def _tokenize_from_country_name_tag(self, instID): """Get country name and country code tokens reading the country_name_tag tag from record instID. Returns a list of tokens (empty if something fails) """ tokens = [] record = get_record(instID) # Read the country name tags that are not marked as secondary country_name_list = [] for field in record[self.address_field]: if "secondary" not in field.get_subfield_values( self.secondary_address_subfield): country_name_list += field.get_subfield_values( self.country_name_subfield ) country_name_list = [s.encode('utf-8') for s in country_name_list] for country_name in country_name_list: # Find the country code using KB kb_country_code = get_kb_mapping( kb_name=self.kb_country_codes, value=country_name ) if kb_country_code: country_code = kb_country_code["key"] if country_name and country_code: tokens += [country_name, country_code] return tokens
def extract_arxiv_ids_from_recid(recid): """Extract arxiv # for given recid We get them from the record which has this format: 037__ $9arXiv$arXiv:1010.1111 """ record = get_record(recid) for report_number_field in record.get('037', []): try: source = report_number_field.get_subfield_values('9')[0] except IndexError: continue else: if source != 'arXiv': continue try: report_number = report_number_field.get_subfield_values('a')[0] except IndexError: continue else: # Extract arxiv id if report_number.startswith('arXiv'): report_number = report_number.split(':')[1] if ARXIV_VERSION_PATTERN.search(report_number): report_number = report_number[:-2] yield report_number
def create_our_record(recid): old_record = get_record(recid) for subfield in old_record.find_subfields('100__u'): if subfield.value.lower() == 'lisbon, lifep': subfield.value = 'LIP, Lisbon' for subfield in old_record.find_subfields('700__u'): if subfield.value.lower() == 'lisbon, lifep': subfield.value = 'LIP, Lisbon' try: instances_100 = old_record['100'] except KeyError: instances_100 = [] try: instances_700 = old_record['700'] except KeyError: instances_700 = [] record = BibRecord(recid=recid) record['100'] = instances_100 record['700'] = instances_700 return record.to_xml()
def cb_process_one(recid): print 'processing', recid try: rec = get_record(recid) except UnicodeDecodeError: pass else: if len(rec.find_fields('773__')) > 1: bibrank.add(recid)
def append_to_record(rec_id, doi, published_date): """Attempt to add a DOI to a record. Also adds 930 'Published' if not already there and adds the extrapolated PubNote data to 773. """ record = get_record(recid=rec_id) new_record = BibRecord(rec_id) # make sure that there is no DOI for this record if not record_has_doi(record, rec_id, doi): # create new record with only 0247 field, that we will append # to the existing record with bibupload function new_field = new_record.add_field('0247_') new_field.add_subfield('2', 'DOI') new_field.add_subfield('a', doi.decode('utf-8')) _print('DOI to be added: ' + doi + ' to the record ' + str(rec_id), 3) if not is_marked_published(record): new_field_980 = new_record.add_field('980__') new_field_980.add_subfield('a', 'Published') append_773 = False field_773 = record.find_fields('773__') new_field_773 = create_pubnote(doi, published_date) if len(field_773) == 0: append_773 = True _print("No pubnote, adding field 773 to record...", 7) elif not is_pubnote_identical(field_773, new_field_773): append_773 = True _print( "Field 773 already exists for record, " + "differs from DOI extract", 3) else: _print( "Field 773 already exists, does not " + "contradict DOI extract.", 6) if append_773: new_field = new_record.add_field('773__') for code, value in new_field_773.iteritems(): new_field.add_subfield(code, value) field_260 = record.find_subfields("260__c") if len(field_260) == 0: # We add 260__c publication date new_field = new_record.add_field('260__') new_field.add_subfield("c", published_date) if len(new_record.record) > 1: return new_record.to_xml() else: return None
def append_to_record(rec_id, doi, published_date): """Attempt to add a DOI to a record. Also adds 930 'Published' if not already there and adds the extrapolated PubNote data to 773. """ record = get_record(recid=rec_id) new_record = BibRecord(rec_id) # make sure that there is no DOI for this record if not record_has_doi(record, rec_id, doi): # create new record with only 0247 field, that we will append # to the existing record with bibupload function new_field = new_record.add_field('0247_') new_field.add_subfield('2', 'DOI') new_field.add_subfield('a', doi.decode('utf-8')) _print('DOI to be added: ' + doi + ' to the record ' + str(rec_id), 3) if not is_marked_published(record): new_field_980 = new_record.add_field('980__') new_field_980.add_subfield('a', 'Published') append_773 = False field_773 = record.find_fields('773__') new_field_773 = create_pubnote(doi, published_date) if len(field_773) == 0: append_773 = True _print("No pubnote, adding field 773 to record...", 7) elif not is_pubnote_identical(field_773, new_field_773): append_773 = True _print("Field 773 already exists for record, " + "differs from DOI extract", 3) else: _print("Field 773 already exists, does not " + "contradict DOI extract.", 6) if append_773: new_field = new_record.add_field('773__') for code, value in new_field_773.iteritems(): new_field.add_subfield(code, value) field_260 = record.find_subfields("260__c") if len(field_260) == 0: # We add 260__c publication date new_field = new_record.add_field('260__') new_field.add_subfield("c", published_date) if len(new_record.record) > 1: return new_record.to_xml() else: return None
def test_equality(self): for recid in self.records_cache.iterkeys(): for recid2 in self.records_cache.iterkeys(): record = self.records_cache[recid] xml = self.xml_cache[recid] if recid == recid2: record2 = get_record(recid) xml2 = record2.to_xml() self.assertEqual(record, record2) self.assertXmlEqual(xml, xml2) else: record2 = self.records_cache[recid2] xml2 = self.xml_cache[recid2] self.assertNotEqual(record, record2)
def process_chunk(recids): # Map for PID -> Affiliation info aff = {} for recid in recids: pids = get_personids(recid) record = get_record(recid) # Check for authors removed from papers missing_pids = intbitset(run_sql("SELECT personid FROM aidAFFILIATIONS WHERE last_recid = %s", [recid])) if pids: missing_pids -= intbitset(pids.values()) for pid in missing_pids: try: recomputed_aff_info = recompute_affiliation(pid) except RecomputeException: continue if recomputed_aff_info: if pid not in aff or aff[pid]['last_occurence'] <= recomputed_aff_info['last_occurence']: aff[pid] = recomputed_aff_info else: run_sql("DELETE FROM aidAFFILIATIONS WHERE personid = %s", [pid]) if not pids: continue # Check for new affiliations for field in chain(record['100'], record['700']): if not field['a']: continue field_author = field['a'][0] field_aff = field['u'] if field_aff: try: pid = pids[field_author] except KeyError: # This happens when bibupload runs while # bibauthorid (rabbit) is sleeping continue record_date = get_creation_date(recid) if pid not in aff or aff[pid]['last_occurence'] <= record_date: aff[pid] = {'aff': field_aff, 'last_recid': recid, 'last_occurence': record_date} return aff
def setUp(self): self.maxDiff = None def order_by_tag(field1, field2): """Function used to order the fields according to their tag""" return cmp(field1[0], field2[0]) bibrecord._order_by_ord = order_by_tag # pylint: disable-msg=W0212 self.records_cache = {} self.xml_cache = {} for recid in perform_request_search(p=""): r = run_sql("SELECT master_format FROM bibrec WHERE id=%s", [recid]) self.assertTrue(r, msg="bibrec row for %s missing" % recid) if r[0][0] != 'marc': continue record = get_record(recid) self.records_cache[recid] = record self.xml_cache[recid] = record.to_xml()
def get_all_authority_record_for_field(field, value): """ Return all the authority record for a specific field. This function will look in the configuration to see if it can create a link between the field and the authority category. If yes then it will do a database request to retrieve all the authority record matching with it . :param field: Field tag for searching. :return: List of records from bibextract """ if field in CFG_BIBAUTHORITY_RECORD_AUTHOR_CONTROL_NUMBER_FIELDS_REVERSED: authority_type = CFG_BIBAUTHORITY_RECORD_AUTHOR_CONTROL_NUMBER_FIELDS_REVERSED[ field] list_index_fields = CFG_BIBAUTHORITY_AUTHORITY_SUBFIELDS_TO_INDEX[ CFG_BIBAUTHORITY_RECORD_AUTHOR_CONTROL_NUMBER_FIELDS_REVERSED[ field]] list_request = [] for field in list_index_fields: first_two_char = field[:2] list_request.append( 'SELECT DISTINCT bibrec_bib{0}x.id_bibrec ' \ 'FROM bibrec_bib{0}x, bib{0}x, bibrec_bib98x, bib98x ' \ 'WHERE bib{0}x.tag like "{2}%" ' \ 'AND bib{0}x.value like "%{1}%" ' \ 'AND bibrec_bib{0}x.id_bibxxx=bib{0}x.id ' \ 'AND bibrec_bib{0}x.id_bibrec=bibrec_bib98x.id_bibrec ' \ 'AND bibrec_bib98x.id_bibxxx=bib98x.id ' \ 'AND bib98x.value<>"DELETED" ' \ 'AND bib98x.value="AUTHORITY"'.format(first_two_char, value, field)) sql_request = " UNION ".join(list_request) authority_records_matching = run_sql(sql_request) authority_records_matching = list(set(authority_records_matching))[:20] authority_records = [] for authority_record in authority_records_matching: authority_records.append(get_record(authority_record[0]).record) return authority_records else: return []
def create_our_record(recid, bibupload, bibupload2): old_record = get_record(recid) try: instances_084 = old_record['084'] except KeyError: instances_084 = [] to_remove_instances_650 = [] modified = False for field in old_record['650']: if 'PACS' in field.get_subfield_values('2'): assert len(field.subfields) >= 2 assert len(field.subfields) -1 == len(field.get_subfield_values('a')) to_remove_instances_650.append(field) for value in field.get_subfield_values('a'): sub_2 = BibRecordSubField(code='2', value='PACS') sub_a = BibRecordSubField(code='a', value=value) f = BibRecordField(subfields=[sub_2, sub_a]) instances_084.append(f) modified = True if not modified: return None # Remove wrong indicator for field in instances_084[:]: if field.ind1 == '1' and field.ind2 == '7' \ and 'PACS' in field.get_subfield_values('2'): field.ind1 = ' ' field.ind2 = ' ' record = BibRecord(recid=recid) record['084'] = set(instances_084) bibupload.add(record.to_xml()) if to_remove_instances_650: record = BibRecord(recid=recid) record['650'] = to_remove_instances_650 bibupload2.add(record.to_xml())
def recompute_affiliation(pid): pid_100_rows = run_sql("""SELECT bibrec, bib10x.value as name FROM aidPERSONIDPAPERS INNER JOIN bib10x ON aidPERSONIDPAPERS.bibref_value = bib10x.id WHERE personid = %s and bibref_table = '100'""", [pid]) pid_700_rows = run_sql("""SELECT bibrec, bib70x.value as name FROM aidPERSONIDPAPERS INNER JOIN bib70x ON aidPERSONIDPAPERS.bibref_value = bib70x.id WHERE personid = %s and bibref_table = '700'""", [pid]) for recid, name in chain(pid_100_rows, pid_700_rows): record = get_record(recid) for field in chain(record['100'], record['700']): try: if field['a'][0] == name and field['u']: return {'aff': field['u'], 'last_recid': recid, 'last_occurence': get_creation_date(recid)} except IndexError, e: print 'WARNING: problem in recomputing affiliations for pid ', pid raise RecomputeException(str(e))
def get_all_record_for_field(field, value): """ Return all the authority record for a specific field. This function will look in the configuration to see if it can create a link between the field and the authority category. If yes then it will do a database request to retrieve all the authority record matching with it . :param field: Field tag for searching. :return: List of records from bibextract """ sql_request = 'SELECT DISTINCT bibrec_bib{0}x.id_bibrec from bibrec_bib{0}x, bib{0}x where bib{0}x.tag like "{2}%" and bib{0}x.value like "%{1}%" and bibrec_bib{0}x.id_bibxxx = bib{0}x.id'.format( field[:2], value, field) authority_records_matching = run_sql(sql_request) authority_records_matching = authority_records_matching[:20] authority_records = [] for authority_record in authority_records_matching: authority_records.append(get_record(authority_record[0]).record) return authority_records
def append_doi(recID, doi): record = get_record(recid=recID) try: # make sure that there is no DOI for this record if record.find_subfields('0247_a'): messages.append('Record %s already has a doi' % recID) if record.find_subfields('0247_a')[0].value != doi: errors.append('DOI of %s record is different than the new doi (%s)!' % (recID, doi)) else: # create new record with only 0247 field, that we will append # to the existing record with bibupload function new_record = BibRecord(recID) new_field = new_record.add_field('0247_') new_field.add_subfield('a', doi.decode('utf-8')) new_field.add_subfield('2', 'DOI') messages.append('Successfully inserted the doi: ' + doi + ' to the record ' + str(recID)) return new_record.to_xml() except Exception, e: traceback.print_exc() errors.append('Unknown error: ' + repr(e))
def get_citation_informations(recid_list, tags, config, fetch_catchup_info=True): """Scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] records_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, 'hdl': {}, 'isbn': {}, 'record_id': {}, } references_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, 'record_id': {}, 'isbn': {}, 'hdl': {}, } # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: for done, recid in enumerate(recid_list): if done % 10 == 0: task_sleep_now_if_required() if done % 50 == 0: mesg = "get cit.inf done %s of %s" % (done, len(recid_list)) write_message(mesg) task_update_progress(mesg) record = get_record(recid) records_info['record_id'][recid] = [unicode(recid)] function = config.get("rank_method", "function") if config.get(function, 'collections'): if recid not in recids_cache(config.get(function, 'collections')): # do not treat this record since it is not in the collections # we want to process continue elif recid in deleted_recids_cache(): # do not treat this record since it was deleted; we # skip it like this in case it was only soft-deleted # e.g. via bibedit (i.e. when collection tag 980 is # DELETED but other tags like report number or journal # publication info remained the same, so the calls to # get_fieldvalues() below would return old values) continue if tags['refs_report_number']: references_info['report-numbers'][recid] = [ t.value for t in record.find_subfields(tags['refs_report_number']) ] msg = "references_info['report-numbers'][%s] = %r" \ % (recid, references_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['refs_journal']: references_info['journals'][recid] = [] for ref in record.find_subfields(tags['refs_journal']): try: # Inspire specific parsing journal, volume, page = ref.value.split(',') except ValueError: pass else: alt_volume = get_alt_volume(volume) if alt_volume: alt_ref = ','.join([journal, alt_volume, page]) references_info['journals'][recid] += [alt_ref] references_info['journals'][recid] += [ref.value] msg = "references_info['journals'][%s] = %r" \ % (recid, references_info['journals'][recid]) write_message(msg, verbose=9) if tags['refs_doi']: references = [ t.value for t in record.find_subfields(tags['refs_doi']) ] dois = [] hdls = [] for ref in references: if ref.startswith("hdl:"): hdls.append(ref[4:]) elif ref.startswith("doi:"): dois.append(ref[4:]) else: dois.append(ref) references_info['doi'][recid] = dois references_info['hdl'][recid] = hdls msg = "references_info['doi'][%s] = %r" % (recid, dois) write_message(msg, verbose=9) msg = "references_info['hdl'][%s] = %r" % (recid, hdls) write_message(msg, verbose=9) if tags['refs_record_id']: references_info['record_id'][recid] = [ t.value for t in record.find_subfields(tags['refs_record_id']) ] msg = "references_info['record_id'][%s] = %r" \ % (recid, references_info['record_id'][recid]) write_message(msg, verbose=9) if tags['refs_isbn']: references_info['isbn'][recid] = [ t.value for t in record.find_subfields(tags['refs_isbn']) ] msg = "references_info['isbn'][%s] = %r" \ % (recid, references_info['isbn'][recid]) write_message(msg, verbose=9) if not fetch_catchup_info: # We do not need the extra info continue if tags['record_pri_number'] or tags['record_add_number']: records_info['report-numbers'][recid] = [] if tags['record_pri_number']: records_info['report-numbers'][recid] += [ t.value for t in record.find_subfields(tags['record_pri_number']) ] if tags['record_add_number']: records_info['report-numbers'][recid] += [ t.value for t in record.find_subfields(tags['record_add_number']) ] msg = "records_info[%s]['report-numbers'] = %r" \ % (recid, records_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['doi']: records_info['doi'][recid] = [] records_info['hdl'][recid] = [] for tag in tags['doi']: for field in record.find_fields(tag[:5]): if 'DOI' in field.get_subfield_values('2'): dois = field.get_subfield_values('a') records_info['doi'][recid].extend(dois) elif 'HDL' in field.get_subfield_values('2'): hdls = field.get_subfield_values('a') records_info['hdl'][recid].extend(hdls) msg = "records_info[%s]['doi'] = %r" \ % (recid, records_info['doi'][recid]) write_message(msg, verbose=9) msg = "records_info[%s]['hdl'] = %r" \ % (recid, records_info['hdl'][recid]) write_message(msg, verbose=9) if tags['isbn']: records_info['isbn'][recid] = [] for tag in tags['isbn']: values = [t.value for t in record.find_subfields(tag)] records_info['isbn'][recid] += values msg = "records_info[%s]['isbn'] = %r" \ % (recid, records_info['isbn'][recid]) write_message(msg, verbose=9) # get a combination of # journal vol (year) pages if tags['publication']: records_info['journals'][recid] = get_journal_info(record, tags) msg = "records_info[%s]['journals'] = %r" \ % (recid, records_info['journals'][recid]) write_message(msg, verbose=9) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) end_time = os.times()[4] write_message("Execution time for generating citation info " "from record: %.2f sec" % (end_time - begin_time)) return records_info, references_info
def get_citation_informations(recid_list, tags, config, fetch_catchup_info=True): """Scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] records_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, 'hdl': {}, 'isbn': {}, 'record_id': {}, } references_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, 'record_id': {}, 'isbn': {}, 'hdl': {}, } # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: for done, recid in enumerate(recid_list): if done % 10 == 0: task_sleep_now_if_required() if done % 50 == 0: mesg = "get cit.inf done %s of %s" % (done, len(recid_list)) write_message(mesg) task_update_progress(mesg) record = get_record(recid) records_info['record_id'][recid] = [unicode(recid)] function = config.get("rank_method", "function") if config.get(function, 'collections'): if recid not in recids_cache(config.get(function, 'collections')): # do not treat this record since it is not in the collections # we want to process continue elif recid in deleted_recids_cache(): # do not treat this record since it was deleted; we # skip it like this in case it was only soft-deleted # e.g. via bibedit (i.e. when collection tag 980 is # DELETED but other tags like report number or journal # publication info remained the same, so the calls to # get_fieldvalues() below would return old values) continue if tags['refs_report_number']: references_info['report-numbers'][recid] = [t.value for t in record.find_subfields(tags['refs_report_number'])] msg = "references_info['report-numbers'][%s] = %r" \ % (recid, references_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['refs_journal']: references_info['journals'][recid] = [] for ref in record.find_subfields(tags['refs_journal']): try: # Inspire specific parsing journal, volume, page = ref.value.split(',') except ValueError: pass else: alt_volume = get_alt_volume(volume) if alt_volume: alt_ref = ','.join([journal, alt_volume, page]) references_info['journals'][recid] += [alt_ref] references_info['journals'][recid] += [ref.value] msg = "references_info['journals'][%s] = %r" \ % (recid, references_info['journals'][recid]) write_message(msg, verbose=9) if tags['refs_doi']: references = [t.value for t in record.find_subfields(tags['refs_doi'])] dois = [] hdls = [] for ref in references: if ref.startswith("hdl:"): hdls.append(ref[4:]) elif ref.startswith("doi:"): dois.append(ref[4:]) else: dois.append(ref) references_info['doi'][recid] = dois references_info['hdl'][recid] = hdls msg = "references_info['doi'][%s] = %r" % (recid, dois) write_message(msg, verbose=9) msg = "references_info['hdl'][%s] = %r" % (recid, hdls) write_message(msg, verbose=9) if tags['refs_record_id']: references_info['record_id'][recid] = [t.value for t in record.find_subfields(tags['refs_record_id'])] msg = "references_info['record_id'][%s] = %r" \ % (recid, references_info['record_id'][recid]) write_message(msg, verbose=9) if tags['refs_isbn']: references_info['isbn'][recid] = [t.value for t in record.find_subfields(tags['refs_isbn'])] msg = "references_info['isbn'][%s] = %r" \ % (recid, references_info['isbn'][recid]) write_message(msg, verbose=9) if not fetch_catchup_info: # We do not need the extra info continue if tags['record_pri_number'] or tags['record_add_number']: records_info['report-numbers'][recid] = [] if tags['record_pri_number']: records_info['report-numbers'][recid] += [t.value for t in record.find_subfields(tags['record_pri_number'])] if tags['record_add_number']: records_info['report-numbers'][recid] += [t.value for t in record.find_subfields(tags['record_add_number'])] msg = "records_info[%s]['report-numbers'] = %r" \ % (recid, records_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['doi']: records_info['doi'][recid] = [] records_info['hdl'][recid] = [] for tag in tags['doi']: for field in record.find_fields(tag[:5]): if 'DOI' in field.get_subfield_values('2'): dois = field.get_subfield_values('a') records_info['doi'][recid].extend(dois) elif 'HDL' in field.get_subfield_values('2'): hdls = field.get_subfield_values('a') records_info['hdl'][recid].extend(hdls) msg = "records_info[%s]['doi'] = %r" \ % (recid, records_info['doi'][recid]) write_message(msg, verbose=9) msg = "records_info[%s]['hdl'] = %r" \ % (recid, records_info['hdl'][recid]) write_message(msg, verbose=9) if tags['isbn']: records_info['isbn'][recid] = [] for tag in tags['isbn']: values = [t.value for t in record.find_subfields(tag)] records_info['isbn'][recid] += values msg = "records_info[%s]['isbn'] = %r" \ % (recid, records_info['isbn'][recid]) write_message(msg, verbose=9) # get a combination of # journal vol (year) pages if tags['publication']: records_info['journals'][recid] = get_journal_info(record, tags) msg = "records_info[%s]['journals'] = %r" \ % (recid, records_info['journals'][recid]) write_message(msg, verbose=9) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) end_time = os.times()[4] write_message("Execution time for generating citation info " "from record: %.2f sec" % (end_time - begin_time)) return records_info, references_info
def cb_process_one(recid): record = get_record(recid) if record.find_fields('999C5') or record.find_fields('999C6'): return if record_has_fulltext(recid): refextract.add(recid)