def get_ids_from_recid(recid): """Get all relevant identifiers from metadata of local record.""" record = get_record(recid) # Retrieving DOI doi = "" dois = record_get_field_values(record, '024', '7', code='a') dois = [doi for doi in dois if doi.startswith('10.')] if len(dois) > 1: print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % (recid, dois) doi = dois[0] elif len(dois) == 1: doi = dois[0] # Retrieving arXiv eprint eprint = "" eprints = record_get_field_values(record, '035', code='a') eprints = [an_eprint[len('oai:arXiv.org:'):] for an_eprint in eprints if an_eprint.lower().startswith('oai:arxiv.org:')] if len(eprints) > 1: print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % (recid, eprints) eprint = eprints[0] elif len(eprints) == 1: eprint = eprints[0] # Retrieving Other service ID other_id = '' for field in record_get_field_instances(record, '035'): subfields = dict(field_get_subfield_instances(field)) if subfields.get('9', '').upper() == CFG_OTHER_SITE.upper() and subfields.get('a'): other_id = subfields['a'] if CFG_INSPIRE_SITE and not other_id: for field in record_get_field_instances(record, '595'): subfields = dict(field_get_subfield_instances(field)) if "CDS" in subfields.get('a', '').upper(): other_id = subfields.get('a', 0).split("-")[-1] try: int(other_id) except ValueError: # Not an integer, we move on other_id = '' reportnumbers = record_get_field_values(record, '037', code='a') system_number = "" if CFG_INSPIRE_SITE: for value in record_get_field_values(record, '970', filter_subfield_code="a", filter_subfield_value="SPIRES", filter_subfield_mode="s"): system_number = value.split("-")[-1] break # There is typically only one out = [str(recid), doi, eprint, other_id, system_number] + reportnumbers return [val.replace('\n', ' ').replace('\r', '') for val in out]
def check_records(records, field): for record in records: if field != '999C5s': for position, value in record.iterfields([field]): newval = value.replace('. ', '.') if newval != value: record.amend_field(position, newval) continue for afield in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(afield) subfields_dict = dict(subfields) if 's' in subfields_dict: old_pubnote = subfields_dict['s'] new_pubnote = old_pubnote.replace('. ', '.') if old_pubnote != new_pubnote: subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) if not '0' in subfields_dict: recids = perform_request_search(p=new_pubnote, f='journal') if len(recids) == 1: recid = recids.pop() subfields.append(('0', str(recid))) record.set_amended( "Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) continue record.set_amended("Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def check_records(records): """ Update field 700__i: * Replace substring INSPIRE-00227069 with INSPIRE-00341324 When subfield __a is equal to Yang, Yi AND __u is equal to Beijing, Inst. High Energy Phys. * Update field 700 ADD subfield __i INSPIRE-00341324 When subfield __a is equal to Yang, Yi AND __u is equal to Beijing, Inst. High Energy Phys. IF subfield __i Does not exist """ for record in records: for field in record_get_field_instances( record, tag="100") + record_get_field_instances(record, "700"): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') == 'Yang, Yi' and subfields_dict.get( 'u') == 'Beijing, Inst. High Energy Phys.': if 'i' not in subfields_dict: subfields.append(('i', 'INSPIRE-00341324')) record.set_amended('Added INSPIRE-00341324 to Yang, Yi') else: for i, (code, value) in enumerate(subfields): if code == 'i' and 'INSPIRE-00227069' in value: subfields[i] = ('i', 'INSPIRE-00341324') record.set_amended( 'Corrected INSPIRE-00227069 with INSPIRE-00341324 for Yang, Yi' )
def check_records(records): from invenio.bibrank import ConfigParser, CFG_ETCDIR from invenio.bibrank_citation_indexer import get_recids_matching_query config = ConfigParser.ConfigParser() config.read("%s/bibrank/%s.cfg" % (CFG_ETCDIR, "citation")) for record in records: for field in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if '0' not in subfields_dict and 's' in subfields_dict: old_pubnote = subfields_dict['s'] g = RE_BROKEN_PUBNOTES.match(old_pubnote) if g: new_pubnote = '%(journal)s,%(volume)s,P%(id)s' % g.groupdict( ) subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) recids = get_recids_matching_query(p=new_pubnote, f='journal', config=config) if len(recids) == 1: recid = recids.pop() subfields.append(('0', str(recid))) record.set_amended( "Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) else: record.set_amended("Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def check_records(records): from invenio.bibrank import ConfigParser, CFG_ETCDIR from invenio.bibrank_citation_indexer import get_recids_matching_query config = ConfigParser.ConfigParser() config.read("%s/bibrank/%s.cfg" % (CFG_ETCDIR, "citation")) for record in records: for field in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if '0' not in subfields_dict and 's' in subfields_dict: old_pubnote = subfields_dict['s'] g = RE_BROKEN_PUBNOTES.match(old_pubnote) if g: new_pubnote = '%(journal)s,%(volume)s,P%(id)s' % g.groupdict() subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) recids = get_recids_matching_query(p=new_pubnote, f='journal', config=config) if len(recids) == 1: recid = recids.pop() subfields.append(('0', str(recid))) record.set_amended("Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) else: record.set_amended("Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def get_ids_from_recid(recid): record = get_record(recid) ## Retrieving DOI doi = "" dois = record_get_field_values(record, "024", "7", code="a") dois = [doi for doi in dois if doi.startswith("10.")] if len(dois) > 1: print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % (recid, dois) elif len(dois) == 1: doi = dois[0] ## Retrieving arXiv eprint eprint = "" eprints = record_get_field_values(record, "035", code="a") eprints = [ an_eprint[len("oai:arXiv.org:") :] for an_eprint in eprints if an_eprint.lower().startswith("oai:arxiv.org:") ] if len(eprints) > 1: print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % (recid, eprints) elif len(eprints) == 1: eprint = eprints[0] ## Retrieving Other service ID other_id = "" for field in record_get_field_instances(record, "035"): subfields = dict(field_get_subfield_instances(field)) if subfields.get("9", "").upper() == CFG_OTHER_SITE.upper() and subfields.get("a"): other_id = subfields["a"] reportnumbers = record_get_field_values(record, "037", code="a") return [str(recid), doi, eprint, other_id] + reportnumbers
def check_records(records, field): for record in records: if field != '999C5s': for position, value in record.iterfields([field]): newval = value.replace('. ', '.') if newval != value: record.amend_field(position, newval) continue for afield in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(afield) subfields_dict = dict(subfields) if 's'in subfields_dict: old_pubnote = subfields_dict['s'] new_pubnote = old_pubnote.replace('. ', '.') if old_pubnote != new_pubnote: subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) if not '0' in subfields_dict: recids = perform_request_search(p=new_pubnote, f='journal') if len(recids) == 1: recid = recids.pop() subfields.append(('0', str(recid))) record.set_amended("Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) continue record.set_amended("Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def get_signatures_with_orcid(record): out = {} for field in record_get_field_instances(record, '100') + record_get_field_instances(record, '700'): subfields = dict(field_get_subfield_instances(field)) if subfields.get('j', '').upper().startswith('ORCID:'): orcid = subfields['j'][len('ORCID:'):] author = subfields['a'] out[author] = orcid return out
def create_our_record(recid): old_record = get_record(recid) instances = record_get_field_instances(old_record, '980') new_instances = [l.field for l in set(OurInstance(i) for i in instances if field_get_subfield_instances(i) != [('a', 'unknown')])] record = {} record_add_field(record, '001', controlfield_value=str(recid)) record_add_fields(record, '980', new_instances) return print_rec(record)
def get_rn(revision): rns = set() record = create_record(get_marcxml_of_revision_id(revision))[0] fields = record_get_field_instances(record, tag='999', ind1='C', ind2='5') for f in fields: subfields = field_get_subfield_instances(f) for index, s in enumerate(subfields): if s[0] == 'r': rns.add(tag_arxiv_more(s[1])) return rns
def check_records(records): for record in records: for field in record_get_field_instances(record, '100') + record_get_field_instances(record, '700'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if 'a' in subfields_dict and subfields_dict['a'] in CHANGES: if 'i' in subfields_dict and subfields_dict['i'] != CHANGES[subfields_dict['a']]: record.set_invalid("Author %s should have INSPIRE ID %s but has already INSPIRE ID %s" % (subfields_dict['a'], CHANGES[subfields_dict['a']], subfields_dict['i'])) elif not 'i' in subfields_dict: subfields.append(('i', CHANGES[subfields_dict['a']])) record.set_amended("Added INSPIRE ID %s to author %s" % (CHANGES[subfields_dict['a']], subfields_dict['a']))
def record_drop_fields_matching_pattern(record, pattern, fields, tag): """Remove fields matching given pattern from record.""" field_positions = [] for field in fields: subfields = field_get_subfield_instances(field) for subfield in subfields: if re.match(pattern, subfield[1].lower(), re.IGNORECASE): field_positions.append((field[1], field[2], field[4])) break for ind1, ind2, pos in field_positions: record_delete_field(record, tag, ind1=ind1, ind2=ind2, field_position_global=pos)
def check_records(records): from invenio.bibrank import ConfigParser, CFG_ETCDIR from invenio.bibrank_citation_indexer import get_recids_matching_query codens = get_codens() config = ConfigParser.ConfigParser() config.read("%s/bibrank/%s.cfg" % (CFG_ETCDIR, "citation")) for record in records: for field in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if 's' in subfields_dict: old_pubnote = subfields_dict['s'] g = RE_BROKEN_PUBNOTES.match(old_pubnote) if g: new_groupdict = g.groupdict() new_groupdict['coden'] = new_groupdict['coden'].upper() if new_groupdict['coden'] in codens: new_groupdict['journal'] = codens[ new_groupdict['coden']] new_pubnote = '%(journal)s,%(volume)s,%(id)s' % new_groupdict if new_pubnote == old_pubnote: # No change, e.g. due to JINST == JINST continue subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) recids = get_recids_matching_query(p=new_pubnote, f='journal', config=config) if len(recids) == 1: recid = recids.pop() if '0' in subfields_dict: if str(recid) == subfields_dict['0']: record.set_amended( "Pubnote changed from %s to %s and matched the same known record %s" % (old_pubnote, new_pubnote, recid)) else: record.warn( "Pubnote changed from %s to %s and matched a different record %s (instead of %s)!" % (old_pubnote, new_pubnote, recid, subfields_dict[0])) else: subfields.append(('0', str(recid))) record.set_amended( "Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) else: record.set_amended( "Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def record_drop_fields_matching_pattern(record, pattern, fields, tag): """Remove fields matching given pattern from record.""" field_positions = [] for field in fields: subfields = field_get_subfield_instances(field) for subfield in subfields: if re.match(pattern, subfield[1].lower(), re.IGNORECASE): field_positions.append((field[1], field[2], field[4])) break for ind1, ind2, pos in field_positions: record_delete_field(record, tag, ind1=ind1, ind2=ind2, field_position_global=pos)
def bst_hepnames_orcid_sync(): bai_orcids = run_sql("SELECT bai.data, orcid.data FROM aidPERSONIDDATA as bai JOIN aidPERSONIDDATA as orcid ON bai.personid=orcid.personid WHERE orcid.tag='extid:ORCID' AND bai.tag='canonical_name'") recs = [] not_matched_profiles = 0 enhanced_records = 0 conflicting_orcids = 0 for bai, orcid in bai_orcids: recids = perform_request_search(p="035:%s" % bai, cc="HepNames") if len(recids) > 1: write_message("WARNING: %s/author/profile/%s, %s matches more than one HepNames: %s" % (CFG_SITE_URL, bai, orcid, recids), stream=sys.stderr) not_matched_profiles += 1 elif not recids: write_message("WARNING: %s/author/profile/%s, %s does not match any HepName" % (CFG_SITE_URL, bai, orcid), stream=sys.stderr) not_matched_profiles += 1 else: recid = recids[0] record = get_record(recid) for field in record_get_field_instances(record, tag="035"): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('9') == 'ORCID': if subfields_dict.get('a') != orcid: if not subfields_dict.get('a', '').strip(): write_message("WARNING: record %s/record/%s has an empty ORCID" % (CFG_SITE_URL, recid), stream=sys.stderr) continue write_message("WARNING: record %s/record/%s matched by BAI %s/author/profile/%s has a different ORCID %s than the profile one: %s" % (CFG_SITE_URL, recid, CFG_SITE_URL, bai, subfields_dict.get('a'), orcid), stream=sys.stderr) conflicting_orcids += 1 break else: new_record = {} record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag="035", subfields=[('a', orcid), ('9', 'ORCID')]) recs.append(new_record) write_message("INFO: adding ORCID %s to record %s/record/%s matched by BAI %s/author/profile/%s" % (orcid, CFG_SITE_URL, recid, CFG_SITE_URL, bai)) enhanced_records += 1 if recs: write_message("INFO: initiating uploads") bibupload = ChunkedBibUpload(mode="a", user='******') for record in recs: bibupload.add(record_xml_output(record)) bibupload.cleanup() else: write_message("INFO: no modification are necessary") write_message("INFO: not_matched_profiles: %s, enhanced_records: %s, conflicting_orcids: %s" % (not_matched_profiles, enhanced_records, conflicting_orcids))
def build_hepnames_knowledge(): recids = get_collection_reclist('HepNames') ret = {} for recid in recids: ids = {'recid': recid} record = get_record(recid) for field in record_get_field_instances(record, '035'): id_type = None id_value = None for code, value in field_get_subfield_instances(field): code = code.strip() value = value.strip() if code == '9': if id_type and id_type != value.upper(): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid IDs".format(recid=recid), stream=sys.stderr) break id_type = value.upper() if code == 'a': if id_value and id_value != value: write_message("ERROR: http://inspirehep.net/record/{recid} has invalid IDs".format(recid=recid), stream=sys.stderr) break id_value = value if not id_type or not id_value: # Incomplete IDs continue else: if id_type == 'BAI': if not valid_bai(id_value): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid BAI: {value}".format(recid=recid, value=id_value), stream=sys.stderr) continue elif id_type == 'INSPIRE': if not valid_inspire(id_value): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid INSPIRE: {value}".format(recid=recid, value=id_value), stream=sys.stderr) continue elif id_type == 'ORCID': if not valid_orcid(id_value): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid ORCID: {value}".format(recid=recid, value=id_value), stream=sys.stderr) continue elif id_type == 'KAKEN': if not valid_kaken(id_value): write_message("ERROR: http://inspirehep.net/record/{recid} has invalid KAKEN: {value}".format(recid=recid, value=id_value), stream=sys.stderr) continue ids[id_type] = id_value.upper() if id_type == 'BAI': ids['ORIGINAL_BAI'] = id_value ret[recid] = ids return ret.values()
def check_records(records): for record in records: for field in record_get_field_instances( record, '100') + record_get_field_instances(record, '700'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if 'a' in subfields_dict and subfields_dict['a'] in CHANGES: if 'i' in subfields_dict and subfields_dict['i'] != CHANGES[ subfields_dict['a']]: record.set_invalid( "Author %s should have INSPIRE ID %s but has already INSPIRE ID %s" % (subfields_dict['a'], CHANGES[subfields_dict['a']], subfields_dict['i'])) elif not 'i' in subfields_dict: subfields.append(('i', CHANGES[subfields_dict['a']])) record.set_amended( "Added INSPIRE ID %s to author %s" % (CHANGES[subfields_dict['a']], subfields_dict['a']))
def check_records(records): from invenio.bibrank import ConfigParser, CFG_ETCDIR from invenio.bibrank_citation_indexer import get_recids_matching_query codens = get_codens() config = ConfigParser.ConfigParser() config.read("%s/bibrank/%s.cfg" % (CFG_ETCDIR, "citation")) for record in records: for field in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if 's' in subfields_dict: old_pubnote = subfields_dict['s'] g = RE_BROKEN_PUBNOTES.match(old_pubnote) if g: new_groupdict = g.groupdict() new_groupdict['coden'] = new_groupdict['coden'].upper() if new_groupdict['coden'] in codens: new_groupdict['journal'] = codens[new_groupdict['coden']] new_pubnote = '%(journal)s,%(volume)s,%(id)s' % new_groupdict if new_pubnote == old_pubnote: # No change, e.g. due to JINST == JINST continue subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) recids = get_recids_matching_query(p=new_pubnote, f='journal', config=config) if len(recids) == 1: recid = recids.pop() if '0' in subfields_dict: if str(recid) == subfields_dict['0']: record.set_amended("Pubnote changed from %s to %s and matched the same known record %s" % (old_pubnote, new_pubnote, recid)) else: record.warn("Pubnote changed from %s to %s and matched a different record %s (instead of %s)!" % (old_pubnote, new_pubnote, recid, subfields_dict[0])) else: subfields.append(('0', str(recid))) record.set_amended("Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) else: record.set_amended("Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def check_records(records): """ Update field 700__i: * Replace substring INSPIRE-00227069 with INSPIRE-00341324 When subfield __a is equal to Yang, Yi AND __u is equal to Beijing, Inst. High Energy Phys. * Update field 700 ADD subfield __i INSPIRE-00341324 When subfield __a is equal to Yang, Yi AND __u is equal to Beijing, Inst. High Energy Phys. IF subfield __i Does not exist """ for record in records: for field in record_get_field_instances(record, tag="100") + record_get_field_instances(record, "700"): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') == 'Yang, Yi' and subfields_dict.get('u') == 'Beijing, Inst. High Energy Phys.': if 'i' not in subfields_dict: subfields.append(('i', 'INSPIRE-00341324')) record.set_amended('Added INSPIRE-00341324 to Yang, Yi') else: for i, (code, value) in enumerate(subfields): if code == 'i' and 'INSPIRE-00227069' in value: subfields[i] = ('i', 'INSPIRE-00341324') record.set_amended('Corrected INSPIRE-00227069 with INSPIRE-00341324 for Yang, Yi')
def get_ids_from_recid(recid): record = get_record(recid) ## Retrieving DOI doi = "" dois = record_get_field_values(record, '024', '7', code='a') dois = [doi for doi in dois if doi.startswith('10.')] if len(dois) > 1: print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % ( recid, dois) elif len(dois) == 1: doi = dois[0] ## Retrieving arXiv eprint eprint = "" eprints = record_get_field_values(record, '035', code='a') eprints = [ an_eprint[len('oai:arXiv.org:'):] for an_eprint in eprints if an_eprint.lower().startswith('oai:arxiv.org:') ] if len(eprints) > 1: print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % ( recid, eprints) elif len(eprints) == 1: eprint = eprints[0] ## Retrieving Other service ID other_id = '' for field in record_get_field_instances(record, '035'): subfields = dict(field_get_subfield_instances(field)) if subfields.get( '9', '').upper() == CFG_OTHER_SITE.upper() and subfields.get('a'): other_id = subfields['a'] reportnumbers = record_get_field_values(record, '037', code='a') return [str(recid), doi, eprint, other_id] + reportnumbers
def lazy_parser(collection, left_tags, right_tags, volume_subfield): for recid in get_collection_reclist(collection): record = get_record(recid) for right_tag in right_tags: for right_value in record_get_field_values( record, right_tag[:3], right_tag[3], right_tag[4], right_tag[5]): if not right_value: continue # Empty metadata yield right_value, right_value for left_tag in left_tags: for left_field in record_get_field_instances( record, left_tag[:3], left_tag[3], left_tag[4]): left_subfields = dict( field_get_subfield_instances(left_field)) if left_tag[5] not in left_subfields: continue # Empty field if volume_subfield in left_subfields: yield left_subfields[left_tag[5]], '%s;%s' % ( right_value, left_subfields[volume_subfield]) else: yield left_subfields[left_tag[5]], right_value
def add_other_id(other_id=None, doi="", eprint="", recid=None, reportnumbers=None, all_recids=None): if all_recids is None: all_recids = get_all_recids() if reportnumbers is None: reportnumbers = [] if recid is not None and recid not in all_recids: write_message( "WARNING: %s thought that their record %s had recid %s in %s but this seems wrong" % (CFG_OTHER_SITE, other_id, recid, CFG_THIS_SITE), stream=sys.stderr) recid = None if recid is None and eprint: arxiv_ids = search_pattern( p='oai:arXiv.org:%s' % (eprint, ), f='035__a', m='e') & all_recids if len(arxiv_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via arXiv eprint matching: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, arxiv_ids), stream=sys.stderr) return elif len(arxiv_ids) == 1: recid = arxiv_ids[0] if recid is None and doi: doi_ids = search_pattern(p='doi:"%s"' % doi) & all_recids if len(doi_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via DOI matching: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, doi_ids), stream=sys.stderr) return elif len(doi_ids) == 1: recid = doi_ids[0] if recid is None and reportnumbers: reportnumbers_ids = intbitset() for rn in reportnumbers: reportnumbers_ids |= search_pattern(p=rn, f='037__a', m='e') reportnumbers_ids &= all_recids() if len(reportnumbers_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via reportnumber matching: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, reportnumbers_ids), stream=sys.stderr) return elif len(reportnumbers_ids) == 1: recid = reportnumbers_ids[0] if recid: record = get_record(recid) fields = record_get_field_instances(record, '035') for field in fields: subfields = dict(field_get_subfield_instances(field)) if CFG_OTHER_SITE.upper() == subfields.get('9', '').upper(): stored_recid = int(subfields.get('a', 0)) if stored_recid and stored_recid != other_id: write_message( "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) return rec = {} record_add_field(rec, '001', controlfield_value='%s' % recid) record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) return record_xml_output(rec)
# Record exists, fetch existing record existing_record = get_record(recid) if existing_record is None: # Did not find existing record in database holdingpen_records.append(record) continue # We remove 500 field temporary/brief entry from revision if record already exists fields_500 = record_get_field_instances(record, '500', ind1="%", ind2="%") if fields_500 is not None: field_positions = [] for field in fields_500: subfields = field_get_subfield_instances(field) for subfield in subfields: if re.match("^.?((temporary|brief) entry).?$", subfield[1].lower(), re.IGNORECASE): field_positions.append( (field[1], field[2], field[4])) for ind1, ind2, pos in field_positions: record_delete_field(record, '500', ind1=ind1, ind2=ind2, field_position_global=pos) # Now compare new version with existing one, returning a diff[tag] = (diffcode, [..]) # None - if field is the same for both records
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden cds_id = rec['001'][0][3] if not 'hidden' in [ x.lower() for x in record_get_field_values(rec, "980", code="a") ]: record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = [ "024", "041", "035", "037", "088", "100", "110", "111", "242", "245", "246", "260", "269", "300", "502", "650", "653", "693", "700", "710", "773", "856", "520", "500", "980" ] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) # 980 Determine Collections collections = set([]) for value in record_get_field_values(rec, '980', code='a'): if 'NOTE' in value.upper(): collections.add('NOTE') if 'THESIS' in value.upper(): collections.add('THESIS') if 'CONFERENCEPAPER' in value.upper(): collections.add('ConferencePaper') if is_published(rec): collections.add("PUBLISHED") collections.add("CITEABLE") if not 'NOTE' in collections: # TODO: Move this to a KB kb = [ 'ATLAS-CONF-', 'CMS-PAS-', 'ATL-', 'CMS-DP-', 'ALICE-INT-', 'LHCb-PUB-' ] values = record_get_field_values(rec, "088", code='a') for val, rep in product(values, kb): if val.startswith(rep): collections.add('NOTE') break # 980 Arxiv tag if record_get_field_values(rec, '035', filter_subfield_code="a", filter_subfield_value="arXiv"): collections.add("arXiv") # 980 HEP && CORE collections.add('HEP') collections.add('CORE') # 980 Conference Note if not 'ConferencePaper' in collections: for value in record_get_field_values(rec, '962', code='n'): if value[-2:].isdigit(): collections.add('ConferencePaper') break record_delete_fields(rec, "980") intnote = record_get_field_values(rec, '690', filter_subfield_code="a", filter_subfield_value='INTNOTE') if intnote: val_088 = record_get_field_values(rec, '088', filter_subfield_code="a") for val in val_088: if 'CMS' in val: url = ('http://weblib.cern.ch/abstract?CERN-CMS' + val.split('CMS', 1)[-1]) record_add_field(rec, '856', ind1='4', subfields=[('u', url)]) # 041 Language languages = get_languages() language_fields = record_get_field_instances(rec, '041') record_delete_fields(rec, "041") for field in language_fields: subs = field_get_subfields(field) if 'a' in subs: if "eng" in subs['a']: continue new_value = translate_config(subs['a'][0], languages) new_subs = [('a', new_value)] record_add_field(rec, "041", subfields=new_subs) # 035 Externals scn_035_fields = record_get_field_instances(rec, '035') forbidden_values = [ "cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01" ] for field in scn_035_fields: subs = field_get_subfields(field) if '9' in subs: if not 'a' in subs: continue for sub in subs['9']: if sub.lower() in forbidden_values: break else: # No forbidden values (We did not "break") suffixes = [s.lower() for s in subs['9']] if 'spires' in suffixes: new_subs = [('a', 'SPIRES-%s' % subs['a'][0])] record_add_field(rec, '970', subfields=new_subs) continue if 'a' in subs: for sub in subs['a']: if sub.lower() in forbidden_values: record_delete_field(rec, tag="035", field_position_global=field[4]) rep_088_fields = record_get_field_instances(rec, '088') for field in rep_088_fields: subs = field_get_subfields(field) if '9' in subs: for val in subs['9']: if val.startswith('P0') or val.startswith('CM-P0'): sf = [('9', 'CERN'), ('b', val)] record_add_field(rec, '595', subfields=sf) for key, val in field[0]: if key in ['a', '9'] and not val.startswith('SIS-'): record_add_field(rec, '037', subfields=[('a', val)]) record_delete_fields(rec, "088") # 037 Externals also... rep_037_fields = record_get_field_instances(rec, '037') for field in rep_037_fields: subs = field_get_subfields(field) if 'a' in subs: for value in subs['a']: if 'arXiv' in value: new_subs = [('a', value), ('9', 'arXiv')] for fld in record_get_field_instances(rec, '695'): for key, val in field_get_subfield_instances(fld): if key == 'a': new_subs.append(('c', val)) break nf = create_field(subfields=new_subs) record_replace_field(rec, '037', nf, field[4]) for key, val in field[0]: if key in ['a', '9'] and val.startswith('SIS-'): record_delete_field(rec, '037', field_position_global=field[4]) for field in record_get_field_instances(rec, '242'): record_add_field(rec, '246', subfields=field[0]) record_delete_fields(rec, '242') # 269 Date normalization for field in record_get_field_instances(rec, '269'): for idx, (key, value) in enumerate(field[0]): if key == "c": field[0][idx] = ("c", convert_date_to_iso(value)) record_delete_fields(rec, "260") if not 'THESIS' in collections: for field in record_get_field_instances(rec, '260'): record_add_field(rec, '269', subfields=field[0]) record_delete_fields(rec, '260') # 300 page number for field in record_get_field_instances(rec, '300'): for idx, (key, value) in enumerate(field[0]): if key == 'a': if "mult." not in value and value != " p": field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value)) else: record_delete_field(rec, '300', field_position_global=field[4]) break # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, '100') author_names.extend(record_get_field_instances(rec, '700')) for field in author_names: subs = field_get_subfields(field) if not 'i' in subs or 'XX' in subs['i']: if not 'j' in subs or 'YY' in subs['j']: for idx, (key, value) in enumerate(field[0]): if key == 'a': field[0][idx] = ('a', punctuate_authorname(value)) # 700 -> 701 Thesis supervisors if 'THESIS' in collections: for field in record_get_field_instances(rec, '700'): record_add_field(rec, '701', subfields=field[0]) record_delete_fields(rec, '700') # 501 move subfields fields_501 = record_get_field_instances(rec, '502') for idx, field in enumerate(fields_501): new_subs = [] for key, value in field[0]: if key == 'a': new_subs.append(('b', value)) elif key == 'b': new_subs.append(('c', value)) elif key == 'c': new_subs.append(('d', value)) else: new_subs.append((key, value)) fields_501[idx] = field_swap_subfields(field, new_subs) # 650 Translate Categories categories = get_categories() category_fields = record_get_field_instances(rec, '650', ind1='1', ind2='7') record_delete_fields(rec, "650") for field in category_fields: for idx, (key, value) in enumerate(field[0]): if key == 'a': new_value = translate_config(value, categories) if new_value != value: new_subs = [('2', 'INSPIRE'), ('a', new_value)] else: new_subs = [('2', 'SzGeCERN'), ('a', value)] record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs) break # 653 Free Keywords for field in record_get_field_instances(rec, '653', ind1='1'): subs = field_get_subfields(field) new_subs = [] if 'a' in subs: for val in subs['a']: new_subs.extend([('9', 'author'), ('a', val)]) new_field = create_field(subfields=new_subs, ind1='1') record_replace_field(rec, '653', new_field, field_position_global=field[4]) experiments = get_experiments() # 693 Remove if 'not applicable' for field in record_get_field_instances(rec, '693'): subs = field_get_subfields(field) all_subs = subs.get('a', []) + subs.get('e', []) if 'not applicable' in [x.lower() for x in all_subs]: record_delete_field(rec, '693', field_position_global=field[4]) new_subs = [] experiment_a = "" experiment_e = "" for (key, value) in subs.iteritems(): if key == 'a': experiment_a = value[0] new_subs.append((key, value[0])) elif key == 'e': experiment_e = value[0] experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e) translated_experiments = translate_config(experiment, experiments) new_subs.append(("e", translated_experiments)) record_delete_field(rec, tag="693", field_position_global=field[4]) record_add_field(rec, "693", subfields=new_subs) # 710 Collaboration for field in record_get_field_instances(rec, '710'): subs = field_get_subfield_instances(field) for idx, (key, value) in enumerate(subs[:]): if key == '5': subs.pop(idx) elif value.startswith('CERN. Geneva'): subs.pop(idx) if len(subs) == 0: record_delete_field(rec, '710', field_position_global=field[4]) # 773 journal translations journals = get_journals() for field in record_get_field_instances(rec, '773'): subs = field_get_subfield_instances(field) new_subs = [] for idx, (key, value) in enumerate(subs): if key == 'p': new_subs.append((key, translate_config(value, journals))) else: new_subs.append((key, value)) record_delete_field(rec, tag="773", field_position_global=field[4]) record_add_field(rec, "773", subfields=new_subs) # FFT (856) Dealing with graphs figure_counter = 0 for field in record_get_field_instances(rec, '856', ind1='4'): subs = field_get_subfields(field) newsubs = [] remove = False if 'z' in subs: is_figure = [s for s in subs['z'] if "figure" in s.lower()] if is_figure and 'u' in subs: is_subformat = [ s for s in subs['u'] if "subformat" in s.lower() ] if not is_subformat: url = subs['u'][0] if url.endswith(".pdf"): # We try to convert fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR) os.close(fd) _print("Downloading %s into %s" % (url, local_url), verbose=5) plotfile = "" try: plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print( "Download failed while attempting to reach %s. Skipping.." % (url, )) remove = True if plotfile: converted = convert_images([plotfile]) if converted: url = converted.pop() _print("Successfully converted %s to %s" % (local_url, url), verbose=5) else: _print("Conversion failed on %s" % (local_url, )) url = None remove = True if url: newsubs.append(('a', url)) newsubs.append(('t', 'Plot')) figure_counter += 1 if 'y' in subs: newsubs.append( ('d', "%05d %s" % (figure_counter, subs['y'][0]))) newsubs.append(('n', subs['y'][0])) else: # Get basename without extension. name = os.path.basename( os.path.splitext(subs['u'][0])[0]) newsubs.append( ('d', "%05d %s" % (figure_counter, name))) newsubs.append(('n', name)) if not newsubs and 'u' in subs: is_fulltext = [s for s in subs['u'] if ".pdf" in s] if is_fulltext: newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])] if not newsubs and 'u' in subs: remove = True is_zipfile = [s for s in subs['u'] if ".zip" in s] if is_zipfile: url = is_zipfile[0] local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url)) _print("Downloading %s into %s" % (url, local_url), verbose=5) zipped_archive = "" try: zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print( "Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0], )) remove = True if zipped_archive: unzipped_archive = unzip(zipped_archive) list_of_pngs = locate("*.png", unzipped_archive) for png in list_of_pngs: if "_vti_" in png or "__MACOSX" in png: continue figure_counter += 1 plotsubs = [] plotsubs.append(('a', png)) caption = '%05d %s' % (figure_counter, os.path.basename(png)) plotsubs.append(('d', caption)) plotsubs.append(('t', 'Plot')) record_add_field(rec, 'FFT', subfields=plotsubs) if not remove and not newsubs and 'u' in subs: urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch', 'http://cmsdoc.cern.ch', 'http://documents.cern.ch', 'http://preprints.cern.ch', 'http://cds.cern.ch') for val in subs['u']: if any(url in val for url in urls): remove = True break if val.endswith('ps.gz'): remove = True if newsubs: record_add_field(rec, 'FFT', subfields=newsubs) remove = True if remove: record_delete_field(rec, '856', ind1='4', field_position_global=field[4]) # 500 - Preliminary results if "THESIS" not in collections: subs = [('a', "Preliminary results")] record_add_field(rec, "500", subfields=subs) for collection in collections: record_add_field(rec, '980', subfields=[('a', collection)]) return rec
def format_element(bfo, oai=0): """Produce MARCXML with enhanced fields. Adds 100/700 $x with Record ID of linked HepName, 701/702 $y with True/False if the signature is claimed $z with Record ID of institution $w with BAI of linked Profile 371/110 $z with Record ID of institution 119/502 $z with Record ID of institution 999C5 $0 with on the fly discovered Record IDs (not for books) 773 $0 with Record ID of corresponding Book or Proceeding or Report $1 with Record ID of corresponding Journal $2 with Record ID of corresponding Conference 693/710 $0 with Record ID of corresponding experiment """ record = bfo.get_record() recid = bfo.recID # Let's filter hidden fields if acc_authorize_action(bfo.user_info, "runbibedit")[0]: # not authorized for tag in CFG_BIBFORMAT_HIDDEN_TAGS: if tag in record: del record[tag] else: # Let's add bibdoc info bibrecdocs = BibRecDocs(recid) for bibdocfile in bibrecdocs.list_latest_files(): fft = [ ("a", bibdocfile.fullpath), ("d", bibdocfile.description or ""), ("f", bibdocfile.format or ""), ("n", bibdocfile.name or ""), ("r", bibdocfile.status or ""), ("s", bibdocfile.cd.strftime("%Y-%m-%d %H:%M:%S")), ("t", bibdocfile.get_type()), ("v", str(bibdocfile.version)), ("z", bibdocfile.comment or ""), ] for flag in bibdocfile.flags: fft.append(("o", flag)) record_add_field(record, "FFT", subfields=fft) is_institution = "INSTITUTION" in [collection.upper() for collection in bfo.fields("980__a")] if "100" in record or "700" in record: signatures = dict( (name, (personid, flag)) for name, personid, flag in run_sql( "SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2", (recid,) ) ) # Let's add signatures for field in ( record_get_field_instances(record, "100") + record_get_field_instances(record, "700") + record_get_field_instances(record, "701") + record_get_field_instances(record, "702") ): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "a" in subfield_dict: author_name = subfield_dict["a"] if "i" in subfield_dict: inspire_id = subfield_dict["i"] hepname_id = get_hepname_id_from_inspire_id(inspire_id) if hepname_id: subfields.append(("x", "%i" % hepname_id)) subfields.append(("y", "1")) else: personid, flag = signatures.get(author_name, (None, None)) bai = get_personid_canonical_id().get(personid) if bai: subfields.append(("w", bai)) hepname_id = get_hepname_id(personid) if hepname_id: subfields.append(("x", "%i" % hepname_id)) subfields.append(("y", "%i" % (flag == 2))) # And matched affiliations if "u" in subfield_dict: for code, value in subfields: if code == "u": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) # Thesis institution for field in record_get_field_instances(record, "502"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "c" in subfield_dict: for code, value in subfields: if code == "c": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) # Enhance affiliation in Experiments for field in record_get_field_instances(record, "119"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "u" in subfield_dict: for code, value in subfields: if code == "u": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) # Enhance affiliation in HepNames and Jobs and Institutions for field in record_get_field_instances(record, "371"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "a" in subfield_dict: for code, value in subfields: if code == "a": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) for field in record_get_field_instances(record, "110"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if is_institution: # We try to resolve obsolete ICNs if "x" in subfield_dict: for code, value in subfields: if code == "x": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) else: # In other collections institution is in a if "a" in subfield_dict: for code, value in subfields: if code == "a": ids = get_institution_ids(value) if len(ids) == 1: subfields.append(("z", "%i" % ids[0])) # Enhance citation for field in record_get_field_instances(record, "999", ind1="C", ind2="5"): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if "0" not in subfield_dict: matched_id = get_matched_id(subfields) if matched_id: subfields.append(("0", str(matched_id))) # Enhance CNUMs and Journals for field in record_get_field_instances(record, "773"): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == "w": # Conference CNUMs recids = perform_request_search(p='111__g:"%s"' % value, cc="Conferences") if len(recids) == 1: subfields.append(("2", str(recids.pop()))) recids = perform_request_search(p='773__w:"%s" 980:PROCEEDINGS' % value) if recid in recids: # We remove this very record, since it can be a proceedings recids.remove(recid) if len(recids) == 1: subfields.append(("0", str(recids.pop()))) elif code == "p": # Journal title recids = perform_request_search(p='711__a:"%s"' % value, cc="Journals") if len(recids) == 1: subfields.append(("1", str(recids.pop()))) elif code == "z": # ISBN recids = find_isbn({"ISBN": value}) if len(recids) == 1: subfields.append(("0", str(recids.pop()))) elif code == "r": # Report recids = perform_request_search(p='reportnumber:"%s"' % value) if len(recids) == 1: subfields.append(("0", str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, "693"): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == "e": recids = perform_request_search(p='119__a:"%s"' % value, cc="Experiments") if len(recids) == 1: subfields.append(("0", str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, "710"): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == "g": recids = perform_request_search(p='119__a:"%s"' % value, cc="Experiments") if len(recids) == 1: subfields.append(("0", str(recids.pop()))) # Add Creation date: if "961" in record: del record["961"] creation_date, modification_date = run_sql( "SELECT creation_date, modification_date FROM bibrec WHERE id=%s", (recid,) )[0] record_add_field( record, "961", subfields=[("c", creation_date.strftime("%Y-%m-%d")), ("x", modification_date.strftime("%Y-%m-%d"))], ) formatted_record = record_xml_output(record) if oai: formatted_record = formatted_record.replace( "<record>", '<marc:record xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" type="Bibliographic">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>', ) formatted_record = formatted_record.replace( '<record xmlns="http://www.loc.gov/MARC21/slim">', '<marc:record xmlns:marc="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" type="Bibliographic">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>', ) formatted_record = formatted_record.replace("</record", "</marc:record") formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield") formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield") formatted_record = formatted_record.replace("<datafield", "<marc:datafield") formatted_record = formatted_record.replace("</datafield", "</marc:datafield") formatted_record = formatted_record.replace("<subfield", "<marc:subfield") formatted_record = formatted_record.replace("</subfield", "</marc:subfield") return formatted_record
elif tag == "773": # Special check for publication notes to make sure we are # not adding double information. correct_773 = True for existing_773 in existing_field_list: if field_get_subfield_values(existing_773, 'p'): correct_773 = False fields_to_correct.append((tag, [existing_773])) if correct_773: fields_to_correct.append((tag, new_field_list)) elif (tag == "100" or tag == "700") and take_authors: # Take authors since $i is missing # Check if some $$i is missing from new records as well and report it missing_identifier_fields = [] for field in new_fields_authors: subfields = dict(field_get_subfield_instances(field)) if "i" not in subfields: missing_identifier_fields.append(field) if missing_identifier_fields: create_authorlist_ticket( [("700", missing_identifier_fields)], current_record_arxiv_id, "AUTHORS_long_list", missing_ids=True ) fields_to_correct.append((tag, new_field_list)) else: corrected_fields = [] if has_field_origin(new_field_list, "arXiv", "9"): for field in existing_field_list: if not "arXiv" in field_get_subfield_values(field, "9"): corrected_fields.append(field)
def add_other_id(other_id=None, doi="", eprint="", recid=None, reportnumbers=None, all_recids=None): if all_recids is None: all_recids = get_all_recids() if reportnumbers is None: reportnumbers = [] if recid is not None and recid not in all_recids: write_message( "WARNING: %s thought that their record %s had recid %s in %s but this seems wrong" % (CFG_OTHER_SITE, other_id, recid, CFG_THIS_SITE), stream=sys.stderr, ) recid = None if recid is None and eprint: arxiv_ids = search_pattern(p="oai:arXiv.org:%s" % (eprint,), f="035__a", m="e") & all_recids if len(arxiv_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via arXiv eprint matching: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, arxiv_ids), stream=sys.stderr, ) return elif len(arxiv_ids) == 1: recid = arxiv_ids[0] if recid is None and doi: doi_ids = search_pattern(p='doi:"%s"' % doi) & all_recids if len(doi_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via DOI matching: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, doi_ids), stream=sys.stderr, ) return elif len(doi_ids) == 1: recid = doi_ids[0] if recid is None and reportnumbers: reportnumbers_ids = intbitset() for rn in reportnumbers: reportnumbers_ids |= search_pattern(p=rn, f="037__a", m="e") reportnumbers_ids &= all_recids() if len(reportnumbers_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via reportnumber matching: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, reportnumbers_ids), stream=sys.stderr, ) return elif len(reportnumbers_ids) == 1: recid = reportnumbers_ids[0] if recid: record = get_record(recid) fields = record_get_field_instances(record, "035") for field in fields: subfields = dict(field_get_subfield_instances(field)) if CFG_OTHER_SITE.upper() == subfields.get("9", "").upper(): stored_recid = int(subfields.get("a", 0)) if stored_recid and stored_recid != other_id: write_message( "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr, ) return rec = {} record_add_field(rec, "001", controlfield_value="%s" % recid) record_add_field(rec, "035", ind1=" ", ind2=" ", subfields=(("9", CFG_OTHER_SITE), ("a", other_id))) return record_xml_output(rec)
def get_ids_from_recid(recid): """Get all relevant identifiers from metadata of local record.""" record = get_record(recid) # Retrieving DOI doi = "" dois = record_get_field_values(record, '024', '7', code='a') dois = [doi for doi in dois if doi.startswith('10.')] if len(dois) > 1: print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % ( recid, dois) doi = dois[0] elif len(dois) == 1: doi = dois[0] # Retrieving arXiv eprint eprint = "" eprints = record_get_field_values(record, '035', code='a') eprints = [ an_eprint[len('oai:arXiv.org:'):] for an_eprint in eprints if an_eprint.lower().startswith('oai:arxiv.org:') ] if len(eprints) > 1: print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % ( recid, eprints) eprint = eprints[0] elif len(eprints) == 1: eprint = eprints[0] # Retrieving Other service ID other_id = '' for field in record_get_field_instances(record, '035'): subfields = dict(field_get_subfield_instances(field)) if subfields.get( '9', '').upper() == CFG_OTHER_SITE.upper() and subfields.get('a'): other_id = subfields['a'] if CFG_INSPIRE_SITE and not other_id: for field in record_get_field_instances(record, '595'): subfields = dict(field_get_subfield_instances(field)) if "CDS" in subfields.get('a', '').upper(): other_id = subfields.get('a', 0).split("-")[-1] try: int(other_id) except ValueError: # Not an integer, we move on other_id = '' reportnumbers = record_get_field_values(record, '037', code='a') system_number = "" if CFG_INSPIRE_SITE: for value in record_get_field_values(record, '970', filter_subfield_code="a", filter_subfield_value="SPIRES", filter_subfield_mode="s"): system_number = value.split("-")[-1] break # There is typically only one out = [str(recid), doi, eprint, other_id, system_number] + reportnumbers return [val.replace('\n', ' ').replace('\r', '') for val in out]
def get_bai(record): for field in record_get_field_instances(record, '035'): subfields = dict(field_get_subfield_instances(field)) if subfields.get('9', '').upper() == 'BAI': return subfields.get('a', '') return ''
def get_author_name(record): for field in record_get_field_instances(record, '100'): subfields = dict(field_get_subfield_instances(field)) if 'a' in subfields: return subfields['a'] return ''
def format_element(bfo, oai=0): """Produce MARCXML with enhanced fields. Adds 100/700 $x with Record ID of linked HepName, 701/702 $y with True/False if the signature is claimed $z with Record ID of institution $w with BAI of linked Profile 371/110 $z with Record ID of institution 119/502 $z with Record ID of institution 999C5 $0 with on the fly discovered Record IDs (not for books) 773 $0 with Record ID of corresponding Book or Proceeding or Report $1 with Record ID of corresponding Journal $2 with Record ID of corresponding Conference 693/710 $0 with Record ID of corresponding experiment """ can_see_hidden_stuff = not acc_authorize_action(bfo.user_info, 'runbibedit')[0] recid = bfo.recID if can_see_hidden_stuff and is_record_deleted(bfo): record = salvage_deleted_record_from_history(recid) else: record = bfo.get_record() # Let's filter hidden fields if can_see_hidden_stuff: # Let's add bibdoc info bibrecdocs = BibRecDocs(recid) for bibdocfile in bibrecdocs.list_latest_files(): fft = [ ('a', bibdocfile.fullpath), ('d', bibdocfile.description or ''), ('f', bibdocfile.format or ''), ('n', bibdocfile.name or ''), ('r', bibdocfile.status or ''), ('s', bibdocfile.cd.strftime('%Y-%m-%d %H:%M:%S')), ('t', bibdocfile.bibdoc.doctype), ('v', str(bibdocfile.version)), ('z', bibdocfile.comment or ''), ] for flag in bibdocfile.flags: fft.append(('o', flag)) record_add_field(record, 'FFT', subfields=fft) else: # not authorized for tag in CFG_BIBFORMAT_HIDDEN_TAGS: if tag in record: del record[tag] is_institution = 'INSTITUTION' in [collection.upper() for collection in bfo.fields('980__a')] signatures = {} if '100' in record or '700' in record: signatures = dict((name, (personid, flag)) for name, personid, flag in run_sql("SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2", (recid, ))) # Let's add signatures for field in record_get_field_instances(record, '100') + record_get_field_instances(record, '700') + record_get_field_instances(record, '701') + record_get_field_instances(record, '702'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: author_name = subfield_dict['a'] personid, flag = signatures.get(author_name, (None, None)) bai = get_personid_canonical_id().get(personid) if bai: subfields.append(('w', bai)) hepname_id = get_hepname_id(personid) if hepname_id: subfields.append(('x', '%i' % hepname_id)) subfields.append(('y', '%i' % (flag == 2))) # And matched affiliations if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Thesis institution for field in record_get_field_instances(record, '502'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'c' in subfield_dict: for code, value in subfields: if code == 'c': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Related institution for field in record_get_field_instances(record, '510'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0'in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Related journal for field in record_get_field_instances(record, '530'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0'in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Enhance affiliation in Experiments for field in record_get_field_instances(record, '119'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance affiliation in HepNames and Jobs and Institutions and # naked affiliations in HEP for field in record_get_field_instances(record, '371') + record_get_field_instances(record, '902'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) for field in record_get_field_instances(record, '110'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if is_institution: # We try to resolve obsolete ICNs if 'x' in subfield_dict: for code, value in subfields: if code == 'x': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) else: # In other collections institution is in a if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance citation for field in record_get_field_instances(record, '999', ind1='C', ind2='5'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if '0' in subfield_dict: # Already available recid subfields.append(('z', '1')) else: matched_id = get_matched_id(subfields) if matched_id: subfields.append(('0', str(matched_id))) # Enhance related records for field in (record_get_field_instances(record, '780', ind1='0', ind2='2') + record_get_field_instances(record, '785', ind1='0', ind2='2') + record_get_field_instances(record, '787', ind1='0', ind2='8')): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) subfield_citation = [] if subfield_dict.get('r'): # Reportnumber subfield_citation.append(('r', subfield_dict['r'])) if subfield_dict.get('z'): # ISBN subfield_citation.append(('i', subfield_dict['z'])) if 'w' not in subfield_dict and subfield_citation: matched_id = get_matched_id(subfield_citation) if matched_id: subfields.append(('w', str(matched_id))) # Enhance CNUMs and Journals for field in record_get_field_instances(record, '773'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) for code, value in subfields: if code == 'w': # Conference CNUMs recids = perform_request_search(p='111__g:"%s"' % value, cc='Conferences') if len(recids) == 1: subfields.append(('2', str(recids.pop()))) if '0' not in subfield_dict: recids = perform_request_search(p='773__w:"%s" 980:PROCEEDINGS' % value) if recid in recids: # We remove this very record, since it can be a proceedings recids.remove(recid) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'p': # Journal title recids = perform_request_search(p='711__a:"%s"' % value, cc='Journals') if len(recids) == 1: subfields.append(('1', str(recids.pop()))) elif code == 'z' and '0' not in subfield_dict: # ISBN recids = find_isbn({'ISBN': value}) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'r' and '0' not in subfield_dict: # Report recids = perform_request_search(p='reportnumber:"%s"' % value) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '693'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'e': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'a': recids = perform_request_search(p='119__b:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '710'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'g': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Add Creation date: if '961' in record: del record['961'] creation_date, modification_date = run_sql("SELECT creation_date, modification_date FROM bibrec WHERE id=%s", (recid,))[0] record_add_field(record, '961', subfields=[('x', creation_date.strftime('%Y-%m-%d')), ('c', modification_date.strftime('%Y-%m-%d'))]) formatted_record = record_xml_output(record) if oai: formatted_record = formatted_record.replace("<record>", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>") formatted_record = formatted_record.replace("<record xmlns=\"http://www.loc.gov/MARC21/slim\">", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>") formatted_record = formatted_record.replace("</record", "</marc:record") formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield") formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield") formatted_record = formatted_record.replace("<datafield", "<marc:datafield") formatted_record = formatted_record.replace("</datafield", "</marc:datafield") formatted_record = formatted_record.replace("<subfield", "<marc:subfield") formatted_record = formatted_record.replace("</subfield", "</marc:subfield") return formatted_record
def add_other_id(other_id=None, doi="", eprint="", recid=None, system_number=None, reportnumbers=None, all_recids=None): """Search and match using given identifiers.""" query = "" if all_recids is None: all_recids = get_all_recids() if reportnumbers is None: reportnumbers = [] if recid is not None: query = "existing recid" try: recid = int(recid) except ValueError: recid = None if recid and recid not in all_recids: write_message("WARNING: %s thought that their record %s had recid %s in %s but this seems wrong" % (CFG_OTHER_SITE, other_id, recid, CFG_THIS_SITE), stream=sys.stderr) recid = None if recid is None and eprint: query = 'oai:arXiv.org:%s' % (eprint,) arxiv_ids = search_pattern(p=query, f='035__a', m='e') & all_recids if len(arxiv_ids) > 1: write_message("ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, arxiv_ids), stream=sys.stderr) return [other_id] + list(arxiv_ids) elif len(arxiv_ids) == 1: recid = arxiv_ids[0] if recid is None and doi: query = 'doi:"%s"' % doi doi_ids = search_pattern(p=query) & all_recids if len(doi_ids) > 1: write_message("ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, doi_ids), stream=sys.stderr) return [other_id] + list(doi_ids) elif len(doi_ids) == 1: recid = doi_ids[0] if recid is None and reportnumbers: query = "037__a:" + " OR 037__a:".join(reportnumbers) reportnumbers_ids = intbitset() for rn in reportnumbers: reportnumbers_ids |= search_pattern(p=rn, f='037__a', m='e') reportnumbers_ids &= all_recids if len(reportnumbers_ids) > 1: write_message("ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, reportnumbers_ids), stream=sys.stderr) return [other_id] + list(reportnumbers_ids) elif len(reportnumbers_ids) == 1: recid = reportnumbers_ids[0] if recid is None and system_number and CFG_CERN_SITE: query = "035:%s 035:SPIRES" % (system_number,) system_number_ids = search_pattern(p=query) system_number_ids &= all_recids if len(system_number_ids) > 1: write_message("ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, system_number_ids), stream=sys.stderr) return [other_id] + list(system_number_ids) elif len(system_number_ids) == 1: recid = system_number_ids[0] if recid: recid = int(recid) record = get_record(recid) fields = record_get_field_instances(record, '035') for field in fields: subfields = dict(field_get_subfield_instances(field)) if CFG_OTHER_SITE.upper() == subfields.get('9', '').upper(): stored_recid = subfields.get('a', 0) try: stored_recid = int(stored_recid) except ValueError: # Not an integer, we move on and add the new ID. continue if stored_recid and int(stored_recid) != int(other_id): write_message("ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) if CFG_INSPIRE_SITE and int(other_id) not in CERN_IDS: write_message("INFO: ID was found in 035 but the record is not core CERN hence it should be moved into 595") else: return if CFG_INSPIRE_SITE: fields = record_get_field_instances(record, '595') for field in fields: subfields = dict(field_get_subfield_instances(field)) if "CDS" in subfields.get('a', '').upper(): stored_recid = subfields.get('a', 0).split("-")[-1] try: stored_recid = int(stored_recid) except ValueError: # Not an integer, we move on and add the new ID. continue if stored_recid and int(stored_recid) != int(other_id): write_message("ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) if int(other_id) in CERN_IDS: write_message("INFO: ID was found in 595 but the record is core CERN hence it should be moved into 035") else: return write_message("Matched {1}/{0} to {3}/{2} with {4}".format( other_id, CFG_OTHER_URL, recid, CFG_THIS_URL, query )) rec = {} record_add_field(rec, '001', controlfield_value='%s' % recid) # Let's filter out previous values in 035/595 for field in record_get_field_instances(record, '035'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') != str(other_id) or subfields_dict.get('9') != CFG_OTHER_SITE: record_add_field(rec, '035', subfields=subfields) for field in record_get_field_instances(record, '595'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') != "CDS-{0}".format(other_id) or subfields_dict.get('9') != 'CERN': record_add_field(rec, '595', subfields=subfields) if CFG_INSPIRE_SITE: if int(other_id) in CERN_IDS: write_message("CERN relevant paper: adding 035") record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) else: write_message("Non-CERN relevant paper: adding 595") record_add_field(rec, '595', ind1=' ', ind2=' ', subfields=(('9', "CERN"), ('a', "CDS-{0}".format(other_id)))) else: record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) return record_xml_output(rec)
def author_merger(fields1, fields2, tag): """function that merges the author lists and return the first author or all the other authors""" #if one of the two lists is empty, I don't have to do anything if len(fields1) == 0 or len(fields2) == 0: logger.info(' Only one field for "%s".' % tag) return fields1+fields2 #I need to copy locally the lists of records because I'm going to modify them fields1 = deepcopy(fields1) fields2 = deepcopy(fields2) try: trusted, untrusted = get_trusted_and_untrusted_fields(fields1, fields2, tag) except EqualOrigins: #if they have the same origin I try with another approach trusted, untrusted = _get_best_fields(fields1, fields2, tag) #and since I am in this case the two sets of fields are already too similar to enrich the trusted one #so I simply return it return trusted # Sanity check: we have a problem if we have identical normalized author # names in the trusted list or if we have identical author names in the # untrusted list that is present in the trusted list of authors. trusted_authors = set() for field in trusted: author = bibrecord.field_get_subfield_values(field, AUTHOR_NORM_NAME_SUBFIELD)[0] if author in trusted_authors: #I don't raise an error if I have duplicated normalized author names, #I simply return the trusted list logger.info(' Duplicated normalized author name. Skipping author subfield merging.') return trusted #raise DuplicateNormalizedAuthorError(author) else: trusted_authors.add(author) #I extract all the authors in the untrusted list in case I need to merge some subfields untrusted_authors = {} for field in untrusted: author = bibrecord.field_get_subfield_values(field, AUTHOR_NORM_NAME_SUBFIELD)[0] if author in trusted_authors: untrusted_authors[author] = field # Now add information from the least trusted list of authors to the most # trusted list of authors. for index, field in enumerate(trusted): author = bibrecord.field_get_subfield_values(field, AUTHOR_NORM_NAME_SUBFIELD)[0] if author in untrusted_authors: trusted_subfield_codes = bibrecord.field_get_subfield_codes(field) untrusted_field = untrusted_authors[author] untrusted_subfield_codes = bibrecord.field_get_subfield_codes(untrusted_field) trusted_subfields = bibrecord.field_get_subfield_instances(field) additional_subfield_codes = set(untrusted_subfield_codes) - set(trusted_subfield_codes) for code in additional_subfield_codes: logger.info(' Subfield "%s" to add to author "%s".' % (code, author)) additional_subfields = bibrecord.field_get_subfield_values(untrusted_field, code) for additional_subfield in additional_subfields: trusted_subfields.append((code, additional_subfield)) else: # Replace the subfields with the new subfields. field = (trusted_subfields, field[1], field[2], field[3], field[4]) return trusted
def get_author_name(record): for field in record_get_field_instances(record, '100'): subfields = dict(field_get_subfield_instances(field)) if 'a' in subfields: return subfields['a'] return ''
def get_bai(record): for field in record_get_field_instances(record, '035'): subfields = dict(field_get_subfield_instances(field)) if subfields.get('9', '').upper() == 'BAI': return subfields.get('a', '') return ''
def __hash__(self): return hash(tuple(field_get_subfield_instances(self.field)))
def add_other_id(other_id=None, doi="", eprint="", recid=None, system_number=None, reportnumbers=None, all_recids=None): """Search and match using given identifiers.""" query = "" if all_recids is None: all_recids = get_all_recids() if reportnumbers is None: reportnumbers = [] if recid is not None: query = "existing recid" try: recid = int(recid) except ValueError: recid = None if recid and recid not in all_recids: write_message( "WARNING: %s thought that their record %s had recid %s in %s but this seems wrong" % (CFG_OTHER_SITE, other_id, recid, CFG_THIS_SITE), stream=sys.stderr) recid = None if recid is None and eprint: query = 'oai:arXiv.org:%s' % (eprint, ) arxiv_ids = search_pattern(p=query, f='035__a', m='e') & all_recids if len(arxiv_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, arxiv_ids), stream=sys.stderr) return [other_id] + list(arxiv_ids) elif len(arxiv_ids) == 1: recid = arxiv_ids[0] if recid is None and doi: query = 'doi:"%s"' % doi doi_ids = search_pattern(p=query) & all_recids if len(doi_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, doi_ids), stream=sys.stderr) return [other_id] + list(doi_ids) elif len(doi_ids) == 1: recid = doi_ids[0] if recid is None and reportnumbers: query = "037__a:" + " OR 037__a:".join(reportnumbers) reportnumbers_ids = intbitset() for rn in reportnumbers: reportnumbers_ids |= search_pattern(p=rn, f='037__a', m='e') reportnumbers_ids &= all_recids if len(reportnumbers_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, reportnumbers_ids), stream=sys.stderr) return [other_id] + list(reportnumbers_ids) elif len(reportnumbers_ids) == 1: recid = reportnumbers_ids[0] if recid is None and system_number and CFG_CERN_SITE: query = "035:%s 035:SPIRES" % (system_number, ) system_number_ids = search_pattern(p=query) system_number_ids &= all_recids if len(system_number_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, system_number_ids), stream=sys.stderr) return [other_id] + list(system_number_ids) elif len(system_number_ids) == 1: recid = system_number_ids[0] if recid: recid = int(recid) record = get_record(recid) fields = record_get_field_instances(record, '035') for field in fields: subfields = dict(field_get_subfield_instances(field)) if CFG_OTHER_SITE.upper() == subfields.get('9', '').upper(): stored_recid = subfields.get('a', 0) try: stored_recid = int(stored_recid) except ValueError: # Not an integer, we move on and add the new ID. continue if stored_recid and int(stored_recid) != int(other_id): write_message( "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) if CFG_INSPIRE_SITE and int(other_id) not in CERN_IDS: write_message( "INFO: ID was found in 035 but the record is not core CERN hence it should be moved into 595" ) else: return if CFG_INSPIRE_SITE: fields = record_get_field_instances(record, '595') for field in fields: subfields = dict(field_get_subfield_instances(field)) if "CDS" in subfields.get('a', '').upper(): stored_recid = subfields.get('a', 0).split("-")[-1] try: stored_recid = int(stored_recid) except ValueError: # Not an integer, we move on and add the new ID. continue if stored_recid and int(stored_recid) != int(other_id): write_message( "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) if int(other_id) in CERN_IDS: write_message( "INFO: ID was found in 595 but the record is core CERN hence it should be moved into 035" ) else: return write_message("Matched {1}/{0} to {3}/{2} with {4}".format( other_id, CFG_OTHER_URL, recid, CFG_THIS_URL, query)) rec = {} record_add_field(rec, '001', controlfield_value='%s' % recid) # Let's filter out previous values in 035/595 for field in record_get_field_instances(record, '035'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') != str(other_id) or subfields_dict.get( '9') != CFG_OTHER_SITE: record_add_field(rec, '035', subfields=subfields) for field in record_get_field_instances(record, '595'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') != "CDS-{0}".format( other_id) or subfields_dict.get('9') != 'CERN': record_add_field(rec, '595', subfields=subfields) if CFG_INSPIRE_SITE: if int(other_id) in CERN_IDS: write_message("CERN relevant paper: adding 035") record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) else: write_message("Non-CERN relevant paper: adding 595") record_add_field(rec, '595', ind1=' ', ind2=' ', subfields=(('9', "CERN"), ('a', "CDS-{0}".format(other_id)))) else: record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) return record_xml_output(rec)
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden cds_id = rec['001'][0][3] if not 'hidden' in [x.lower() for x in record_get_field_values(rec, "980", code="a")]: record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = ["024", "041", "035", "037", "088", "100", "110", "111", "242", "245", "246", "260", "269", "300", "502", "650", "653", "693", "700", "710", "773", "856", "520", "500", "980"] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) # 980 Determine Collections collections = set([]) for value in record_get_field_values(rec, '980', code='a'): if 'NOTE' in value.upper(): collections.add('NOTE') if 'THESIS' in value.upper(): collections.add('THESIS') if 'CONFERENCEPAPER' in value.upper(): collections.add('ConferencePaper') if is_published(rec): collections.add("PUBLISHED") collections.add("CITEABLE") if not 'NOTE' in collections: # TODO: Move this to a KB kb = ['ATLAS-CONF-', 'CMS-PAS-', 'ATL-', 'CMS-DP-', 'ALICE-INT-', 'LHCb-PUB-'] values = record_get_field_values(rec, "088", code='a') for val, rep in product(values, kb): if val.startswith(rep): collections.add('NOTE') break # 980 Arxiv tag if record_get_field_values(rec, '035', filter_subfield_code="a", filter_subfield_value="arXiv"): collections.add("arXiv") # 980 HEP && CORE collections.add('HEP') collections.add('CORE') # 980 Conference Note if not 'ConferencePaper' in collections: for value in record_get_field_values(rec, '962', code='n'): if value[-2:].isdigit(): collections.add('ConferencePaper') break record_delete_fields(rec, "980") intnote = record_get_field_values(rec, '690', filter_subfield_code="a", filter_subfield_value='INTNOTE') if intnote: val_088 = record_get_field_values(rec, '088', filter_subfield_code="a") for val in val_088: if 'CMS' in val: url = ('http://weblib.cern.ch/abstract?CERN-CMS' + val.split('CMS', 1)[-1]) record_add_field(rec, '856', ind1='4', subfields=[('u', url)]) # 041 Language languages = get_languages() language_fields = record_get_field_instances(rec, '041') record_delete_fields(rec, "041") for field in language_fields: subs = field_get_subfields(field) if 'a' in subs: if "eng" in subs['a']: continue new_value = translate_config(subs['a'][0], languages) new_subs = [('a', new_value)] record_add_field(rec, "041", subfields=new_subs) # 035 Externals scn_035_fields = record_get_field_instances(rec, '035') forbidden_values = ["cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"] for field in scn_035_fields: subs = field_get_subfields(field) if '9' in subs: if not 'a' in subs: continue for sub in subs['9']: if sub.lower() in forbidden_values: break else: # No forbidden values (We did not "break") suffixes = [s.lower() for s in subs['9']] if 'spires' in suffixes: new_subs = [('a', 'SPIRES-%s' % subs['a'][0])] record_add_field(rec, '970', subfields=new_subs) continue if 'a' in subs: for sub in subs['a']: if sub.lower() in forbidden_values: record_delete_field(rec, tag="035", field_position_global=field[4]) rep_088_fields = record_get_field_instances(rec, '088') for field in rep_088_fields: subs = field_get_subfields(field) if '9' in subs: for val in subs['9']: if val.startswith('P0') or val.startswith('CM-P0'): sf = [('9', 'CERN'), ('b', val)] record_add_field(rec, '595', subfields=sf) for key, val in field[0]: if key in ['a', '9'] and not val.startswith('SIS-'): record_add_field(rec, '037', subfields=[('a', val)]) record_delete_fields(rec, "088") # 037 Externals also... rep_037_fields = record_get_field_instances(rec, '037') for field in rep_037_fields: subs = field_get_subfields(field) if 'a' in subs: for value in subs['a']: if 'arXiv' in value: new_subs = [('a', value), ('9', 'arXiv')] for fld in record_get_field_instances(rec, '695'): for key, val in field_get_subfield_instances(fld): if key == 'a': new_subs.append(('c', val)) break nf = create_field(subfields=new_subs) record_replace_field(rec, '037', nf, field[4]) for key, val in field[0]: if key in ['a', '9'] and val.startswith('SIS-'): record_delete_field(rec, '037', field_position_global=field[4]) for field in record_get_field_instances(rec, '242'): record_add_field(rec, '246', subfields=field[0]) record_delete_fields(rec, '242') # 269 Date normalization for field in record_get_field_instances(rec, '269'): for idx, (key, value) in enumerate(field[0]): if key == "c": field[0][idx] = ("c", convert_date_to_iso(value)) record_delete_fields(rec, "260") if not 'THESIS' in collections: for field in record_get_field_instances(rec, '260'): record_add_field(rec, '269', subfields=field[0]) record_delete_fields(rec, '260') # 300 page number for field in record_get_field_instances(rec, '300'): for idx, (key, value) in enumerate(field[0]): if key == 'a': if "mult." not in value and value != " p": field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value)) else: record_delete_field(rec, '300', field_position_global=field[4]) break # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, '100') author_names.extend(record_get_field_instances(rec, '700')) for field in author_names: subs = field_get_subfields(field) if not 'i' in subs or 'XX' in subs['i']: if not 'j' in subs or 'YY' in subs['j']: for idx, (key, value) in enumerate(field[0]): if key == 'a': field[0][idx] = ('a', punctuate_authorname(value)) # 700 -> 701 Thesis supervisors if 'THESIS' in collections: for field in record_get_field_instances(rec, '700'): record_add_field(rec, '701', subfields=field[0]) record_delete_fields(rec, '700') # 501 move subfields fields_501 = record_get_field_instances(rec, '502') for idx, field in enumerate(fields_501): new_subs = [] for key, value in field[0]: if key == 'a': new_subs.append(('b', value)) elif key == 'b': new_subs.append(('c', value)) elif key == 'c': new_subs.append(('d', value)) else: new_subs.append((key, value)) fields_501[idx] = field_swap_subfields(field, new_subs) # 650 Translate Categories categories = get_categories() category_fields = record_get_field_instances(rec, '650', ind1='1', ind2='7') record_delete_fields(rec, "650") for field in category_fields: for idx, (key, value) in enumerate(field[0]): if key == 'a': new_value = translate_config(value, categories) if new_value != value: new_subs = [('2', 'INSPIRE'), ('a', new_value)] else: new_subs = [('2', 'SzGeCERN'), ('a', value)] record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs) break # 653 Free Keywords for field in record_get_field_instances(rec, '653', ind1='1'): subs = field_get_subfields(field) new_subs = [] if 'a' in subs: for val in subs['a']: new_subs.extend([('9', 'author'), ('a', val)]) new_field = create_field(subfields=new_subs, ind1='1') record_replace_field(rec, '653', new_field, field_position_global=field[4]) experiments = get_experiments() # 693 Remove if 'not applicable' for field in record_get_field_instances(rec, '693'): subs = field_get_subfields(field) all_subs = subs.get('a', []) + subs.get('e', []) if 'not applicable' in [x.lower() for x in all_subs]: record_delete_field(rec, '693', field_position_global=field[4]) new_subs = [] experiment_a = "" experiment_e = "" for (key, value) in subs.iteritems(): if key == 'a': experiment_a = value[0] new_subs.append((key, value[0])) elif key == 'e': experiment_e = value[0] experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e) translated_experiments = translate_config(experiment, experiments) new_subs.append(("e", translated_experiments)) record_delete_field(rec, tag="693", field_position_global=field[4]) record_add_field(rec, "693", subfields=new_subs) # 710 Collaboration for field in record_get_field_instances(rec, '710'): subs = field_get_subfield_instances(field) for idx, (key, value) in enumerate(subs[:]): if key == '5': subs.pop(idx) elif value.startswith('CERN. Geneva'): subs.pop(idx) if len(subs) == 0: record_delete_field(rec, '710', field_position_global=field[4]) # 773 journal translations journals = get_journals() for field in record_get_field_instances(rec, '773'): subs = field_get_subfield_instances(field) new_subs = [] for idx, (key, value) in enumerate(subs): if key == 'p': new_subs.append((key, translate_config(value, journals))) else: new_subs.append((key, value)) record_delete_field(rec, tag="773", field_position_global=field[4]) record_add_field(rec, "773", subfields=new_subs) # FFT (856) Dealing with graphs figure_counter = 0 for field in record_get_field_instances(rec, '856', ind1='4'): subs = field_get_subfields(field) newsubs = [] remove = False if 'z' in subs: is_figure = [s for s in subs['z'] if "figure" in s.lower()] if is_figure and 'u' in subs: is_subformat = [s for s in subs['u'] if "subformat" in s.lower()] if not is_subformat: url = subs['u'][0] if url.endswith(".pdf"): # We try to convert fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR) os.close(fd) _print("Downloading %s into %s" % (url, local_url), verbose=5) plotfile = "" try: plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (url,)) remove = True if plotfile: converted = convert_images([plotfile]) if converted: url = converted.pop() _print("Successfully converted %s to %s" % (local_url, url), verbose=5) else: _print("Conversion failed on %s" % (local_url,)) url = None remove = True if url: newsubs.append(('a', url)) newsubs.append(('t', 'Plot')) figure_counter += 1 if 'y' in subs: newsubs.append(('d', "%05d %s" % (figure_counter, subs['y'][0]))) newsubs.append(('n', subs['y'][0])) else: # Get basename without extension. name = os.path.basename(os.path.splitext(subs['u'][0])[0]) newsubs.append(('d', "%05d %s" % (figure_counter, name))) newsubs.append(('n', name)) if not newsubs and 'u' in subs: is_fulltext = [s for s in subs['u'] if ".pdf" in s and not "subformat=pdfa" in s] if is_fulltext: newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])] if not newsubs and 'u' in subs: remove = True is_zipfile = [s for s in subs['u'] if ".zip" in s] if is_zipfile: url = is_zipfile[0] local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url)) _print("Downloading %s into %s" % (url, local_url), verbose=5) zipped_archive = "" try: zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0],)) remove = True if zipped_archive: unzipped_archive = unzip(zipped_archive) list_of_pngs = locate("*.png", unzipped_archive) for png in list_of_pngs: if "_vti_" in png or "__MACOSX" in png: continue figure_counter += 1 plotsubs = [] plotsubs.append(('a', png)) caption = '%05d %s' % (figure_counter, os.path.basename(png)) plotsubs.append(('d', caption)) plotsubs.append(('t', 'Plot')) record_add_field(rec, 'FFT', subfields=plotsubs) if not remove and not newsubs and 'u' in subs: urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch', 'http://cmsdoc.cern.ch', 'http://documents.cern.ch', 'http://preprints.cern.ch', 'http://cds.cern.ch') for val in subs['u']: if any(url in val for url in urls): remove = True break if val.endswith('ps.gz'): remove = True if newsubs: record_add_field(rec, 'FFT', subfields=newsubs) remove = True if remove: record_delete_field(rec, '856', ind1='4', field_position_global=field[4]) # 500 - Preliminary results if "THESIS" not in collections: subs = [('a', "Preliminary results")] record_add_field(rec, "500", subfields=subs) for collection in collections: record_add_field(rec, '980', subfields=[('a', collection)]) return rec
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden cds_id = rec["001"][0][3] if not "hidden" in [x.lower() for x in record_get_field_values(rec, "980", code="a")]: record_add_field(rec, "035", subfields=[("9", "CDS"), ("a", cds_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = [ "024", "041", "035", "037", "088", "100", "110", "111", "242", "245", "246", "260", "269", "300", "502", "650", "653", "693", "700", "710", "773", "856", "520", "500", "980", ] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) # 980 Determine Collections collections = set([]) for value in record_get_field_values(rec, "980", code="a"): if "NOTE" in value.upper(): collections.add("NOTE") if "THESIS" in value.upper(): collections.add("THESIS") if "CONFERENCEPAPER" in value.upper(): collections.add("ConferencePaper") if is_published(rec): collections.add("PUBLISHED") collections.add("CITEABLE") if not "NOTE" in collections: # TODO: Move this to a KB kb = ["ATLAS-CONF-", "CMS-PAS-", "ATL-", "CMS-DP-", "ALICE-INT-", "LHCb-PUB-"] values = record_get_field_values(rec, "088", code="a") for val, rep in product(values, kb): if val.startswith(rep): collections.add("NOTE") break # 980 Arxiv tag if record_get_field_values(rec, "035", filter_subfield_code="a", filter_subfield_value="arXiv"): collections.add("arXiv") # 980 HEP && CORE collections.add("HEP") collections.add("CORE") # 980 Conference Note if not "ConferencePaper" in collections: for value in record_get_field_values(rec, "962", code="n"): if value[-2:].isdigit(): collections.add("ConferencePaper") break record_delete_fields(rec, "980") intnote = record_get_field_values(rec, "690", filter_subfield_code="a", filter_subfield_value="INTNOTE") if intnote: val_088 = record_get_field_values(rec, "088", filter_subfield_code="a") for val in val_088: if "CMS" in val: url = "http://weblib.cern.ch/abstract?CERN-CMS" + val.split("CMS", 1)[-1] record_add_field(rec, "856", ind1="4", subfields=[("u", url)]) # 041 Language languages = get_languages() language_fields = record_get_field_instances(rec, "041") record_delete_fields(rec, "041") for field in language_fields: subs = field_get_subfields(field) if "a" in subs: if "eng" in subs["a"]: continue new_value = translate_config(subs["a"][0], languages) new_subs = [("a", new_value)] record_add_field(rec, "041", subfields=new_subs) # 035 Externals scn_035_fields = record_get_field_instances(rec, "035") forbidden_values = ["cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"] for field in scn_035_fields: subs = field_get_subfields(field) if "9" in subs: if not "a" in subs: continue for sub in subs["9"]: if sub.lower() in forbidden_values: break else: # No forbidden values (We did not "break") suffixes = [s.lower() for s in subs["9"]] if "spires" in suffixes: new_subs = [("a", "SPIRES-%s" % subs["a"][0])] record_add_field(rec, "970", subfields=new_subs) continue if "a" in subs: for sub in subs["a"]: if sub.lower() in forbidden_values: record_delete_field(rec, tag="035", field_position_global=field[4]) rep_088_fields = record_get_field_instances(rec, "088") for field in rep_088_fields: subs = field_get_subfields(field) if "9" in subs: for val in subs["9"]: if val.startswith("P0") or val.startswith("CM-P0"): sf = [("9", "CERN"), ("b", val)] record_add_field(rec, "595", subfields=sf) for key, val in field[0]: if key in ["a", "9"] and not val.startswith("SIS-"): record_add_field(rec, "037", subfields=[("a", val)]) record_delete_fields(rec, "088") # 037 Externals also... rep_037_fields = record_get_field_instances(rec, "037") for field in rep_037_fields: subs = field_get_subfields(field) if "a" in subs: for value in subs["a"]: if "arXiv" in value: new_subs = [("a", value), ("9", "arXiv")] for fld in record_get_field_instances(rec, "695"): for key, val in field_get_subfield_instances(fld): if key == "a": new_subs.append(("c", val)) break nf = create_field(subfields=new_subs) record_replace_field(rec, "037", nf, field[4]) for key, val in field[0]: if key in ["a", "9"] and val.startswith("SIS-"): record_delete_field(rec, "037", field_position_global=field[4]) for field in record_get_field_instances(rec, "242"): record_add_field(rec, "246", subfields=field[0]) record_delete_fields(rec, "242") # 269 Date normalization for field in record_get_field_instances(rec, "269"): for idx, (key, value) in enumerate(field[0]): if key == "c": field[0][idx] = ("c", convert_date_to_iso(value)) record_delete_fields(rec, "260") if not "THESIS" in collections: for field in record_get_field_instances(rec, "260"): record_add_field(rec, "269", subfields=field[0]) record_delete_fields(rec, "260") # 300 page number for field in record_get_field_instances(rec, "300"): for idx, (key, value) in enumerate(field[0]): if key == "a": if "mult." not in value and value != " p": field[0][idx] = ("a", re.sub(r"[^\d-]+", "", value)) else: record_delete_field(rec, "300", field_position_global=field[4]) break # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, "100") author_names.extend(record_get_field_instances(rec, "700")) for field in author_names: subs = field_get_subfields(field) if not "i" in subs or "XX" in subs["i"]: if not "j" in subs or "YY" in subs["j"]: for idx, (key, value) in enumerate(field[0]): if key == "a": field[0][idx] = ("a", punctuate_authorname(value)) # 700 -> 701 Thesis supervisors if "THESIS" in collections: for field in record_get_field_instances(rec, "700"): record_add_field(rec, "701", subfields=field[0]) record_delete_fields(rec, "700") # 501 move subfields fields_501 = record_get_field_instances(rec, "502") for idx, field in enumerate(fields_501): new_subs = [] for key, value in field[0]: if key == "a": new_subs.append(("b", value)) elif key == "b": new_subs.append(("c", value)) elif key == "c": new_subs.append(("d", value)) else: new_subs.append((key, value)) fields_501[idx] = field_swap_subfields(field, new_subs) # 650 Translate Categories categories = get_categories() category_fields = record_get_field_instances(rec, "650", ind1="1", ind2="7") record_delete_fields(rec, "650") for field in category_fields: for idx, (key, value) in enumerate(field[0]): if key == "a": new_value = translate_config(value, categories) if new_value != value: new_subs = [("2", "INSPIRE"), ("a", new_value)] else: new_subs = [("2", "SzGeCERN"), ("a", value)] record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs) break # 653 Free Keywords for field in record_get_field_instances(rec, "653", ind1="1"): subs = field_get_subfields(field) new_subs = [] if "a" in subs: for val in subs["a"]: new_subs.extend([("9", "author"), ("a", val)]) new_field = create_field(subfields=new_subs, ind1="1") record_replace_field(rec, "653", new_field, field_position_global=field[4]) experiments = get_experiments() # 693 Remove if 'not applicable' for field in record_get_field_instances(rec, "693"): subs = field_get_subfields(field) all_subs = subs.get("a", []) + subs.get("e", []) if "not applicable" in [x.lower() for x in all_subs]: record_delete_field(rec, "693", field_position_global=field[4]) new_subs = [] experiment_a = "" experiment_e = "" for (key, value) in subs.iteritems(): if key == "a": experiment_a = value[0] new_subs.append((key, value[0])) elif key == "e": experiment_e = value[0] experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e) translated_experiments = translate_config(experiment, experiments) new_subs.append(("e", translated_experiments)) record_delete_field(rec, tag="693", field_position_global=field[4]) record_add_field(rec, "693", subfields=new_subs) # 710 Collaboration for field in record_get_field_instances(rec, "710"): subs = field_get_subfield_instances(field) for idx, (key, value) in enumerate(subs[:]): if key == "5": subs.pop(idx) elif value.startswith("CERN. Geneva"): subs.pop(idx) if len(subs) == 0: record_delete_field(rec, "710", field_position_global=field[4]) # 773 journal translations journals = get_journals() for field in record_get_field_instances(rec, "773"): subs = field_get_subfield_instances(field) new_subs = [] for idx, (key, value) in enumerate(subs): if key == "p": new_subs.append((key, translate_config(value, journals))) else: new_subs.append((key, value)) record_delete_field(rec, tag="773", field_position_global=field[4]) record_add_field(rec, "773", subfields=new_subs) # FFT (856) Dealing with graphs figure_counter = 0 for field in record_get_field_instances(rec, "856", ind1="4"): subs = field_get_subfields(field) newsubs = [] remove = False if "z" in subs: is_figure = [s for s in subs["z"] if "figure" in s.lower()] if is_figure and "u" in subs: is_subformat = [s for s in subs["u"] if "subformat" in s.lower()] if not is_subformat: url = subs["u"][0] if url.endswith(".pdf"): # We try to convert fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR) os.close(fd) _print("Downloading %s into %s" % (url, local_url), verbose=5) plotfile = "" try: plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (url,)) remove = True if plotfile: converted = convert_images([plotfile]) if converted: url = converted.pop() _print("Successfully converted %s to %s" % (local_url, url), verbose=5) else: _print("Conversion failed on %s" % (local_url,)) url = None remove = True if url: newsubs.append(("a", url)) newsubs.append(("t", "Plot")) figure_counter += 1 if "y" in subs: newsubs.append(("d", "%05d %s" % (figure_counter, subs["y"][0]))) newsubs.append(("n", subs["y"][0])) else: # Get basename without extension. name = os.path.basename(os.path.splitext(subs["u"][0])[0]) newsubs.append(("d", "%05d %s" % (figure_counter, name))) newsubs.append(("n", name)) if not newsubs and "u" in subs: is_fulltext = [s for s in subs["u"] if ".pdf" in s] if is_fulltext: newsubs = [("t", "INSPIRE-PUBLIC"), ("a", subs["u"][0])] if not newsubs and "u" in subs: remove = True is_zipfile = [s for s in subs["u"] if ".zip" in s] if is_zipfile: url = is_zipfile[0] local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url)) _print("Downloading %s into %s" % (url, local_url), verbose=5) zipped_archive = "" try: zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0],)) remove = True if zipped_archive: unzipped_archive = unzip(zipped_archive) list_of_pngs = locate("*.png", unzipped_archive) for png in list_of_pngs: if "_vti_" in png or "__MACOSX" in png: continue figure_counter += 1 plotsubs = [] plotsubs.append(("a", png)) caption = "%05d %s" % (figure_counter, os.path.basename(png)) plotsubs.append(("d", caption)) plotsubs.append(("t", "Plot")) record_add_field(rec, "FFT", subfields=plotsubs) if not remove and not newsubs and "u" in subs: urls = ( "http://cdsweb.cern.ch", "http://cms.cern.ch", "http://cmsdoc.cern.ch", "http://documents.cern.ch", "http://preprints.cern.ch", "http://cds.cern.ch", ) for val in subs["u"]: if any(url in val for url in urls): remove = True break if val.endswith("ps.gz"): remove = True if newsubs: record_add_field(rec, "FFT", subfields=newsubs) remove = True if remove: record_delete_field(rec, "856", ind1="4", field_position_global=field[4]) # 500 - Preliminary results if "THESIS" not in collections: subs = [("a", "Preliminary results")] record_add_field(rec, "500", subfields=subs) for collection in collections: record_add_field(rec, "980", subfields=[("a", collection)]) return rec
elif tag == "773": # Special check for publication notes to make sure we are # not adding double information. correct_773 = True for existing_773 in existing_field_list: if field_get_subfield_values(existing_773, 'p'): correct_773 = False fields_to_correct.append((tag, [existing_773])) if correct_773: fields_to_correct.append((tag, new_field_list)) elif (tag == "100" or tag == "700") and take_authors: # Take authors since $i is missing # Check if some $$i is missing from new records as well and report it missing_identifier_fields = [] for field in new_fields_authors: subfields = dict(field_get_subfield_instances(field)) if "i" not in subfields: missing_identifier_fields.append(field) if missing_identifier_fields: create_authorlist_ticket( [("700", missing_identifier_fields)], current_record_arxiv_id, "AUTHORS_long_list", missing_ids=True) fields_to_correct.append((tag, new_field_list)) else: corrected_fields = [] if has_field_origin(new_field_list, "arXiv", "9"): for field in existing_field_list: if not "arXiv" in field_get_subfield_values( field, "9"):
def format_element(bfo, oai=0): """Produce MARCXML with enhanced fields. Adds 100/700 $x with Record ID of linked HepName, 701/702 $y with True/False if the signature is claimed $z with Record ID of institution $w with BAI of linked Profile 371/110 $z with Record ID of institution 119/502 $z with Record ID of institution 999C5 $0 with on the fly discovered Record IDs (not for books) 773 $0 with Record ID of corresponding Book or Proceeding or Report $1 with Record ID of corresponding Journal $2 with Record ID of corresponding Conference 693/710 $0 with Record ID of corresponding experiment """ can_see_hidden_stuff = not acc_authorize_action(bfo.user_info, 'runbibedit')[0] recid = bfo.recID if can_see_hidden_stuff and is_record_deleted(bfo): record = salvage_deleted_record_from_history(recid) else: record = bfo.get_record() # Let's filter hidden fields if can_see_hidden_stuff: # Let's add bibdoc info bibrecdocs = BibRecDocs(recid) for bibdocfile in bibrecdocs.list_latest_files(): fft = [ ('a', bibdocfile.fullpath), ('d', bibdocfile.description or ''), ('f', bibdocfile.format or ''), ('n', bibdocfile.name or ''), ('r', bibdocfile.status or ''), ('s', bibdocfile.cd.strftime('%Y-%m-%d %H:%M:%S')), ('t', bibdocfile.bibdoc.doctype), ('v', str(bibdocfile.version)), ('z', bibdocfile.comment or ''), ] for flag in bibdocfile.flags: fft.append(('o', flag)) record_add_field(record, 'FFT', subfields=fft) else: # not authorized for tag in CFG_BIBFORMAT_HIDDEN_TAGS: if tag in record: del record[tag] is_institution = 'INSTITUTION' in [ collection.upper() for collection in bfo.fields('980__a') ] signatures = {} if '100' in record or '700' in record: signatures = dict(( name, (personid, flag) ) for name, personid, flag in run_sql( "SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2", (recid, ))) # Let's add signatures for field in record_get_field_instances( record, '100') + record_get_field_instances( record, '700') + record_get_field_instances( record, '701') + record_get_field_instances(record, '702'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: author_name = subfield_dict['a'] personid, flag = signatures.get(author_name, (None, None)) bai = get_personid_canonical_id().get(personid) if bai: subfields.append(('w', bai)) hepname_id = get_hepname_id(personid) if hepname_id: subfields.append(('x', '%i' % hepname_id)) subfields.append(('y', '%i' % (flag == 2))) # And matched affiliations if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Thesis institution for field in record_get_field_instances(record, '502'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'c' in subfield_dict: for code, value in subfields: if code == 'c': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Related institution for field in record_get_field_instances(record, '510'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0' in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Related journal for field in record_get_field_instances(record, '530'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0' in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Enhance affiliation in Experiments for field in record_get_field_instances(record, '119'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance affiliation in HepNames and Jobs and Institutions and # naked affiliations in HEP for field in record_get_field_instances( record, '371') + record_get_field_instances(record, '902'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) for field in record_get_field_instances(record, '110'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if is_institution: # We try to resolve obsolete ICNs if 'x' in subfield_dict: for code, value in subfields: if code == 'x': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) else: # In other collections institution is in a if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance citation for field in record_get_field_instances(record, '999', ind1='C', ind2='5'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if '0' in subfield_dict: # Already available recid subfields.append(('z', '1')) else: matched_id = get_matched_id(subfields) if matched_id: subfields.append(('0', str(matched_id))) # Enhance related records for field in ( record_get_field_instances(record, '780', ind1='0', ind2='2') + record_get_field_instances(record, '785', ind1='0', ind2='2') + record_get_field_instances(record, '787', ind1='0', ind2='8')): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) subfield_citation = [] if subfield_dict.get('r'): # Reportnumber subfield_citation.append(('r', subfield_dict['r'])) if subfield_dict.get('z'): # ISBN subfield_citation.append(('i', subfield_dict['z'])) if 'w' not in subfield_dict and subfield_citation: matched_id = get_matched_id(subfield_citation) if matched_id: subfields.append(('w', str(matched_id))) # Enhance CNUMs and Journals for field in record_get_field_instances(record, '773'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) for code, value in subfields: if code == 'w': # Conference CNUMs recids = perform_request_search(p='111__g:"%s"' % value, cc='Conferences') if len(recids) == 1: subfields.append(('2', str(recids.pop()))) if '0' not in subfield_dict: recids = perform_request_search( p='773__w:"%s" 980:PROCEEDINGS' % value) if recid in recids: # We remove this very record, since it can be a proceedings recids.remove(recid) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'p': # Journal title recids = perform_request_search(p='711__a:"%s"' % value, cc='Journals') if len(recids) == 1: subfields.append(('1', str(recids.pop()))) elif code == 'z' and '0' not in subfield_dict: # ISBN recids = find_isbn({'ISBN': value}) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'r' and '0' not in subfield_dict: # Report recids = perform_request_search(p='reportnumber:"%s"' % value) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '693'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'e': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'a': recids = perform_request_search(p='119__b:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '710'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'g': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Add Creation date: if '961' in record: del record['961'] creation_date, modification_date = run_sql( "SELECT creation_date, modification_date FROM bibrec WHERE id=%s", (recid, ))[0] record_add_field(record, '961', subfields=[('x', creation_date.strftime('%Y-%m-%d')), ('c', modification_date.strftime('%Y-%m-%d'))]) formatted_record = record_xml_output(record) if oai: formatted_record = formatted_record.replace( "<record>", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>" ) formatted_record = formatted_record.replace( "<record xmlns=\"http://www.loc.gov/MARC21/slim\">", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>" ) formatted_record = formatted_record.replace("</record", "</marc:record") formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield") formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield") formatted_record = formatted_record.replace("<datafield", "<marc:datafield") formatted_record = formatted_record.replace("</datafield", "</marc:datafield") formatted_record = formatted_record.replace("<subfield", "<marc:subfield") formatted_record = formatted_record.replace("</subfield", "</marc:subfield") return formatted_record
# FIXME: Add some automatic deny/accept parameters, perhaps also bibmatch call insert_records.append(record) else: # Record exists, fetch existing record existing_record = get_record(recid) if existing_record is None: # Did not find existing record in database holdingpen_records.append(record) continue # We remove 500 field temporary/brief entry from revision if record already exists fields_500 = record_get_field_instances(record, '500', ind1="%", ind2="%") if fields_500 is not None: field_positions = [] for field in fields_500: subfields = field_get_subfield_instances(field) for subfield in subfields: if re.match("^.?((temporary|brief) entry).?$", subfield[1].lower(), re.IGNORECASE): field_positions.append((field[1], field[2], field[4])) for ind1, ind2, pos in field_positions: record_delete_field(record, '500', ind1=ind1, ind2=ind2, field_position_global=pos) # Now compare new version with existing one, returning a diff[tag] = (diffcode, [..]) # None - if field is the same for both records # ('r',) - field missing from input record, ignored ATM # ('a',) - new field added, should be updated with append # ('c', difference_comparison) -> if field field_id exists in both records, but it's value has changed # -> uploaded with correct if accepted fields_to_add = [] fields_to_correct = []