def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode("utf-8")) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag="999", ind1="%", ind2="%") # Replace 999 fields record_delete_fields(record, "999") record_add_fields(record, "999", fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode('utf-8')) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag='999', ind1='%', ind2='%') # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def _filter_fields(self, record, output_fields): """Removes from the record all the fields that are not output_fields. @param record: record structure (@see: bibrecord.py for details) @param output_fields: list of fields that should remain in the record @return: record containing only fields among output_fields """ # Tibor's new implementation: for tag in record.keys(): if tag not in output_fields: bibrecord.record_delete_fields(record, tag) return record # Rado's old implementation that leads to bibrecord-related # bug, see <https://savannah.cern.ch/task/?10267>: record_keys = record.keys() # Check if any of the tags, fields or subfields match # any value in output_fields. In case of match we leave # the element and its children in the record. # # If the element and all its children are not among the # output fields, it is deleted for tag in record_keys: tag = tag.lower() if tag not in output_fields: for (subfields, ind1, ind2, value, field_number) in record[tag]: current_field = tag + ind1.strip() + ind2.strip() current_field = current_field.lower() if current_field not in output_fields: delete_parents = True for (code, value) in subfields: current_subfield = current_field + code current_subfield = current_subfield.lower() if current_subfield not in output_fields: bibrecord.record_delete_subfield( record, tag, code, ind1, ind2) else: delete_parents = False if delete_parents: bibrecord.record_delete_field( record, tag, ind1, ind2) return record
def _filter_fields(self, record, output_fields): """Removes from the record all the fields that are not output_fields. @param record: record structure (@see: bibrecord.py for details) @param output_fields: list of fields that should remain in the record @return: record containing only fields among output_fields """ # Tibor's new implementation: for tag in record.keys(): if tag not in output_fields: bibrecord.record_delete_fields(record, tag) return record # Rado's old implementation that leads to bibrecord-related # bug, see <https://savannah.cern.ch/task/?10267>: record_keys = record.keys() # Check if any of the tags, fields or subfields match # any value in output_fields. In case of match we leave # the element and its children in the record. # # If the element and all its children are not among the # output fields, it is deleted for tag in record_keys: tag = tag.lower() if tag not in output_fields: for (subfields, ind1, ind2, value, field_number) in record[tag]: current_field = tag + ind1.strip() + ind2.strip() current_field = current_field.lower() if current_field not in output_fields: delete_parents = True for (code, value) in subfields: current_subfield = current_field + code current_subfield = current_subfield.lower() if current_subfield not in output_fields: bibrecord.record_delete_subfield(record, tag, code, ind1, ind2) else: delete_parents = False if delete_parents: bibrecord.record_delete_field(record, tag, ind1, ind2) return record
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml( txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode('utf-8')) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_file_contents( recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml(txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_contents(recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden cds_id = rec['001'][0][3] if not 'hidden' in [ x.lower() for x in record_get_field_values(rec, "980", code="a") ]: record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = [ "024", "041", "035", "037", "088", "100", "110", "111", "242", "245", "246", "260", "269", "300", "502", "650", "653", "693", "700", "710", "773", "856", "520", "500", "980" ] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) # 980 Determine Collections collections = set([]) for value in record_get_field_values(rec, '980', code='a'): if 'NOTE' in value.upper(): collections.add('NOTE') if 'THESIS' in value.upper(): collections.add('THESIS') if 'CONFERENCEPAPER' in value.upper(): collections.add('ConferencePaper') if is_published(rec): collections.add("PUBLISHED") collections.add("CITEABLE") if not 'NOTE' in collections: # TODO: Move this to a KB kb = [ 'ATLAS-CONF-', 'CMS-PAS-', 'ATL-', 'CMS-DP-', 'ALICE-INT-', 'LHCb-PUB-' ] values = record_get_field_values(rec, "088", code='a') for val, rep in product(values, kb): if val.startswith(rep): collections.add('NOTE') break # 980 Arxiv tag if record_get_field_values(rec, '035', filter_subfield_code="a", filter_subfield_value="arXiv"): collections.add("arXiv") # 980 HEP && CORE collections.add('HEP') collections.add('CORE') # 980 Conference Note if not 'ConferencePaper' in collections: for value in record_get_field_values(rec, '962', code='n'): if value[-2:].isdigit(): collections.add('ConferencePaper') break record_delete_fields(rec, "980") intnote = record_get_field_values(rec, '690', filter_subfield_code="a", filter_subfield_value='INTNOTE') if intnote: val_088 = record_get_field_values(rec, '088', filter_subfield_code="a") for val in val_088: if 'CMS' in val: url = ('http://weblib.cern.ch/abstract?CERN-CMS' + val.split('CMS', 1)[-1]) record_add_field(rec, '856', ind1='4', subfields=[('u', url)]) # 041 Language languages = get_languages() language_fields = record_get_field_instances(rec, '041') record_delete_fields(rec, "041") for field in language_fields: subs = field_get_subfields(field) if 'a' in subs: if "eng" in subs['a']: continue new_value = translate_config(subs['a'][0], languages) new_subs = [('a', new_value)] record_add_field(rec, "041", subfields=new_subs) # 035 Externals scn_035_fields = record_get_field_instances(rec, '035') forbidden_values = [ "cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01" ] for field in scn_035_fields: subs = field_get_subfields(field) if '9' in subs: if not 'a' in subs: continue for sub in subs['9']: if sub.lower() in forbidden_values: break else: # No forbidden values (We did not "break") suffixes = [s.lower() for s in subs['9']] if 'spires' in suffixes: new_subs = [('a', 'SPIRES-%s' % subs['a'][0])] record_add_field(rec, '970', subfields=new_subs) continue if 'a' in subs: for sub in subs['a']: if sub.lower() in forbidden_values: record_delete_field(rec, tag="035", field_position_global=field[4]) rep_088_fields = record_get_field_instances(rec, '088') for field in rep_088_fields: subs = field_get_subfields(field) if '9' in subs: for val in subs['9']: if val.startswith('P0') or val.startswith('CM-P0'): sf = [('9', 'CERN'), ('b', val)] record_add_field(rec, '595', subfields=sf) for key, val in field[0]: if key in ['a', '9'] and not val.startswith('SIS-'): record_add_field(rec, '037', subfields=[('a', val)]) record_delete_fields(rec, "088") # 037 Externals also... rep_037_fields = record_get_field_instances(rec, '037') for field in rep_037_fields: subs = field_get_subfields(field) if 'a' in subs: for value in subs['a']: if 'arXiv' in value: new_subs = [('a', value), ('9', 'arXiv')] for fld in record_get_field_instances(rec, '695'): for key, val in field_get_subfield_instances(fld): if key == 'a': new_subs.append(('c', val)) break nf = create_field(subfields=new_subs) record_replace_field(rec, '037', nf, field[4]) for key, val in field[0]: if key in ['a', '9'] and val.startswith('SIS-'): record_delete_field(rec, '037', field_position_global=field[4]) for field in record_get_field_instances(rec, '242'): record_add_field(rec, '246', subfields=field[0]) record_delete_fields(rec, '242') # 269 Date normalization for field in record_get_field_instances(rec, '269'): for idx, (key, value) in enumerate(field[0]): if key == "c": field[0][idx] = ("c", convert_date_to_iso(value)) record_delete_fields(rec, "260") if not 'THESIS' in collections: for field in record_get_field_instances(rec, '260'): record_add_field(rec, '269', subfields=field[0]) record_delete_fields(rec, '260') # 300 page number for field in record_get_field_instances(rec, '300'): for idx, (key, value) in enumerate(field[0]): if key == 'a': if "mult." not in value and value != " p": field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value)) else: record_delete_field(rec, '300', field_position_global=field[4]) break # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, '100') author_names.extend(record_get_field_instances(rec, '700')) for field in author_names: subs = field_get_subfields(field) if not 'i' in subs or 'XX' in subs['i']: if not 'j' in subs or 'YY' in subs['j']: for idx, (key, value) in enumerate(field[0]): if key == 'a': field[0][idx] = ('a', punctuate_authorname(value)) # 700 -> 701 Thesis supervisors if 'THESIS' in collections: for field in record_get_field_instances(rec, '700'): record_add_field(rec, '701', subfields=field[0]) record_delete_fields(rec, '700') # 501 move subfields fields_501 = record_get_field_instances(rec, '502') for idx, field in enumerate(fields_501): new_subs = [] for key, value in field[0]: if key == 'a': new_subs.append(('b', value)) elif key == 'b': new_subs.append(('c', value)) elif key == 'c': new_subs.append(('d', value)) else: new_subs.append((key, value)) fields_501[idx] = field_swap_subfields(field, new_subs) # 650 Translate Categories categories = get_categories() category_fields = record_get_field_instances(rec, '650', ind1='1', ind2='7') record_delete_fields(rec, "650") for field in category_fields: for idx, (key, value) in enumerate(field[0]): if key == 'a': new_value = translate_config(value, categories) if new_value != value: new_subs = [('2', 'INSPIRE'), ('a', new_value)] else: new_subs = [('2', 'SzGeCERN'), ('a', value)] record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs) break # 653 Free Keywords for field in record_get_field_instances(rec, '653', ind1='1'): subs = field_get_subfields(field) new_subs = [] if 'a' in subs: for val in subs['a']: new_subs.extend([('9', 'author'), ('a', val)]) new_field = create_field(subfields=new_subs, ind1='1') record_replace_field(rec, '653', new_field, field_position_global=field[4]) experiments = get_experiments() # 693 Remove if 'not applicable' for field in record_get_field_instances(rec, '693'): subs = field_get_subfields(field) all_subs = subs.get('a', []) + subs.get('e', []) if 'not applicable' in [x.lower() for x in all_subs]: record_delete_field(rec, '693', field_position_global=field[4]) new_subs = [] experiment_a = "" experiment_e = "" for (key, value) in subs.iteritems(): if key == 'a': experiment_a = value[0] new_subs.append((key, value[0])) elif key == 'e': experiment_e = value[0] experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e) translated_experiments = translate_config(experiment, experiments) new_subs.append(("e", translated_experiments)) record_delete_field(rec, tag="693", field_position_global=field[4]) record_add_field(rec, "693", subfields=new_subs) # 710 Collaboration for field in record_get_field_instances(rec, '710'): subs = field_get_subfield_instances(field) for idx, (key, value) in enumerate(subs[:]): if key == '5': subs.pop(idx) elif value.startswith('CERN. Geneva'): subs.pop(idx) if len(subs) == 0: record_delete_field(rec, '710', field_position_global=field[4]) # 773 journal translations journals = get_journals() for field in record_get_field_instances(rec, '773'): subs = field_get_subfield_instances(field) new_subs = [] for idx, (key, value) in enumerate(subs): if key == 'p': new_subs.append((key, translate_config(value, journals))) else: new_subs.append((key, value)) record_delete_field(rec, tag="773", field_position_global=field[4]) record_add_field(rec, "773", subfields=new_subs) # FFT (856) Dealing with graphs figure_counter = 0 for field in record_get_field_instances(rec, '856', ind1='4'): subs = field_get_subfields(field) newsubs = [] remove = False if 'z' in subs: is_figure = [s for s in subs['z'] if "figure" in s.lower()] if is_figure and 'u' in subs: is_subformat = [ s for s in subs['u'] if "subformat" in s.lower() ] if not is_subformat: url = subs['u'][0] if url.endswith(".pdf"): # We try to convert fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR) os.close(fd) _print("Downloading %s into %s" % (url, local_url), verbose=5) plotfile = "" try: plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print( "Download failed while attempting to reach %s. Skipping.." % (url, )) remove = True if plotfile: converted = convert_images([plotfile]) if converted: url = converted.pop() _print("Successfully converted %s to %s" % (local_url, url), verbose=5) else: _print("Conversion failed on %s" % (local_url, )) url = None remove = True if url: newsubs.append(('a', url)) newsubs.append(('t', 'Plot')) figure_counter += 1 if 'y' in subs: newsubs.append( ('d', "%05d %s" % (figure_counter, subs['y'][0]))) newsubs.append(('n', subs['y'][0])) else: # Get basename without extension. name = os.path.basename( os.path.splitext(subs['u'][0])[0]) newsubs.append( ('d', "%05d %s" % (figure_counter, name))) newsubs.append(('n', name)) if not newsubs and 'u' in subs: is_fulltext = [s for s in subs['u'] if ".pdf" in s] if is_fulltext: newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])] if not newsubs and 'u' in subs: remove = True is_zipfile = [s for s in subs['u'] if ".zip" in s] if is_zipfile: url = is_zipfile[0] local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url)) _print("Downloading %s into %s" % (url, local_url), verbose=5) zipped_archive = "" try: zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print( "Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0], )) remove = True if zipped_archive: unzipped_archive = unzip(zipped_archive) list_of_pngs = locate("*.png", unzipped_archive) for png in list_of_pngs: if "_vti_" in png or "__MACOSX" in png: continue figure_counter += 1 plotsubs = [] plotsubs.append(('a', png)) caption = '%05d %s' % (figure_counter, os.path.basename(png)) plotsubs.append(('d', caption)) plotsubs.append(('t', 'Plot')) record_add_field(rec, 'FFT', subfields=plotsubs) if not remove and not newsubs and 'u' in subs: urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch', 'http://cmsdoc.cern.ch', 'http://documents.cern.ch', 'http://preprints.cern.ch', 'http://cds.cern.ch') for val in subs['u']: if any(url in val for url in urls): remove = True break if val.endswith('ps.gz'): remove = True if newsubs: record_add_field(rec, 'FFT', subfields=newsubs) remove = True if remove: record_delete_field(rec, '856', ind1='4', field_position_global=field[4]) # 500 - Preliminary results if "THESIS" not in collections: subs = [('a', "Preliminary results")] record_add_field(rec, "500", subfields=subs) for collection in collections: record_add_field(rec, '980', subfields=[('a', collection)]) return rec
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden cds_id = rec['001'][0][3] if not 'hidden' in [x.lower() for x in record_get_field_values(rec, "980", code="a")]: record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = ["024", "041", "035", "037", "088", "100", "110", "111", "242", "245", "246", "260", "269", "300", "502", "650", "653", "693", "700", "710", "773", "856", "520", "500", "980"] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) # 980 Determine Collections collections = set([]) for value in record_get_field_values(rec, '980', code='a'): if 'NOTE' in value.upper(): collections.add('NOTE') if 'THESIS' in value.upper(): collections.add('THESIS') if 'CONFERENCEPAPER' in value.upper(): collections.add('ConferencePaper') if is_published(rec): collections.add("PUBLISHED") collections.add("CITEABLE") if not 'NOTE' in collections: # TODO: Move this to a KB kb = ['ATLAS-CONF-', 'CMS-PAS-', 'ATL-', 'CMS-DP-', 'ALICE-INT-', 'LHCb-PUB-'] values = record_get_field_values(rec, "088", code='a') for val, rep in product(values, kb): if val.startswith(rep): collections.add('NOTE') break # 980 Arxiv tag if record_get_field_values(rec, '035', filter_subfield_code="a", filter_subfield_value="arXiv"): collections.add("arXiv") # 980 HEP && CORE collections.add('HEP') collections.add('CORE') # 980 Conference Note if not 'ConferencePaper' in collections: for value in record_get_field_values(rec, '962', code='n'): if value[-2:].isdigit(): collections.add('ConferencePaper') break record_delete_fields(rec, "980") intnote = record_get_field_values(rec, '690', filter_subfield_code="a", filter_subfield_value='INTNOTE') if intnote: val_088 = record_get_field_values(rec, '088', filter_subfield_code="a") for val in val_088: if 'CMS' in val: url = ('http://weblib.cern.ch/abstract?CERN-CMS' + val.split('CMS', 1)[-1]) record_add_field(rec, '856', ind1='4', subfields=[('u', url)]) # 041 Language languages = get_languages() language_fields = record_get_field_instances(rec, '041') record_delete_fields(rec, "041") for field in language_fields: subs = field_get_subfields(field) if 'a' in subs: if "eng" in subs['a']: continue new_value = translate_config(subs['a'][0], languages) new_subs = [('a', new_value)] record_add_field(rec, "041", subfields=new_subs) # 035 Externals scn_035_fields = record_get_field_instances(rec, '035') forbidden_values = ["cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"] for field in scn_035_fields: subs = field_get_subfields(field) if '9' in subs: if not 'a' in subs: continue for sub in subs['9']: if sub.lower() in forbidden_values: break else: # No forbidden values (We did not "break") suffixes = [s.lower() for s in subs['9']] if 'spires' in suffixes: new_subs = [('a', 'SPIRES-%s' % subs['a'][0])] record_add_field(rec, '970', subfields=new_subs) continue if 'a' in subs: for sub in subs['a']: if sub.lower() in forbidden_values: record_delete_field(rec, tag="035", field_position_global=field[4]) rep_088_fields = record_get_field_instances(rec, '088') for field in rep_088_fields: subs = field_get_subfields(field) if '9' in subs: for val in subs['9']: if val.startswith('P0') or val.startswith('CM-P0'): sf = [('9', 'CERN'), ('b', val)] record_add_field(rec, '595', subfields=sf) for key, val in field[0]: if key in ['a', '9'] and not val.startswith('SIS-'): record_add_field(rec, '037', subfields=[('a', val)]) record_delete_fields(rec, "088") # 037 Externals also... rep_037_fields = record_get_field_instances(rec, '037') for field in rep_037_fields: subs = field_get_subfields(field) if 'a' in subs: for value in subs['a']: if 'arXiv' in value: new_subs = [('a', value), ('9', 'arXiv')] for fld in record_get_field_instances(rec, '695'): for key, val in field_get_subfield_instances(fld): if key == 'a': new_subs.append(('c', val)) break nf = create_field(subfields=new_subs) record_replace_field(rec, '037', nf, field[4]) for key, val in field[0]: if key in ['a', '9'] and val.startswith('SIS-'): record_delete_field(rec, '037', field_position_global=field[4]) for field in record_get_field_instances(rec, '242'): record_add_field(rec, '246', subfields=field[0]) record_delete_fields(rec, '242') # 269 Date normalization for field in record_get_field_instances(rec, '269'): for idx, (key, value) in enumerate(field[0]): if key == "c": field[0][idx] = ("c", convert_date_to_iso(value)) record_delete_fields(rec, "260") if not 'THESIS' in collections: for field in record_get_field_instances(rec, '260'): record_add_field(rec, '269', subfields=field[0]) record_delete_fields(rec, '260') # 300 page number for field in record_get_field_instances(rec, '300'): for idx, (key, value) in enumerate(field[0]): if key == 'a': if "mult." not in value and value != " p": field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value)) else: record_delete_field(rec, '300', field_position_global=field[4]) break # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, '100') author_names.extend(record_get_field_instances(rec, '700')) for field in author_names: subs = field_get_subfields(field) if not 'i' in subs or 'XX' in subs['i']: if not 'j' in subs or 'YY' in subs['j']: for idx, (key, value) in enumerate(field[0]): if key == 'a': field[0][idx] = ('a', punctuate_authorname(value)) # 700 -> 701 Thesis supervisors if 'THESIS' in collections: for field in record_get_field_instances(rec, '700'): record_add_field(rec, '701', subfields=field[0]) record_delete_fields(rec, '700') # 501 move subfields fields_501 = record_get_field_instances(rec, '502') for idx, field in enumerate(fields_501): new_subs = [] for key, value in field[0]: if key == 'a': new_subs.append(('b', value)) elif key == 'b': new_subs.append(('c', value)) elif key == 'c': new_subs.append(('d', value)) else: new_subs.append((key, value)) fields_501[idx] = field_swap_subfields(field, new_subs) # 650 Translate Categories categories = get_categories() category_fields = record_get_field_instances(rec, '650', ind1='1', ind2='7') record_delete_fields(rec, "650") for field in category_fields: for idx, (key, value) in enumerate(field[0]): if key == 'a': new_value = translate_config(value, categories) if new_value != value: new_subs = [('2', 'INSPIRE'), ('a', new_value)] else: new_subs = [('2', 'SzGeCERN'), ('a', value)] record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs) break # 653 Free Keywords for field in record_get_field_instances(rec, '653', ind1='1'): subs = field_get_subfields(field) new_subs = [] if 'a' in subs: for val in subs['a']: new_subs.extend([('9', 'author'), ('a', val)]) new_field = create_field(subfields=new_subs, ind1='1') record_replace_field(rec, '653', new_field, field_position_global=field[4]) experiments = get_experiments() # 693 Remove if 'not applicable' for field in record_get_field_instances(rec, '693'): subs = field_get_subfields(field) all_subs = subs.get('a', []) + subs.get('e', []) if 'not applicable' in [x.lower() for x in all_subs]: record_delete_field(rec, '693', field_position_global=field[4]) new_subs = [] experiment_a = "" experiment_e = "" for (key, value) in subs.iteritems(): if key == 'a': experiment_a = value[0] new_subs.append((key, value[0])) elif key == 'e': experiment_e = value[0] experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e) translated_experiments = translate_config(experiment, experiments) new_subs.append(("e", translated_experiments)) record_delete_field(rec, tag="693", field_position_global=field[4]) record_add_field(rec, "693", subfields=new_subs) # 710 Collaboration for field in record_get_field_instances(rec, '710'): subs = field_get_subfield_instances(field) for idx, (key, value) in enumerate(subs[:]): if key == '5': subs.pop(idx) elif value.startswith('CERN. Geneva'): subs.pop(idx) if len(subs) == 0: record_delete_field(rec, '710', field_position_global=field[4]) # 773 journal translations journals = get_journals() for field in record_get_field_instances(rec, '773'): subs = field_get_subfield_instances(field) new_subs = [] for idx, (key, value) in enumerate(subs): if key == 'p': new_subs.append((key, translate_config(value, journals))) else: new_subs.append((key, value)) record_delete_field(rec, tag="773", field_position_global=field[4]) record_add_field(rec, "773", subfields=new_subs) # FFT (856) Dealing with graphs figure_counter = 0 for field in record_get_field_instances(rec, '856', ind1='4'): subs = field_get_subfields(field) newsubs = [] remove = False if 'z' in subs: is_figure = [s for s in subs['z'] if "figure" in s.lower()] if is_figure and 'u' in subs: is_subformat = [s for s in subs['u'] if "subformat" in s.lower()] if not is_subformat: url = subs['u'][0] if url.endswith(".pdf"): # We try to convert fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR) os.close(fd) _print("Downloading %s into %s" % (url, local_url), verbose=5) plotfile = "" try: plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (url,)) remove = True if plotfile: converted = convert_images([plotfile]) if converted: url = converted.pop() _print("Successfully converted %s to %s" % (local_url, url), verbose=5) else: _print("Conversion failed on %s" % (local_url,)) url = None remove = True if url: newsubs.append(('a', url)) newsubs.append(('t', 'Plot')) figure_counter += 1 if 'y' in subs: newsubs.append(('d', "%05d %s" % (figure_counter, subs['y'][0]))) newsubs.append(('n', subs['y'][0])) else: # Get basename without extension. name = os.path.basename(os.path.splitext(subs['u'][0])[0]) newsubs.append(('d', "%05d %s" % (figure_counter, name))) newsubs.append(('n', name)) if not newsubs and 'u' in subs: is_fulltext = [s for s in subs['u'] if ".pdf" in s and not "subformat=pdfa" in s] if is_fulltext: newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])] if not newsubs and 'u' in subs: remove = True is_zipfile = [s for s in subs['u'] if ".zip" in s] if is_zipfile: url = is_zipfile[0] local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url)) _print("Downloading %s into %s" % (url, local_url), verbose=5) zipped_archive = "" try: zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0],)) remove = True if zipped_archive: unzipped_archive = unzip(zipped_archive) list_of_pngs = locate("*.png", unzipped_archive) for png in list_of_pngs: if "_vti_" in png or "__MACOSX" in png: continue figure_counter += 1 plotsubs = [] plotsubs.append(('a', png)) caption = '%05d %s' % (figure_counter, os.path.basename(png)) plotsubs.append(('d', caption)) plotsubs.append(('t', 'Plot')) record_add_field(rec, 'FFT', subfields=plotsubs) if not remove and not newsubs and 'u' in subs: urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch', 'http://cmsdoc.cern.ch', 'http://documents.cern.ch', 'http://preprints.cern.ch', 'http://cds.cern.ch') for val in subs['u']: if any(url in val for url in urls): remove = True break if val.endswith('ps.gz'): remove = True if newsubs: record_add_field(rec, 'FFT', subfields=newsubs) remove = True if remove: record_delete_field(rec, '856', ind1='4', field_position_global=field[4]) # 500 - Preliminary results if "THESIS" not in collections: subs = [('a', "Preliminary results")] record_add_field(rec, "500", subfields=subs) for collection in collections: record_add_field(rec, '980', subfields=[('a', collection)]) return rec
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden zenodo_id = rec['001'][0][3] if not 'hidden' in [x.lower() for x in record_get_field_values(rec, "980", code="a")]: record_add_field(rec, '035', subfields=[('9', 'Zenodo'), ('a', zenodo_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = ["024", "035", "100", "245", "260", "700", "710", "773", "856", "520", "500"] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) descriptions = record_get_field_instances(rec, '520') record_delete_fields(rec, '520') for desc in descriptions: subs = field_get_subfields(desc) if 'a' in subs: record_add_field(rec, "520", subfields=[('9', 'Zenodo'), ('h', subs['a'][0])]) # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, '100') author_names.extend(record_get_field_instances(rec, '700')) for field in author_names: subs = field_get_subfields(field) if not 'i' in subs or 'XX' in subs['i']: if not 'j' in subs or 'YY' in subs['j']: for idx, (key, value) in enumerate(field[0]): if key == 'a': field[0][idx] = ('a', punctuate_authorname(value)) # 773 is cited by, DOI of the extended paper # match the INSPIRE record ID of that paper and add it in 786__w for field in record_get_field_instances(rec, '773'): subs = field_get_subfields(field) if 'i' in subs and 'isSupplementTo' in subs['i']: if 'n' in subs and "doi" in [s.lower() for s in subs['n']]: paper_recid = perform_request_search( p="0247_a:%s" % subs['a'][0], of="id" ) if paper_recid: record_add_field(rec, "786", subfields=[('w', str(paper_recid[0]))]) if 'n' in subs and "arxiv" in [s.lower() for s in subs['n']]: paper_recid = perform_request_search( p="037__a:%s" % subs['a'][0], of="id" ) if paper_recid: record_add_field(rec, "786", subfields=[('w', str(paper_recid[0]))]) # Other mandatory fields # 786 formatting record_add_field(rec, "786", subfields=[('q', '0')]) # 980 only DATA Collection record_add_field(rec, '980', subfields=[('a', 'DATA')]) return rec
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden cds_id = rec["001"][0][3] if not "hidden" in [x.lower() for x in record_get_field_values(rec, "980", code="a")]: record_add_field(rec, "035", subfields=[("9", "CDS"), ("a", cds_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = [ "024", "041", "035", "037", "088", "100", "110", "111", "242", "245", "246", "260", "269", "300", "502", "650", "653", "693", "700", "710", "773", "856", "520", "500", "980", ] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) # 980 Determine Collections collections = set([]) for value in record_get_field_values(rec, "980", code="a"): if "NOTE" in value.upper(): collections.add("NOTE") if "THESIS" in value.upper(): collections.add("THESIS") if "CONFERENCEPAPER" in value.upper(): collections.add("ConferencePaper") if is_published(rec): collections.add("PUBLISHED") collections.add("CITEABLE") if not "NOTE" in collections: # TODO: Move this to a KB kb = ["ATLAS-CONF-", "CMS-PAS-", "ATL-", "CMS-DP-", "ALICE-INT-", "LHCb-PUB-"] values = record_get_field_values(rec, "088", code="a") for val, rep in product(values, kb): if val.startswith(rep): collections.add("NOTE") break # 980 Arxiv tag if record_get_field_values(rec, "035", filter_subfield_code="a", filter_subfield_value="arXiv"): collections.add("arXiv") # 980 HEP && CORE collections.add("HEP") collections.add("CORE") # 980 Conference Note if not "ConferencePaper" in collections: for value in record_get_field_values(rec, "962", code="n"): if value[-2:].isdigit(): collections.add("ConferencePaper") break record_delete_fields(rec, "980") intnote = record_get_field_values(rec, "690", filter_subfield_code="a", filter_subfield_value="INTNOTE") if intnote: val_088 = record_get_field_values(rec, "088", filter_subfield_code="a") for val in val_088: if "CMS" in val: url = "http://weblib.cern.ch/abstract?CERN-CMS" + val.split("CMS", 1)[-1] record_add_field(rec, "856", ind1="4", subfields=[("u", url)]) # 041 Language languages = get_languages() language_fields = record_get_field_instances(rec, "041") record_delete_fields(rec, "041") for field in language_fields: subs = field_get_subfields(field) if "a" in subs: if "eng" in subs["a"]: continue new_value = translate_config(subs["a"][0], languages) new_subs = [("a", new_value)] record_add_field(rec, "041", subfields=new_subs) # 035 Externals scn_035_fields = record_get_field_instances(rec, "035") forbidden_values = ["cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"] for field in scn_035_fields: subs = field_get_subfields(field) if "9" in subs: if not "a" in subs: continue for sub in subs["9"]: if sub.lower() in forbidden_values: break else: # No forbidden values (We did not "break") suffixes = [s.lower() for s in subs["9"]] if "spires" in suffixes: new_subs = [("a", "SPIRES-%s" % subs["a"][0])] record_add_field(rec, "970", subfields=new_subs) continue if "a" in subs: for sub in subs["a"]: if sub.lower() in forbidden_values: record_delete_field(rec, tag="035", field_position_global=field[4]) rep_088_fields = record_get_field_instances(rec, "088") for field in rep_088_fields: subs = field_get_subfields(field) if "9" in subs: for val in subs["9"]: if val.startswith("P0") or val.startswith("CM-P0"): sf = [("9", "CERN"), ("b", val)] record_add_field(rec, "595", subfields=sf) for key, val in field[0]: if key in ["a", "9"] and not val.startswith("SIS-"): record_add_field(rec, "037", subfields=[("a", val)]) record_delete_fields(rec, "088") # 037 Externals also... rep_037_fields = record_get_field_instances(rec, "037") for field in rep_037_fields: subs = field_get_subfields(field) if "a" in subs: for value in subs["a"]: if "arXiv" in value: new_subs = [("a", value), ("9", "arXiv")] for fld in record_get_field_instances(rec, "695"): for key, val in field_get_subfield_instances(fld): if key == "a": new_subs.append(("c", val)) break nf = create_field(subfields=new_subs) record_replace_field(rec, "037", nf, field[4]) for key, val in field[0]: if key in ["a", "9"] and val.startswith("SIS-"): record_delete_field(rec, "037", field_position_global=field[4]) for field in record_get_field_instances(rec, "242"): record_add_field(rec, "246", subfields=field[0]) record_delete_fields(rec, "242") # 269 Date normalization for field in record_get_field_instances(rec, "269"): for idx, (key, value) in enumerate(field[0]): if key == "c": field[0][idx] = ("c", convert_date_to_iso(value)) record_delete_fields(rec, "260") if not "THESIS" in collections: for field in record_get_field_instances(rec, "260"): record_add_field(rec, "269", subfields=field[0]) record_delete_fields(rec, "260") # 300 page number for field in record_get_field_instances(rec, "300"): for idx, (key, value) in enumerate(field[0]): if key == "a": if "mult." not in value and value != " p": field[0][idx] = ("a", re.sub(r"[^\d-]+", "", value)) else: record_delete_field(rec, "300", field_position_global=field[4]) break # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, "100") author_names.extend(record_get_field_instances(rec, "700")) for field in author_names: subs = field_get_subfields(field) if not "i" in subs or "XX" in subs["i"]: if not "j" in subs or "YY" in subs["j"]: for idx, (key, value) in enumerate(field[0]): if key == "a": field[0][idx] = ("a", punctuate_authorname(value)) # 700 -> 701 Thesis supervisors if "THESIS" in collections: for field in record_get_field_instances(rec, "700"): record_add_field(rec, "701", subfields=field[0]) record_delete_fields(rec, "700") # 501 move subfields fields_501 = record_get_field_instances(rec, "502") for idx, field in enumerate(fields_501): new_subs = [] for key, value in field[0]: if key == "a": new_subs.append(("b", value)) elif key == "b": new_subs.append(("c", value)) elif key == "c": new_subs.append(("d", value)) else: new_subs.append((key, value)) fields_501[idx] = field_swap_subfields(field, new_subs) # 650 Translate Categories categories = get_categories() category_fields = record_get_field_instances(rec, "650", ind1="1", ind2="7") record_delete_fields(rec, "650") for field in category_fields: for idx, (key, value) in enumerate(field[0]): if key == "a": new_value = translate_config(value, categories) if new_value != value: new_subs = [("2", "INSPIRE"), ("a", new_value)] else: new_subs = [("2", "SzGeCERN"), ("a", value)] record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs) break # 653 Free Keywords for field in record_get_field_instances(rec, "653", ind1="1"): subs = field_get_subfields(field) new_subs = [] if "a" in subs: for val in subs["a"]: new_subs.extend([("9", "author"), ("a", val)]) new_field = create_field(subfields=new_subs, ind1="1") record_replace_field(rec, "653", new_field, field_position_global=field[4]) experiments = get_experiments() # 693 Remove if 'not applicable' for field in record_get_field_instances(rec, "693"): subs = field_get_subfields(field) all_subs = subs.get("a", []) + subs.get("e", []) if "not applicable" in [x.lower() for x in all_subs]: record_delete_field(rec, "693", field_position_global=field[4]) new_subs = [] experiment_a = "" experiment_e = "" for (key, value) in subs.iteritems(): if key == "a": experiment_a = value[0] new_subs.append((key, value[0])) elif key == "e": experiment_e = value[0] experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e) translated_experiments = translate_config(experiment, experiments) new_subs.append(("e", translated_experiments)) record_delete_field(rec, tag="693", field_position_global=field[4]) record_add_field(rec, "693", subfields=new_subs) # 710 Collaboration for field in record_get_field_instances(rec, "710"): subs = field_get_subfield_instances(field) for idx, (key, value) in enumerate(subs[:]): if key == "5": subs.pop(idx) elif value.startswith("CERN. Geneva"): subs.pop(idx) if len(subs) == 0: record_delete_field(rec, "710", field_position_global=field[4]) # 773 journal translations journals = get_journals() for field in record_get_field_instances(rec, "773"): subs = field_get_subfield_instances(field) new_subs = [] for idx, (key, value) in enumerate(subs): if key == "p": new_subs.append((key, translate_config(value, journals))) else: new_subs.append((key, value)) record_delete_field(rec, tag="773", field_position_global=field[4]) record_add_field(rec, "773", subfields=new_subs) # FFT (856) Dealing with graphs figure_counter = 0 for field in record_get_field_instances(rec, "856", ind1="4"): subs = field_get_subfields(field) newsubs = [] remove = False if "z" in subs: is_figure = [s for s in subs["z"] if "figure" in s.lower()] if is_figure and "u" in subs: is_subformat = [s for s in subs["u"] if "subformat" in s.lower()] if not is_subformat: url = subs["u"][0] if url.endswith(".pdf"): # We try to convert fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR) os.close(fd) _print("Downloading %s into %s" % (url, local_url), verbose=5) plotfile = "" try: plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (url,)) remove = True if plotfile: converted = convert_images([plotfile]) if converted: url = converted.pop() _print("Successfully converted %s to %s" % (local_url, url), verbose=5) else: _print("Conversion failed on %s" % (local_url,)) url = None remove = True if url: newsubs.append(("a", url)) newsubs.append(("t", "Plot")) figure_counter += 1 if "y" in subs: newsubs.append(("d", "%05d %s" % (figure_counter, subs["y"][0]))) newsubs.append(("n", subs["y"][0])) else: # Get basename without extension. name = os.path.basename(os.path.splitext(subs["u"][0])[0]) newsubs.append(("d", "%05d %s" % (figure_counter, name))) newsubs.append(("n", name)) if not newsubs and "u" in subs: is_fulltext = [s for s in subs["u"] if ".pdf" in s] if is_fulltext: newsubs = [("t", "INSPIRE-PUBLIC"), ("a", subs["u"][0])] if not newsubs and "u" in subs: remove = True is_zipfile = [s for s in subs["u"] if ".zip" in s] if is_zipfile: url = is_zipfile[0] local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url)) _print("Downloading %s into %s" % (url, local_url), verbose=5) zipped_archive = "" try: zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0],)) remove = True if zipped_archive: unzipped_archive = unzip(zipped_archive) list_of_pngs = locate("*.png", unzipped_archive) for png in list_of_pngs: if "_vti_" in png or "__MACOSX" in png: continue figure_counter += 1 plotsubs = [] plotsubs.append(("a", png)) caption = "%05d %s" % (figure_counter, os.path.basename(png)) plotsubs.append(("d", caption)) plotsubs.append(("t", "Plot")) record_add_field(rec, "FFT", subfields=plotsubs) if not remove and not newsubs and "u" in subs: urls = ( "http://cdsweb.cern.ch", "http://cms.cern.ch", "http://cmsdoc.cern.ch", "http://documents.cern.ch", "http://preprints.cern.ch", "http://cds.cern.ch", ) for val in subs["u"]: if any(url in val for url in urls): remove = True break if val.endswith("ps.gz"): remove = True if newsubs: record_add_field(rec, "FFT", subfields=newsubs) remove = True if remove: record_delete_field(rec, "856", ind1="4", field_position_global=field[4]) # 500 - Preliminary results if "THESIS" not in collections: subs = [("a", "Preliminary results")] record_add_field(rec, "500", subfields=subs) for collection in collections: record_add_field(rec, "980", subfields=[("a", collection)]) return rec
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden zenodo_id = rec["001"][0][3] if not "hidden" in [x.lower() for x in record_get_field_values(rec, "980", code="a")]: record_add_field(rec, "035", subfields=[("9", "Zenodo"), ("a", zenodo_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = ["024", "035", "100", "245", "260", "700", "710", "773", "856", "520", "500"] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) descriptions = record_get_field_instances(rec, "520") record_delete_fields(rec, "520") for desc in descriptions: subs = field_get_subfields(desc) if "a" in subs: record_add_field(rec, "520", subfields=[("9", "Zenodo"), ("h", subs["a"][0])]) # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, "100") author_names.extend(record_get_field_instances(rec, "700")) for field in author_names: subs = field_get_subfields(field) if not "i" in subs or "XX" in subs["i"]: if not "j" in subs or "YY" in subs["j"]: for idx, (key, value) in enumerate(field[0]): if key == "a": field[0][idx] = ("a", punctuate_authorname(value)) # 773 is cited by, DOI of the extended paper # match the INSPIRE record ID of that paper and add it in 786__w for field in record_get_field_instances(rec, "773"): subs = field_get_subfields(field) if "i" in subs and "isSupplementTo" in subs["i"]: if "n" in subs and "doi" in [s.lower() for s in subs["n"]]: paper_recid = perform_request_search(p="0247_a:%s" % subs["a"][0], of="id") if paper_recid: record_add_field(rec, "786", subfields=[("w", str(paper_recid[0]))]) if "n" in subs and "arxiv" in [s.lower() for s in subs["n"]]: paper_recid = perform_request_search(p="037__a:%s" % subs["a"][0], of="id") if paper_recid: record_add_field(rec, "786", subfields=[("w", str(paper_recid[0]))]) # Other mandatory fields # 786 formatting record_add_field(rec, "786", subfields=[("q", "0")]) # 980 only DATA Collection record_add_field(rec, "980", subfields=[("a", "DATA")]) return rec