def test_unzip(self): """ Test uncompressing function. """ test_zipped_file = "./test/test.zip" desired_dir = os.path.join(CFG_TMPSHAREDDIR, "apsharvest") unzipped_folder = unzip(test_zipped_file, desired_dir) self.assertTrue( unzipped_folder.startswith(desired_dir), "Unzipped folder is located in the wrong place %s!" % unzipped_folder, ) unzipped_folder = unzip(test_zipped_file) self.assertTrue(os.path.exists(unzipped_folder), "Unzipped folder does not exist!") z = zipfile.ZipFile(test_zipped_file) list_of_zipfiles = z.namelist() found_list_of_files = get_files_and_folders(unzipped_folder) self.assertTrue(len(list_of_zipfiles) == len(found_list_of_files), "Looks like all files were not extracted!")
except StandardError, e: if 'urlopen' in str(e) or 'URL could not be opened' in str(e): msg = "URL could not be opened: %s" % (url,) write_message("Error: %s" % (msg,), stream=sys.stderr) write_message("No fulltext found for %s" % (record.recid or record.doi,)) yield record, msg continue raise finally: request_end = time.time() # Unzip the compressed file unzipped_folder = unzip(result_file, base_directory=self.out_folder) # Validate the checksum of the compressed fulltext file. try: checksum_validated_files = find_and_validate_md5_checksums( in_folder=unzipped_folder, md5key_filename=CFG_APSHARVEST_MD5_FILE) except APSFileChecksumError, e: info_msg = "Skipping %s in %s" % \ (record.recid or record.doi, unzipped_folder) msg = "Error while validating checksum: %s\n%s\n%s" % \ (info_msg, str(e), traceback.format_exc()[:-1]) write_message(msg) yield record, msg continue if not checksum_validated_files:
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden cds_id = rec['001'][0][3] if not 'hidden' in [x.lower() for x in record_get_field_values(rec, "980", code="a")]: record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = ["024", "041", "035", "037", "088", "100", "110", "111", "242", "245", "246", "260", "269", "300", "502", "650", "653", "693", "700", "710", "773", "856", "520", "500", "980"] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) # 980 Determine Collections collections = set([]) for value in record_get_field_values(rec, '980', code='a'): if 'NOTE' in value.upper(): collections.add('NOTE') if 'THESIS' in value.upper(): collections.add('THESIS') if 'CONFERENCEPAPER' in value.upper(): collections.add('ConferencePaper') if is_published(rec): collections.add("PUBLISHED") collections.add("CITEABLE") if not 'NOTE' in collections: # TODO: Move this to a KB kb = ['ATLAS-CONF-', 'CMS-PAS-', 'ATL-', 'CMS-DP-', 'ALICE-INT-', 'LHCb-PUB-'] values = record_get_field_values(rec, "088", code='a') for val, rep in product(values, kb): if val.startswith(rep): collections.add('NOTE') break # 980 Arxiv tag if record_get_field_values(rec, '035', filter_subfield_code="a", filter_subfield_value="arXiv"): collections.add("arXiv") # 980 HEP && CORE collections.add('HEP') collections.add('CORE') # 980 Conference Note if not 'ConferencePaper' in collections: for value in record_get_field_values(rec, '962', code='n'): if value[-2:].isdigit(): collections.add('ConferencePaper') break record_delete_fields(rec, "980") intnote = record_get_field_values(rec, '690', filter_subfield_code="a", filter_subfield_value='INTNOTE') if intnote: val_088 = record_get_field_values(rec, '088', filter_subfield_code="a") for val in val_088: if 'CMS' in val: url = ('http://weblib.cern.ch/abstract?CERN-CMS' + val.split('CMS', 1)[-1]) record_add_field(rec, '856', ind1='4', subfields=[('u', url)]) # 041 Language languages = get_languages() language_fields = record_get_field_instances(rec, '041') record_delete_fields(rec, "041") for field in language_fields: subs = field_get_subfields(field) if 'a' in subs: if "eng" in subs['a']: continue new_value = translate_config(subs['a'][0], languages) new_subs = [('a', new_value)] record_add_field(rec, "041", subfields=new_subs) # 035 Externals scn_035_fields = record_get_field_instances(rec, '035') forbidden_values = ["cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"] for field in scn_035_fields: subs = field_get_subfields(field) if '9' in subs: if not 'a' in subs: continue for sub in subs['9']: if sub.lower() in forbidden_values: break else: # No forbidden values (We did not "break") suffixes = [s.lower() for s in subs['9']] if 'spires' in suffixes: new_subs = [('a', 'SPIRES-%s' % subs['a'][0])] record_add_field(rec, '970', subfields=new_subs) continue if 'a' in subs: for sub in subs['a']: if sub.lower() in forbidden_values: record_delete_field(rec, tag="035", field_position_global=field[4]) rep_088_fields = record_get_field_instances(rec, '088') for field in rep_088_fields: subs = field_get_subfields(field) if '9' in subs: for val in subs['9']: if val.startswith('P0') or val.startswith('CM-P0'): sf = [('9', 'CERN'), ('b', val)] record_add_field(rec, '595', subfields=sf) for key, val in field[0]: if key in ['a', '9'] and not val.startswith('SIS-'): record_add_field(rec, '037', subfields=[('a', val)]) record_delete_fields(rec, "088") # 037 Externals also... rep_037_fields = record_get_field_instances(rec, '037') for field in rep_037_fields: subs = field_get_subfields(field) if 'a' in subs: for value in subs['a']: if 'arXiv' in value: new_subs = [('a', value), ('9', 'arXiv')] for fld in record_get_field_instances(rec, '695'): for key, val in field_get_subfield_instances(fld): if key == 'a': new_subs.append(('c', val)) break nf = create_field(subfields=new_subs) record_replace_field(rec, '037', nf, field[4]) for key, val in field[0]: if key in ['a', '9'] and val.startswith('SIS-'): record_delete_field(rec, '037', field_position_global=field[4]) for field in record_get_field_instances(rec, '242'): record_add_field(rec, '246', subfields=field[0]) record_delete_fields(rec, '242') # 269 Date normalization for field in record_get_field_instances(rec, '269'): for idx, (key, value) in enumerate(field[0]): if key == "c": field[0][idx] = ("c", convert_date_to_iso(value)) record_delete_fields(rec, "260") if not 'THESIS' in collections: for field in record_get_field_instances(rec, '260'): record_add_field(rec, '269', subfields=field[0]) record_delete_fields(rec, '260') # 300 page number for field in record_get_field_instances(rec, '300'): for idx, (key, value) in enumerate(field[0]): if key == 'a': if "mult." not in value and value != " p": field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value)) else: record_delete_field(rec, '300', field_position_global=field[4]) break # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, '100') author_names.extend(record_get_field_instances(rec, '700')) for field in author_names: subs = field_get_subfields(field) if not 'i' in subs or 'XX' in subs['i']: if not 'j' in subs or 'YY' in subs['j']: for idx, (key, value) in enumerate(field[0]): if key == 'a': field[0][idx] = ('a', punctuate_authorname(value)) # 700 -> 701 Thesis supervisors if 'THESIS' in collections: for field in record_get_field_instances(rec, '700'): record_add_field(rec, '701', subfields=field[0]) record_delete_fields(rec, '700') # 501 move subfields fields_501 = record_get_field_instances(rec, '502') for idx, field in enumerate(fields_501): new_subs = [] for key, value in field[0]: if key == 'a': new_subs.append(('b', value)) elif key == 'b': new_subs.append(('c', value)) elif key == 'c': new_subs.append(('d', value)) else: new_subs.append((key, value)) fields_501[idx] = field_swap_subfields(field, new_subs) # 650 Translate Categories categories = get_categories() category_fields = record_get_field_instances(rec, '650', ind1='1', ind2='7') record_delete_fields(rec, "650") for field in category_fields: for idx, (key, value) in enumerate(field[0]): if key == 'a': new_value = translate_config(value, categories) if new_value != value: new_subs = [('2', 'INSPIRE'), ('a', new_value)] else: new_subs = [('2', 'SzGeCERN'), ('a', value)] record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs) break # 653 Free Keywords for field in record_get_field_instances(rec, '653', ind1='1'): subs = field_get_subfields(field) new_subs = [] if 'a' in subs: for val in subs['a']: new_subs.extend([('9', 'author'), ('a', val)]) new_field = create_field(subfields=new_subs, ind1='1') record_replace_field(rec, '653', new_field, field_position_global=field[4]) experiments = get_experiments() # 693 Remove if 'not applicable' for field in record_get_field_instances(rec, '693'): subs = field_get_subfields(field) all_subs = subs.get('a', []) + subs.get('e', []) if 'not applicable' in [x.lower() for x in all_subs]: record_delete_field(rec, '693', field_position_global=field[4]) new_subs = [] experiment_a = "" experiment_e = "" for (key, value) in subs.iteritems(): if key == 'a': experiment_a = value[0] new_subs.append((key, value[0])) elif key == 'e': experiment_e = value[0] experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e) translated_experiments = translate_config(experiment, experiments) new_subs.append(("e", translated_experiments)) record_delete_field(rec, tag="693", field_position_global=field[4]) record_add_field(rec, "693", subfields=new_subs) # 710 Collaboration for field in record_get_field_instances(rec, '710'): subs = field_get_subfield_instances(field) for idx, (key, value) in enumerate(subs[:]): if key == '5': subs.pop(idx) elif value.startswith('CERN. Geneva'): subs.pop(idx) if len(subs) == 0: record_delete_field(rec, '710', field_position_global=field[4]) # 773 journal translations journals = get_journals() for field in record_get_field_instances(rec, '773'): subs = field_get_subfield_instances(field) new_subs = [] for idx, (key, value) in enumerate(subs): if key == 'p': new_subs.append((key, translate_config(value, journals))) else: new_subs.append((key, value)) record_delete_field(rec, tag="773", field_position_global=field[4]) record_add_field(rec, "773", subfields=new_subs) # FFT (856) Dealing with graphs figure_counter = 0 for field in record_get_field_instances(rec, '856', ind1='4'): subs = field_get_subfields(field) newsubs = [] remove = False if 'z' in subs: is_figure = [s for s in subs['z'] if "figure" in s.lower()] if is_figure and 'u' in subs: is_subformat = [s for s in subs['u'] if "subformat" in s.lower()] if not is_subformat: url = subs['u'][0] if url.endswith(".pdf"): # We try to convert fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR) os.close(fd) _print("Downloading %s into %s" % (url, local_url), verbose=5) plotfile = "" try: plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (url,)) remove = True if plotfile: converted = convert_images([plotfile]) if converted: url = converted.pop() _print("Successfully converted %s to %s" % (local_url, url), verbose=5) else: _print("Conversion failed on %s" % (local_url,)) url = None remove = True if url: newsubs.append(('a', url)) newsubs.append(('t', 'Plot')) figure_counter += 1 if 'y' in subs: newsubs.append(('d', "%05d %s" % (figure_counter, subs['y'][0]))) newsubs.append(('n', subs['y'][0])) else: # Get basename without extension. name = os.path.basename(os.path.splitext(subs['u'][0])[0]) newsubs.append(('d', "%05d %s" % (figure_counter, name))) newsubs.append(('n', name)) if not newsubs and 'u' in subs: is_fulltext = [s for s in subs['u'] if ".pdf" in s and not "subformat=pdfa" in s] if is_fulltext: newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])] if not newsubs and 'u' in subs: remove = True is_zipfile = [s for s in subs['u'] if ".zip" in s] if is_zipfile: url = is_zipfile[0] local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url)) _print("Downloading %s into %s" % (url, local_url), verbose=5) zipped_archive = "" try: zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0],)) remove = True if zipped_archive: unzipped_archive = unzip(zipped_archive) list_of_pngs = locate("*.png", unzipped_archive) for png in list_of_pngs: if "_vti_" in png or "__MACOSX" in png: continue figure_counter += 1 plotsubs = [] plotsubs.append(('a', png)) caption = '%05d %s' % (figure_counter, os.path.basename(png)) plotsubs.append(('d', caption)) plotsubs.append(('t', 'Plot')) record_add_field(rec, 'FFT', subfields=plotsubs) if not remove and not newsubs and 'u' in subs: urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch', 'http://cmsdoc.cern.ch', 'http://documents.cern.ch', 'http://preprints.cern.ch', 'http://cds.cern.ch') for val in subs['u']: if any(url in val for url in urls): remove = True break if val.endswith('ps.gz'): remove = True if newsubs: record_add_field(rec, 'FFT', subfields=newsubs) remove = True if remove: record_delete_field(rec, '856', ind1='4', field_position_global=field[4]) # 500 - Preliminary results if "THESIS" not in collections: subs = [('a', "Preliminary results")] record_add_field(rec, "500", subfields=subs) for collection in collections: record_add_field(rec, '980', subfields=[('a', collection)]) return rec
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 if not hidden cds_id = rec["001"][0][3] if not "hidden" in [x.lower() for x in record_get_field_values(rec, "980", code="a")]: record_add_field(rec, "035", subfields=[("9", "CDS"), ("a", cds_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = [ "024", "041", "035", "037", "088", "100", "110", "111", "242", "245", "246", "260", "269", "300", "502", "650", "653", "693", "700", "710", "773", "856", "520", "500", "980", ] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) # 980 Determine Collections collections = set([]) for value in record_get_field_values(rec, "980", code="a"): if "NOTE" in value.upper(): collections.add("NOTE") if "THESIS" in value.upper(): collections.add("THESIS") if "CONFERENCEPAPER" in value.upper(): collections.add("ConferencePaper") if is_published(rec): collections.add("PUBLISHED") collections.add("CITEABLE") if not "NOTE" in collections: # TODO: Move this to a KB kb = ["ATLAS-CONF-", "CMS-PAS-", "ATL-", "CMS-DP-", "ALICE-INT-", "LHCb-PUB-"] values = record_get_field_values(rec, "088", code="a") for val, rep in product(values, kb): if val.startswith(rep): collections.add("NOTE") break # 980 Arxiv tag if record_get_field_values(rec, "035", filter_subfield_code="a", filter_subfield_value="arXiv"): collections.add("arXiv") # 980 HEP && CORE collections.add("HEP") collections.add("CORE") # 980 Conference Note if not "ConferencePaper" in collections: for value in record_get_field_values(rec, "962", code="n"): if value[-2:].isdigit(): collections.add("ConferencePaper") break record_delete_fields(rec, "980") intnote = record_get_field_values(rec, "690", filter_subfield_code="a", filter_subfield_value="INTNOTE") if intnote: val_088 = record_get_field_values(rec, "088", filter_subfield_code="a") for val in val_088: if "CMS" in val: url = "http://weblib.cern.ch/abstract?CERN-CMS" + val.split("CMS", 1)[-1] record_add_field(rec, "856", ind1="4", subfields=[("u", url)]) # 041 Language languages = get_languages() language_fields = record_get_field_instances(rec, "041") record_delete_fields(rec, "041") for field in language_fields: subs = field_get_subfields(field) if "a" in subs: if "eng" in subs["a"]: continue new_value = translate_config(subs["a"][0], languages) new_subs = [("a", new_value)] record_add_field(rec, "041", subfields=new_subs) # 035 Externals scn_035_fields = record_get_field_instances(rec, "035") forbidden_values = ["cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"] for field in scn_035_fields: subs = field_get_subfields(field) if "9" in subs: if not "a" in subs: continue for sub in subs["9"]: if sub.lower() in forbidden_values: break else: # No forbidden values (We did not "break") suffixes = [s.lower() for s in subs["9"]] if "spires" in suffixes: new_subs = [("a", "SPIRES-%s" % subs["a"][0])] record_add_field(rec, "970", subfields=new_subs) continue if "a" in subs: for sub in subs["a"]: if sub.lower() in forbidden_values: record_delete_field(rec, tag="035", field_position_global=field[4]) rep_088_fields = record_get_field_instances(rec, "088") for field in rep_088_fields: subs = field_get_subfields(field) if "9" in subs: for val in subs["9"]: if val.startswith("P0") or val.startswith("CM-P0"): sf = [("9", "CERN"), ("b", val)] record_add_field(rec, "595", subfields=sf) for key, val in field[0]: if key in ["a", "9"] and not val.startswith("SIS-"): record_add_field(rec, "037", subfields=[("a", val)]) record_delete_fields(rec, "088") # 037 Externals also... rep_037_fields = record_get_field_instances(rec, "037") for field in rep_037_fields: subs = field_get_subfields(field) if "a" in subs: for value in subs["a"]: if "arXiv" in value: new_subs = [("a", value), ("9", "arXiv")] for fld in record_get_field_instances(rec, "695"): for key, val in field_get_subfield_instances(fld): if key == "a": new_subs.append(("c", val)) break nf = create_field(subfields=new_subs) record_replace_field(rec, "037", nf, field[4]) for key, val in field[0]: if key in ["a", "9"] and val.startswith("SIS-"): record_delete_field(rec, "037", field_position_global=field[4]) for field in record_get_field_instances(rec, "242"): record_add_field(rec, "246", subfields=field[0]) record_delete_fields(rec, "242") # 269 Date normalization for field in record_get_field_instances(rec, "269"): for idx, (key, value) in enumerate(field[0]): if key == "c": field[0][idx] = ("c", convert_date_to_iso(value)) record_delete_fields(rec, "260") if not "THESIS" in collections: for field in record_get_field_instances(rec, "260"): record_add_field(rec, "269", subfields=field[0]) record_delete_fields(rec, "260") # 300 page number for field in record_get_field_instances(rec, "300"): for idx, (key, value) in enumerate(field[0]): if key == "a": if "mult." not in value and value != " p": field[0][idx] = ("a", re.sub(r"[^\d-]+", "", value)) else: record_delete_field(rec, "300", field_position_global=field[4]) break # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, "100") author_names.extend(record_get_field_instances(rec, "700")) for field in author_names: subs = field_get_subfields(field) if not "i" in subs or "XX" in subs["i"]: if not "j" in subs or "YY" in subs["j"]: for idx, (key, value) in enumerate(field[0]): if key == "a": field[0][idx] = ("a", punctuate_authorname(value)) # 700 -> 701 Thesis supervisors if "THESIS" in collections: for field in record_get_field_instances(rec, "700"): record_add_field(rec, "701", subfields=field[0]) record_delete_fields(rec, "700") # 501 move subfields fields_501 = record_get_field_instances(rec, "502") for idx, field in enumerate(fields_501): new_subs = [] for key, value in field[0]: if key == "a": new_subs.append(("b", value)) elif key == "b": new_subs.append(("c", value)) elif key == "c": new_subs.append(("d", value)) else: new_subs.append((key, value)) fields_501[idx] = field_swap_subfields(field, new_subs) # 650 Translate Categories categories = get_categories() category_fields = record_get_field_instances(rec, "650", ind1="1", ind2="7") record_delete_fields(rec, "650") for field in category_fields: for idx, (key, value) in enumerate(field[0]): if key == "a": new_value = translate_config(value, categories) if new_value != value: new_subs = [("2", "INSPIRE"), ("a", new_value)] else: new_subs = [("2", "SzGeCERN"), ("a", value)] record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs) break # 653 Free Keywords for field in record_get_field_instances(rec, "653", ind1="1"): subs = field_get_subfields(field) new_subs = [] if "a" in subs: for val in subs["a"]: new_subs.extend([("9", "author"), ("a", val)]) new_field = create_field(subfields=new_subs, ind1="1") record_replace_field(rec, "653", new_field, field_position_global=field[4]) experiments = get_experiments() # 693 Remove if 'not applicable' for field in record_get_field_instances(rec, "693"): subs = field_get_subfields(field) all_subs = subs.get("a", []) + subs.get("e", []) if "not applicable" in [x.lower() for x in all_subs]: record_delete_field(rec, "693", field_position_global=field[4]) new_subs = [] experiment_a = "" experiment_e = "" for (key, value) in subs.iteritems(): if key == "a": experiment_a = value[0] new_subs.append((key, value[0])) elif key == "e": experiment_e = value[0] experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e) translated_experiments = translate_config(experiment, experiments) new_subs.append(("e", translated_experiments)) record_delete_field(rec, tag="693", field_position_global=field[4]) record_add_field(rec, "693", subfields=new_subs) # 710 Collaboration for field in record_get_field_instances(rec, "710"): subs = field_get_subfield_instances(field) for idx, (key, value) in enumerate(subs[:]): if key == "5": subs.pop(idx) elif value.startswith("CERN. Geneva"): subs.pop(idx) if len(subs) == 0: record_delete_field(rec, "710", field_position_global=field[4]) # 773 journal translations journals = get_journals() for field in record_get_field_instances(rec, "773"): subs = field_get_subfield_instances(field) new_subs = [] for idx, (key, value) in enumerate(subs): if key == "p": new_subs.append((key, translate_config(value, journals))) else: new_subs.append((key, value)) record_delete_field(rec, tag="773", field_position_global=field[4]) record_add_field(rec, "773", subfields=new_subs) # FFT (856) Dealing with graphs figure_counter = 0 for field in record_get_field_instances(rec, "856", ind1="4"): subs = field_get_subfields(field) newsubs = [] remove = False if "z" in subs: is_figure = [s for s in subs["z"] if "figure" in s.lower()] if is_figure and "u" in subs: is_subformat = [s for s in subs["u"] if "subformat" in s.lower()] if not is_subformat: url = subs["u"][0] if url.endswith(".pdf"): # We try to convert fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR) os.close(fd) _print("Downloading %s into %s" % (url, local_url), verbose=5) plotfile = "" try: plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (url,)) remove = True if plotfile: converted = convert_images([plotfile]) if converted: url = converted.pop() _print("Successfully converted %s to %s" % (local_url, url), verbose=5) else: _print("Conversion failed on %s" % (local_url,)) url = None remove = True if url: newsubs.append(("a", url)) newsubs.append(("t", "Plot")) figure_counter += 1 if "y" in subs: newsubs.append(("d", "%05d %s" % (figure_counter, subs["y"][0]))) newsubs.append(("n", subs["y"][0])) else: # Get basename without extension. name = os.path.basename(os.path.splitext(subs["u"][0])[0]) newsubs.append(("d", "%05d %s" % (figure_counter, name))) newsubs.append(("n", name)) if not newsubs and "u" in subs: is_fulltext = [s for s in subs["u"] if ".pdf" in s] if is_fulltext: newsubs = [("t", "INSPIRE-PUBLIC"), ("a", subs["u"][0])] if not newsubs and "u" in subs: remove = True is_zipfile = [s for s in subs["u"] if ".zip" in s] if is_zipfile: url = is_zipfile[0] local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url)) _print("Downloading %s into %s" % (url, local_url), verbose=5) zipped_archive = "" try: zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0) except InvenioFileDownloadError: _print("Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0],)) remove = True if zipped_archive: unzipped_archive = unzip(zipped_archive) list_of_pngs = locate("*.png", unzipped_archive) for png in list_of_pngs: if "_vti_" in png or "__MACOSX" in png: continue figure_counter += 1 plotsubs = [] plotsubs.append(("a", png)) caption = "%05d %s" % (figure_counter, os.path.basename(png)) plotsubs.append(("d", caption)) plotsubs.append(("t", "Plot")) record_add_field(rec, "FFT", subfields=plotsubs) if not remove and not newsubs and "u" in subs: urls = ( "http://cdsweb.cern.ch", "http://cms.cern.ch", "http://cmsdoc.cern.ch", "http://documents.cern.ch", "http://preprints.cern.ch", "http://cds.cern.ch", ) for val in subs["u"]: if any(url in val for url in urls): remove = True break if val.endswith("ps.gz"): remove = True if newsubs: record_add_field(rec, "FFT", subfields=newsubs) remove = True if remove: record_delete_field(rec, "856", ind1="4", field_position_global=field[4]) # 500 - Preliminary results if "THESIS" not in collections: subs = [("a", "Preliminary results")] record_add_field(rec, "500", subfields=subs) for collection in collections: record_add_field(rec, "980", subfields=[("a", collection)]) return rec
continue except APSHarvesterFileExits: write_message("File exists at %s" % (result_file,), verbose=2) except StandardError, e: if 'urlopen' in str(e) or 'URL could not be opened' in str(e): write_message("Error: URL could not be opened: %s" % (url,), stream=sys.stderr) write_message("No fulltext found for %s" % (record.recid or record.doi,)) yield record, "%s cannot be opened." % (url,) raise # Unzip the compressed file unzipped_folder = unzip(result_file) # Validate the checksum of the compressed fulltext file. try: checksum_validated_files = find_and_validate_md5_checksums( in_folder=unzipped_folder, md5key_filename=CFG_APSHARVEST_MD5_FILE) except InvenioFileChecksumError, e: write_message("Error while validating checksum: %s" % (str(e),)) write_message("Skipping %s in %s" % (record.recid or record.doi, unzipped_folder)) continue if not checksum_validated_files: write_message("Warning: No files found to perform checksum" " validation on inside %s" % (unzipped_folder,)) elif len(checksum_validated_files) != 1 or \
write_message("File exists at %s" % (result_file, ), verbose=2) except StandardError, e: if 'urlopen' in str(e) or 'URL could not be opened' in str(e): msg = "URL could not be opened: %s" % (url, ) write_message("Error: %s" % (msg, ), stream=sys.stderr) write_message("No fulltext found for %s" % (record.recid or record.doi, )) yield record, msg continue raise finally: request_end = time.time() # Unzip the compressed file unzipped_folder = unzip(result_file, base_directory=out_folder) # Validate the checksum of the compressed fulltext file. try: checksum_validated_files = find_and_validate_md5_checksums( in_folder=unzipped_folder, md5key_filename=CFG_APSHARVEST_MD5_FILE) except InvenioFileChecksumError, e: info_msg = "Skipping %s in %s" % \ (record.recid or record.doi, unzipped_folder) msg = "Error while validating checksum: %s\n%s\n%s" % \ (info_msg, str(e), traceback.format_exc()[:-1]) write_message(msg) yield record, msg continue if not checksum_validated_files:
write_message("File exists at %s" % (result_file, ), verbose=2) except StandardError, e: if 'urlopen' in str(e) or 'URL could not be opened' in str(e): msg = "URL could not be opened: %s" % (url, ) write_message("Error: %s" % (msg, ), stream=sys.stderr) write_message("No fulltext found for %s" % (record.recid or record.doi, )) yield record, msg continue raise finally: request_end = time.time() # Unzip the compressed file unzipped_folder = unzip(result_file) # Validate the checksum of the compressed fulltext file. try: checksum_validated_files = find_and_validate_md5_checksums( in_folder=unzipped_folder, md5key_filename=CFG_APSHARVEST_MD5_FILE) except InvenioFileChecksumError, e: info_msg = "Skipping %s in %s" % \ (record.recid or record.doi, unzipped_folder) msg = "Error while validating checksum: %s\n%s\n%s" % \ (info_msg, str(e), traceback.format_exc()[:-1]) write_message(msg) yield record, msg continue if not checksum_validated_files:
def apply_filter(rec): """ Filters the record to be compatible within Inspire Parameters: * rec - dictionary: BibRecord structure Returns: dictionary, BibRecord structure """ # Move recid from 001 to 035 cds_id = rec['001'][0][3] record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)]) # Clear control fields record_strip_controlfields(rec) # Clear other uninteresting fields interesting_fields = [ "024", "041", "035", "037", "088", "100", "110", "111", "242", "245", "246", "260", "269", "300", "502", "650", "653", "693", "700", "710", "773", "856", "520", "500", "980" ] for tag in rec.keys(): if tag not in interesting_fields: record_delete_fields(rec, tag) # 980 Determine Collections collections = set([]) for value in record_get_field_values(rec, '980', code='a'): if 'NOTE' in value.upper(): collections.add('NOTE') if 'THESIS' in value.upper(): collections.add('THESIS') if is_published(rec): collections.add("PUBLISHED") collections.add("CITEABLE") # 980 Arxiv tag if record_get_field_values(rec, '035', filter_subfield_code="a", filter_subfield_value="arXiv"): collections.add("arXiv") # 980 HEP && CORE collections.add('HEP') collections.add('CORE') record_delete_fields(rec, "980") intnote = record_get_field_values(rec, '690', filter_subfield_code="a", filter_subfield_value='INTNOTE') if intnote: val_088 = record_get_field_values(rec, '088', filter_subfield_code="a") for val in val_088: if 'CMS' in val: url = ('http://weblib.cern.ch/abstract?CERN-CMS' + val.split('CMS', 1)[-1]) record_add_field(rec, '856', ind1='4', subfields=[('u', url)]) # 041 Language languages = get_languages() language_fields = record_get_field_instances(rec, '041') record_delete_fields(rec, "041") for field in language_fields: subs = field_get_subfields(field) if 'a' in subs: if "eng" in subs['a']: continue new_value = translate_config(subs['a'][0], languages) new_subs = [('a', new_value)] record_add_field(rec, "041", subfields=new_subs) # 035 Externals scn_035_fields = record_get_field_instances(rec, '035') forbidden_values = [ "cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01" ] for field in scn_035_fields: subs = field_get_subfields(field) if '9' in subs: if not 'a' in subs: continue for sub in subs['9']: if sub.lower() in forbidden_values: break else: # No forbidden values (We did not "break") suffixes = [s.lower() for s in subs['9']] if 'spires' in suffixes: new_sub = ('a', 'SPIRES-%s' % subs['a']) record_add_field(rec, '970', subfields=new_sub) continue if 'a' in subs: for sub in subs['a']: if sub.lower() in forbidden_values: record_delete_field(rec, tag="035", field_position_global=field[4]) rep_088_fields = record_get_field_instances(rec, '088') for field in rep_088_fields: subs = field_get_subfields(field) if '9' in subs: for val in subs['9']: if val.startswith('P0') or val.startswith('CM-P0'): sf = [('9', 'CERN'), ('b', val)] record_add_field(rec, '595', subfields=sf) for key, val in field[0]: if key in ['a', '9'] and not val.startswith('SIS-'): record_add_field(rec, '037', subfields=[('a', val)]) record_delete_fields(rec, "088") rep_037_fields = record_get_field_instances(rec, '037') for field in rep_037_fields: subs = field_get_subfields(field) if 'a' in subs: for value in subs['a']: if 'arXiv' in value: new_subs = [('a', value), ('9', 'arXiv')] for fld in record_get_field_instances(rec, '695'): for key, val in field_get_subfield_instances(fld): if key == 'a': new_subs.append(('c', val)) break nf = create_field(subfields=new_subs) record_replace_field(rec, '037', nf, field[4]) for key, val in field[0]: if key in ['a', '9'] and val.startswith('SIS-'): record_delete_field(rec, '037', field_position_global=field[4]) for field in record_get_field_instances(rec, '242'): record_add_field(rec, '246', subfields=field[0]) record_delete_fields(rec, '242') # 269 Date normalization for field in record_get_field_instances(rec, '269'): for idx, (key, value) in enumerate(field[0]): if key == "c": field[0][idx] = ("c", convert_date_to_iso(value)) record_delete_fields(rec, "260") if not 'THESIS' in collections: for field in record_get_field_instances(rec, '260'): record_add_field(rec, '269', subfields=field[0]) record_delete_fields(rec, '260') # 300 page number for field in record_get_field_instances(rec, '300'): for idx, (key, value) in enumerate(field[0]): if key == 'a': if "mult." not in value and value != " p": field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value)) else: record_delete_field(rec, '300', field_position_global=field[4]) break # 100 & 700 punctuate author names author_names = record_get_field_instances(rec, '100') author_names.extend(record_get_field_instances(rec, '700')) for field in author_names: subs = field_get_subfields(field) if not 'i' in subs or 'XX' in subs['i']: if not 'j' in subs or 'YY' in subs['j']: for idx, (key, value) in enumerate(field[0]): if key == 'a': field[0][idx] = ('a', punctuate_authorname(value)) # 700 -> 701 Thesis supervisors if 'THESIS' in collections: for field in record_get_field_instances(rec, '700'): record_add_field(rec, '701', subfields=field[0]) record_delete_fields(rec, '700') # 501 move subfields fields_501 = record_get_field_instances(rec, '502') for idx, field in enumerate(fields_501): new_subs = [] for key, value in field[0]: if key == 'a': new_subs.append(('b', value)) elif key == 'b': new_subs.append(('c', value)) elif key == 'c': new_subs.append(('d', value)) else: new_subs.append((key, value)) fields_501[idx] = field_swap_subfields(field, new_subs) # 650 Translate Categories categories = get_categories() category_fields = record_get_field_instances(rec, '650', ind1='1', ind2='7') record_delete_fields(rec, "650") for field in category_fields: for idx, (key, value) in enumerate(field[0]): if key == 'a': new_value = translate_config(value, categories) if new_value != value: new_subs = [('2', 'INSPIRE'), ('a', new_value)] else: new_subs = [('2', 'SzGeCERN'), ('a', value)] record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs) break # 653 Free Keywords for field in record_get_field_instances(rec, '653', ind1='1'): subs = field_get_subfields(field) new_subs = [] if 'a' in subs: for val in subs['a']: new_subs.extend([('9', 'author'), ('a', val)]) new_field = create_field(subfields=new_subs) record_replace_field(rec, '653', new_field, field_position_global=field[4]) experiments = get_experiments() # 693 Remove if 'not applicable' for field in record_get_field_instances(rec, '693'): subs = field_get_subfields(field) if 'not applicable' in [x.lower() for x in subs['a']]: if 'not applicable' in [x.lower() for x in subs['e']]: record_delete_field(rec, '693', field_position_global=field[4]) new_subs = [] experiment_a = "" experiment_e = "" for (key, value) in subs.iteritems(): if key == 'a': experiment_a = value[0] new_subs.append((key, value[0])) elif key == 'e': experiment_e = value[0] experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e) translated_experiments = translate_config(experiment, experiments) new_subs.append(("e", translated_experiments)) record_delete_field(rec, tag="693", field_position_global=field[4]) record_add_field(rec, "693", subfields=new_subs) # 710 Collaboration for field in record_get_field_instances(rec, '710'): subs = field_get_subfield_instances(field) for idx, (key, value) in enumerate(subs[:]): if key == '5': subs.pop(idx) elif value.startswith('CERN. Geneva'): subs.pop(idx) if len(subs) == 0: record_delete_field(rec, '710', field_position_global=field[4]) # 773 journal translations journals = get_journals() for field in record_get_field_instances(rec, '773'): subs = field_get_subfield_instances(field) new_subs = [] for idx, (key, value) in enumerate(subs): if key == 'p': new_subs.append((key, translate_config(value, journals))) else: new_subs.append((key, value)) record_delete_field(rec, tag="773", field_position_global=field[4]) record_add_field(rec, "773", subfields=new_subs) # FFT (856) Dealing with graphs figure_counter = 0 for field in record_get_field_instances(rec, '856', ind1='4'): subs = field_get_subfields(field) newsubs = [] remove = False if 'z' in subs: is_figure = [s for s in subs['z'] if "figure" in s.lower()] if is_figure and 'u' in subs: is_subformat = [ s for s in subs['u'] if "subformat" in s.lower() ] if not is_subformat: url = subs['u'][0] if url.endswith(".pdf"): # We try to convert fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR) os.close(fd) _print("Downloading %s into %s" % (url, local_url), verbose=5) plotfile = "" try: plotfile = download_file(url_for_file=url, downloaded_file=local_url, timeout=30.0) except InvenioDownloadError: _print( "Download failed while attempting to reach %s. Skipping.." % (url, )) remove = True if plotfile: converted = convert_images([plotfile]) if converted: url = converted.pop() _print("Successfully converted %s to %s" % (local_url, url), verbose=5) else: _print("Conversion failed on %s" % (local_url, )) url = None remove = True if url: newsubs.append(('a', url)) newsubs.append(('t', 'Plot')) figure_counter += 1 if 'y' in subs: newsubs.append( ('d', "%05d %s" % (figure_counter, subs['y'][0]))) else: newsubs.append( ('d', "%05d %s" % (figure_counter, os.path.basename(url)))) if not newsubs and 'u' in subs: is_fulltext = [s for s in subs['u'] if ".pdf" in s] if is_fulltext: newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])] if not newsubs and 'u' in subs: remove = True is_zipfile = [s for s in subs['u'] if ".zip" in s] if is_zipfile: url = is_zipfile[0] local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url)) _print("Downloading %s into %s" % (url, local_url), verbose=5) zipped_archive = "" try: zipped_archive = download_file(url_for_file=is_zipfile[0], downloaded_file=local_url, timeout=30.0) except InvenioDownloadError: _print( "Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0], )) remove = True if zipped_archive: unzipped_archive = unzip(zipped_archive) list_of_pngs = locate("*.png", unzipped_archive) for png in list_of_pngs: if "_vti_" in png or "__MACOSX" in png: continue figure_counter += 1 plotsubs = [] plotsubs.append(('a', png)) caption = '%05d %s' % (figure_counter, os.path.basename(png)) plotsubs.append(('d', caption)) plotsubs.append(('t', 'Plot')) record_add_field(rec, 'FFT', subfields=plotsubs) if not remove and not newsubs and 'u' in subs: urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch', 'http://cmsdoc.cern.ch', 'http://documents.cern.ch', 'http://preprints.cern.ch', 'http://cds.cern.ch') for val in subs['u']: if any(url in val for url in urls): remove = True break if val.endswith('ps.gz'): remove = True if newsubs: record_add_field(rec, 'FFT', subfields=newsubs) remove = True if remove: record_delete_field(rec, '856', ind1='4', field_position_global=field[4]) # 500 - Preliminary results subs = [('a', "Preliminary results")] record_add_field(rec, "500", subfields=subs) for collection in collections: record_add_field(rec, '980', subfields=[('a', collection)]) return rec