def extract_arxiv_ids_from_recid(recid): """Extract arxiv # for given recid We get them from the record which has this format: 037__ $9arXiv$arXiv:1010.1111 """ record = get_record(recid) for report_number_field in record.get("037", []): try: source = report_number_field.get_subfield_values("9")[0] except IndexError: continue else: if source != "arXiv": continue try: report_number = report_number_field.get_subfield_values("a")[0] except IndexError: continue else: # Extract arxiv id if report_number.startswith("arXiv"): report_number = report_number.split(":")[1] if ARXIV_VERSION_PATTERN.search(report_number): report_number = report_number[:-2] yield report_number
def extract_arxiv_ids_from_recid(recid): """Extract arxiv # for given recid We get them from the record which has this format: 037__ $9arXiv$arXiv:1010.1111 """ record = get_record(recid) for report_number_field in record.get('037', []): try: source = report_number_field.get_subfield_values('9')[0] except IndexError: continue else: if source != 'arXiv': continue try: report_number = report_number_field.get_subfield_values('a')[0] except IndexError: continue else: # Extract arxiv id if report_number.startswith('arXiv'): report_number = report_number.split(':')[1] if ARXIV_VERSION_PATTERN.search(report_number): report_number = report_number[:-2] yield report_number
def get_citation_informations(recid_list, tags, config, fetch_catchup_info=True): """Scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] records_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, 'hdl': {}, 'isbn': {}, 'record_id': {}, } references_info = { 'report-numbers': {}, 'journals': {}, 'doi': {}, 'record_id': {}, 'isbn': {}, 'hdl': {}, } # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: for done, recid in enumerate(recid_list): if done % 10 == 0: task_sleep_now_if_required() if done % 50 == 0: mesg = "get cit.inf done %s of %s" % (done, len(recid_list)) write_message(mesg) task_update_progress(mesg) record = get_record(recid) records_info['record_id'][recid] = [unicode(recid)] function = config.get("rank_method", "function") if config.get(function, 'collections'): if recid not in recids_cache(config.get(function, 'collections')): # do not treat this record since it is not in the collections # we want to process continue elif recid in deleted_recids_cache(): # do not treat this record since it was deleted; we # skip it like this in case it was only soft-deleted # e.g. via bibedit (i.e. when collection tag 980 is # DELETED but other tags like report number or journal # publication info remained the same, so the calls to # get_fieldvalues() below would return old values) continue if tags['refs_report_number']: references_info['report-numbers'][recid] = [ t.value for t in record.find_subfields(tags['refs_report_number']) ] msg = "references_info['report-numbers'][%s] = %r" \ % (recid, references_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['refs_journal']: references_info['journals'][recid] = [] for ref in record.find_subfields(tags['refs_journal']): try: # Inspire specific parsing journal, volume, page = ref.value.split(',') except ValueError: pass else: alt_volume = get_alt_volume(volume) if alt_volume: alt_ref = ','.join([journal, alt_volume, page]) references_info['journals'][recid] += [alt_ref] references_info['journals'][recid] += [ref.value] msg = "references_info['journals'][%s] = %r" \ % (recid, references_info['journals'][recid]) write_message(msg, verbose=9) if tags['refs_doi']: references = [ t.value for t in record.find_subfields(tags['refs_doi']) ] dois = [] hdls = [] for ref in references: if ref.startswith("hdl:"): hdls.append(ref[4:]) elif ref.startswith("doi:"): dois.append(ref[4:]) else: dois.append(ref) references_info['doi'][recid] = dois references_info['hdl'][recid] = hdls msg = "references_info['doi'][%s] = %r" % (recid, dois) write_message(msg, verbose=9) msg = "references_info['hdl'][%s] = %r" % (recid, hdls) write_message(msg, verbose=9) if tags['refs_record_id']: references_info['record_id'][recid] = [ t.value for t in record.find_subfields(tags['refs_record_id']) ] msg = "references_info['record_id'][%s] = %r" \ % (recid, references_info['record_id'][recid]) write_message(msg, verbose=9) if tags['refs_isbn']: references_info['isbn'][recid] = [ t.value for t in record.find_subfields(tags['refs_isbn']) ] msg = "references_info['isbn'][%s] = %r" \ % (recid, references_info['isbn'][recid]) write_message(msg, verbose=9) if not fetch_catchup_info: # We do not need the extra info continue if tags['record_pri_number'] or tags['record_add_number']: records_info['report-numbers'][recid] = [] if tags['record_pri_number']: records_info['report-numbers'][recid] += [ t.value for t in record.find_subfields(tags['record_pri_number']) ] if tags['record_add_number']: records_info['report-numbers'][recid] += [ t.value for t in record.find_subfields(tags['record_add_number']) ] msg = "records_info[%s]['report-numbers'] = %r" \ % (recid, records_info['report-numbers'][recid]) write_message(msg, verbose=9) if tags['doi']: records_info['doi'][recid] = [] records_info['hdl'][recid] = [] for tag in tags['doi']: for field in record.find_fields(tag[:5]): if 'DOI' in field.get_subfield_values('2'): dois = field.get_subfield_values('a') records_info['doi'][recid].extend(dois) elif 'HDL' in field.get_subfield_values('2'): hdls = field.get_subfield_values('a') records_info['hdl'][recid].extend(hdls) msg = "records_info[%s]['doi'] = %r" \ % (recid, records_info['doi'][recid]) write_message(msg, verbose=9) msg = "records_info[%s]['hdl'] = %r" \ % (recid, records_info['hdl'][recid]) write_message(msg, verbose=9) if tags['isbn']: records_info['isbn'][recid] = [] for tag in tags['isbn']: values = [t.value for t in record.find_subfields(tag)] records_info['isbn'][recid] += values msg = "records_info[%s]['isbn'] = %r" \ % (recid, records_info['isbn'][recid]) write_message(msg, verbose=9) # get a combination of # journal vol (year) pages if tags['publication']: records_info['journals'][recid] = get_journal_info(record, tags) msg = "records_info[%s]['journals'] = %r" \ % (recid, records_info['journals'][recid]) write_message(msg, verbose=9) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) end_time = os.times()[4] write_message("Execution time for generating citation info " "from record: %.2f sec" % (end_time - begin_time)) return records_info, references_info
def get_citation_informations(recid_list, tags, config, fetch_catchup_info=True): """Scans the collections searching references (999C5x -fields) and citations for items in the recid_list returns a 4 list of dictionaries that contains the citation information of cds records examples: [ {} {} {} {} ] [ {5: 'SUT-DP-92-70-5'}, { 93: ['astro-ph/9812088']}, { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ] NB: stuff here is for analysing new or changed records. see "ref_analyzer" for more. """ begin_time = os.times()[4] records_info = {"report-numbers": {}, "journals": {}, "doi": {}, "hdl": {}, "isbn": {}, "record_id": {}} references_info = {"report-numbers": {}, "journals": {}, "doi": {}, "record_id": {}, "isbn": {}, "hdl": {}} # perform quick check to see if there are some records with # reference tags, because otherwise get.cit.inf would be slow even # if there is nothing to index: for done, recid in enumerate(recid_list): if done % 10 == 0: task_sleep_now_if_required() if done % 50 == 0: mesg = "get cit.inf done %s of %s" % (done, len(recid_list)) write_message(mesg) task_update_progress(mesg) record = get_record(recid) records_info["record_id"][recid] = [unicode(recid)] function = config.get("rank_method", "function") if config.get(function, "collections"): if recid not in recids_cache(config.get(function, "collections")): # do not treat this record since it is not in the collections # we want to process continue elif recid in deleted_recids_cache(): # do not treat this record since it was deleted; we # skip it like this in case it was only soft-deleted # e.g. via bibedit (i.e. when collection tag 980 is # DELETED but other tags like report number or journal # publication info remained the same, so the calls to # get_fieldvalues() below would return old values) continue if tags["refs_report_number"]: references_info["report-numbers"][recid] = [ t.value for t in record.find_subfields(tags["refs_report_number"]) ] msg = "references_info['report-numbers'][%s] = %r" % (recid, references_info["report-numbers"][recid]) write_message(msg, verbose=9) if tags["refs_journal"]: references_info["journals"][recid] = [] for ref in record.find_subfields(tags["refs_journal"]): try: # Inspire specific parsing journal, volume, page = ref.value.split(",") except ValueError: pass else: alt_volume = get_alt_volume(volume) if alt_volume: alt_ref = ",".join([journal, alt_volume, page]) references_info["journals"][recid] += [alt_ref] references_info["journals"][recid] += [ref.value] msg = "references_info['journals'][%s] = %r" % (recid, references_info["journals"][recid]) write_message(msg, verbose=9) if tags["refs_doi"]: references = [t.value for t in record.find_subfields(tags["refs_doi"])] dois = [] hdls = [] for ref in references: if ref.startswith("hdl:"): hdls.append(ref[4:]) elif ref.startswith("doi:"): dois.append(ref[4:]) else: dois.append(ref) references_info["doi"][recid] = dois references_info["hdl"][recid] = hdls msg = "references_info['doi'][%s] = %r" % (recid, dois) write_message(msg, verbose=9) msg = "references_info['hdl'][%s] = %r" % (recid, hdls) write_message(msg, verbose=9) if tags["refs_record_id"]: references_info["record_id"][recid] = [t.value for t in record.find_subfields(tags["refs_record_id"])] msg = "references_info['record_id'][%s] = %r" % (recid, references_info["record_id"][recid]) write_message(msg, verbose=9) if tags["refs_isbn"]: references_info["isbn"][recid] = [t.value for t in record.find_subfields(tags["refs_isbn"])] msg = "references_info['isbn'][%s] = %r" % (recid, references_info["isbn"][recid]) write_message(msg, verbose=9) if not fetch_catchup_info: # We do not need the extra info continue if tags["record_pri_number"] or tags["record_add_number"]: records_info["report-numbers"][recid] = [] if tags["record_pri_number"]: records_info["report-numbers"][recid] += [ t.value for t in record.find_subfields(tags["record_pri_number"]) ] if tags["record_add_number"]: records_info["report-numbers"][recid] += [ t.value for t in record.find_subfields(tags["record_add_number"]) ] msg = "records_info[%s]['report-numbers'] = %r" % (recid, records_info["report-numbers"][recid]) write_message(msg, verbose=9) if tags["doi"]: records_info["doi"][recid] = [] records_info["hdl"][recid] = [] for tag in tags["doi"]: for field in record.find_fields(tag[:5]): if "DOI" in field.get_subfield_values("2"): dois = field.get_subfield_values("a") records_info["doi"][recid].extend(dois) elif "HDL" in field.get_subfield_values("2"): hdls = field.get_subfield_values("a") records_info["hdl"][recid].extend(hdls) msg = "records_info[%s]['doi'] = %r" % (recid, records_info["doi"][recid]) write_message(msg, verbose=9) msg = "records_info[%s]['hdl'] = %r" % (recid, records_info["hdl"][recid]) write_message(msg, verbose=9) if tags["isbn"]: records_info["isbn"][recid] = [] for tag in tags["isbn"]: values = [t.value for t in record.find_subfields(tag)] records_info["isbn"][recid] += values msg = "records_info[%s]['isbn'] = %r" % (recid, records_info["isbn"][recid]) write_message(msg, verbose=9) # get a combination of # journal vol (year) pages if tags["publication"]: records_info["journals"][recid] = get_journal_info(record, tags) msg = "records_info[%s]['journals'] = %r" % (recid, records_info["journals"][recid]) write_message(msg, verbose=9) mesg = "get cit.inf done fully" write_message(mesg) task_update_progress(mesg) end_time = os.times()[4] write_message("Execution time for generating citation info " "from record: %.2f sec" % (end_time - begin_time)) return records_info, references_info