def bad_experiments_affilations(): """Check to see bad metadata.""" metadata = { 'Institutions': { 'HepNames': '371__a', 'Truth': '110__u' }, 'Experiments': { 'HepNames': '693__e', 'Truth': '119__a' } } for aff in metadata: for value in numpy.setdiff1d( get_all_field_values(metadata[aff]['HepNames']), get_all_field_values(metadata[aff]['Truth'])): search = metadata[aff]['HepNames'] + ':"' + value + '"' result = perform_request_search(p=search, cc='HepNames') result = intbitset(result) & RECIDS_HEPN if len(result) > 0: if perform_request_search(p=metadata[aff]['Truth'] + ':"' + value + '"', cc=aff) == []: print search, result
def generate_data(data_set): '''Gets data from INSPIRE.''' if data_set == 'INSPIRE_JOURNALS': return [set(get_all_field_values('711__a')), None] if data_set == 'INSPIRE_EPRINTS': source = set(get_all_field_values('035__a')) | \ set(get_all_field_values('035__z')) | \ set(get_all_field_values('037__a')) inspire_eprints = set() inspire_bibcodes = set() for item in source: if item.startswith('oai:arXiv.org:'): inspire_eprints.add(item.replace('oai:arXiv.org:', '')) elif item.startswith('arXiv:'): inspire_eprints.add(item.replace('arXiv:', '')) elif re.match(ARXIV_REGEX, item): inspire_eprints.add(item) elif len(item) != 19: continue if re.match(ADS_REGEX, item): inspire_bibcodes.add(item) return [inspire_eprints, inspire_bibcodes] if data_set == 'INSPIRE_DOIS': source = get_all_field_values('0247_a') inspire_dois = set() for item in source: if item.startswith('10.'): inspire_dois.add(item) return [inspire_dois, None] return [None, None]
def get_all_sets(): """ Return all the sets. """ res = run_sql("SELECT setSpec, setName, setDescription FROM oaiREPOSITORY") ret = {} for row in res: ret[row[0]] = row ## Let's expand with all the set that exist in the DB for a_set in get_all_field_values(CFG_OAI_SET_FIELD): if a_set not in ret: ret[a_set] = (a_set, a_set, '') ## Let's expand with all the supersets for a_set in ret.keys(): while ':' in a_set: try: a_set = a_set.rsplit(":", 1)[0] except AttributeError: a_set = ':'.join(a_set.split(":")[:-1]) if a_set not in ret: ret[a_set] = (a_set, a_set, '') if CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC in ret: ## Let's remove the special global set del ret[CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC] if '' in ret: ## '' is not a valid setSpec but might be in the MARC del ret[''] return ret
def get_citations(self): """Find all the citations of records in this repository.""" citations_list = [] citations = '' for ref in get_all_field_values('999C5a'): if self.regex_base.match(ref): search = '999C5a:' + ref cites = perform_request_search(p=search, cc='HEP') if len(cites): if not self.regex.match(ref): print 'Problem with DOI extraction:', search, cites continue try: metadata = self.get_ref_metadata_inspire( ref, self.dois) if not metadata: metadata = self.get_ref_metadata_repository(ref) except ValueError: print 'Problem with DOI:', search, cites, '\n' continue citations_list.append((len(cites), ref, cites, metadata)) for doi in sorted(citations_list, reverse=True): doi_url = 'https://doi.org/' + doi[1].replace('doi:', '') citations += \ '''{0} citations to {3} {2} https://inspirehep.net/search?p=999C5a:{1} '''.format(doi[0], doi[1], doi[3], doi_url) return citations
def get_all_sets(): """ Return all the sets. """ res = run_sql("SELECT setSpec, setName, setDescription FROM oaiREPOSITORY") ret = {} for row in res: ret[row[0]] = row ## Let's expand with all the set that exist in the DB for a_set in get_all_field_values(CFG_OAI_SET_FIELD): if a_set not in ret: ret[a_set] = (a_set, a_set, '') ## Let's expand with all the supersets for a_set in ret.keys(): while ':' in a_set: try: a_set = a_set.rsplit(":", 1)[0] except AttributeError: a_set = ':'.join(a_set.split(":")[:-1]) if a_set not in ret: ret[a_set] = (a_set, a_set, '') if CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC in ret: ## Let's remove the special global set del ret[CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC] if '' in ret: ## '' is not a valid setSpec but might be in the MARC del ret[''] return ret
def get_inspire_dois(cls, regex): """Get all the DOIs in INSPIRE.""" dois = set() for doi in get_all_field_values('0247_a'): doi = 'doi:' + doi if regex.match(doi): dois.add(doi) return dois
def new_orcids(already_seen): """Search for new ORCIDs in HEP.""" orcid_counter = 0 fields = ('100__j', '700__j', '100__k', '700__k') for field in fields: for orcid in get_all_field_values(field): if not re.search('00-000', orcid): continue if not orcid.startswith('ORCID:'): search = field + ':' + orcid recid = perform_request_search(p=search, cc='HEP') if len(recid): print 'Needs to start with "ORCID:"', recid, orcid continue orcid = orcid.replace('ORCID:', '') if bad_id_check(orcid): search = field + ':' + orcid recid = perform_request_search(p=search, cc='HEP') if len(recid): print 'Bad ORCID', recid, orcid search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \ or {3}:ORCID:* 980:core".format(fields[0], fields[1], fields[2], fields[3]) search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \ or {3}:ORCID:* (037__c:hep-* or 037__c:nucl-* \ or 037__c:math*)".format(fields[0], fields[1], fields[2], fields[3]) search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \ or {3}:ORCID:* 037__c:hep-th".format(fields[0], fields[1], fields[2], fields[3]) result = perform_request_search(p=search, cc='HEP') #search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \ # or {3}:ORCID:*".format(fields[0], # fields[1], fields[2], fields[3]) #result = perform_request_search(p=search, cc='Fermilab') for recid, field in [(recid, field) for recid in result \ for field in fields]: for orcid in get_fieldvalues(recid, field): if not orcid.startswith('ORCID:'): continue orcid = orcid.strip('ORCID:') if orcid in already_seen: continue if bad_id_check(orcid): print "Bad ORCID in HEP:", orcid already_seen[orcid] = recid print "http://inspirehep.net/record/{0}\thttp://orcid.org/{1}".\ format(str(recid), orcid) orcid_counter += 1 #search = '001:' + str(recid) + ' 980:CORE' #if perform_request_search(p=search, cc='HEP'): # print "http://inspirehep.net/record/{0}\thttp://orcid.org/{1}".\ # format(str(recid), orcid) print 'New orcids:', orcid_counter
def bad_experiments_affilations(): """Check to see bad metadata.""" metadata = {'Institutions':{'HepNames':'371__a', 'Truth':'110__u'}, 'Experiments':{'HepNames':'693__e', 'Truth':'119__a'}} for aff in metadata: for value in numpy.setdiff1d(get_all_field_values( metadata[aff]['HepNames']), get_all_field_values( metadata[aff]['Truth'])): search = metadata[aff]['HepNames'] + ':"' + value + '"' result = perform_request_search(p=search, cc='HepNames') result = intbitset(result) & RECIDS_HEPN if len(result) > 0: if perform_request_search( p=metadata[aff]['Truth'] + ':"' + value + '"', cc=aff) == []: print search, result
def bad_url_z(): """Check to make sure $$z field is correct.""" field = '8564_z' good_values = set(['postprint', 'openaccess']) for value in get_all_field_values(field): search = field + ':' + value result = perform_request_search(p=search, cc='Fermilab') if len(result) == 0: continue if value in good_values: print value, len(result) else: print value, result
def bad_url_z(): """Check to make sure $$z field is correct.""" field = '8564_z' good_values = set(['postprint', 'openaccess']) for value in get_all_field_values(field): search = field + ':' + value result = perform_request_search(p=search, cc='Fermilab') if len(result) == 0: continue if value in good_values: print value, len(result) else: print value, result
def get_jacow_dois(): """Return all the JACoW DOIs INSPIRE has.""" jacow_dois_record = set() for doi in get_all_field_values('0247_a'): if doi.startswith('10.18429/JACoW-'): jacow_dois_record.add('doi:' + doi) jacow_dois_ref = set() for doi in get_all_field_values('999C5a'): if doi.startswith('doi:10.18429/JACoW-'): jacow_dois_ref.add(doi) missing_dois = jacow_dois_ref - jacow_dois_record if not missing_dois: return jacow_dois_record for doi in sorted(missing_dois): if good_doi(doi): search_unit('doi', f='0247_2', m='a') doi = doi.replace('doi:', '') if search_unit(doi, f='0247_2', m='a'): continue print 'https://doi.org/{0}'.format(doi) sys.exit()
def new_orcids(already_seen): """Search for new ORCIDs in HEP.""" orcid_counter = 0 fields = ('100__j', '700__j', '100__k', '700__k') for field in fields: for orcid in get_all_field_values(field): if not re.search('00-000', orcid): continue if not orcid.startswith('ORCID:'): search = field + ':' + orcid recid = perform_request_search(p=search, cc='HEP') if len(recid): print 'Needs to start with "ORCID:"', recid, orcid continue orcid = orcid.replace('ORCID:', '') if bad_id_check(orcid): search = field + ':' + orcid recid = perform_request_search(p=search, cc='HEP') if len(recid): print 'Bad ORCID', recid, orcid search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \ or {3}:ORCID:* 980:CORE".format(fields[0], fields[1], fields[2], fields[3]) result = perform_request_search(p=search, cc='HEP') for recid, field in [(recid, field) for recid in result \ for field in fields]: for orcid in get_fieldvalues(recid, field): if not orcid.startswith('ORCID:'): continue orcid = orcid.strip('ORCID:') if orcid in already_seen: continue if bad_id_check(orcid): print "Bad ORCID in HEP:", orcid already_seen[orcid] = recid print "http://inspirehep.net/record/{0}\thttp://orcid.org/{1}".\ format(str(recid), orcid) orcid_counter += 1 #search = '001:' + str(recid) + ' 980:CORE' #if perform_request_search(p=search, cc='HEP'): # print "http://inspirehep.net/record/{0}\thttp://orcid.org/{1}".\ # format(str(recid), orcid) print 'New orcids:', orcid_counter
import re from sys import argv from invenio.search_engine import perform_request_search, get_record, \ search_unit, get_all_field_values from invenio.bibrecord import print_rec, record_get_field_instances, \ record_add_field from invenio.intbitset import intbitset from invenio.bibformat_engine import BibFormatObject from invenio.search_engine import get_collection_reclist from hep_convert_email_to_id_input import RECIDS, SEARCH, VERBOSE HN = get_collection_reclist('HepNames') EMAILS_HEPNAMES = get_all_field_values('371__m') + \ get_all_field_values('371__o') + \ get_all_field_values('595__m') + \ get_all_field_values('595__o') EMAILS_HEP = get_all_field_values('100__m') + get_all_field_values('700__m') COUNTER_MAX = 400 def generate_check_digit(base_digits): ''' Taken from https://github.com/tjwds/generate-orcid-checksum ''' total = 0 for digit in str(base_digits): total = (total + int(digit)) * 2 remainder = total % 11
from os.path import exists import re import random #from hep_ads_xml_dois import DOIS #DIRECTORY = '/afs/cern.ch/project/inspire/TEST/hoc/' ADS_REGEX = re.compile(r"^\d{4}([a-z&]+)[\d.]+[a-z.\d]+$", re.IGNORECASE) ARXIV_REGEX = re.compile(r"^[a-z]+\-?[a-z]+\.?[A-Z]*/\d{7}$") INSPIRE_EPRINTS = set() INSPIRE_ADS_BIBCODE = set() INSPIRE_DOIS = set() EPRINT_UNION = get_all_field_values('035__a') + \ get_all_field_values('035__z') + \ get_all_field_values('037__a') for item in EPRINT_UNION: if item.startswith('oai:arXiv.org:'): INSPIRE_EPRINTS.add(item.replace('oai:arXiv.org:', '')) elif item.startswith('arXiv:'): INSPIRE_EPRINTS.add(item.replace('arXiv:', '')) elif re.match(ARXIV_REGEX, item): INSPIRE_EPRINTS.add(item) elif len(item) != 19: continue if re.match(ADS_REGEX, item): INSPIRE_ADS_BIBCODE.add(item) for item in get_all_field_values('0247_a'): if item.startswith('10.'):
import re COUNTER_MAX = 100 FERMILAB = re.compile('^FERMILAB.*', re.I) FNAL = re.compile('^FNAL.*', re.I) ARCHAIC = re.compile('^FERMILAB.*(THY|EXP)$', re.I) YY_FORM = re.compile(r'^[A-Z]+-[A-Z]+-\d\d-\d\d\d.*') YYYY_FORM = re.compile(r'^[A-Z]+-[A-Z]+-\d\d\d\d-\d\d.*') DDDD_FORM = re.compile(r'^[A-Z]+-(TM|FN|PROPOSAL)-\d\d\d\d.*') EXCEPTION = re.compile(r'^FERMILAB-THESIS-200[5-8]-\d\d\d.*') REF = '999C5r' REP = '037__a' REPORT_CITATIONS = get_all_field_values(REF) #REPORT_CITATIONS = ['FERMILAB-PUB-96-357-'] FERMILAB_CITATIONS = set([ report.upper() for report in REPORT_CITATIONS if FERMILAB.match(report) or FNAL.match(report) ]) REPORT_NUMBERS = get_all_field_values(REP) #REPORT_NUMBERS = ['FERMILAB-PUB-96-357-E'] FERMILAB_REPORTS = set([ report.upper() for report in REPORT_NUMBERS if (FERMILAB.match(report) or FNAL.match(report)) and not ARCHAIC.match(report) ])
import re from sys import argv from invenio.search_engine import perform_request_search, get_record, \ search_unit, get_all_field_values from invenio.bibrecord import print_rec, record_get_field_instances, \ record_add_field, record_get_field_value from invenio.intbitset import intbitset from invenio.bibformat_engine import BibFormatObject from invenio.search_engine import get_collection_reclist from hep_convert_email_to_id_input import RECIDS, SEARCH, VERBOSE HN = get_collection_reclist('HepNames') EMAILS_HEPNAMES = get_all_field_values('371__m') + \ get_all_field_values('371__o') + \ get_all_field_values('595__m') + \ get_all_field_values('595__o') EMAILS_HEP = get_all_field_values('100__m') + get_all_field_values('700__m') COUNTER_MAX = 400 def generate_check_digit(base_digits): ''' Taken from https://github.com/tjwds/generate-orcid-checksum ''' total = 0 for digit in str(base_digits): total = (total + int(digit)) * 2