Ejemplo n.º 1
0
def bad_experiments_affilations():
    """Check to see bad metadata."""

    metadata = {
        'Institutions': {
            'HepNames': '371__a',
            'Truth': '110__u'
        },
        'Experiments': {
            'HepNames': '693__e',
            'Truth': '119__a'
        }
    }

    for aff in metadata:
        for value in numpy.setdiff1d(
                get_all_field_values(metadata[aff]['HepNames']),
                get_all_field_values(metadata[aff]['Truth'])):
            search = metadata[aff]['HepNames'] + ':"' + value + '"'
            result = perform_request_search(p=search, cc='HepNames')
            result = intbitset(result) & RECIDS_HEPN
            if len(result) > 0:
                if perform_request_search(p=metadata[aff]['Truth'] + ':"' +
                                          value + '"',
                                          cc=aff) == []:
                    print search, result
Ejemplo n.º 2
0
def generate_data(data_set):
    '''Gets data from INSPIRE.'''

    if data_set == 'INSPIRE_JOURNALS':
        return [set(get_all_field_values('711__a')), None]
    if data_set == 'INSPIRE_EPRINTS':
        source = set(get_all_field_values('035__a')) | \
                 set(get_all_field_values('035__z')) | \
                 set(get_all_field_values('037__a'))
        inspire_eprints = set()
        inspire_bibcodes = set()
        for item in source:
            if item.startswith('oai:arXiv.org:'):
                inspire_eprints.add(item.replace('oai:arXiv.org:', ''))
            elif item.startswith('arXiv:'):
                inspire_eprints.add(item.replace('arXiv:', ''))
            elif re.match(ARXIV_REGEX, item):
                inspire_eprints.add(item)
            elif len(item) != 19:
                continue
            if re.match(ADS_REGEX, item):
                inspire_bibcodes.add(item)
        return [inspire_eprints, inspire_bibcodes]
    if data_set == 'INSPIRE_DOIS':
        source = get_all_field_values('0247_a')
        inspire_dois = set()
        for item in source:
            if item.startswith('10.'):
                inspire_dois.add(item)
        return [inspire_dois, None]
    return [None, None]
Ejemplo n.º 3
0
def get_all_sets():
    """
    Return all the sets.
    """
    res = run_sql("SELECT setSpec, setName, setDescription FROM oaiREPOSITORY")
    ret = {}
    for row in res:
        ret[row[0]] = row

    ## Let's expand with all the set that exist in the DB
    for a_set in get_all_field_values(CFG_OAI_SET_FIELD):
        if a_set not in ret:
            ret[a_set] = (a_set, a_set, '')

    ## Let's expand with all the supersets
    for a_set in ret.keys():
        while ':' in a_set:
            try:
                a_set = a_set.rsplit(":", 1)[0]
            except AttributeError:
                a_set = ':'.join(a_set.split(":")[:-1])
            if a_set not in ret:
                ret[a_set] = (a_set, a_set, '')

    if CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC in ret:
        ## Let's remove the special global set
        del ret[CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC]

    if '' in ret:
        ## '' is not a valid setSpec but might be in the MARC
        del ret['']

    return ret
    def get_citations(self):
        """Find all the citations of records in this repository."""

        citations_list = []
        citations = ''
        for ref in get_all_field_values('999C5a'):
            if self.regex_base.match(ref):
                search = '999C5a:' + ref
                cites = perform_request_search(p=search, cc='HEP')
                if len(cites):
                    if not self.regex.match(ref):
                        print 'Problem with DOI extraction:', search, cites
                        continue
                    try:
                        metadata = self.get_ref_metadata_inspire(
                            ref, self.dois)
                        if not metadata:
                            metadata = self.get_ref_metadata_repository(ref)
                    except ValueError:
                        print 'Problem with DOI:', search, cites, '\n'
                        continue
                    citations_list.append((len(cites), ref, cites, metadata))
        for doi in sorted(citations_list, reverse=True):
            doi_url = 'https://doi.org/' + doi[1].replace('doi:', '')
            citations += \
'''{0} citations to {3}
    {2}
    https://inspirehep.net/search?p=999C5a:{1}

'''.format(doi[0], doi[1], doi[3], doi_url)
        return citations
Ejemplo n.º 5
0
def get_all_sets():
    """
    Return all the sets.
    """
    res = run_sql("SELECT setSpec, setName, setDescription FROM oaiREPOSITORY")
    ret = {}
    for row in res:
        ret[row[0]] = row

    ## Let's expand with all the set that exist in the DB
    for a_set in get_all_field_values(CFG_OAI_SET_FIELD):
        if a_set not in ret:
            ret[a_set] = (a_set, a_set, '')

    ## Let's expand with all the supersets
    for a_set in ret.keys():
        while ':' in a_set:
            try:
                a_set = a_set.rsplit(":", 1)[0]
            except AttributeError:
                a_set = ':'.join(a_set.split(":")[:-1])
            if a_set not in ret:
                ret[a_set] = (a_set, a_set, '')

    if CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC in ret:
        ## Let's remove the special global set
        del ret[CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC]

    if '' in ret:
        ## '' is not a valid setSpec but might be in the MARC
        del ret['']

    return ret
 def get_inspire_dois(cls, regex):
     """Get all the DOIs in INSPIRE."""
     dois = set()
     for doi in get_all_field_values('0247_a'):
         doi = 'doi:' + doi
         if regex.match(doi):
             dois.add(doi)
     return dois
Ejemplo n.º 7
0
def new_orcids(already_seen):
    """Search for new ORCIDs in HEP."""

    orcid_counter = 0
    fields = ('100__j', '700__j', '100__k', '700__k')
    for field in fields:
        for orcid in get_all_field_values(field):
            if not re.search('00-000', orcid):
                continue
            if not orcid.startswith('ORCID:'):
                search = field + ':' + orcid
                recid = perform_request_search(p=search, cc='HEP')
                if len(recid):
                    print 'Needs to start with "ORCID:"', recid, orcid
                continue
            orcid = orcid.replace('ORCID:', '')
            if bad_id_check(orcid):
                search = field + ':' + orcid
                recid = perform_request_search(p=search, cc='HEP')
                if len(recid):
                    print 'Bad ORCID', recid, orcid

    search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \
              or {3}:ORCID:* 980:core".format(fields[0], fields[1], fields[2],
                                              fields[3])
    search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \
              or {3}:ORCID:* (037__c:hep-* or 037__c:nucl-* \
              or 037__c:math*)".format(fields[0], fields[1], fields[2],
                                       fields[3])
    search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \
              or {3}:ORCID:* 037__c:hep-th".format(fields[0], fields[1],
                                                   fields[2], fields[3])

    result = perform_request_search(p=search, cc='HEP')
    #search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \
    #          or {3}:ORCID:*".format(fields[0],
    #              fields[1], fields[2], fields[3])
    #result = perform_request_search(p=search, cc='Fermilab')

    for recid, field in [(recid, field) for recid in result \
                                        for field in fields]:
        for orcid in get_fieldvalues(recid, field):
            if not orcid.startswith('ORCID:'):
                continue
            orcid = orcid.strip('ORCID:')
            if orcid in already_seen:
                continue
            if bad_id_check(orcid):
                print "Bad ORCID in HEP:", orcid
            already_seen[orcid] = recid
            print "http://inspirehep.net/record/{0}\thttp://orcid.org/{1}".\
                  format(str(recid), orcid)
            orcid_counter += 1
            #search = '001:' + str(recid) + ' 980:CORE'
            #if perform_request_search(p=search, cc='HEP'):
            #  print "http://inspirehep.net/record/{0}\thttp://orcid.org/{1}".\
            #      format(str(recid), orcid)
    print 'New orcids:', orcid_counter
def bad_experiments_affilations():
    """Check to see bad metadata."""

    metadata = {'Institutions':{'HepNames':'371__a', 'Truth':'110__u'},
                'Experiments':{'HepNames':'693__e', 'Truth':'119__a'}}

    for aff in metadata:
        for value in numpy.setdiff1d(get_all_field_values(
                                     metadata[aff]['HepNames']),
                                     get_all_field_values(
                                     metadata[aff]['Truth'])):
            search = metadata[aff]['HepNames'] + ':"' + value + '"'
            result = perform_request_search(p=search, cc='HepNames')
            result = intbitset(result) & RECIDS_HEPN
            if len(result) > 0:
                if perform_request_search(
                p=metadata[aff]['Truth'] + ':"' + value + '"', cc=aff) == []:
                    print search, result
Ejemplo n.º 9
0
def bad_url_z():
    """Check to make sure $$z field is correct."""

    field = '8564_z'
    good_values = set(['postprint', 'openaccess'])
    for value in get_all_field_values(field):
        search = field + ':' + value
        result = perform_request_search(p=search, cc='Fermilab')
        if len(result) == 0:
            continue
        if value in good_values:
            print value, len(result)
        else:
            print value, result
def bad_url_z():
    """Check to make sure $$z field is correct."""

    field = '8564_z'
    good_values = set(['postprint', 'openaccess'])
    for value in get_all_field_values(field):
        search = field + ':' + value
        result = perform_request_search(p=search, cc='Fermilab')
        if len(result) == 0:
            continue
        if value in good_values:
            print value, len(result)
        else:
            print value, result
Ejemplo n.º 11
0
def get_jacow_dois():
    """Return all the JACoW DOIs INSPIRE has."""

    jacow_dois_record = set()
    for doi in get_all_field_values('0247_a'):
        if doi.startswith('10.18429/JACoW-'):
            jacow_dois_record.add('doi:' + doi)

    jacow_dois_ref = set()
    for doi in get_all_field_values('999C5a'):
        if doi.startswith('doi:10.18429/JACoW-'):
            jacow_dois_ref.add(doi)
    missing_dois = jacow_dois_ref - jacow_dois_record
    if not missing_dois:
        return jacow_dois_record

    for doi in sorted(missing_dois):
        if good_doi(doi):
            search_unit('doi', f='0247_2', m='a')
            doi = doi.replace('doi:', '')
            if search_unit(doi, f='0247_2', m='a'):
                continue
            print 'https://doi.org/{0}'.format(doi)
    sys.exit()
def new_orcids(already_seen):
    """Search for new ORCIDs in HEP."""

    orcid_counter = 0
    fields = ('100__j', '700__j', '100__k', '700__k')
    for field in fields:
        for orcid in get_all_field_values(field):
            if not re.search('00-000', orcid):
                continue
            if not orcid.startswith('ORCID:'):
                search = field + ':' + orcid
                recid = perform_request_search(p=search, cc='HEP')
                if len(recid):
                    print 'Needs to start with "ORCID:"', recid, orcid
                continue
            orcid = orcid.replace('ORCID:', '')
            if bad_id_check(orcid):
                search = field + ':' + orcid
                recid = perform_request_search(p=search, cc='HEP')
                if len(recid):
                    print 'Bad ORCID', recid, orcid


    search = "{0}:ORCID:* or {1}:ORCID:* or {2}:ORCID:* \
              or {3}:ORCID:* 980:CORE".format(fields[0],
                  fields[1], fields[2], fields[3])
    result = perform_request_search(p=search, cc='HEP')
    for recid, field in [(recid, field) for recid in result \
                                        for field in fields]:
        for orcid in get_fieldvalues(recid, field):
            if not orcid.startswith('ORCID:'):
                continue
            orcid = orcid.strip('ORCID:')
            if orcid in already_seen:
                continue
            if bad_id_check(orcid):
                print "Bad ORCID in HEP:", orcid
            already_seen[orcid] = recid
            print "http://inspirehep.net/record/{0}\thttp://orcid.org/{1}".\
                  format(str(recid), orcid)
            orcid_counter += 1
            #search = '001:' + str(recid) + ' 980:CORE'
            #if perform_request_search(p=search, cc='HEP'):
            #  print "http://inspirehep.net/record/{0}\thttp://orcid.org/{1}".\
            #      format(str(recid), orcid)
    print 'New orcids:', orcid_counter
import re
from sys import argv

from invenio.search_engine import perform_request_search, get_record, \
                                  search_unit, get_all_field_values
from invenio.bibrecord import print_rec, record_get_field_instances, \
                              record_add_field
from invenio.intbitset import intbitset
from invenio.bibformat_engine import BibFormatObject
from invenio.search_engine import get_collection_reclist

from hep_convert_email_to_id_input import RECIDS, SEARCH, VERBOSE

HN = get_collection_reclist('HepNames')

EMAILS_HEPNAMES = get_all_field_values('371__m') + \
         get_all_field_values('371__o') + \
         get_all_field_values('595__m') + \
         get_all_field_values('595__o')
EMAILS_HEP = get_all_field_values('100__m') + get_all_field_values('700__m')

COUNTER_MAX = 400

def generate_check_digit(base_digits):
    '''
    Taken from https://github.com/tjwds/generate-orcid-checksum
    '''
    total = 0
    for digit in str(base_digits):
        total = (total + int(digit)) * 2
    remainder = total % 11
Ejemplo n.º 14
0
from os.path import exists
import re
import random

#from hep_ads_xml_dois import DOIS

#DIRECTORY = '/afs/cern.ch/project/inspire/TEST/hoc/'

ADS_REGEX = re.compile(r"^\d{4}([a-z&]+)[\d.]+[a-z.\d]+$", 
                       re.IGNORECASE)
ARXIV_REGEX = re.compile(r"^[a-z]+\-?[a-z]+\.?[A-Z]*/\d{7}$")

INSPIRE_EPRINTS = set()
INSPIRE_ADS_BIBCODE = set()
INSPIRE_DOIS = set()
EPRINT_UNION = get_all_field_values('035__a') + \
               get_all_field_values('035__z') + \
               get_all_field_values('037__a')
for item in EPRINT_UNION:
    if item.startswith('oai:arXiv.org:'):
        INSPIRE_EPRINTS.add(item.replace('oai:arXiv.org:', ''))
    elif item.startswith('arXiv:'):
        INSPIRE_EPRINTS.add(item.replace('arXiv:', ''))
    elif re.match(ARXIV_REGEX, item):
        INSPIRE_EPRINTS.add(item)
    elif len(item) != 19:
        continue
    if re.match(ADS_REGEX, item):
        INSPIRE_ADS_BIBCODE.add(item)
for item in get_all_field_values('0247_a'):
    if item.startswith('10.'):
Ejemplo n.º 15
0
import re

COUNTER_MAX = 100
FERMILAB = re.compile('^FERMILAB.*', re.I)
FNAL = re.compile('^FNAL.*', re.I)
ARCHAIC = re.compile('^FERMILAB.*(THY|EXP)$', re.I)

YY_FORM = re.compile(r'^[A-Z]+-[A-Z]+-\d\d-\d\d\d.*')
YYYY_FORM = re.compile(r'^[A-Z]+-[A-Z]+-\d\d\d\d-\d\d.*')
DDDD_FORM = re.compile(r'^[A-Z]+-(TM|FN|PROPOSAL)-\d\d\d\d.*')
EXCEPTION = re.compile(r'^FERMILAB-THESIS-200[5-8]-\d\d\d.*')

REF = '999C5r'
REP = '037__a'

REPORT_CITATIONS = get_all_field_values(REF)
#REPORT_CITATIONS = ['FERMILAB-PUB-96-357-']
FERMILAB_CITATIONS = set([
    report.upper() for report in REPORT_CITATIONS
    if FERMILAB.match(report) or FNAL.match(report)
])

REPORT_NUMBERS = get_all_field_values(REP)
#REPORT_NUMBERS = ['FERMILAB-PUB-96-357-E']
FERMILAB_REPORTS = set([
    report.upper() for report in REPORT_NUMBERS
    if (FERMILAB.match(report) or FNAL.match(report))
    and not ARCHAIC.match(report)
])

Ejemplo n.º 16
0
import re
from sys import argv

from invenio.search_engine import perform_request_search, get_record, \
                                  search_unit, get_all_field_values
from invenio.bibrecord import print_rec, record_get_field_instances, \
                              record_add_field, record_get_field_value
from invenio.intbitset import intbitset
from invenio.bibformat_engine import BibFormatObject
from invenio.search_engine import get_collection_reclist

from hep_convert_email_to_id_input import RECIDS, SEARCH, VERBOSE

HN = get_collection_reclist('HepNames')

EMAILS_HEPNAMES = get_all_field_values('371__m') + \
         get_all_field_values('371__o') + \
         get_all_field_values('595__m') + \
         get_all_field_values('595__o')
EMAILS_HEP = get_all_field_values('100__m') + get_all_field_values('700__m')

COUNTER_MAX = 400


def generate_check_digit(base_digits):
    '''
    Taken from https://github.com/tjwds/generate-orcid-checksum
    '''
    total = 0
    for digit in str(base_digits):
        total = (total + int(digit)) * 2