コード例 #1
0
def start_bibupload_job(id_pairs):
    """ Submits the append job to bibupload
    id_pairs - {local_recid: remote_recid} """
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    for local, remote in id_pairs.iteritems():
        bibupload.add(generate_marc_to_append(local, remote))
    bibupload.cleanup()  # This initiates the job
コード例 #2
0
def apply_hepnames_updates(hepname_updates):
    bibupload = ChunkedBibUpload(mode='a', user='******')
    for recid, entry in hepname_updates.iteritems():
        record = {}
        record_add_field(record, '001', controlfield_value=str(recid))
        for key, value in entry.iteritems():
            if key in ('ORCID', 'ORIGINAL_BAI', 'INSPIRE', 'KAKEN'):
                if key == 'ORIGINAL_BAI':
                    key = 'BAI'
                record_add_field(record, '035', subfields=[('a', value), ('9', key)])
        write_message(record_xml_output(record))
        bibupload.add(record_xml_output(record))
コード例 #3
0
def bst_hepnames_orcid_sync():
    bai_orcids = run_sql("SELECT bai.data, orcid.data FROM aidPERSONIDDATA as bai JOIN aidPERSONIDDATA as orcid ON bai.personid=orcid.personid WHERE orcid.tag='extid:ORCID' AND bai.tag='canonical_name'")

    recs = []

    not_matched_profiles = 0
    enhanced_records = 0
    conflicting_orcids = 0

    for bai, orcid in bai_orcids:
        recids = perform_request_search(p="035:%s" % bai, cc="HepNames")
        if len(recids) > 1:
            write_message("WARNING: %s/author/profile/%s, %s matches more than one HepNames: %s" % (CFG_SITE_URL, bai, orcid, recids), stream=sys.stderr)
            not_matched_profiles += 1
        elif not recids:
            write_message("WARNING: %s/author/profile/%s, %s does not match any HepName" % (CFG_SITE_URL, bai, orcid), stream=sys.stderr)
            not_matched_profiles += 1
        else:
            recid = recids[0]
            record = get_record(recid)
            for field in record_get_field_instances(record, tag="035"):
                subfields = field_get_subfield_instances(field)
                subfields_dict = dict(subfields)
                if subfields_dict.get('9') == 'ORCID':
                    if subfields_dict.get('a') != orcid:
                        if not subfields_dict.get('a', '').strip():
                            write_message("WARNING: record %s/record/%s has an empty ORCID" % (CFG_SITE_URL, recid), stream=sys.stderr)
                            continue
                        write_message("WARNING: record %s/record/%s matched by BAI %s/author/profile/%s has a different ORCID %s than the profile one: %s" % (CFG_SITE_URL, recid, CFG_SITE_URL, bai, subfields_dict.get('a'), orcid), stream=sys.stderr)
                        conflicting_orcids += 1
                    break
            else:
                new_record = {}
                record_add_field(new_record, tag="001", controlfield_value=str(recid))
                record_add_field(new_record, tag="035", subfields=[('a', orcid), ('9', 'ORCID')])
                recs.append(new_record)
                write_message("INFO: adding ORCID %s to record %s/record/%s matched by BAI %s/author/profile/%s" % (orcid, CFG_SITE_URL, recid, CFG_SITE_URL, bai))
                enhanced_records += 1
    if recs:
        write_message("INFO: initiating uploads")
        bibupload = ChunkedBibUpload(mode="a", user='******')
        for record in recs:
            bibupload.add(record_xml_output(record))
        bibupload.cleanup()
    else:
        write_message("INFO: no modification are necessary")
    write_message("INFO: not_matched_profiles: %s, enhanced_records: %s, conflicting_orcids: %s" % (not_matched_profiles, enhanced_records, conflicting_orcids))
コード例 #4
0
ファイル: bst_hepdata.py プロジェクト: turtle321/inspire
def bst_hepdata():
    uploader = ChunkedHepDataUpload()
    dumper = HepDataDumper()
    for record in dumper:
        marcxml_record = hepdata2marcxml(record)
        uploader.add(marcxml_record)
    inspire_ids = dumper.inspire_ids
    current_inspire_ids = intbitset(perform_request_search(p='035__9:HEPDATA'))
    records_to_amend = inspire_ids - current_inspire_ids
    id_appender = ChunkedBibUpload(mode='a', user='******')
    for recid in records_to_amend:
        rec = {}
        record_add_field(rec, tag="001", controlfield_value=str(recid))
        record_add_field(rec,
                         tag="035",
                         subfields=[('a', 'ins%s' % recid), ('9', 'HEPDATA')])
        id_appender.add(record_xml_output(rec))
コード例 #5
0
ファイル: bst_hal.py プロジェクト: rzarref/inspire_inspirehep
def bst_hal():
    doi_map, arxiv_map = get_hal_maps()
    matchable_records = get_record_ids_to_export()
    write_message("Total matchable records: %s" % len(matchable_records))
    hal_records = get_hal_records()
    write_message("Already matched records: %s" % len(hal_records))
    bibupload = ChunkedBibUpload(mode='a', notimechange=True, user='******')
    tot_records = matchable_records - hal_records
    write_message("Records to be checked: %s" % len(tot_records))
    for i, recid in enumerate(tot_records):
        if i % 1000 == 0:
            write_message("%s records done out of %s" % (i, len(tot_records)))
            task_sleep_now_if_required()
        dois = get_fieldvalues(recid, tag='0247__a', sort=False)
        arxivs = get_fieldvalues(recid, tag='037__a', sort=False)
        matched_hal = [doi_map[doi] for doi in dois if doi in doi_map]
        matched_hal += [
            arxiv_map[arxiv] for arxiv in arxivs if arxiv in arxiv_map
        ]

        # Let's assert that we matched only one single hal document at most
        matched_hal_id = set(id(entry) for entry in matched_hal)
        if len(matched_hal) > 1:
            write_message(
                "WARNING: record %s matches more than 1 HAL record: %s" %
                (recid, matched_hal),
                stream=sys.stderr)
            continue
        elif not matched_hal:
            continue
        hal_id = matched_hal[0]['halId_s']

        rec = {}
        record_add_field(rec, '001', controlfield_value=str(recid))
        record_add_field(rec, '035', subfields=[('a', hal_id), ('9', 'HAL')])

        write_message("Record %s matched HAL record %s" % (recid, hal_id))

        bibupload.add(record_xml_output(rec))

    return True
コード例 #6
0
ファイル: bst_labssync.py プロジェクト: turtle321/inspire
def bst_labssync():
    """
    Synchronizes from Labs via redis.

    """
    r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS)
    user_agent = make_user_agent_string('labssync')
    s = requests.Session()
    s.headers['User-Agent'] = user_agent
    s.headers['Accept'] = 'application/marcxml+xml'

    tot = r.scard(CFG_REDIS_KEY)
    if tot == 0:
        write_message("Nothing to do")
        return
    else:
        write_message("At least %s records to synchronize from labs" % tot)

    errors = []
    final_total = 0
    uploader = ChunkedBibUpload(mode='r', user='******')
    while True:
        elem = r.spop(CFG_REDIS_KEY)
        if not elem:
            break
        final_total += 1
        try:
            record = s.get("https://%s/api/%s" % (CFG_LABS_HOSTNAME, elem)).text

            # Let's strip collection/XML header
            record = record_xml_output(create_record(record)[0])
            uploader.add(record)
            task_sleep_now_if_required()
        except Exception as err:
            register_exception()
            write_message("ERROR: when retrieving %s: %s" % (elem, err), stream=sys.stderr)
            errors.append(elem)

    write_message("Finally synced %s records from labs" % final_total)
    if errors:
        write_message("All those %s records had errors and might need to be resynced: %s" % (len(errors), ', '.join(errors)))
コード例 #7
0
def start_bibupload_job(id_pairs):
    """ Submits the append job to bibupload
    id_pairs - {local_recid: remote_recid} """
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    for local, remote in id_pairs.iteritems():
        bibupload.add(generate_marc_to_append(local, remote))
    bibupload.cleanup()  # This initiates the job
コード例 #8
0
ファイル: bst_hal.py プロジェクト: kaplun/inspire
def bst_hal():
    doi_map, arxiv_map = get_hal_maps()
    matchable_records = get_record_ids_to_export()
    write_message("Total matchable records: %s" % len(matchable_records))
    hal_records = get_hal_records()
    write_message("Already matched records: %s" % len(hal_records))
    bibupload = ChunkedBibUpload(mode='a', notimechange=True, user='******')
    tot_records = matchable_records - hal_records
    write_message("Records to be checked: %s" % len(tot_records))
    for i, recid in enumerate(tot_records):
        if i % 1000 == 0:
            write_message("%s records done out of %s" % (i, len(tot_records)))
            task_sleep_now_if_required()
        dois = get_fieldvalues(recid, tag='0247__a', sort=False)
        arxivs = get_fieldvalues(recid, tag='037__a', sort=False)
        matched_hal = [doi_map[doi] for doi in dois if doi in doi_map]
        matched_hal += [arxiv_map[arxiv] for arxiv in arxivs if arxiv in arxiv_map]

        # Let's assert that we matched only one single hal document at most
        matched_hal_id = set(id(entry) for entry in matched_hal)
        if len(matched_hal) > 1:
            write_message("WARNING: record %s matches more than 1 HAL record: %s" % (recid, matched_hal), stream=sys.stderr)
            continue
        elif not matched_hal:
            continue
        hal_id = matched_hal[0]['halId_s']

        rec = {}
        record_add_field(rec, '001', controlfield_value=str(recid))
        record_add_field(rec, '035', subfields=[('a', hal_id), ('9', 'HAL')])

        write_message("Record %s matched HAL record %s" % (recid, hal_id))

        bibupload.add(record_xml_output(rec))

    return True
コード例 #9
0
ファイル: bst_hal.py プロジェクト: turtle321/inspire
def bst_hal():
    doi_map, arxiv_map, recid_map = get_hal_maps()
    matchable_records = get_record_ids_to_export()
    write_message("Total matchable records: %s" % len(matchable_records))
    hal_records = get_hal_records()
    write_message("Already matched records: %s" % len(hal_records))
    new_inspire_ids = intbitset(recid_map.keys()) - hal_records
    write_message("New records pushed from Inspire: %s" % len(new_inspire_ids))
    bibupload = ChunkedBibUpload(mode='a', notimechange=True, user='******')
    for recid in new_inspire_ids:
        hal_id = recid_map[recid]['halId_s']
        update_record(recid, hal_id, bibupload)
    write_message('Added HAL ids to all records pushed from Inspire')
    task_sleep_now_if_required()
    tot_records = matchable_records - hal_records - new_inspire_ids
    write_message("Additional records to be checked: %s" % len(tot_records))
    for i, recid in enumerate(tot_records):
        if i % 1000 == 0:
            write_message("%s records done out of %s" % (i, len(tot_records)))
            task_sleep_now_if_required()
        dois = get_fieldvalues(recid, tag='0247__a', sort=False)
        arxivs = get_fieldvalues(recid, tag='037__a', sort=False)
        matched_hal = [doi_map[doi] for doi in dois if doi in doi_map]
        matched_hal += [arxiv_map[arxiv] for arxiv in arxivs if arxiv in arxiv_map]

        # Let's assert that we matched only one single hal document at most
        matched_hal_id = set(id(entry) for entry in matched_hal)
        if len(matched_hal_id) > 1:
            write_message("WARNING: record %s matches more than 1 HAL record: %s" % (recid, matched_hal), stream=sys.stderr)
            continue
        elif not matched_hal:
            continue
        hal_id = matched_hal[0]['halId_s']

        update_record(recid, hal_id, bibupload)

    return True
コード例 #10
0
#!/usr/bin/python
# -*- coding: utf-8 -*-


""" Deletes records from a local Invenio instance """


from invenio.search_engine import get_record
from invenio.bibtaskutils import ChunkedBibUpload
from invenio.bibrecord import record_xml_output


bibupload = ChunkedBibUpload(mode='d', user='******', notimechange=True)


print "Invenio record deleter!"
print "Enter range of record IDs to be deleted:"
print "Start: "
range_start = int(raw_input())
print "End: "
range_end = int(raw_input()) + 1

print " ========== Let's do this! =========="

for recid in range(range_start, range_end):
	record = get_record(recid)
	if record:
		marc = record_xml_output(record, tags=list('001'))
		bibupload.add(marc)
		print "%s: Got it!" % str(recid)
	else:
コード例 #11
0
def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR,
                         logging=True,
                         asana_key=CFG_ASANA_API_KEY,
                         asana_parent_id=ASANA_PARENT_TASK_ID,
                         skip_result_types='missing'):
    """Update DOIs on documents harvested from ArXiv.

    Parameters:
    :param input_uri: Link to new URI data
        DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
        NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
    :param log_dir: Directory to store log files in
    :param logging: True or False, default True
    :param asana_key: The Asana API, by default uses the value of CFG_ASANA_API_KEY
        NOTE: Passing the value of None for this parameter will skip writing
        to Asana and instead email the instance admin
    :param asana_parent_id: The taskID of the task in Asana to log subtasks to
    :param skip_result_types: Error messages to not bother with during
        reporting, input as Comma Seperated Values CSVs
        Possible values: missing, ambigous, incorrect
    """
    skip_results = verify_skip_results(skip_result_types)

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI: %s" % (input_uri,))

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=False)

    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retrieving DOI data")
        return False
    except ExpatError:
        _print("FATAL ERROR: Could not parse XML from: " + input_uri, 1)
        task_update_progress("Failed parsing DOI data")
        return False

    root = tree.getroot()

    try:
        date_el = root.find('date')
        date_str = '%s-%s-%s' % (date_el.get('year'), date_el.get('month'),
                                 date_el.get('day'))
        _print("Processing DOIs last updated on date %s" % date_str)
    except AttributeError:
        _print("Warning: Couldn't get last published date of Arxiv DOI feed.")

    doi_count = 0
    new_count = 0

    # Stores any DOIs with have issues with in structure:
    # Missing: (doi, arxiv preprint_id, published date)
    # Ambiguous: (doi, arxiv preprint_id, rec_ids)
    # Incorrect: (rec_id, old-doi, new-doi)
    problem_dois = {'missing': [], 'ambiguous': [], 'incorrect': []}

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            try:
                record_xml = append_to_record(rec_id, doi, published_date)
            except DOIError as ex:
                problem_dois['incorrect'].append((rec_id, ex.message, doi))
                continue
            if record_xml:
                new_count += 1
                _print("* Now we will run the bibupload for " +
                       "%s record" % rec_id, 5)
                _print("** We will upload the following xml code %s" %
                       repr(record_xml), 9)
                bibupload.add(record_xml)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
            problem_dois['ambiguous'].append((doi, arxiv, repr(rec_id)))
        else:
            _print('No record found matching arxiv ID: %s' % arxiv, 9)
            problem_dois['missing'].append((doi, arxiv, published_date))

    _print("========================| FINAL SCORE |=======================", 1)
    _print("DOIs found and processed: %d" % doi_count, 1)
    _print("Arxiv IDs without corresponding records: %d"
           % len(problem_dois['missing']), 1)
    _print("Arxiv IDs corresponding to multiple records (duplicates): %d"
           % len(problem_dois['ambiguous']), 1)
    _print("Inspire records with an incorrect DOI: %d"
           % len(problem_dois['incorrect']), 1)
    _print("Records without DOIs requiring appends: %d" % new_count, 1)
    _print("==============================================================", 1)

    bibupload.cleanup()

    notify_on_errors(problem_dois, log_dir, doi_count, new_count,
                     asana_key, asana_parent_id, skip_results)

    return True
コード例 #12
0
def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR,
                         logging=True,
                         asana_key=CFG_ASANA_API_KEY,
                         asana_parent_id=ASANA_PARENT_TASK_ID,
                         skip_result_types='missing'):
    """Update DOIs on documents harvested from ArXiv.

    Parameters:
    :param input_uri: Link to new URI data
        DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
        NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
    :param log_dir: Directory to store log files in
    :param logging: True or False, default True
    :param asana_key: The Asana API, by default uses the value of CFG_ASANA_API_KEY
        NOTE: Passing the value of None for this parameter will skip writing
        to Asana and instead email the instance admin
    :param asana_parent_id: The taskID of the task in Asana to log subtasks to
    :param skip_result_types: Error messages to not bother with during
        reporting, input as Comma Seperated Values CSVs
        Possible values: missing, ambigous, incorrect
    """
    skip_results = verify_skip_results(skip_result_types)

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI: %s" % (input_uri, ))

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a',
                                 user=SCRIPT_NAME,
                                 notimechange=False)

    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retrieving DOI data")
        return False
    except ExpatError:
        _print("FATAL ERROR: Could not parse XML from: " + input_uri, 1)
        task_update_progress("Failed parsing DOI data")
        return False

    root = tree.getroot()

    try:
        date_el = root.find('date')
        date_str = '%s-%s-%s' % (date_el.get('year'), date_el.get('month'),
                                 date_el.get('day'))
        _print("Processing DOIs last updated on date %s" % date_str)
    except AttributeError:
        _print("Warning: Couldn't get last published date of Arxiv DOI feed.")

    doi_count = 0
    new_count = 0

    # Stores any DOIs with have issues with in structure:
    # Missing: (doi, arxiv preprint_id, published date)
    # Ambiguous: (doi, arxiv preprint_id, rec_ids)
    # Incorrect: (rec_id, old-doi, new-doi)
    problem_dois = {'missing': [], 'ambiguous': [], 'incorrect': []}

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            try:
                record_xml = append_to_record(rec_id, doi, published_date)
            except DOIError as ex:
                problem_dois['incorrect'].append((rec_id, ex.message, doi))
                continue
            if record_xml:
                new_count += 1
                _print(
                    "* Now we will run the bibupload for " +
                    "%s record" % rec_id, 5)
                _print(
                    "** We will upload the following xml code %s" %
                    repr(record_xml), 9)
                bibupload.add(record_xml)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
            problem_dois['ambiguous'].append((doi, arxiv, repr(rec_id)))
        else:
            _print('No record found matching arxiv ID: %s' % arxiv, 9)
            problem_dois['missing'].append((doi, arxiv, published_date))

    _print("========================| FINAL SCORE |=======================", 1)
    _print("DOIs found and processed: %d" % doi_count, 1)
    _print(
        "Arxiv IDs without corresponding records: %d" %
        len(problem_dois['missing']), 1)
    _print(
        "Arxiv IDs corresponding to multiple records (duplicates): %d" %
        len(problem_dois['ambiguous']), 1)
    _print(
        "Inspire records with an incorrect DOI: %d" %
        len(problem_dois['incorrect']), 1)
    _print("Records without DOIs requiring appends: %d" % new_count, 1)
    _print("==============================================================", 1)

    bibupload.cleanup()

    notify_on_errors(problem_dois, log_dir, doi_count, new_count, asana_key,
                     asana_parent_id, skip_results)

    return True
コード例 #13
0
def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR, logging=True):
    """
    bst_arxiv_doi_update
    Updates DOIs on documents harvested from ArXiv.

    Parameters:
        * input_uri - Link to new URI data
            DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
            NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
        * log_dir - Directory to store log files in
        * logging - True or False, default True
    """

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI...")

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    bibindex = ChunkedBibIndex(indexes='year,global,journal',
                               user=SCRIPT_NAME)
    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retreiving DOI data")
        task_update_status("FAILED")
        return False
    root = tree.getroot()

    doi_count = 0
    new_count = 0
    missing_count = 0

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            record_xml = append_to_record(rec_id, doi, published_date)
            if record_xml:
                new_count += 1
                _print("* Now we will run the bibupload and bibindex for " +
                       str(rec_id) + " record", 5)
                _print("** We will upload the following xml code " +
                       repr(record_xml), 9)
                bibupload.add(record_xml)
                bibindex.add(rec_id)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
        else:
            missing_count += 1
            _print('No record found matching arxiv ID: ' + arxiv, 9)

    _print("======================== FINAL SCORE ========================", 1)
    _print("DOIs found and processed: " + str(doi_count), 1)
    _print("Arxiv IDs without corresponding records: " + str(missing_count), 1)
    _print("Records requiring appends: " + str(new_count), 1)

    if logging:
        task_update_progress("Logging...")
        write_list_to_file(log_dir, 'errors', ERRORS)
        write_list_to_file(log_dir, 'messages', MESSAGES)

    task_update_progress(SCRIPT_NAME + " finished. %s DOIs processed, %s to add"
                         % (str(doi_count), str(new_count)))
    task_update_status("DONE")

    bibupload.__del__()
    bibindex.__del__()

    return True
コード例 #14
0
def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR,
                         logging=True):
    """
    bst_arxiv_doi_update
    Updates DOIs on documents harvested from ArXiv.

    Parameters:
        * input_uri - Link to new URI data
            DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
            NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
        * log_dir - Directory to store log files in
        * logging - True or False, default True
    """

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI...")

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    bibindex = ChunkedBibIndex(indexes='year,global,journal', user=SCRIPT_NAME)
    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retreiving DOI data")
        task_update_status("FAILED")
        return False
    root = tree.getroot()

    doi_count = 0
    new_count = 0
    missing_count = 0

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            record_xml = append_to_record(rec_id, doi, published_date)
            if record_xml:
                new_count += 1
                _print(
                    "* Now we will run the bibupload and bibindex for " +
                    str(rec_id) + " record", 5)
                _print(
                    "** We will upload the following xml code " +
                    repr(record_xml), 9)
                bibupload.add(record_xml)
                bibindex.add(rec_id)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
        else:
            missing_count += 1
            _print('No record found matching arxiv ID: ' + arxiv, 9)

    _print("======================== FINAL SCORE ========================", 1)
    _print("DOIs found and processed: " + str(doi_count), 1)
    _print("Arxiv IDs without corresponding records: " + str(missing_count), 1)
    _print("Records requiring appends: " + str(new_count), 1)

    if logging:
        task_update_progress("Logging...")
        write_list_to_file(log_dir, 'errors', ERRORS)
        write_list_to_file(log_dir, 'messages', MESSAGES)

    task_update_progress(SCRIPT_NAME +
                         " finished. %s DOIs processed, %s to add" %
                         (str(doi_count), str(new_count)))
    task_update_status("DONE")

    bibupload.__del__()
    bibindex.__del__()

    return True