Python ChunkedBibIndex Examples

Programming Language: Python

Namespace/Package Name: invenio.bibtaskutils

Class/Type: ChunkedBibIndex

Examples at hotexamples.com: 3

Python ChunkedBibIndex - 3 examples found. These are the top rated real world Python examples of invenio.bibtaskutils.ChunkedBibIndex extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ChunkedBibIndex(2)

__del__(2)

add(2)

Example #1

Show file

File: bst_arxiv_doi_update.py Project: petros-ioannidis/inspire

def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR,
                         logging=True,
                         asana_key=CFG_ASANA_API_KEY,
                         asana_parent_id=ASANA_PARENT_TASK_ID,
                         skip_result_types='missing'):
    """
    bst_arxiv_doi_update
    Updates DOIs on documents harvested from ArXiv.

    Parameters:
    :param input_uri: Link to new URI data
        DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
        NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
    :param log_dir: Directory to store log files in
    :param logging: True or False, default True
    :param asana_key: The Asana API, by default uses the value of CFG_ASANA_API_KEY
        NOTE: Passing the value of None for this parameter will skip writing
        to Asana and instead email the instance admin
    :param asana_parent_id: The taskID of the task in Asana to log subtasks to
    :param skip_result_types: Error messages to not bother with during
        reporting, input as Comma Seperated Values CSVs
        Possible values: missing, ambigous, incorrect
    """
    skip_results = verify_skip_results(skip_result_types)

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI: %s" % (input_uri,))

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    bibindex = ChunkedBibIndex(indexes='year,global,journal',
                               user=SCRIPT_NAME)
    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retreiving DOI data")
        task_update_status("FAILED")
        return False
    except ExpatError:
        _print("FATAL ERROR: Could not parse XML from: " + input_uri, 1)
        task_update_progress("Failed parsing DOI data")
        task_update_status("FAILED")
        return False

    root = tree.getroot()

    try:
        date_el = root.find('date')
        date_str = '%s-%s-%s' % (date_el.get('year'), date_el.get('month'),
                                 date_el.get('day'))
        _print("Processing DOIs last updated on date %s" % date_str)
    except AttributeError:
        _print("Warning: Couldn't get last published date of Arxiv DOI feed.")

    doi_count = 0
    new_count = 0

    # Stores any DOIs with have issues with in structure:
    # Missing: (doi, arxiv preprint_id, published date)
    # Ambiguous: (doi, arxiv preprint_id, rec_ids)
    # Incorrect: (rec_id, old-doi, new-doi)
    problem_dois = {'missing': [], 'ambiguous': [], 'incorrect': []}

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            try:
                record_xml = append_to_record(rec_id, doi, published_date)
            except DOIError as ex:
                problem_dois['incorrect'].append((rec_id, ex.message, doi))
                continue
            if record_xml:
                new_count += 1
                _print("* Now we will run the bibupload and bibindex for " +
                       "%s record" % rec_id, 5)
                _print("** We will upload the following xml code %s" %
                       repr(record_xml), 9)
                bibupload.add(record_xml)
                bibindex.add(rec_id)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
            problem_dois['ambiguous'].append((doi, arxiv, repr(rec_id)))
        else:
            _print('No record found matching arxiv ID: %s' % arxiv, 9)
            problem_dois['missing'].append((doi, arxiv, published_date))

    _print("========================| FINAL SCORE |=======================", 1)
    _print("DOIs found and processed: %d" % doi_count, 1)
    _print("Arxiv IDs without corresponding records: %d"
           % len(problem_dois['missing']), 1)
    _print("Arxiv IDs corresponding to multiple records (duplicates): %d"
           % len(problem_dois['ambiguous']), 1)
    _print("Inspire records with an incorrect DOI: %d"
           % len(problem_dois['incorrect']), 1)
    _print("Records without DOIs requiring appends: %d" % new_count, 1)
    _print("==============================================================", 1)

    if logging:
        task_update_progress("Logging...")
        write_list_to_file(log_dir, 'errors', ERRORS)
        write_list_to_file(log_dir, 'messages', MESSAGES)

    notify_on_errors(problem_dois, log_dir, doi_count, new_count,
                     asana_key, asana_parent_id, skip_results)

    task_update_progress("%s finished. %s DOIs processed, %s to add"
                         % (SCRIPT_NAME, str(doi_count), str(new_count)))
    task_update_status("DONE")

    bibupload.__del__()
    bibindex.__del__()

    return True

Example #2

Show file

File: bst_arxiv_doi_update.py Project: GRArmstrong/inspire-old

def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR, logging=True):
    """
    bst_arxiv_doi_update
    Updates DOIs on documents harvested from ArXiv.

    Parameters:
        * input_uri - Link to new URI data
            DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
            NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
        * log_dir - Directory to store log files in
        * logging - True or False, default True
    """

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI...")

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    bibindex = ChunkedBibIndex(indexes='year,global,journal',
                               user=SCRIPT_NAME)
    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retreiving DOI data")
        task_update_status("FAILED")
        return False
    root = tree.getroot()

    doi_count = 0
    new_count = 0
    missing_count = 0

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            record_xml = append_to_record(rec_id, doi, published_date)
            if record_xml:
                new_count += 1
                _print("* Now we will run the bibupload and bibindex for " +
                       str(rec_id) + " record", 5)
                _print("** We will upload the following xml code " +
                       repr(record_xml), 9)
                bibupload.add(record_xml)
                bibindex.add(rec_id)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
        else:
            missing_count += 1
            _print('No record found matching arxiv ID: ' + arxiv, 9)

    _print("======================== FINAL SCORE ========================", 1)
    _print("DOIs found and processed: " + str(doi_count), 1)
    _print("Arxiv IDs without corresponding records: " + str(missing_count), 1)
    _print("Records requiring appends: " + str(new_count), 1)

    if logging:
        task_update_progress("Logging...")
        write_list_to_file(log_dir, 'errors', ERRORS)
        write_list_to_file(log_dir, 'messages', MESSAGES)

    task_update_progress(SCRIPT_NAME + " finished. %s DOIs processed, %s to add"
                         % (str(doi_count), str(new_count)))
    task_update_status("DONE")

    bibupload.__del__()
    bibindex.__del__()

    return True

Example #3

Show file

def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR,
                         logging=True):
    """
    bst_arxiv_doi_update
    Updates DOIs on documents harvested from ArXiv.

    Parameters:
        * input_uri - Link to new URI data
            DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
            NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
        * log_dir - Directory to store log files in
        * logging - True or False, default True
    """

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI...")

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    bibindex = ChunkedBibIndex(indexes='year,global,journal', user=SCRIPT_NAME)
    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retreiving DOI data")
        task_update_status("FAILED")
        return False
    root = tree.getroot()

    doi_count = 0
    new_count = 0
    missing_count = 0

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            record_xml = append_to_record(rec_id, doi, published_date)
            if record_xml:
                new_count += 1
                _print(
                    "* Now we will run the bibupload and bibindex for " +
                    str(rec_id) + " record", 5)
                _print(
                    "** We will upload the following xml code " +
                    repr(record_xml), 9)
                bibupload.add(record_xml)
                bibindex.add(rec_id)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
        else:
            missing_count += 1
            _print('No record found matching arxiv ID: ' + arxiv, 9)

    _print("======================== FINAL SCORE ========================", 1)
    _print("DOIs found and processed: " + str(doi_count), 1)
    _print("Arxiv IDs without corresponding records: " + str(missing_count), 1)
    _print("Records requiring appends: " + str(new_count), 1)

    if logging:
        task_update_progress("Logging...")
        write_list_to_file(log_dir, 'errors', ERRORS)
        write_list_to_file(log_dir, 'messages', MESSAGES)

    task_update_progress(SCRIPT_NAME +
                         " finished. %s DOIs processed, %s to add" %
                         (str(doi_count), str(new_count)))
    task_update_status("DONE")

    bibupload.__del__()
    bibindex.__del__()

    return True