Beispiel #1
0
def get_recids_to_load():
    """
    Generates the final list of record IDs to load.

    Returns a list of tuples like: (recid, date)
    """
    recids_given = task_get_option("recids", default=[])
    query_given = task_get_option("query")
    reportnumbers_given = task_get_option("reportnumbers")
    if query_given:
        write_message("Performing given search query: %s" % (query_given,))
        result = perform_request_search(p=query_given,
                                        of='id',
                                        rg=0,
                                        wl=0)
        recids_given.extend(result)

    if reportnumbers_given:
        write_message("Searching for records referring to given reportnumbers")
        for reportnumber in reportnumbers_given:
            result = perform_request_search(p='reportnumber:%s' % (reportnumber,),
                                            of='id',
                                            rg=0,
                                            wl=0)
            recids_given.extend(result)

    recids_given = [(recid, None) for recid in recids_given]

    last_id, last_date = fetch_last_updated(name="bibcatalog")
    records_found = []
    if task_get_option("new", default=False):
        records_found.extend(get_all_new_records(since=last_date, last_id=last_id))
    if task_get_option("modified", default=False):
        records_found.extend(get_all_modified_records(since=last_date, last_id=last_id))

    for recid, date in records_found:
        recids_given.append((recid, date))
    return recids_given
Beispiel #2
0
def get_recids_to_load():
    """
    Generates the final list of record IDs to load.

    Returns a list of tuples like: (recid, date)
    """
    recids_given = task_get_option("recids", default=[])
    query_given = task_get_option("query")
    reportnumbers_given = task_get_option("reportnumbers")
    if query_given:
        write_message("Performing given search query: %s" % (query_given,))
        result = perform_request_search(p=query_given,
                                        of='id',
                                        rg=0,
                                        wl=0)
        recids_given.extend(result)

    if reportnumbers_given:
        write_message("Searching for records referring to given reportnumbers")
        for reportnumber in reportnumbers_given:
            result = perform_request_search(p='reportnumber:%s' % (reportnumber,),
                                            of='id',
                                            rg=0,
                                            wl=0)
            recids_given.extend(result)

    recids_given = [(recid, None) for recid in recids_given]

    last_id, last_date = fetch_last_updated(name="bibcatalog")
    records_found = []
    if task_get_option("new", default=False):
        records_found.extend(get_all_new_records(since=last_date, last_id=last_id))
    if task_get_option("modified", default=False):
        records_found.extend(get_all_modified_records(since=last_date, last_id=last_id))

    for recid, date in records_found:
        recids_given.append((recid, date))
    return recids_given
Beispiel #3
0
def task_run_core(name=NAME):
    """Entry point for the arxiv-pdf-checker task"""

    # First gather recids to process
    recids = task_get_option("recids")
    if recids:
        start_date = None
        recids = [(recid, None) for recid in recids]
    else:
        start_date = datetime.now()
        dummy, last_date = fetch_last_updated(name)
        recids = fetch_updated_arxiv_records(last_date)

    updated_recids = set()

    try:

        for count, (recid, dummy) in enumerate(recids):
            if count % 50 == 0:
                msg = "Done %s of %s" % (count, len(recids))
                write_message(msg)
                task_update_progress(msg)

            # BibTask sleep
            task_sleep_now_if_required(can_stop_too=True)

            write_message("processing %s" % recid, verbose=9)
            try:
                if process_one(recid):
                    updated_recids.add(recid)
                time.sleep(6)
            except AlreadyHarvested:
                write_message("already harvested successfully")
                time.sleep(6)
            except FoundExistingPdf:
                write_message("pdf already attached (matching md5)")
                time.sleep(6)
            except PdfNotAvailable:
                write_message("no pdf available")
                time.sleep(20)
            except InvenioFileDownloadError, e:
                write_message("failed to download: %s" % e)
                time.sleep(20)

    finally:
        # We want to process updated records even in case we are interrupted
        msg = "Updated %s records" % len(updated_recids)
        write_message(msg)
        task_update_progress(msg)
        write_message(repr(updated_recids))

        # For all updated records, we want to sync the 8564 tags
        # and reextract references
        if updated_recids:
            submit_fixmarc_task(updated_recids)
            submit_refextract_task(updated_recids)

    # Store last run date of the daemon
    # not if it ran on specific recids from the command line with --id
    # but only if it ran on the modified records
    if start_date:
        store_last_updated(0, start_date, name)

    return True
Beispiel #4
0
def task_run_core(name=NAME):
    """Entry point for the arxiv-pdf-checker task"""

    # First gather recids to process
    recids = task_get_option('recids')
    if recids:
        start_date = None
        recids = [(recid, None) for recid in recids]
    else:
        start_date = datetime.now()
        dummy, last_date = fetch_last_updated(name)
        recids = fetch_updated_arxiv_records(last_date)

    updated_recids = set()

    try:

        for count, (recid, dummy) in enumerate(recids):
            if count % 50 == 0:
                msg = 'Done %s of %s' % (count, len(recids))
                write_message(msg)
                task_update_progress(msg)

            # BibTask sleep
            task_sleep_now_if_required(can_stop_too=True)

            write_message('processing %s' % recid, verbose=9)
            try:
                if process_one(recid):
                    updated_recids.add(recid)
                time.sleep(6)
            except AlreadyHarvested:
                write_message('already harvested successfully')
                time.sleep(6)
            except FoundExistingPdf:
                write_message('pdf already attached (matching md5)')
                time.sleep(6)
            except PdfNotAvailable:
                write_message("no pdf available")
                time.sleep(20)
            except InvenioFileDownloadError, e:
                write_message("failed to download: %s" % e)
                time.sleep(20)

    finally:
        # We want to process updated records even in case we are interrupted
        msg = 'Updated %s records' % len(updated_recids)
        write_message(msg)
        task_update_progress(msg)
        write_message(repr(updated_recids))

        # For all updated records, we want to sync the 8564 tags
        # and reextract references
        if updated_recids:
            submit_fixmarc_task(updated_recids)
            submit_refextract_task(updated_recids)

    # Store last run date of the daemon
    # not if it ran on specific recids from the command line with --id
    # but only if it ran on the modified records
    if start_date:
        store_last_updated(0, start_date, name)

    return True