def get_recids_to_load(): """ Generates the final list of record IDs to load. Returns a list of tuples like: (recid, date) """ recids_given = task_get_option("recids", default=[]) query_given = task_get_option("query") reportnumbers_given = task_get_option("reportnumbers") if query_given: write_message("Performing given search query: %s" % (query_given,)) result = perform_request_search(p=query_given, of='id', rg=0, wl=0) recids_given.extend(result) if reportnumbers_given: write_message("Searching for records referring to given reportnumbers") for reportnumber in reportnumbers_given: result = perform_request_search(p='reportnumber:%s' % (reportnumber,), of='id', rg=0, wl=0) recids_given.extend(result) recids_given = [(recid, None) for recid in recids_given] last_id, last_date = fetch_last_updated(name="bibcatalog") records_found = [] if task_get_option("new", default=False): records_found.extend(get_all_new_records(since=last_date, last_id=last_id)) if task_get_option("modified", default=False): records_found.extend(get_all_modified_records(since=last_date, last_id=last_id)) for recid, date in records_found: recids_given.append((recid, date)) return recids_given
def get_recids_to_load(): """ Generates the final list of record IDs to load. Returns a list of tuples like: (recid, date) """ recids_given = task_get_option("recids", default=[]) query_given = task_get_option("query") reportnumbers_given = task_get_option("reportnumbers") if query_given: write_message("Performing given search query: %s" % (query_given,)) result = perform_request_search(p=query_given, of='id', rg=0, wl=0) recids_given.extend(result) if reportnumbers_given: write_message("Searching for records referring to given reportnumbers") for reportnumber in reportnumbers_given: result = perform_request_search(p='reportnumber:%s' % (reportnumber,), of='id', rg=0, wl=0) recids_given.extend(result) recids_given = [(recid, None) for recid in recids_given] last_id, last_date = fetch_last_updated(name="bibcatalog") records_found = [] if task_get_option("new", default=False): records_found.extend(get_all_new_records(since=last_date, last_id=last_id)) if task_get_option("modified", default=False): records_found.extend(get_all_modified_records(since=last_date, last_id=last_id)) for recid, date in records_found: recids_given.append((recid, date)) return recids_given
def task_run_core(name=NAME): """Entry point for the arxiv-pdf-checker task""" # First gather recids to process recids = task_get_option("recids") if recids: start_date = None recids = [(recid, None) for recid in recids] else: start_date = datetime.now() dummy, last_date = fetch_last_updated(name) recids = fetch_updated_arxiv_records(last_date) updated_recids = set() try: for count, (recid, dummy) in enumerate(recids): if count % 50 == 0: msg = "Done %s of %s" % (count, len(recids)) write_message(msg) task_update_progress(msg) # BibTask sleep task_sleep_now_if_required(can_stop_too=True) write_message("processing %s" % recid, verbose=9) try: if process_one(recid): updated_recids.add(recid) time.sleep(6) except AlreadyHarvested: write_message("already harvested successfully") time.sleep(6) except FoundExistingPdf: write_message("pdf already attached (matching md5)") time.sleep(6) except PdfNotAvailable: write_message("no pdf available") time.sleep(20) except InvenioFileDownloadError, e: write_message("failed to download: %s" % e) time.sleep(20) finally: # We want to process updated records even in case we are interrupted msg = "Updated %s records" % len(updated_recids) write_message(msg) task_update_progress(msg) write_message(repr(updated_recids)) # For all updated records, we want to sync the 8564 tags # and reextract references if updated_recids: submit_fixmarc_task(updated_recids) submit_refextract_task(updated_recids) # Store last run date of the daemon # not if it ran on specific recids from the command line with --id # but only if it ran on the modified records if start_date: store_last_updated(0, start_date, name) return True
def task_run_core(name=NAME): """Entry point for the arxiv-pdf-checker task""" # First gather recids to process recids = task_get_option('recids') if recids: start_date = None recids = [(recid, None) for recid in recids] else: start_date = datetime.now() dummy, last_date = fetch_last_updated(name) recids = fetch_updated_arxiv_records(last_date) updated_recids = set() try: for count, (recid, dummy) in enumerate(recids): if count % 50 == 0: msg = 'Done %s of %s' % (count, len(recids)) write_message(msg) task_update_progress(msg) # BibTask sleep task_sleep_now_if_required(can_stop_too=True) write_message('processing %s' % recid, verbose=9) try: if process_one(recid): updated_recids.add(recid) time.sleep(6) except AlreadyHarvested: write_message('already harvested successfully') time.sleep(6) except FoundExistingPdf: write_message('pdf already attached (matching md5)') time.sleep(6) except PdfNotAvailable: write_message("no pdf available") time.sleep(20) except InvenioFileDownloadError, e: write_message("failed to download: %s" % e) time.sleep(20) finally: # We want to process updated records even in case we are interrupted msg = 'Updated %s records' % len(updated_recids) write_message(msg) task_update_progress(msg) write_message(repr(updated_recids)) # For all updated records, we want to sync the 8564 tags # and reextract references if updated_recids: submit_fixmarc_task(updated_recids) submit_refextract_task(updated_recids) # Store last run date of the daemon # not if it ran on specific recids from the command line with --id # but only if it ran on the modified records if start_date: store_last_updated(0, start_date, name) return True