def main(): recids = search_pattern(p='arxiv', f='reportnumber') to_process = [] for count, recid in enumerate(recids): if count % 50 == 0: print 'done %s of %s' % (count, len(recids)) if not record_has_fulltext(recid): print 'adding', recid to_process.append(recid) if len(to_process) == 1000: task_id = submit_task(to_process) print 'submitted task id %s' % task_id wait_for_task(task_id) to_process = [] if to_process: task_id = submit_task(to_process) print 'submitted final task id %s' % task_id
def process_one(recid): """Checks given recid for updated pdfs on arxiv""" write_message('checking %s' % recid) # Last version we have harvested harvest_status, harvest_version = fetch_arxiv_pdf_status(recid) # Fetch arxiv version arxiv_version = fetch_arxiv_version(recid) if not arxiv_version: msg = 'version information unavailable' write_message(msg) raise PdfNotAvailable(msg) write_message('harvested_version %s' % harvest_version) write_message('arxiv_version %s' % arxiv_version) if record_has_fulltext(recid) and harvest_version == arxiv_version: write_message('our version matches arxiv') raise AlreadyHarvested(status=harvest_status) # We already tried to harvest this record but failed if harvest_status == STATUS_MISSING and harvest_version == arxiv_version: raise PdfNotAvailable() updated = False try: download_one(recid, arxiv_version) except PdfNotAvailable: store_arxiv_pdf_status(recid, STATUS_MISSING, arxiv_version) raise except FoundExistingPdf: store_arxiv_pdf_status(recid, STATUS_OK, arxiv_version) raise else: store_arxiv_pdf_status(recid, STATUS_OK, arxiv_version) updated = True return updated
def cb_process_one(recid): record = get_record(recid) if record.find_fields('999C5') or record.find_fields('999C6'): return if record_has_fulltext(recid): refextract.add(recid)