def check_slave_is_in_consistent_state(connection=None): """ Check if the slave is already aware that dbdump task is running. dbdump being a monotask, guarantee that no other task is currently running and it's hence safe to detach the slave and start the actual dump. """ if connection is None: connection = get_connection_for_dump_on_slave() i = 0 ## Let's take the current status of dbdump (e.g. RUNNING, ABOUT TO STOP, etc.)... current_status = run_sql("""SELECT status FROM "schTASK" WHERE id=%s""", (task_get_task_param('task_id'), ))[0][0] while True: if i == 10: ## Timeout!! raise StandardError( "The slave seems not to pick up with the master") ## ...and let's see if it matches with what the slave sees. if run_sql( """SELECT status FROM "schTASK" WHERE id=%s AND status=%s""", (task_get_task_param('task_id'), current_status), connection=connection): ## Bingo! return time.sleep(3) i += 1
def update_rule_last_run(rule_name): """ Set the last time a rule was run to now. This function should be called after a rule has been ran. """ if task_has_option('record_ids') or task_get_option('no_upload', False) \ or task_get_option('no_tickets', False): return # We don't want to update the database in this case updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;", (task_get_task_param('task_starting_time'), rule_name,)) if not updated: # rule not in the database, insert it run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)", (rule_name, task_get_task_param('task_starting_time')))
def watch_directory(new_job_dir=CFG_BIBENCODE_DAEMON_DIR_NEWJOBS, old_job_dir=CFG_BIBENCODE_DAEMON_DIR_OLDJOBS): """ Checks a folder job files, parses and executes them @param new_job_dir: path to the directory with new jobs @type new_job_dir: string @param old_job_dir: path to the directory where the old jobs are moved @type old_job_dir: string """ global _NUMBER, _TASKID write_message('Checking directory %s for new jobs' % new_job_dir) task_update_progress('Checking for new jobs') _TASKID = task_get_task_param('task_id') files = os.listdir(new_job_dir) for file in files: file_fullpath = os.path.join(new_job_dir, file) if has_signature(file_fullpath): write_message('New Job found: %s' % file) job = json_decode_file(file_fullpath) if not getval(job, 'isbatch'): args = job_to_args(job) if not launch_task(args): write_message('Error submitting task') else: ## We need the job description for the batch engine ## So we need to use the new path inside the oldjobs dir process_batch(os.path.join(old_job_dir, file)) ## Move the file to the done dir shutil.move(file_fullpath, os.path.join(old_job_dir, file)) ## Update number for next job _NUMBER += 1 return 1
def _dbdump_run_task_core(): """ Run DB dumper core stuff. Note: do not use task_can_sleep() stuff here because we don't want other tasks to interrupt us while we are dumping the DB content. """ # read params: task_update_progress("Reading parameters") write_message("Reading parameters started") output_dir = task_get_option('output', CFG_LOGDIR) output_num = task_get_option('number', 5) output_fil_prefix = CFG_DATABASE_NAME + '-dbdump-' output_fil_suffix = task_get_task_param('task_starting_time').replace(' ', '_') + '.sql.gz' output_fil = output_fil_prefix + output_fil_suffix write_message("Reading parameters ended") # make dump: task_update_progress("Dumping database") write_message("Database dump started") _dump_database(output_dir, output_fil) write_message("Database dump ended") # prune old dump files: task_update_progress("Pruning old dump files") write_message("Pruning old dump files started") _delete_old_dumps(output_dir, output_fil_prefix, output_num) write_message("Pruning old dump files ended") # we are done: task_update_progress("Done.") return True
def _update_job_lastrun_time(jobname): """Update expJOB table and set lastrun time of JOBNAME to the task starting time.""" run_sql("""UPDATE "expJOB" SET lastrun=%s WHERE jobname=%s""", ( task_get_task_param('task_starting_time'), jobname, ))
def iterate_over_new(list, fmt): """ Iterate over list of IDs @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') format_record(recID, fmt, on_the_fly=True) formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True)) run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def check_slave_is_in_consistent_state(connection=None): """ Check if the slave is already aware that dbdump task is running. dbdump being a monotask, guarantee that no other task is currently running and it's hence safe to detach the slave and start the actual dump. """ if connection is None: connection = get_connection_for_dump_on_slave() i = 0 ## Let's take the current status of dbdump (e.g. RUNNING, ABOUT TO STOP, etc.)... current_status = run_sql("""SELECT status FROM "schTASK" WHERE id=%s""", (task_get_task_param('task_id'), ))[0][0] while True: if i == 10: ## Timeout!! raise StandardError("The slave seems not to pick up with the master") ## ...and let's see if it matches with what the slave sees. if run_sql("""SELECT status FROM "schTASK" WHERE id=%s AND status=%s""", (task_get_task_param('task_id'), current_status), connection=connection): ## Bingo! return time.sleep(3) i += 1
def _task_run_core(): """Runs analyse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = bibtask.task_get_option('recids') collections = bibtask.task_get_option('collections') taxonomy = bibtask.task_get_option('taxonomy') if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1 # We will write to a temporary file as we go, because we might be processing # big collections with many docs _rid = time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = engine.get_tmp_file(_rid) fo = open(abs_path, 'w') fo.write('<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec['recIDs']) rec_added = False for onto_rec in onto_recids: bibtask.task_sleep_now_if_required(can_stop_too=False) if onto_rec['collection'] is not None: bibtask.write_message( 'INFO: Applying taxonomy %s to collection %s (%s ' 'records)' % (onto_rec['ontology'], onto_rec['collection'], len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3) else: bibtask.write_message('INFO: Applying taxonomy %s to recIDs %s. ' % (onto_rec['ontology'], ', '.join([str(recid) for recid in onto_rec['recIDs']])), stream=sys.stderr, verbose=3) if onto_rec['recIDs']: xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'], onto_rec['collection']) if len(xml) > 5: fo.write(xml) rec_added = True fo.write('</collection>\n') fo.close() # Apply the changes. if rec_added: if bconfig.CFG_DB_SAVE_KW: webinterface.upload_keywords(abs_path) else: bibtask.write_message( "INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0) else: bibtask.write_message( "WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0) os.remove(abs_path) # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1
def task_run_core(): """Core task of oaiharvest. This function will run all the operations needed to run an oaiharvest task into bibsched. :return: :raise InvenioOAIHarvestWarning: """ workflow_id_preservation = 0 workflow = None start_time = time.time() list_of_workflow_without_repository = [] list_of_repository_per_workflow = {} repository = task_get_option("repository") if not repository: workflow_option = task_get_option("workflow") if isinstance(workflow_option, list): for name in workflow_option: if name not in list_of_workflow_without_repository: list_of_workflow_without_repository.append(name) else: list_of_workflow_without_repository.append(workflow_option) else: if task_get_option("workflow"): workflow_option = task_get_option("workflow") if isinstance(workflow_option, list): for name in workflow_option: if name not in list_of_repository_per_workflow: list_of_repository_per_workflow[name] = repository else: list_of_repository_per_workflow[workflow_option] = repository elif isinstance(repository, list): for name_repository in repository: name_workflow = OaiHARVEST.get( OaiHARVEST.name == name_repository).one().workflows if name_workflow not in list_of_repository_per_workflow: list_of_repository_per_workflow[name_workflow] = [ name_repository] else: list_of_repository_per_workflow[name_workflow].append( name_repository) else: workflow_found = OaiHARVEST.get( OaiHARVEST.name == repository).one().workflows list_of_repository_per_workflow[workflow_found] = repository try: if list_of_repository_per_workflow: for workflow_to_launch in list_of_repository_per_workflow: options = task_get_option(None) options["repository"] = list_of_repository_per_workflow[ workflow_to_launch] workflow = start(workflow_to_launch, data=[""], stop_on_error=True, options=options) else: for workflow_to_launch in list_of_workflow_without_repository: workflow = start(workflow_to_launch, data=[""], stop_on_error=True, options=task_get_option(None)) if workflow: workflow_id_preservation = workflow.uuid workflowlog = BibWorkflowEngineLog.query.filter( BibWorkflowEngineLog.id_object == workflow.uuid ).all() for log in workflowlog: write_message(log.message) execution_time = round(time.time() - start_time, 2) write_message("Execution time :" + str(execution_time)) except WorkflowError as e: write_message("ERRORS HAPPENED") write_message("____________Workflow log output____________") workflow_id_preservation = e.id_workflow workflowlog = BibWorkflowEngineLog.query.filter( BibWorkflowEngineLog.id_object == e.id_workflow ).filter(BibWorkflowEngineLog.log_type >= 40).all() for log in workflowlog: write_message(log.message) for i in e.payload: write_message("\n\n____________Workflow " + i + " log output____________") workflowlog = BibWorkflowEngineLog.query.filter( BibWorkflowEngineLog.id_object == i ).filter(BibWorkflowEngineLog.log_type >= 40).all() for log in workflowlog: write_message(log.message) write_message("____________Object log output____________") objectlog = BibWorkflowObjectLog.query.filter( BibWorkflowObjectLog.id_object == e.id_object ).filter(BibWorkflowEngineLog.log_type >= 40).all() for log in objectlog: write_message(log.message) execution_time = round(time.time() - start_time, 2) write_message("Execution time :" + str(execution_time)) # Generate reports ticket_queue = task_get_option("create-ticket-in") notification_email = task_get_option("notify-email-to") workflow_main = Workflow.query.filter( Workflow.uuid == workflow_id_preservation ).one() if ticket_queue or notification_email: subject, text = generate_harvest_report( workflow_main, current_task_id=task_get_task_param("task_id") ) # Create ticket for finished harvest? if ticket_queue: ticketid = create_ticket(ticket_queue, subject=subject, text=text) if ticketid: write_message("Ticket %s submitted." % (str(ticketid),)) # Send e-mail for finished harvest? if notification_email: send_email(fromaddr=CFG_SITE_SUPPORT_EMAIL, toaddr=notification_email, subject=subject, content=text) if workflow_main.counter_error: if CFG_OAI_FAILED_HARVESTING_STOP_QUEUE == 0 or \ not task_get_task_param("sleeptime") or \ workflow_main.counter_error > 1: # Admin want BibSched to stop, or the task is not set to # run at a later date: we must stop the queue. write_message("An error occurred. Task is configured to stop") return False else: # An error happened, but it can be recovered at next run # (task is re-scheduled) and admin set BibSched to # continue even after failure. write_message("Error occurred, but task is configured to continue") if CFG_OAI_FAILED_HARVESTING_EMAILS_ADMIN: try: raise InvenioOAIHarvestWarning( "OAIHarvest (task #%s) failed at fully harvesting." " BibSched has NOT been stopped, and OAIHarvest will" " try to recover at next run" % (task_get_task_param("task_id"),) ) except InvenioOAIHarvestWarning: register_exception(stream='warning', alert_admin=True) return True else: return True
def _dbdump_run_task_core(): """ Run DB dumper core stuff. Note: do not use task_can_sleep() stuff here because we don't want other tasks to interrupt us while we are dumping the DB content. """ # read params: host = CFG_DATABASE_HOST port = CFG_DATABASE_PORT connection = None active_queues = [] try: if task_get_option('slave') and not task_get_option('dump_on_slave_helper_mode'): connection = get_connection_for_dump_on_slave() write_message("Dump on slave requested") write_message("... checking if slave is well up...") check_slave_is_up(connection) write_message("... checking if slave is in consistent state...") check_slave_is_in_consistent_state(connection) write_message("... detaching slave database...") detach_slave(connection) write_message("... scheduling dump on slave helper...") helper_arguments = [] if task_get_option("number"): helper_arguments += ["--number", str(task_get_option("number"))] if task_get_option("output"): helper_arguments += ["--output", str(task_get_option("output"))] if task_get_option("params"): helper_arguments += ["--params", str(task_get_option("params"))] if task_get_option("ignore_tables"): helper_arguments += ["--ignore-tables", str(task_get_option("ignore_tables"))] if task_get_option("compress"): helper_arguments += ["--compress"] if task_get_option("slave"): helper_arguments += ["--slave", str(task_get_option("slave"))] helper_arguments += ['-N', 'slavehelper', '--dump-on-slave-helper'] task_id = task_low_level_submission('dbdump', task_get_task_param('user'), '-P4', *helper_arguments) write_message("Slave scheduled with ID %s" % task_id) task_update_progress("DONE") return True elif task_get_option('dump_on_slave_helper_mode'): write_message("Dumping on slave mode") connection = get_connection_for_dump_on_slave() write_message("... checking if slave is well down...") check_slave_is_down(connection) host = CFG_DATABASE_SLAVE task_update_progress("Reading parameters") write_message("Reading parameters started") output_dir = task_get_option('output', CFG_LOGDIR) output_num = task_get_option('number', 5) params = task_get_option('params', None) compress = task_get_option('compress', False) slave = task_get_option('slave', False) ignore_tables = task_get_option('ignore_tables', None) if ignore_tables: ignore_tables = get_table_names(ignore_tables) else: ignore_tables = None output_file_suffix = task_get_task_param('task_starting_time') output_file_suffix = output_file_suffix.replace(' ', '_') + '.sql' if compress: output_file_suffix = "%s.gz" % (output_file_suffix,) write_message("Reading parameters ended") if task_get_option('disable_workers'): active_queues = get_queues() if active_queues: write_message("Suspend workers and wait for any running tasks to complete") suspend_queues(active_queues) write_message("Workers suspended") # make dump: task_update_progress("Dumping database") write_message("Database dump started") if slave: output_file_prefix = 'slave-%s-dbdump-' % (CFG_DATABASE_NAME,) else: output_file_prefix = '%s-dbdump-' % (CFG_DATABASE_NAME,) output_file = output_file_prefix + output_file_suffix dump_path = output_dir + os.sep + output_file dump_database(dump_path, \ host=host, port=port, params=params, \ compress=compress, \ ignore_tables=ignore_tables) write_message("Database dump ended") finally: for queue in active_queues: enable_queue(queue) if connection and task_get_option('dump_on_slave_helper_mode'): write_message("Reattaching slave") attach_slave(connection) # prune old dump files: task_update_progress("Pruning old dump files") write_message("Pruning old dump files started") _delete_old_dumps(output_dir, output_file_prefix, output_num) write_message("Pruning old dump files ended") # we are done: task_update_progress("Done.") return True
def task_run_core(): """Core task of oaiharvest. This function will run all the operations needed to run an oaiharvest task into bibsched. :return: :raise InvenioOAIHarvestWarning: """ workflow_id_preservation = 0 workflow = None start_time = time.time() list_of_workflow_without_repository = [] list_of_repository_per_workflow = {} repository = task_get_option("repository") if not repository: workflow_option = task_get_option("workflow") if isinstance(workflow_option, list): for name in workflow_option: if name not in list_of_workflow_without_repository: list_of_workflow_without_repository.append(name) else: list_of_workflow_without_repository.append(workflow_option) else: if task_get_option("workflow"): workflow_option = task_get_option("workflow") if isinstance(workflow_option, list): for name in workflow_option: if name not in list_of_repository_per_workflow: list_of_repository_per_workflow[name] = repository else: list_of_repository_per_workflow[workflow_option] = repository elif isinstance(repository, list): for name_repository in repository: name_workflow = OaiHARVEST.get( OaiHARVEST.name == name_repository).one().workflows if name_workflow not in list_of_repository_per_workflow: list_of_repository_per_workflow[name_workflow] = [ name_repository ] else: list_of_repository_per_workflow[name_workflow].append( name_repository) else: workflow_found = OaiHARVEST.get( OaiHARVEST.name == repository).one().workflows list_of_repository_per_workflow[workflow_found] = repository try: if list_of_repository_per_workflow: for workflow_to_launch in list_of_repository_per_workflow: options = task_get_option(None) options["repository"] = list_of_repository_per_workflow[ workflow_to_launch] workflow = start(workflow_to_launch, data=[""], stop_on_error=True, options=options) else: for workflow_to_launch in list_of_workflow_without_repository: workflow = start(workflow_to_launch, data=[""], stop_on_error=True, options=task_get_option(None)) if workflow: workflow_id_preservation = workflow.uuid workflowlog = BibWorkflowEngineLog.query.filter( BibWorkflowEngineLog.id_object == workflow.uuid).all() for log in workflowlog: write_message(log.message) execution_time = round(time.time() - start_time, 2) write_message("Execution time :" + str(execution_time)) except WorkflowError as e: write_message("ERRORS HAPPENED") write_message("____________Workflow log output____________") workflow_id_preservation = e.id_workflow workflowlog = BibWorkflowEngineLog.query.filter( BibWorkflowEngineLog.id_object == e.id_workflow).filter( BibWorkflowEngineLog.log_type >= 40).all() for log in workflowlog: write_message(log.message) for i in e.payload: write_message("\n\n____________Workflow " + i + " log output____________") workflowlog = BibWorkflowEngineLog.query.filter( BibWorkflowEngineLog.id_object == i).filter( BibWorkflowEngineLog.log_type >= 40).all() for log in workflowlog: write_message(log.message) write_message("____________Object log output____________") objectlog = BibWorkflowObjectLog.query.filter( BibWorkflowObjectLog.id_object == e.id_object).filter( BibWorkflowEngineLog.log_type >= 40).all() for log in objectlog: write_message(log.message) execution_time = round(time.time() - start_time, 2) write_message("Execution time :" + str(execution_time)) # Generate reports ticket_queue = task_get_option("create-ticket-in") notification_email = task_get_option("notify-email-to") workflow_main = Workflow.query.filter( Workflow.uuid == workflow_id_preservation).one() if ticket_queue or notification_email: subject, text = generate_harvest_report( workflow_main, current_task_id=task_get_task_param("task_id")) # Create ticket for finished harvest? if ticket_queue: ticketid = create_ticket(ticket_queue, subject=subject, text=text) if ticketid: write_message("Ticket %s submitted." % (str(ticketid), )) # Send e-mail for finished harvest? if notification_email: send_email(fromaddr=CFG_SITE_SUPPORT_EMAIL, toaddr=notification_email, subject=subject, content=text) if workflow_main.counter_error: if CFG_OAI_FAILED_HARVESTING_STOP_QUEUE == 0 or \ not task_get_task_param("sleeptime") or \ workflow_main.counter_error > 1: # Admin want BibSched to stop, or the task is not set to # run at a later date: we must stop the queue. write_message("An error occurred. Task is configured to stop") return False else: # An error happened, but it can be recovered at next run # (task is re-scheduled) and admin set BibSched to # continue even after failure. write_message("Error occurred, but task is configured to continue") if CFG_OAI_FAILED_HARVESTING_EMAILS_ADMIN: try: raise InvenioOAIHarvestWarning( "OAIHarvest (task #%s) failed at fully harvesting." " BibSched has NOT been stopped, and OAIHarvest will" " try to recover at next run" % (task_get_task_param("task_id"), )) except InvenioOAIHarvestWarning: register_exception(stream='warning', alert_admin=True) return True else: return True
def ref_analyzer(citation_informations, updated_recids, tags, config): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations = {} for recid in updated_recids: citations[recid] = set() references = {} for recid in updated_recids: references[recid] = set() def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_cites(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return citations[citee].add(citer) if citer in updated_recids: references[citer].add(citee) def add_to_refs(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return if citee in updated_recids: citations[citee].add(citer) references[citer].add(citee) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in iteritems(references_info['report-numbers']): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field, config=config) write_message("These match searching %s in %s: %s" % (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in iteritems(references_info['journals']): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in iteritems(references_info['doi']): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Try to find references based on 999C5a (hdl references) # e.g. 4263537/4000 write_message("Phase 4: HDL references") done = 0 for thisrecid, refs in references_info['hdl'].iteritems(): step("HDL references", thisrecid, done, len(references_info['hdl'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'hdl' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' HDL value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t5 = os.times()[4] # Try to find references based on 999C50 # e.g. 1244 write_message("Phase 5: Record ID references") done = 0 for thisrecid, refs in references_info['record_id'].iteritems(): step("Record ID references", thisrecid, done, len(references_info['record_id'])) done += 1 field = "001" for recid in (r for r in refs if r): valid = get_recids_matching_query(p=recid, f=field, config=config) write_message("These match searching %s in %s: %s" % (recid, field, list(valid)), verbose=9) if valid: add_to_refs(thisrecid, valid[0]) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t6 = os.times()[4] # Try to find references based on 999C5i # e.g. 978-3-942171-73-1 write_message("Phase 6: ISBN references") done = 0 for thisrecid, refs in references_info['isbn'].iteritems(): step("ISBN references", thisrecid, done, len(references_info['isbn'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'isbn' recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' ISBN value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t7 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 7: report numbers catchup") done = 0 for thisrecid, reportcodes in iteritems(records_info['report-numbers']): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query( p=report_pattern, f=tags['refs_report_number'], m='r', config=config) else: recids = get_recids_matching_query( p=reportcode, f=tags['refs_report_number'], config=config) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 8: journals catchup") done = 0 t8 = os.times()[4] for thisrecid, rec_journals in iteritems(records_info['journals']): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = get_recids_matching_query(p=journal, f=tags['refs_journal'], config=config) write_message("These records match %s in %s: %s" % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 9: DOI catchup") done = 0 t9 = os.times()[4] for thisrecid, dois in iteritems(records_info['doi']): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: recids = get_recids_matching_query(p=doi, f=tags['refs_doi'], config=config) write_message("These records match %s in %s: %s" % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 10: HDL catchup") done = 0 t10 = os.times()[4] for thisrecid, hdls in records_info['hdl'].iteritems(): step("HDL catchup", thisrecid, done, len(records_info['hdl'])) done += 1 for hdl in hdls: recids = get_recids_matching_query(p=hdl, f=tags['refs_doi'], config=config) write_message("These records match %s in %s: %s" % (hdl, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 11: ISBN catchup") done = 0 t11 = os.times()[4] for thisrecid, isbns in records_info['isbn'].iteritems(): step("ISBN catchup", thisrecid, done, len(records_info['isbn'])) done += 1 for isbn in isbns: recids = get_recids_matching_query(p=isbn, f=tags['refs_isbn'], config=config) write_message("These records match %s in %s: %s" % (isbn, tags['refs_isbn'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) write_message("Phase 12: Record ID catchup") done = 0 t12 = os.times()[4] for thisrecid, record_ids in records_info['record_id'].iteritems(): step("Record ID catchup", thisrecid, done, len(records_info['record_id'])) done += 1 for record_id in record_ids: recids = get_recids_matching_query(p=record_id, f=tags['refs_record_id'], config=config) write_message("These records match %s in %s: %s" % (record_id, tags['refs_record_id'], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(iteritems(citations), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(iteritems(references), 10))) write_message("size: %s" % len(references)) t13 = os.times()[4] write_message("Execution time for analyzing the citation information " "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2 - t1)) write_message("... checking ref journals: %.2f sec" % (t3 - t2)) write_message("... checking ref DOI: %.2f sec" % (t4 - t3)) write_message("... checking ref HDL: %.2f sec" % (t5 - t4)) write_message("... checking ref Record ID: %.2f sec" % (t6 - t5)) write_message("... checking ref ISBN: %.2f sec" % (t7 - t6)) write_message("... checking rec report numbers: %.2f sec" % (t8 - t7)) write_message("... checking rec journals: %.2f sec" % (t9 - t8)) write_message("... checking rec DOI: %.2f sec" % (t10 - t9)) write_message("... checking rec HDL: %.2f sec" % (t11 - t10)) write_message("... checking rec ISBN: %.2f sec" % (t12 - t11)) write_message("... checking rec Record ID: %.2f sec" % (t13 - t12)) write_message("... total time of ref_analyze: %.2f sec" % (t13 - t1)) return citations, references
def _task_run_core(): """Runs analyse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = bibtask.task_get_option('recids') collections = bibtask.task_get_option('collections') taxonomy = bibtask.task_get_option('taxonomy') if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1 # We will write to a temporary file as we go, because we might be processing # big collections with many docs _rid = time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = engine.get_tmp_file(_rid) fo = open(abs_path, 'w') fo.write('<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec['recIDs']) rec_added = False for onto_rec in onto_recids: bibtask.task_sleep_now_if_required(can_stop_too=False) if onto_rec['collection'] is not None: bibtask.write_message( 'INFO: Applying taxonomy %s to collection %s (%s ' 'records)' % (onto_rec['ontology'], onto_rec['collection'], len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3) else: bibtask.write_message( 'INFO: Applying taxonomy %s to recIDs %s. ' % (onto_rec['ontology'], ', '.join( [str(recid) for recid in onto_rec['recIDs']])), stream=sys.stderr, verbose=3) if onto_rec['recIDs']: xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'], onto_rec['collection']) if len(xml) > 5: fo.write(xml) rec_added = True fo.write('</collection>\n') fo.close() # Apply the changes. if rec_added: if bconfig.CFG_DB_SAVE_KW: webinterface.upload_keywords(abs_path) else: bibtask.write_message( "INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0) else: bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0) os.remove(abs_path) # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1
def ref_analyzer(citation_informations, dicts, updated_recids, tags, do_catchup=True): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations_weight = dicts['cites_weight'] citations = dicts['cites'] references = dicts['refs'] selfcites = dicts['selfcites'] selfrefs = dicts['selfrefs'] authorcites = dicts['authorcites'] def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_dicts(citer, cited): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == cited: return if cited not in citations_weight: citations_weight[cited] = 0 # Citations and citations weight if citer not in citations.setdefault(cited, []): citations[cited].append(citer) citations_weight[cited] += 1 # References if cited not in references.setdefault(citer, []): references[citer].append(cited) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] write_message("Phase 0: temporarily remove changed records from " \ "citation dictionaries; they will be filled later") if do_catchup: for somerecid in updated_recids: try: del citations[somerecid] except KeyError: pass for somerecid in updated_recids: try: del references[somerecid] except KeyError: pass # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in iteritems(references_info['report-numbers']): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field) write_message("These match searching %s in %s: %s" % \ (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in iteritems(references_info['journals']): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in iteritems(references_info['doi']): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p, field) write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 4: report numbers catchup") done = 0 for thisrecid, reportcodes in iteritems(records_info['report-numbers']): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query(report_pattern, tags['refs_report_number'], 'r') else: recids = get_recids_matching_query(reportcode, tags['refs_report_number'], 'e') for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 5: journals catchup") done = 0 t5 = os.times()[4] for thisrecid, rec_journals in iteritems(records_info['journals']): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 6: DOI catchup") done = 0 t6 = os.times()[4] for thisrecid, dois in iteritems(records_info['doi']): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5a recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 7: remove empty lists from dicts") # Remove empty lists in citation and reference keys = citations.keys() for k in keys: if not citations[k]: del citations[k] keys = references.keys() for k in keys: if not references[k]: del references[k] if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(iteritems(citations), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(iteritems(references), 10))) write_message("size: %s" % len(references)) write_message("selfcitedbydic (x is cited by y and one of the " \ "authors of x same as y's):") write_message(dict(islice(iteritems(selfcites), 10))) write_message("size: %s" % len(selfcites)) write_message("selfdic (x cites y and one of the authors of x " \ "same as y's):") write_message(dict(islice(iteritems(selfrefs), 10))) write_message("size: %s" % len(selfrefs)) write_message("authorcitdic (author is cited in recs):") write_message(dict(islice(iteritems(authorcites), 10))) write_message("size: %s" % len(authorcites)) t7 = os.times()[4] write_message("Execution time for analyzing the citation information " \ "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2-t1)) write_message("... checking ref journals: %.2f sec" % (t3-t2)) write_message("... checking ref DOI: %.2f sec" % (t4-t3)) write_message("... checking rec report numbers: %.2f sec" % (t5-t4)) write_message("... checking rec journals: %.2f sec" % (t6-t5)) write_message("... checking rec DOI: %.2f sec" % (t7-t6)) write_message("... total time of ref_analyze: %.2f sec" % (t7-t1)) return citations_weight, citations, references, selfcites, \ selfrefs, authorcites
def _update_job_lastrun_time(jobname): """Update expJOB table and set lastrun time of JOBNAME to the task starting time.""" run_sql("UPDATE expJOB SET lastrun=%s WHERE jobname=%s", (task_get_task_param('task_starting_time'), jobname,))
def ref_analyzer(citation_informations, updated_recids, tags, config): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations = {} for recid in updated_recids: citations[recid] = set() references = {} for recid in updated_recids: references[recid] = set() def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_cites(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return citations[citee].add(citer) if citer in updated_recids: references[citer].add(citee) def add_to_refs(citer, citee): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == citee: return if citee in updated_recids: citations[citee].add(citer) references[citer].add(citee) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in iteritems(references_info["report-numbers"]): step("Report numbers references", thisrecid, done, len(references_info["report-numbers"])) done += 1 for refnumber in (r for r in refnumbers if r): field = "reportnumber" refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field, config=config) write_message("These match searching %s in %s: %s" % (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning("multiple-matches", refnumber) msg = ( "Whoops: record '%d' report number value '%s' " "matches many records; taking only the first one. %s" % (thisrecid, refnumber, repr(recids)) ) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in iteritems(references_info["journals"]): step("Journal references", thisrecid, done, len(references_info["journals"])) done += 1 for reference in (r for r in refs if r): p = reference field = "journal" # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning("not-well-formed", p) msg = "Whoops, record '%d' reference value '%s' " "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning("multiple-matches", p) msg = ( "Whoops: record '%d' reference value '%s' " "matches many records; taking only the first one. %s" % (thisrecid, p, repr(recids)) ) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in iteritems(references_info["doi"]): step("DOI references", thisrecid, done, len(references_info["doi"])) done += 1 for reference in (r for r in refs if r): p = reference field = "doi" recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning("multiple-matches", p) msg = "Whoops: record '%d' DOI value '%s' " "matches many records; taking only the first one. %s" % ( thisrecid, p, repr(recids), ) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Try to find references based on 999C5a (hdl references) # e.g. 4263537/4000 write_message("Phase 4: HDL references") done = 0 for thisrecid, refs in references_info["hdl"].iteritems(): step("HDL references", thisrecid, done, len(references_info["hdl"])) done += 1 for reference in (r for r in refs if r): p = reference field = "hdl" recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning("multiple-matches", p) msg = "Whoops: record '%d' HDL value '%s' " "matches many records; taking only the first one. %s" % ( thisrecid, p, repr(recids), ) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t5 = os.times()[4] # Try to find references based on 999C50 # e.g. 1244 write_message("Phase 5: Record ID references") done = 0 for thisrecid, refs in references_info["record_id"].iteritems(): step("Record ID references", thisrecid, done, len(references_info["record_id"])) done += 1 field = "001" for recid in (r for r in refs if r): valid = get_recids_matching_query(p=recid, f=field, config=config) write_message("These match searching %s in %s: %s" % (recid, field, list(valid)), verbose=9) if valid: add_to_refs(thisrecid, valid[0]) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t6 = os.times()[4] # Try to find references based on 999C5i # e.g. 978-3-942171-73-1 write_message("Phase 6: ISBN references") done = 0 for thisrecid, refs in references_info["isbn"].iteritems(): step("ISBN references", thisrecid, done, len(references_info["isbn"])) done += 1 for reference in (r for r in refs if r): p = reference field = "isbn" recids = get_recids_matching_query(p=p, f=field, config=config) write_message("These match searching %s in %s: %s" % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning("multiple-matches", p) msg = "Whoops: record '%d' ISBN value '%s' " "matches many records; taking only the first one. %s" % ( thisrecid, p, repr(recids), ) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_refs(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t7 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 7: report numbers catchup") done = 0 for thisrecid, reportcodes in iteritems(records_info["report-numbers"]): step("Report numbers catchup", thisrecid, done, len(records_info["report-numbers"])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith("arXiv"): std_reportcode = standardize_report_number(reportcode) report_pattern = r"^%s( *\[[a-zA-Z.-]*\])?" % re.escape(std_reportcode) recids = get_recids_matching_query(p=report_pattern, f=tags["refs_report_number"], m="r", config=config) else: recids = get_recids_matching_query(p=reportcode, f=tags["refs_report_number"], config=config) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 8: journals catchup") done = 0 t8 = os.times()[4] for thisrecid, rec_journals in iteritems(records_info["journals"]): step("Journals catchup", thisrecid, done, len(records_info["journals"])) done += 1 for journal in rec_journals: journal = journal.replace('"', "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = get_recids_matching_query(p=journal, f=tags["refs_journal"], config=config) write_message("These records match %s in %s: %s" % (journal, tags["refs_journal"], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 9: DOI catchup") done = 0 t9 = os.times()[4] for thisrecid, dois in iteritems(records_info["doi"]): step("DOI catchup", thisrecid, done, len(records_info["doi"])) done += 1 for doi in dois: recids = get_recids_matching_query(p=doi, f=tags["refs_doi"], config=config) write_message("These records match %s in %s: %s" % (doi, tags["refs_doi"], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 10: HDL catchup") done = 0 t10 = os.times()[4] for thisrecid, hdls in records_info["hdl"].iteritems(): step("HDL catchup", thisrecid, done, len(records_info["hdl"])) done += 1 for hdl in hdls: recids = get_recids_matching_query(p=hdl, f=tags["refs_doi"], config=config) write_message("These records match %s in %s: %s" % (hdl, tags["refs_doi"], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 11: ISBN catchup") done = 0 t11 = os.times()[4] for thisrecid, isbns in records_info["isbn"].iteritems(): step("ISBN catchup", thisrecid, done, len(records_info["isbn"])) done += 1 for isbn in isbns: recids = get_recids_matching_query(p=isbn, f=tags["refs_isbn"], config=config) write_message("These records match %s in %s: %s" % (isbn, tags["refs_isbn"], list(recids)), verbose=9) for recid in recids: add_to_cites(recid, thisrecid) write_message("Phase 12: Record ID catchup") done = 0 t12 = os.times()[4] for thisrecid, record_ids in records_info["record_id"].iteritems(): step("Record ID catchup", thisrecid, done, len(records_info["record_id"])) done += 1 for record_id in record_ids: recids = get_recids_matching_query(p=record_id, f=tags["refs_record_id"], config=config) write_message( "These records match %s in %s: %s" % (record_id, tags["refs_record_id"], list(recids)), verbose=9 ) for recid in recids: add_to_cites(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) if task_get_task_param("verbose") >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(iteritems(citations), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(iteritems(references), 10))) write_message("size: %s" % len(references)) t13 = os.times()[4] write_message("Execution time for analyzing the citation information " "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2 - t1)) write_message("... checking ref journals: %.2f sec" % (t3 - t2)) write_message("... checking ref DOI: %.2f sec" % (t4 - t3)) write_message("... checking ref HDL: %.2f sec" % (t5 - t4)) write_message("... checking ref Record ID: %.2f sec" % (t6 - t5)) write_message("... checking ref ISBN: %.2f sec" % (t7 - t6)) write_message("... checking rec report numbers: %.2f sec" % (t8 - t7)) write_message("... checking rec journals: %.2f sec" % (t9 - t8)) write_message("... checking rec DOI: %.2f sec" % (t10 - t9)) write_message("... checking rec HDL: %.2f sec" % (t11 - t10)) write_message("... checking rec ISBN: %.2f sec" % (t12 - t11)) write_message("... checking rec Record ID: %.2f sec" % (t13 - t12)) write_message("... total time of ref_analyze: %.2f sec" % (t13 - t1)) return citations, references