def compute_cache(pids): bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids), stream=sys.stdout, verbose=0) for i, p in enumerate(pids): bibtask.write_message("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids))) bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids))) _compute_cache_for_person(p) bibtask.task_sleep_now_if_required(can_stop_too=True)
def compute_cache(pids): bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids), stream=stdout, verbose=0) for _, p in enumerate(pids): bibtask.write_message("WebAuthorProfile: doing %s out of %s (personid: %s)" % (pids.index(p) + 1, len(pids), p)) bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s (personid: %s)" % (pids.index(p) + 1, len(pids), p)) _compute_cache_for_person(p) bibtask.task_sleep_now_if_required(can_stop_too=True)
def compute_cache_mp(pids): from multiprocessing import Pool p = Pool() bibtask.write_message("WebAuthorProfileMP: %s persons to go" % len(pids), stream=sys.stdout, verbose=0) sl = 100 ss = [pids[i: i + sl] for i in range(0, len(pids), sl)] for i, bunch in enumerate(ss): bibtask.write_message("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss))) bibtask.task_update_progress("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss))) p.map(_compute_cache_for_person, bunch) bibtask.task_sleep_now_if_required(can_stop_too=True)
def compute_cache_mp(pids): from multiprocessing import Pool p = Pool() bibtask.write_message("WebAuthorProfileMP: %s persons to go" % len(pids), stream=stdout, verbose=0) sl = 100 ss = [pids[i: i + sl] for i in range(0, len(pids), sl)] for i, bunch in enumerate(ss): bibtask.write_message("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss))) bibtask.task_update_progress("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss))) p.map(_compute_cache_for_person, bunch) bibtask.task_sleep_now_if_required(can_stop_too=True)
def _task_run_core(): """ Runs the requested task in the bibsched environment. """ repair_pid = bibtask.task_get_option('repair_pid') fast_update_personid = bibtask.task_get_option('fast_update_personid') personid_gc = bibtask.task_get_option('personid_gc') record_ids = bibtask.task_get_option('record_ids') all_records = bibtask.task_get_option('all_records') if record_ids: record_ids_nested = [[p] for p in record_ids] else: record_ids_nested = None if repair_pid: bibtask.task_update_progress('Updating names cache...') _run_update_authornames_tables_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Removing person entities not touched by ' 'humans...') personid_remove_automatically_assigned_papers() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Updating person entities...') update_personID_from_algorithm() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Cleaning person tables...') _run_update_personID_table_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('All repairs done.') if fast_update_personid: bibtask.task_update_progress('Updating personid...') _run_personid_fast_assign_papers(record_ids_nested, all_records) bibtask.task_update_progress('PersonID update finished!') if personid_gc: bibtask.task_update_progress('Updating personid (GC)...') _run_personid_gc(record_ids_nested, all_records) bibtask.task_update_progress('PersonID update finished (GC)!') return 1
def _task_run_core(): """ Runs the requested task in the bibsched environment. """ lastname = bibtask.task_get_option("lastname") process_all = bibtask.task_get_option("process_all") prepare_grid = bibtask.task_get_option("prepare_grid") load_grid = bibtask.task_get_option("load_grid_results") data_dir = bibtask.task_get_option("data_dir") prefix = bibtask.task_get_option("prefix") max_records_option = bibtask.task_get_option("max_records") update = bibtask.task_get_option("update") clean_cache = bibtask.task_get_option("clean_cache") update_cache = bibtask.task_get_option("update_cache") record_ids = bibtask.task_get_option("record_ids") record_ids_nested = None all_records = bibtask.task_get_option("all_records") repair_pid = bibtask.task_get_option("repair_pid") fast_update_personid = bibtask.task_get_option("fast_update_personid") if record_ids: record_ids_nested = [[p] for p in record_ids] if fast_update_personid: fast_update_personid = [[p] for p in fast_update_personid] # automated_daemon_mode_p = True if lastname: bibtask.write_message("Processing last name %s" % (lastname), stream=sys.stdout, verbose=0) if process_all: if bconfig.STANDALONE: bibtask.write_message("Processing not possible in standalone!", stream=sys.stdout, verbose=0) return 0 bibtask.write_message("Processing all names...", stream=sys.stdout, verbose=0) lengths = get_len_authornames_bibrefs() if not check_and_create_aid_tables(): bibtask.write_message("Failed to create database tables!", stream=sys.stdout, verbose=0) return 0 if lengths["names"] < 1: bibtask.write_message("Populating Authornames table. It's Empty.", stream=sys.stdout, verbose=0) bibtask.task_update_progress("Populating Authornames table.") populate_authornames() insert_user_log( "daemon", "-1", "UATFP", "bibsched", "status", comment="bibauthorid_daemon, " "update_authornames_tables_from_paper", ) if lengths["bibrefs"] < 1: bibtask.write_message("Populating Bibrefs lookup. It's Empty.", stream=sys.stdout, verbose=0) bibtask.task_update_progress("Populating Bibrefs lookup table.") populate_authornames_bibrefs_from_authornames() bibtask.task_update_progress("Processing all authors.") start_full_disambiguation( last_names="all", process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True ) update_personID_from_algorithm() insert_user_log( "daemon", "-1", "update_aid", "bibsched", "status", comment="bibauthorid_daemon, update_authorid_universe" ) if prepare_grid: bibtask.write_message("Preparing Grid Job", stream=sys.stdout, verbose=0) data_dir_name = "grid_data" workdir_prefix = "job" max_records = 4000 if data_dir: data_dir_name = data_dir if prefix: workdir_prefix = prefix if max_records_option: max_records = max_records_option _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records) if load_grid: bibtask.write_message( "Reading Grid Job results and will write" " them to the database.", stream=sys.stdout, verbose=0 ) _write_data_files_to_db(data_dir) if update or update_cache: bibtask.write_message("update-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress("update-cache: Processing recently" " updated papers") _run_update_authornames_tables_from_paper(record_ids_nested, all_records) bibtask.write_message("update-cache: Finished processing papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress("update-cache: DONE") if update: bibtask.write_message("updating authorid universe", stream=sys.stdout, verbose=0) bibtask.task_update_progress("updating authorid universe") _update_authorid_universe(record_ids, all_records) bibtask.write_message("done updating authorid universe", stream=sys.stdout, verbose=0) bibtask.task_update_progress("done updating authorid universe") if clean_cache: bibtask.write_message("clean-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for names") _run_authornames_tables_gc() bibtask.write_message("update-cache: Finished cleaning authornames " "tables", stream=sys.stdout, verbose=0) bibtask.task_update_progress("clean-cache: Processing recently updated" " papers for persons") _run_update_personID_table_from_paper(record_ids_nested, all_records) bibtask.write_message("update-cache: Finished cleaning PersonID" " table", stream=sys.stdout, verbose=0) bibtask.task_update_progress("clean-cache: DONE") if repair_pid: bibtask.task_update_progress("Updating names cache...") _run_update_authornames_tables_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress("Removing person entities not touched by " "humans...") personid_remove_automatically_assigned_papers() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress("Updating person entities...") update_personID_from_algorithm() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress("Cleaning person tables...") _run_update_personID_table_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress("All repairs done.") if fast_update_personid: bibtask.task_update_progress("Updating personid...") _run_personid_fast_assign_papers(fast_update_personid) bibtask.task_update_progress("Update finished...") # TODO: remember to pass the papers list! return 1
def personid_fast_assign_papers(paperslist=None, use_threading_not_multiprocessing=True): ''' Assign papers to the most compatible person. Compares only the name to find the right person to assign to. If nobody seems compatible, create a new person. ''' class Worker(Thread): def __init__(self, i, p_q, atul, personid_new_id_lock, checker): Thread.__init__(self) self.i = i self.checker = checker self.p_q = p_q self.atul = atul self.personid_new_id_lock = personid_new_id_lock def run(self): while True: if checker.should_stop(): break try: bibrec = self.p_q.get_nowait() except Empty: break close_connection() pfap_assign_paper_iteration(self.i, bibrec, self.atul, self.personid_new_id_lock) def _pfap_assign_paper(i, p_q, atul, personid_new_id_lock, checker): while True: # check bibsched if checker.should_stop(): break try: bibrec = p_q.get_nowait() except Empty: break pfap_assign_paper_iteration(i, bibrec, atul, personid_new_id_lock) _pfap_printmsg('starter', 'Started') if not paperslist: #paperslist = run_sql('select id from bibrec where 1') paperslist = [[x] for x in perform_request_search(p="")] paperslist = [k[0] for k in paperslist] _pfap_printmsg('starter', 'Starting on %s papers ' % len(paperslist)) if use_threading_not_multiprocessing: authornames_table_update_lock = Lock() personid_new_id_lock = Lock() papers_q = Queue() else: authornames_table_update_lock = multiprocessing.Lock() personid_new_id_lock = multiprocessing.Lock() papers_q = multiprocessing.Queue() for p in paperslist: papers_q.put(p) process_list = [] c = 0 if not use_threading_not_multiprocessing: while not papers_q.empty(): checker = status_checker() while len(process_list) <= bconfig.CFG_BIBAUTHORID_MAX_PROCESSES: p = multiprocessing.Process(target=_pfap_assign_paper, args=(c, papers_q, authornames_table_update_lock, personid_new_id_lock, checker)) c += 1 process_list.append(p) p.start() for i, p in enumerate(tuple(process_list)): if not p.is_alive(): p.join() process_list.remove(p) task_sleep_now_if_required(can_stop_too=False) else: max_processes = bconfig.CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS checker = status_checker() workers = [] while not papers_q.empty(): i = 0 while len(workers) < max_processes: w = Worker(i, papers_q, authornames_table_update_lock, personid_new_id_lock, checker) i += 1 w.start() workers.append(w) for c, p in enumerate(tuple(workers)): if not p.is_alive(): p.join() workers.remove(p) task_sleep_now_if_required(can_stop_too=False)
def _task_run_core(): """Runs analyse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = bibtask.task_get_option('recids') collections = bibtask.task_get_option('collections') taxonomy = bibtask.task_get_option('taxonomy') if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run(bibtask.task_get_task_param('task_starting_time')) return 1 # We will write to a temporary file as we go, because we might be processing # big collections with many docs _rid = time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = bibclassify_engine.get_tmp_file(_rid) fo = open(abs_path, 'w') fo.write('<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec['recIDs']) rec_added = False for onto_rec in onto_recids: bibtask.task_sleep_now_if_required(can_stop_too=False) if onto_rec['collection'] is not None: bibtask.write_message('INFO: Applying taxonomy %s to collection %s (%s ' 'records)' % (onto_rec['ontology'], onto_rec['collection'], len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3) else: bibtask.write_message('INFO: Applying taxonomy %s to recIDs %s. ' % (onto_rec['ontology'], ', '.join([str(recid) for recid in onto_rec['recIDs']])), stream=sys.stderr, verbose=3) if onto_rec['recIDs']: xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'], onto_rec['collection']) if len(xml) > 5: fo.write(xml) rec_added = True fo.write('</collection>\n') fo.close() # Apply the changes. if rec_added: if bconfig.CFG_DB_SAVE_KW: bibclassify_webinterface.upload_keywords(abs_path) else: bibtask.write_message("INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0) else: bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0) os.remove(abs_path) # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run(bibtask.task_get_task_param('task_starting_time')) return 1
def _analyze_documents(records, taxonomy_name, collection, output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER): """For each collection, parse the documents attached to the records in collection with the corresponding taxonomy_name. @var records: list of recids to process @var taxonomy_name: str, name of the taxonomy, e.g. HEP @var collection: str, collection name @keyword output_limit: int, max number of keywords to extract [3] @return: str, marcxml output format of results """ global _INDEX if not records: # No records could be found. bibtask.write_message("WARNING: No records were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs(record).list_latest_files() # TODO: why this doesn't call list_all_files() ? keywords = {} akws = {} acro = {} single_keywords = composite_keywords = author_keywords = acronyms = None for doc in bibdocfiles: # Get the keywords for all PDF documents contained in the record. if bibclassify_text_extractor.is_pdf(doc.get_full_path()): bibtask.write_message('INFO: Generating keywords for record %d.' % record, stream=sys.stderr, verbose=3) fulltext = doc.get_path() single_keywords, composite_keywords, author_keywords, acronyms = \ bibclassify_engine.get_keywords_from_local_file(fulltext, taxonomy_name, with_author_keywords=True, output_mode="raw", output_limit=output_limit, match_mode='partial') else: bibtask.write_message('WARNING: BibClassify does not know how to process \ doc: %s (type: %s) -- ignoring it.' % (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3) if single_keywords or composite_keywords: cleaned_single = bibclassify_engine.clean_before_output(single_keywords) cleaned_composite = bibclassify_engine.clean_before_output(composite_keywords) # merge the groups into one keywords.update(cleaned_single) keywords.update(cleaned_composite) acro.update(acronyms) akws.update(author_keywords) if len(keywords): output.append('<record>') output.append('<controlfield tag="001">%s</controlfield>' % record) output.append(bibclassify_engine._output_marc(keywords.items(), (), akws, acro, spires=bconfig.CFG_SPIRES_FORMAT)) output.append('</record>') else: bibtask.write_message('WARNING: No keywords found for record %d.' % record, stream=sys.stderr, verbose=0) _INDEX += 1 bibtask.task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER)) bibtask.task_sleep_now_if_required(can_stop_too=False) return '\n'.join(output)
deleted_recs = dbinter.get_deleted_papers() deleted_recs = frozenset(x[0] for x in deleted_recs) if bconfig.TABLES_UTILS_DEBUG: print "%d total deleted papers" % (len(deleted_recs),) if personid: personid_q = dbinter.list_2_SQL_str(personid, lambda x: str(x[0])) else: personid_q = None counter = 0 rows_limit = 10000000 end_loop = False while not end_loop: task_sleep_now_if_required(True) papers_data = dbinter.collect_personid_papers(person=personid_q, limit=(counter, rows_limit,)) if bconfig.TABLES_UTILS_DEBUG: print "query with limit %d %d" % (counter, rows_limit) if len(papers_data) == rows_limit: counter += rows_limit else: end_loop = True papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data) to_remove = set() jobs = dict() for p in papers_data:
def personid_fast_assign_papers(paperslist=None, use_threading_not_multiprocessing=True): ''' Assign papers to the most compatible person. Compares only the name to find the right person to assign to. If nobody seems compatible, create a new person. ''' class Worker(Thread): def __init__(self, i, p_q, atul, personid_new_id_lock, checker): Thread.__init__(self) self.i = i self.checker = checker self.p_q = p_q self.atul = atul self.personid_new_id_lock = personid_new_id_lock def run(self): while True: if checker.should_stop(): break try: bibrec = self.p_q.get_nowait() except Empty: break close_connection() pfap_assign_paper_iteration(self.i, bibrec, self.atul, self.personid_new_id_lock) def _pfap_assign_paper(i, p_q, atul, personid_new_id_lock, checker): while True: # check bibsched if checker.should_stop(): break try: bibrec = p_q.get_nowait() except Empty: break pfap_assign_paper_iteration(i, bibrec, atul, personid_new_id_lock) _pfap_printmsg('starter', 'Started') if not paperslist: #paperslist = run_sql('select id from bibrec where 1') paperslist = [[x] for x in perform_request_search(p="")] paperslist = [k[0] for k in paperslist] _pfap_printmsg('starter', 'Starting on %s papers ' % len(paperslist)) if use_threading_not_multiprocessing: authornames_table_update_lock = Lock() personid_new_id_lock = Lock() papers_q = Queue() else: authornames_table_update_lock = multiprocessing.Lock() personid_new_id_lock = multiprocessing.Lock() papers_q = multiprocessing.Queue() for p in paperslist: papers_q.put(p) process_list = [] c = 0 if not use_threading_not_multiprocessing: while not papers_q.empty(): checker = status_checker() while len(process_list) <= bconfig.CFG_BIBAUTHORID_MAX_PROCESSES: p = multiprocessing.Process(target=_pfap_assign_paper, args=(c, papers_q, authornames_table_update_lock, personid_new_id_lock, checker)) c += 1 process_list.append(p) p.start() for i, p in enumerate(tuple(process_list)): if not p.is_alive(): p.join() process_list.remove(p) task_sleep_now_if_required(True) else: max_processes = bconfig.CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS checker = status_checker() workers = [] while not papers_q.empty(): i = 0 while len(workers) < max_processes: w = Worker(i, papers_q, authornames_table_update_lock, personid_new_id_lock, checker) i += 1 w.start() workers.append(w) for c, p in enumerate(tuple(workers)): if not p.is_alive(): p.join() workers.remove(p) task_sleep_now_if_required(True)
def _task_run_core(): """Runs analyse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = bibtask.task_get_option('recids') collections = bibtask.task_get_option('collections') taxonomy = bibtask.task_get_option('taxonomy') if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1 # We will write to a temporary file as we go, because we might be processing # big collections with many docs _rid = time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = bibclassify_engine.get_tmp_file(_rid) fo = open(abs_path, 'w') fo.write('<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec['recIDs']) rec_added = False for onto_rec in onto_recids: bibtask.task_sleep_now_if_required(can_stop_too=False) if onto_rec['collection'] is not None: bibtask.write_message( 'INFO: Applying taxonomy %s to collection %s (%s ' 'records)' % (onto_rec['ontology'], onto_rec['collection'], len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3) else: bibtask.write_message( 'INFO: Applying taxonomy %s to recIDs %s. ' % (onto_rec['ontology'], ', '.join( [str(recid) for recid in onto_rec['recIDs']])), stream=sys.stderr, verbose=3) if onto_rec['recIDs']: xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'], onto_rec['collection']) if len(xml) > 5: fo.write(xml) rec_added = True fo.write('</collection>\n') fo.close() # Apply the changes. if rec_added: if bconfig.CFG_DB_SAVE_KW: bibclassify_webinterface.upload_keywords(abs_path) else: bibtask.write_message( "INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0) else: bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0) os.remove(abs_path) # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1
def _task_run_core(): """ Runs the requested task in the bibsched environment. """ lastname = bibtask.task_get_option('lastname') process_all = bibtask.task_get_option('process_all') prepare_grid = bibtask.task_get_option('prepare_grid') load_grid = bibtask.task_get_option('load_grid_results') data_dir = bibtask.task_get_option('data_dir') prefix = bibtask.task_get_option('prefix') max_records_option = bibtask.task_get_option('max_records') update = bibtask.task_get_option('update') clean_cache = bibtask.task_get_option('clean_cache') update_cache = bibtask.task_get_option('update_cache') record_ids = bibtask.task_get_option('record_ids') record_ids_nested = None all_records = bibtask.task_get_option('all_records') repair_pid = bibtask.task_get_option('repair_pid') if record_ids: record_ids_nested = [[p] for p in record_ids] # automated_daemon_mode_p = True if lastname: bibtask.write_message("Processing last name %s" % (lastname), stream=sys.stdout, verbose=0) if process_all: if bconfig.STANDALONE: bibtask.write_message("Processing not possible in standalone!", stream=sys.stdout, verbose=0) return 0 bibtask.write_message("Processing all names...", stream=sys.stdout, verbose=0) lengths = get_len_authornames_bibrefs() if not check_and_create_aid_tables(): bibtask.write_message("Failed to create database tables!", stream=sys.stdout, verbose=0) return 0 if lengths['names'] < 1: bibtask.write_message("Populating Authornames table. It's Empty.", stream=sys.stdout, verbose=0) bibtask.task_update_progress('Populating Authornames table.') populate_authornames() insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', comment='bibauthorid_daemon, ' 'update_authornames_tables_from_paper') if lengths['bibrefs'] < 1: bibtask.write_message("Populating Bibrefs lookup. It's Empty.", stream=sys.stdout, verbose=0) bibtask.task_update_progress('Populating Bibrefs lookup table.') populate_authornames_bibrefs_from_authornames() bibtask.task_update_progress('Processing all authors.') start_full_disambiguation(last_names="all", process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True) update_personID_from_algorithm() insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe') if prepare_grid: bibtask.write_message("Preparing Grid Job", stream=sys.stdout, verbose=0) data_dir_name = "grid_data" workdir_prefix = "job" max_records = 4000 if data_dir: data_dir_name = data_dir if prefix: workdir_prefix = prefix if max_records_option: max_records = max_records_option _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records) if load_grid: bibtask.write_message("Reading Grid Job results and will write" " them to the database.", stream=sys.stdout, verbose=0) _write_data_files_to_db(data_dir) if update or update_cache: bibtask.write_message("update-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress('update-cache: Processing recently' ' updated papers') _run_update_authornames_tables_from_paper(record_ids_nested, all_records) bibtask.write_message("update-cache: Finished processing papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress('update-cache: DONE') if update: bibtask.write_message("updating authorid universe", stream=sys.stdout, verbose=0) bibtask.task_update_progress('updating authorid universe') _update_authorid_universe(record_ids, all_records) bibtask.write_message("done updating authorid universe", stream=sys.stdout, verbose=0) bibtask.task_update_progress('done updating authorid universe') if clean_cache: bibtask.write_message("clean-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress('clean-cache: Processing recently updated' ' papers for names') _run_authornames_tables_gc() bibtask.write_message("update-cache: Finished cleaning authornames " "tables", stream=sys.stdout, verbose=0) bibtask.task_update_progress('clean-cache: Processing recently updated' ' papers for persons') _run_update_personID_table_from_paper(record_ids_nested, all_records) bibtask.write_message("update-cache: Finished cleaning PersonID" " table", stream=sys.stdout, verbose=0) bibtask.task_update_progress('clean-cache: DONE') if repair_pid: bibtask.task_update_progress('Updating names cache...') _run_update_authornames_tables_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Removing person entities not touched by ' 'humans...') personid_remove_automatically_assigned_papers() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Updating person entities...') update_personID_from_algorithm() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Cleaning person tables...') _run_update_personID_table_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('All repairs done.') return 1
def _prepare_data_files_from_db(data_dir_name="grid_data", workdir_prefix="job", max_records=4000): ''' Prepares grid jobs. Is a task running in bibsched. Meaning: 1. Find all last names in the database 2. For each last name: - find all documents regarding this last name (ignore first names) - if number of documents loaded into memory exceeds max_records, write the memory cache into files (cf. Files section). Each write back procedure will happen into a newly created directory. The prefix for the respective job directory may be specified as well as the name of the data directory where these job directories will be created. Files: - authornames.dat - virtual_authors.dat - virtual_author_data.dat - virtual_author_clusters.dat - virtual_author_cluster_cache.dat - realauthors.dat - realauthor_data.dat - doclist.dat - records.dat - ids.dat - ra_va_cache.dat @param data_dir_name: the name of the directory that will hold all the sub directories for the jobs. @type data_dir_name: string @param workdir_prefix: prefix for the job sub directories. @type workdir_prefix: string @param max_records: maximum number of records after which the memory cache is to be flushed to files. @type max_records: int ''' try: max_records = int(max_records) except ValueError: max_records = 4000 bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0) bibtask.write_message("Limiting files to %s records" % (max_records, ), stream=sys.stdout, verbose=0) bibtask.task_update_progress('Loading last names...') last_names = find_all_last_names() last_name_queue = Queue.Queue() for last_name in sorted(last_names): last_name_queue.put(last_name) total = len(last_names) status = 1 bibtask.write_message("Done. Loaded %s last names." % (total), stream=sys.stdout, verbose=0) job_id = 0 data_dir = "" if data_dir_name.startswith("/"): data_dir = data_dir_name else: data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name) if not data_dir.endswith("/"): data_dir = "%s/" % (data_dir, ) job_lnames = [] while True: if last_name_queue.empty(): bibtask.write_message("Done with all names.", stream=sys.stdout, verbose=0) break bibtask.task_sleep_now_if_required(can_stop_too=False) lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del (lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue job_lnames.append(lname) bibtask.task_update_progress('Preparing job %d of %d: %s.' % (status, total, lname)) bibtask.write_message( ("Processing: %s (%d/%d).") % (lname, status, total), stream=sys.stdout, verbose=0) bibtask.task_sleep_now_if_required(can_stop_too=False) populate_doclist_for_author_surname(lname) post_remove_names = set() for name in [ row['name'] for row in dat.AUTHOR_NAMES if not row['processed'] ]: potential_removal = "%s," % (name.split(',')[0], ) if not potential_removal == "%s" % (lname, ): post_remove_names.add(potential_removal) if len(post_remove_names) > 1: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bibtask.write_message( ("-> Removed %s entries from the " + "computation list: %s") % (removed, removed_names), stream=sys.stdout, verbose=0) total -= removed if lname_list: last_name_queue.put(lname_list) if len(dat.RELEVANT_RECORDS) >= max_records: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) bibtask.task_sleep_now_if_required(can_stop_too=True) job_lnames = [] job_id += 1 status += 1 if dat.RELEVANT_RECORDS: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) bibtask.task_sleep_now_if_required(can_stop_too=True) return True
deleted_recs = dbinter.get_deleted_papers() deleted_recs = frozenset(x[0] for x in deleted_recs) if bconfig.TABLES_UTILS_DEBUG: print "%d total deleted papers" % (len(deleted_recs),) if personid: personid_q = dbinter.list_2_SQL_str(personid, lambda x: str(x[0])) else: personid_q = None counter = 0 rows_limit = 10000000 end_loop = False while not end_loop: task_sleep_now_if_required(True) papers_data = dbinter.collect_personid_papers(person=personid_q, limit=(counter, rows_limit)) if bconfig.TABLES_UTILS_DEBUG: print "query with limit %d %d" % (counter, rows_limit) if len(papers_data) == rows_limit: counter += rows_limit else: end_loop = True papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data) to_remove = set() jobs = dict() for p in papers_data: if int(p[0]) in deleted_recs:
def _task_run_core(): """ Runs the requested task in the bibsched environment. """ lastname = bibtask.task_get_option('lastname') process_all = bibtask.task_get_option('process_all') prepare_grid = bibtask.task_get_option('prepare_grid') load_grid = bibtask.task_get_option('load_grid_results') data_dir = bibtask.task_get_option('data_dir') prefix = bibtask.task_get_option('prefix') max_records_option = bibtask.task_get_option('max_records') update = bibtask.task_get_option('update') clean_cache = bibtask.task_get_option('clean_cache') update_cache = bibtask.task_get_option('update_cache') record_ids = bibtask.task_get_option('record_ids') record_ids_nested = None all_records = bibtask.task_get_option('all_records') repair_pid = bibtask.task_get_option('repair_pid') if record_ids: record_ids_nested = [[p] for p in record_ids] # automated_daemon_mode_p = True if lastname: bibtask.write_message("Processing last name %s" % (lastname), stream=sys.stdout, verbose=0) if process_all: if bconfig.STANDALONE: bibtask.write_message("Processing not possible in standalone!", stream=sys.stdout, verbose=0) return 0 bibtask.write_message("Processing all names...", stream=sys.stdout, verbose=0) lengths = get_len_authornames_bibrefs() if not check_and_create_aid_tables(): bibtask.write_message("Failed to create database tables!", stream=sys.stdout, verbose=0) return 0 if lengths['names'] < 1: bibtask.write_message("Populating Authornames table. It's Empty.", stream=sys.stdout, verbose=0) bibtask.task_update_progress('Populating Authornames table.') populate_authornames() insert_user_log('daemon', '-1', 'UATFP', 'bibsched', 'status', comment='bibauthorid_daemon, ' 'update_authornames_tables_from_paper') if lengths['bibrefs'] < 1: bibtask.write_message("Populating Bibrefs lookup. It's Empty.", stream=sys.stdout, verbose=0) bibtask.task_update_progress('Populating Bibrefs lookup table.') populate_authornames_bibrefs_from_authornames() bibtask.task_update_progress('Processing all authors.') start_full_disambiguation(last_names="all", process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True) update_personID_from_algorithm() insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe') if prepare_grid: bibtask.write_message("Preparing Grid Job", stream=sys.stdout, verbose=0) data_dir_name = "grid_data" workdir_prefix = "job" max_records = 4000 if data_dir: data_dir_name = data_dir if prefix: workdir_prefix = prefix if max_records_option: max_records = max_records_option _prepare_data_files_from_db(data_dir_name, workdir_prefix, max_records) if load_grid: bibtask.write_message( "Reading Grid Job results and will write" " them to the database.", stream=sys.stdout, verbose=0) _write_data_files_to_db(data_dir) if update or update_cache: bibtask.write_message( "update-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress('update-cache: Processing recently' ' updated papers') _run_update_authornames_tables_from_paper(record_ids_nested, all_records) bibtask.write_message("update-cache: Finished processing papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress('update-cache: DONE') if update: bibtask.write_message("updating authorid universe", stream=sys.stdout, verbose=0) bibtask.task_update_progress('updating authorid universe') _update_authorid_universe(record_ids, all_records) bibtask.write_message("done updating authorid universe", stream=sys.stdout, verbose=0) bibtask.task_update_progress('done updating authorid universe') if clean_cache: bibtask.write_message( "clean-cache: Processing recently updated" " papers", stream=sys.stdout, verbose=0) bibtask.task_update_progress('clean-cache: Processing recently updated' ' papers for names') _run_authornames_tables_gc() bibtask.write_message( "update-cache: Finished cleaning authornames " "tables", stream=sys.stdout, verbose=0) bibtask.task_update_progress('clean-cache: Processing recently updated' ' papers for persons') _run_update_personID_table_from_paper(record_ids_nested, all_records) bibtask.write_message( "update-cache: Finished cleaning PersonID" " table", stream=sys.stdout, verbose=0) bibtask.task_update_progress('clean-cache: DONE') if repair_pid: bibtask.task_update_progress('Updating names cache...') _run_update_authornames_tables_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Removing person entities not touched by ' 'humans...') personid_remove_automatically_assigned_papers() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Updating person entities...') update_personID_from_algorithm() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('Cleaning person tables...') _run_update_personID_table_from_paper() bibtask.task_sleep_now_if_required(can_stop_too=False) bibtask.task_update_progress('All repairs done.') return 1
def _update_authorid_universe(record_ids=None, all_records=False): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len([row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log(25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]: for author_id in docs['authornameids']: author_name = [an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id] refrecs = [ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id] refrec = -1 if len(refrecs) > 1: refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = None updated_records = [] if not record_ids and not all_records: last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return elif record_ids and not all_records: updated_records = record_ids elif not record_ids and all_records: bibtask.write_message("Update is going to empty all aid tables...", stream=sys.stdout, verbose=0) empty_aid_tables() bibtask.write_message("Update authorid will operate on all! records.", stream=sys.stdout, verbose=0) bibtask.task_update_progress('Update is operating on all! records.') start_full_disambiguation(process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True) bibtask.task_update_progress('Update is done.') return bibtask.task_sleep_now_if_required(can_stop_too=True) authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [row for row in authors if row['db_name'] == rec_author] if author_in_list: for upd in [row for row in authors if row['db_name'] == rec_author]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({'db_name': rec_author, 'records': [rec], 'last_name': last_name}) for status, author_last_name in enumerate(author_last_names): bibtask.task_sleep_now_if_required(can_stop_too=False) current_authors = [row for row in authors if row['last_name'] == author_last_name] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name']] if not authornamesid: bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records(va_id, "orig_authorname_id") for an_list in [row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") bibtask.task_sleep_now_if_required(can_stop_too=False) start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log(25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) bibtask.task_sleep_now_if_required(can_stop_too=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id,)) bconfig.LOGGER.log(25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') bibtask.task_sleep_now_if_required(can_stop_too=False) update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying " "with bibauthorid!")
def _prepare_data_files_from_db(data_dir_name="grid_data", workdir_prefix="job", max_records=4000): ''' Prepares grid jobs. Is a task running in bibsched. Meaning: 1. Find all last names in the database 2. For each last name: - find all documents regarding this last name (ignore first names) - if number of documents loaded into memory exceeds max_records, write the memory cache into files (cf. Files section). Each write back procedure will happen into a newly created directory. The prefix for the respective job directory may be specified as well as the name of the data directory where these job directories will be created. Files: - authornames.dat - virtual_authors.dat - virtual_author_data.dat - virtual_author_clusters.dat - virtual_author_cluster_cache.dat - realauthors.dat - realauthor_data.dat - doclist.dat - records.dat - ids.dat - ra_va_cache.dat @param data_dir_name: the name of the directory that will hold all the sub directories for the jobs. @type data_dir_name: string @param workdir_prefix: prefix for the job sub directories. @type workdir_prefix: string @param max_records: maximum number of records after which the memory cache is to be flushed to files. @type max_records: int ''' try: max_records = int(max_records) except ValueError: max_records = 4000 bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0) bibtask.write_message("Limiting files to %s records" % (max_records,), stream=sys.stdout, verbose=0) bibtask.task_update_progress('Loading last names...') last_names = find_all_last_names() last_name_queue = Queue.Queue() for last_name in sorted(last_names): last_name_queue.put(last_name) total = len(last_names) status = 1 bibtask.write_message("Done. Loaded %s last names." % (total), stream=sys.stdout, verbose=0) job_id = 0 data_dir = "" if data_dir_name.startswith("/"): data_dir = data_dir_name else: data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name) if not data_dir.endswith("/"): data_dir = "%s/" % (data_dir,) job_lnames = [] while True: if last_name_queue.empty(): bibtask.write_message("Done with all names.", stream=sys.stdout, verbose=0) break bibtask.task_sleep_now_if_required(can_stop_too=False) lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del(lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue job_lnames.append(lname) bibtask.task_update_progress('Preparing job %d of %d: %s.' % (status, total, lname)) bibtask.write_message(("Processing: %s (%d/%d).") % (lname, status, total), stream=sys.stdout, verbose=0) bibtask.task_sleep_now_if_required(can_stop_too=False) populate_doclist_for_author_surname(lname) post_remove_names = set() for name in [row['name'] for row in dat.AUTHOR_NAMES if not row['processed']]: potential_removal = "%s," % (name.split(',')[0],) if not potential_removal == "%s" % (lname,): post_remove_names.add(potential_removal) if len(post_remove_names) > 1: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bibtask.write_message(("-> Removed %s entries from the " + "computation list: %s") % (removed, removed_names), stream=sys.stdout, verbose=0) total -= removed if lname_list: last_name_queue.put(lname_list) if len(dat.RELEVANT_RECORDS) >= max_records: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) bibtask.task_sleep_now_if_required(can_stop_too=True) job_lnames = [] job_id += 1 status += 1 if dat.RELEVANT_RECORDS: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) bibtask.task_sleep_now_if_required(can_stop_too=True) return True
deleted_recs = dbinter.get_deleted_papers() deleted_recs = frozenset(x[0] for x in deleted_recs) if bconfig.TABLES_UTILS_DEBUG: print "%d total deleted papers" % (len(deleted_recs),) if personid: personid_q = dbinter.list_2_SQL_str(personid, lambda x: str(x[0])) else: personid_q = None counter = 0 rows_limit = 10000000 end_loop = False while not end_loop: task_sleep_now_if_required(can_stop_too=False) papers_data = dbinter.collect_personid_papers(person=personid_q, limit=(counter, rows_limit,)) if bconfig.TABLES_UTILS_DEBUG: print "query with limit %d %d" % (counter, rows_limit) if len(papers_data) == rows_limit: counter += rows_limit else: end_loop = True papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data) to_remove = set() jobs = dict() for p in papers_data:
def _analyze_documents( records, taxonomy_name, collection, output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER): """For each collection, parse the documents attached to the records in collection with the corresponding taxonomy_name. @var records: list of recids to process @var taxonomy_name: str, name of the taxonomy, e.g. HEP @var collection: str, collection name @keyword output_limit: int, max number of keywords to extract [3] @return: str, marcxml output format of results """ global _INDEX if not records: # No records could be found. bibtask.write_message( "WARNING: No records were found in collection %s." % collection, stream=sys.stderr, verbose=2) return False # Process records: output = [] for record in records: bibdocfiles = BibRecDocs(record).list_latest_files( ) # TODO: why this doesn't call list_all_files() ? keywords = {} akws = {} acro = {} single_keywords = composite_keywords = author_keywords = acronyms = None for doc in bibdocfiles: # Get the keywords for all PDF documents contained in the record. if bibclassify_text_extractor.is_pdf(doc.get_full_path()): bibtask.write_message( 'INFO: Generating keywords for record %d.' % record, stream=sys.stderr, verbose=3) fulltext = doc.get_path() single_keywords, composite_keywords, author_keywords, acronyms = \ bibclassify_engine.get_keywords_from_local_file(fulltext, taxonomy_name, with_author_keywords=True, output_mode="raw", output_limit=output_limit, match_mode='partial') else: bibtask.write_message( 'WARNING: BibClassify does not know how to process \ doc: %s (type: %s) -- ignoring it.' % (doc.fullpath, doc.doctype), stream=sys.stderr, verbose=3) if single_keywords or composite_keywords: cleaned_single = bibclassify_engine.clean_before_output( single_keywords) cleaned_composite = bibclassify_engine.clean_before_output( composite_keywords) # merge the groups into one keywords.update(cleaned_single) keywords.update(cleaned_composite) acro.update(acronyms) akws.update(author_keywords) if len(keywords): output.append('<record>') output.append('<controlfield tag="001">%s</controlfield>' % record) output.append( bibclassify_engine._output_marc( keywords.items(), (), akws, acro, spires=bconfig.CFG_SPIRES_FORMAT)) output.append('</record>') else: bibtask.write_message('WARNING: No keywords found for record %d.' % record, stream=sys.stderr, verbose=0) _INDEX += 1 bibtask.task_update_progress('Done %d out of %d.' % (_INDEX, _RECIDS_NUMBER)) bibtask.task_sleep_now_if_required(can_stop_too=False) return '\n'.join(output)
def _update_authorid_universe(record_ids=None, all_records=False): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len( [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log( 25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [ row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids ]: for author_id in docs['authornameids']: author_name = [ an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id ] refrecs = [ ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id ] refrec = -1 if len(refrecs) > 1: refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = None updated_records = [] if not record_ids and not all_records: last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log( 'daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message( "Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return elif record_ids and not all_records: updated_records = record_ids elif not record_ids and all_records: bibtask.write_message("Update is going to empty all aid tables...", stream=sys.stdout, verbose=0) empty_aid_tables() bibtask.write_message("Update authorid will operate on all! records.", stream=sys.stdout, verbose=0) bibtask.task_update_progress('Update is operating on all! records.') start_full_disambiguation(process_orphans=True, db_exists=False, populate_doclist=True, write_to_db=True) bibtask.task_update_progress('Update is done.') return bibtask.task_sleep_now_if_required(can_stop_too=True) authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [ row for row in authors if row['db_name'] == rec_author ] if author_in_list: for upd in [ row for row in authors if row['db_name'] == rec_author ]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({ 'db_name': rec_author, 'records': [rec], 'last_name': last_name }) for status, author_last_name in enumerate(author_last_names): bibtask.task_sleep_now_if_required(can_stop_too=False) current_authors = [ row for row in authors if row['last_name'] == author_last_name ] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log( 25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [ row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name'] ] if not authornamesid: bconfig.LOGGER.error( "The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records( va_id, "orig_authorname_id") for an_list in [ row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec ]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") bibtask.task_sleep_now_if_required(can_stop_too=False) start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log( 25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) bibtask.task_sleep_now_if_required(can_stop_too=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id, )) bconfig.LOGGER.log( 25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') bibtask.task_sleep_now_if_required(can_stop_too=False) update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log( 25, "Done updating everything. Thanks for flying " "with bibauthorid!")
def rabbit(bibrecs, check_invalid_papers=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids)