def _prepare_data_files_from_db(data_dir_name="grid_data", workdir_prefix="job", max_records=4000): ''' Prepares grid jobs. Is a task running in bibsched. Meaning: 1. Find all last names in the database 2. For each last name: - find all documents regarding this last name (ignore first names) - if number of documents loaded into memory exceeds max_records, write the memory cache into files (cf. Files section). Each write back procedure will happen into a newly created directory. The prefix for the respective job directory may be specified as well as the name of the data directory where these job directories will be created. Files: - authornames.dat - virtual_authors.dat - virtual_author_data.dat - virtual_author_clusters.dat - virtual_author_cluster_cache.dat - realauthors.dat - realauthor_data.dat - doclist.dat - records.dat - ids.dat - ra_va_cache.dat @param data_dir_name: the name of the directory that will hold all the sub directories for the jobs. @type data_dir_name: string @param workdir_prefix: prefix for the job sub directories. @type workdir_prefix: string @param max_records: maximum number of records after which the memory cache is to be flushed to files. @type max_records: int ''' try: max_records = int(max_records) except ValueError: max_records = 4000 bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0) bibtask.write_message("Limiting files to %s records" % (max_records,), stream=sys.stdout, verbose=0) bibtask.task_update_progress('Loading last names...') last_names = find_all_last_names() last_name_queue = Queue.Queue() for last_name in sorted(last_names): last_name_queue.put(last_name) total = len(last_names) status = 1 bibtask.write_message("Done. Loaded %s last names." % (total), stream=sys.stdout, verbose=0) job_id = 0 data_dir = "" if data_dir_name.startswith("/"): data_dir = data_dir_name else: data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name) if not data_dir.endswith("/"): data_dir = "%s/" % (data_dir,) job_lnames = [] while True: if last_name_queue.empty(): bibtask.write_message("Done with all names.", stream=sys.stdout, verbose=0) break lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del(lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue job_lnames.append(lname) bibtask.task_update_progress('Preparing job %d of %d: %s.' % (status, total, lname)) bibtask.write_message(("Processing: %s (%d/%d).") % (lname, status, total), stream=sys.stdout, verbose=0) populate_doclist_for_author_surname(lname) post_remove_names = set() for name in [row['name'] for row in dat.AUTHOR_NAMES if not row['processed']]: potential_removal = "%s," % (name.split(',')[0],) if not potential_removal == "%s" % (lname,): post_remove_names.add(potential_removal) if len(post_remove_names) > 1: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bibtask.write_message(("-> Removed %s entries from the " + "computation list: %s") % (removed, removed_names), stream=sys.stdout, verbose=0) total -= removed if lname_list: last_name_queue.put(lname_list) if len(dat.RELEVANT_RECORDS) >= max_records: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) job_lnames = [] job_id += 1 status += 1 if dat.RELEVANT_RECORDS: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) return True
def _prepare_data_files_from_db(data_dir_name="grid_data", workdir_prefix="job", max_records=4000): ''' Prepares grid jobs. Is a task running in bibsched. Meaning: 1. Find all last names in the database 2. For each last name: - find all documents regarding this last name (ignore first names) - if number of documents loaded into memory exceeds max_records, write the memory cache into files (cf. Files section). Each write back procedure will happen into a newly created directory. The prefix for the respective job directory may be specified as well as the name of the data directory where these job directories will be created. Files: - authornames.dat - virtual_authors.dat - virtual_author_data.dat - virtual_author_clusters.dat - virtual_author_cluster_cache.dat - realauthors.dat - realauthor_data.dat - doclist.dat - records.dat - ids.dat - ra_va_cache.dat @param data_dir_name: the name of the directory that will hold all the sub directories for the jobs. @type data_dir_name: string @param workdir_prefix: prefix for the job sub directories. @type workdir_prefix: string @param max_records: maximum number of records after which the memory cache is to be flushed to files. @type max_records: int ''' try: max_records = int(max_records) except ValueError: max_records = 4000 bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0) bibtask.write_message("Limiting files to %s records" % (max_records, ), stream=sys.stdout, verbose=0) bibtask.task_update_progress('Loading last names...') last_names = find_all_last_names() last_name_queue = Queue.Queue() for last_name in sorted(last_names): last_name_queue.put(last_name) total = len(last_names) status = 1 bibtask.write_message("Done. Loaded %s last names." % (total), stream=sys.stdout, verbose=0) job_id = 0 data_dir = "" if data_dir_name.startswith("/"): data_dir = data_dir_name else: data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name) if not data_dir.endswith("/"): data_dir = "%s/" % (data_dir, ) job_lnames = [] while True: if last_name_queue.empty(): bibtask.write_message("Done with all names.", stream=sys.stdout, verbose=0) break lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del (lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue job_lnames.append(lname) bibtask.task_update_progress('Preparing job %d of %d: %s.' % (status, total, lname)) bibtask.write_message( ("Processing: %s (%d/%d).") % (lname, status, total), stream=sys.stdout, verbose=0) populate_doclist_for_author_surname(lname) post_remove_names = set() for name in [ row['name'] for row in dat.AUTHOR_NAMES if not row['processed'] ]: potential_removal = "%s," % (name.split(',')[0], ) if not potential_removal == "%s" % (lname, ): post_remove_names.add(potential_removal) if len(post_remove_names) > 1: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bibtask.write_message( ("-> Removed %s entries from the " + "computation list: %s") % (removed, removed_names), stream=sys.stdout, verbose=0) total -= removed if lname_list: last_name_queue.put(lname_list) if len(dat.RELEVANT_RECORDS) >= max_records: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) job_lnames = [] job_id += 1 status += 1 if dat.RELEVANT_RECORDS: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) return True
def start_full_disambiguation(last_names="all", process_orphans=False, db_exists=False, populate_doclist=True, write_to_db=True, populate_aid_from_personid=False): ''' Starts the disambiguation process on a specified set of authors or on all authors while respecting specified preconditions @param last_names: "all" to process all authors or a specific last name @type last_names: string @param process_orphans: process the orphans left after the first process? @type process_orphans: boolean @param db_exists: is there a data representation already in memory? @type db_exists: boolean @param populate_doclist: shall we populate the document list w/ the authors @type populate_doclist: boolean @param write_to_db: write the results back to the database? @type write_to_db: boolean @param populate_aid_from_personid: Populate all AID tables as a backtrack from aidPERSONID? @type populate_aid_from_personid: boolean @return: True if the process went through smoothly, False if it didn't @rtype: boolean ''' if bconfig.STANDALONE: bconfig.LOGGER.critical("This method is not available in " "standalone mode.") return False if isinstance(last_names, str) and last_names != "all": job_last_names = [last_names] elif last_names == "all": job_last_names = find_all_last_names() elif isinstance(last_names, list): job_last_names = last_names else: bconfig.LOGGER.error("Failed to detect parameter type. Exiting.") return False if populate_aid_from_personid: dat.RUNTIME_CONFIG['populate_aid_from_personid'] = True if db_exists: db_lnames = get_existing_last_names() for db_lname in db_lnames: if db_lname in job_last_names: job_last_names.remove(db_lname) bconfig.LOGGER.log(25, "Removed %s entries from the computation list, " " since they've already been processed and written " " to the db" % (len(db_lnames))) del(db_lnames) totale = len(job_last_names) if MP_ENABLED: mp_queue = multiprocessing.Queue() mp_termination_queue = multiprocessing.Queue(1) db_write_lock = multiprocessing.Lock() process_list = [] lc = multiprocessing.Process(target=list_creation_process, name='baid-listgen', args=(mp_queue, job_last_names, mp_termination_queue)) lc.start() del(job_last_names) for i in range(bconfig.BIBAUTHORID_MAX_PROCESSES): p = multiprocessing.Process(target=computation_process_starter, name='baid-worker-' + str(i), args=(i, mp_termination_queue, mp_queue, db_write_lock, populate_doclist, True, process_orphans, True, write_to_db)) process_list.append(p) p.start() for p in process_list: p.join() lc.join() else: mp_queue = Queue.Queue() mp_termination_queue = Queue.Queue() db_write_lock = None list_creation_process(mp_queue, job_last_names, mp_termination_queue) del(job_last_names) computation_process_starter(0, mp_termination_queue, mp_queue, db_write_lock, populate_doclist, True, process_orphans, True, write_to_db) # status = 1 bconfig.LOGGER.log(25, "Done. Loaded %s last names." % (totale))
def start_full_disambiguation(last_names="all", process_orphans=False, db_exists=False, populate_doclist=True, write_to_db=True): ''' Starts the disambiguation process on a specified set of authors or on all authors while respecting specified preconditions @param last_names: "all" to process all authors or a specific last name @type last_names: string @param process_orphans: process the orphans left after the first process? @type process_orphans: boolean @param db_exists: is there a data representation already in memory? @type db_exists: boolean @param populate_doclist: shall we populate the document list w/ the authors @type populate_doclist: boolean @param write_to_db: write the results back to the database? @type write_to_db: boolean @return: True if the process went through smoothly, False if it didn't @rtype: boolean ''' if bconfig.STANDALONE: bconfig.LOGGER.critical("This method is not available in " "standalone mode.") return False if isinstance(last_names, str) and last_names != "all": job_last_names = [last_names] elif last_names == "all": job_last_names = find_all_last_names() elif isinstance(last_names, list): job_last_names = last_names else: bconfig.LOGGER.error("Failed to detect parameter type. Exiting.") return False if db_exists: db_lnames = get_existing_last_names() for db_lname in db_lnames: if db_lname in job_last_names: job_last_names.remove(db_lname) bconfig.LOGGER.log( 25, "Removed %s entries from the computation list, " " since they've already been processed and written " " to the db" % (len(db_lnames))) del (db_lnames) totale = len(job_last_names) if MP_ENABLED: mp_queue = multiprocessing.Queue() mp_termination_queue = multiprocessing.Queue(1) mp_queue.put([]) db_write_lock = multiprocessing.Lock() process_list = [] lc = multiprocessing.Process(target=list_creation_process, name='baid-listgen', args=(mp_queue, job_last_names, mp_termination_queue)) lc.start() del (job_last_names) for i in range(bconfig.BIBAUTHORID_MAX_PROCESSES): p = multiprocessing.Process( target=computation_process_starter, name='baid-worker-' + str(i), args=(i, mp_termination_queue, mp_queue, db_write_lock, populate_doclist, True, process_orphans, True, write_to_db)) process_list.append(p) p.start() for p in process_list: p.join() lc.join() else: mp_queue = Queue.Queue() mp_termination_queue = Queue.Queue() mp_queue.put([]) db_write_lock = None list_creation_process(mp_queue, job_last_names, mp_termination_queue) del (job_last_names) computation_process_starter(0, mp_termination_queue, mp_queue, db_write_lock, populate_doclist, True, process_orphans, True, write_to_db) # status = 1 bconfig.LOGGER.log(25, "Done. Loaded %s last names." % (totale))