def _prepare_data_files_from_db(data_dir_name="grid_data", workdir_prefix="job", max_records=4000): ''' Prepares grid jobs. Is a task running in bibsched. Meaning: 1. Find all last names in the database 2. For each last name: - find all documents regarding this last name (ignore first names) - if number of documents loaded into memory exceeds max_records, write the memory cache into files (cf. Files section). Each write back procedure will happen into a newly created directory. The prefix for the respective job directory may be specified as well as the name of the data directory where these job directories will be created. Files: - authornames.dat - virtual_authors.dat - virtual_author_data.dat - virtual_author_clusters.dat - virtual_author_cluster_cache.dat - realauthors.dat - realauthor_data.dat - doclist.dat - records.dat - ids.dat - ra_va_cache.dat @param data_dir_name: the name of the directory that will hold all the sub directories for the jobs. @type data_dir_name: string @param workdir_prefix: prefix for the job sub directories. @type workdir_prefix: string @param max_records: maximum number of records after which the memory cache is to be flushed to files. @type max_records: int ''' try: max_records = int(max_records) except ValueError: max_records = 4000 bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0) bibtask.write_message("Limiting files to %s records" % (max_records,), stream=sys.stdout, verbose=0) bibtask.task_update_progress('Loading last names...') last_names = find_all_last_names() last_name_queue = Queue.Queue() for last_name in sorted(last_names): last_name_queue.put(last_name) total = len(last_names) status = 1 bibtask.write_message("Done. Loaded %s last names." % (total), stream=sys.stdout, verbose=0) job_id = 0 data_dir = "" if data_dir_name.startswith("/"): data_dir = data_dir_name else: data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name) if not data_dir.endswith("/"): data_dir = "%s/" % (data_dir,) job_lnames = [] while True: if last_name_queue.empty(): bibtask.write_message("Done with all names.", stream=sys.stdout, verbose=0) break lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del(lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue job_lnames.append(lname) bibtask.task_update_progress('Preparing job %d of %d: %s.' % (status, total, lname)) bibtask.write_message(("Processing: %s (%d/%d).") % (lname, status, total), stream=sys.stdout, verbose=0) populate_doclist_for_author_surname(lname) post_remove_names = set() for name in [row['name'] for row in dat.AUTHOR_NAMES if not row['processed']]: potential_removal = "%s," % (name.split(',')[0],) if not potential_removal == "%s" % (lname,): post_remove_names.add(potential_removal) if len(post_remove_names) > 1: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bibtask.write_message(("-> Removed %s entries from the " + "computation list: %s") % (removed, removed_names), stream=sys.stdout, verbose=0) total -= removed if lname_list: last_name_queue.put(lname_list) if len(dat.RELEVANT_RECORDS) >= max_records: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) job_lnames = [] job_id += 1 status += 1 if dat.RELEVANT_RECORDS: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) return True
def _prepare_data_files_from_db(data_dir_name="grid_data", workdir_prefix="job", max_records=4000): ''' Prepares grid jobs. Is a task running in bibsched. Meaning: 1. Find all last names in the database 2. For each last name: - find all documents regarding this last name (ignore first names) - if number of documents loaded into memory exceeds max_records, write the memory cache into files (cf. Files section). Each write back procedure will happen into a newly created directory. The prefix for the respective job directory may be specified as well as the name of the data directory where these job directories will be created. Files: - authornames.dat - virtual_authors.dat - virtual_author_data.dat - virtual_author_clusters.dat - virtual_author_cluster_cache.dat - realauthors.dat - realauthor_data.dat - doclist.dat - records.dat - ids.dat - ra_va_cache.dat @param data_dir_name: the name of the directory that will hold all the sub directories for the jobs. @type data_dir_name: string @param workdir_prefix: prefix for the job sub directories. @type workdir_prefix: string @param max_records: maximum number of records after which the memory cache is to be flushed to files. @type max_records: int ''' try: max_records = int(max_records) except ValueError: max_records = 4000 bibtask.write_message("Loading last names", stream=sys.stdout, verbose=0) bibtask.write_message("Limiting files to %s records" % (max_records, ), stream=sys.stdout, verbose=0) bibtask.task_update_progress('Loading last names...') last_names = find_all_last_names() last_name_queue = Queue.Queue() for last_name in sorted(last_names): last_name_queue.put(last_name) total = len(last_names) status = 1 bibtask.write_message("Done. Loaded %s last names." % (total), stream=sys.stdout, verbose=0) job_id = 0 data_dir = "" if data_dir_name.startswith("/"): data_dir = data_dir_name else: data_dir = "%s/%s/" % (bconfig.FILE_PATH, data_dir_name) if not data_dir.endswith("/"): data_dir = "%s/" % (data_dir, ) job_lnames = [] while True: if last_name_queue.empty(): bibtask.write_message("Done with all names.", stream=sys.stdout, verbose=0) break lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del (lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue job_lnames.append(lname) bibtask.task_update_progress('Preparing job %d of %d: %s.' % (status, total, lname)) bibtask.write_message( ("Processing: %s (%d/%d).") % (lname, status, total), stream=sys.stdout, verbose=0) populate_doclist_for_author_surname(lname) post_remove_names = set() for name in [ row['name'] for row in dat.AUTHOR_NAMES if not row['processed'] ]: potential_removal = "%s," % (name.split(',')[0], ) if not potential_removal == "%s" % (lname, ): post_remove_names.add(potential_removal) if len(post_remove_names) > 1: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bibtask.write_message( ("-> Removed %s entries from the " + "computation list: %s") % (removed, removed_names), stream=sys.stdout, verbose=0) total -= removed if lname_list: last_name_queue.put(lname_list) if len(dat.RELEVANT_RECORDS) >= max_records: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) job_lnames = [] job_id += 1 status += 1 if dat.RELEVANT_RECORDS: if not os.path.exists(data_dir): os.mkdir(data_dir) work_dir = "%s%s%s" % (data_dir, workdir_prefix, job_id) _write_to_files(work_dir, job_lnames) return True
def computation_process_starter(i, mp_termination_queue, job_mp_queue, db_write_lock, populate_doclist=True, process_doclist=True, process_orphans=False, print_stats=True, write_to_db=False): ''' Sub process that starts the disambiguation process on a specified set of authors. @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig) @type i: int @param mp_termination_queue: queue holding the exit token for the processes to terminate upon finishing all queue elements @type mp_termination_queue: queue @param job_mp_queue: queue holding the last name blocks @type job_mp_queue: queue @param db_write_lock: shilds the database from too many concurrent accesses @type db_write_lock: multiprocessing.Lock @param last_names: "all" to process all authors or a specific last name @type last_names: string @param process_orphans: process the orphans left after the first process? @type process_orphans: boolean @param db_exists: is there a data representation already in memory? @type db_exists: boolean @param populate_doclist: shall we populate the document list w/ the authors @type populate_doclist: boolean @param write_to_db: write the results back to the database? @type write_to_db: boolean ''' while True: debugmsg(i, "getting name from queue") if job_mp_queue.qsize() > 0: job_last_names = job_mp_queue.get() debugmsg(i, "got queue item! %s items left in queue" % job_mp_queue.qsize()) else: debugmsg(i, "Queue is currently empty...") if not mp_termination_queue.empty(): debugmsg(i, "Exit token there, Process %s salutes to quit!" % i) return else: debugmsg(i, "Exit token not present, continuing in 15s!") time.sleep(15) continue last_name_queue = Queue.Queue() last_name_queue.put(sorted(job_last_names)) gc.collect() while True: dat.reset_mem_cache(True) gc.collect() if last_name_queue.empty(): bconfig.LOGGER.log(25, "Done with all names.") break debugmsg(i, "starting with queue: " + str(last_name_queue.queue)) lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del(lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue # bconfig.LOGGER.log(25, "Processing: %s (%d/%d)." # % (lname, status, total)) if populate_doclist: populate_doclist_for_author_surname(lname, job_last_names) start_computation(process_orphans=process_orphans, process_doclist=process_doclist, print_stats=print_stats) post_remove_names = set() # The following snippet finds additionally processed last names and # removes them from the processing queue. E.g. 't hooft and t'hooft for name in [row['name'] for row in dat.AUTHOR_NAMES if not row['processed']]: potential_removal = "%s" % (name.split(',')[0]) if not potential_removal == "%s" % (lname): post_remove_names.add(potential_removal) if len(post_remove_names) > 0: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bconfig.LOGGER.log(25, "-> Removed %s entries from the " "computation list: %s" % (removed, removed_names)) if lname_list: last_name_queue.put(lname_list) if write_to_db: if MP_ENABLED: db_write_lock.acquire() if dat.ID_TRACKER: try: write_mem_cache_to_tables() except Exception, emsg: bconfig.LOGGER.error("An error occurred while writing " "to the db: %s" % emsg) else: bconfig.LOGGER.info("The ID tracker appears to be empty. " "Nothing will be written to the " "database from this job. That's ok, " "when excluding collections. Last " "processed last name: %s" % lname) if MP_ENABLED: db_write_lock.release() dat.reset_mem_cache(True) gc.collect()
def computation_process_starter(i, mp_termination_queue, job_mp_queue, db_write_lock, populate_doclist=True, process_doclist=True, process_orphans=False, print_stats=True, write_to_db=False): ''' Sub process that starts the disambiguation process on a specified set of authors. @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig) @type i: int @param mp_termination_queue: queue holding the exit token for the processes to terminate upon finishing all queue elements @type mp_termination_queue: queue @param job_mp_queue: queue holding the last name blocks @type job_mp_queue: queue @param db_write_lock: shilds the database from too many concurrent accesses @type db_write_lock: multiprocessing.Lock @param last_names: "all" to process all authors or a specific last name @type last_names: string @param process_orphans: process the orphans left after the first process? @type process_orphans: boolean @param db_exists: is there a data representation already in memory? @type db_exists: boolean @param populate_doclist: shall we populate the document list w/ the authors @type populate_doclist: boolean @param write_to_db: write the results back to the database? @type write_to_db: boolean ''' while True: if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': getting name from queue' job_last_names_list = job_mp_queue.get() if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str(i) + ': got queue' if len(job_last_names_list) > 0: job_last_names = job_last_names_list[0] if len(job_last_names_list) > 1: job_mp_queue.put(job_last_names_list[1:]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': put non empty list' else: job_mp_queue.put([]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': put empty list' else: if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': we got an empty list...' job_mp_queue.put([]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': put empty list' if not mp_termination_queue.empty(): if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': token there, exiting!' return else: if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': token not there, continuing!' time.sleep(15) continue last_name_queue = Queue.Queue() last_name_queue.put(sorted(job_last_names)) del (job_last_names_list) gc.collect() while True: dat.reset_mem_cache(True) gc.collect() if last_name_queue.empty(): bconfig.LOGGER.log(25, "Done with all names.") break if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': starting with queue: ' + str(last_name_queue.queue) lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del (lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue # bconfig.LOGGER.log(25, "Processing: %s (%d/%d)." # % (lname, status, total)) if populate_doclist: populate_doclist_for_author_surname(lname, job_last_names) start_computation(process_orphans=process_orphans, process_doclist=process_doclist, print_stats=print_stats) post_remove_names = set() # The following snippet finds additionally processed last names and # removes them from the processing queue. E.g. 't hooft and t'hooft for name in [ row['name'] for row in dat.AUTHOR_NAMES if not row['processed'] ]: potential_removal = "%s" % (name.split(',')[0]) if not potential_removal == "%s" % (lname): post_remove_names.add(potential_removal) if len(post_remove_names) > 0: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bconfig.LOGGER.log( 25, "-> Removed %s entries from the " "computation list: %s" % (removed, removed_names)) if lname_list: last_name_queue.put(lname_list) if write_to_db: if MP_ENABLED: db_write_lock.acquire() if dat.ID_TRACKER: try: write_mem_cache_to_tables() except Exception, emsg: bconfig.LOGGER.error("An error occurred while writing " "to the db: %s" % emsg) else: bconfig.LOGGER.info("The ID tracker appears to be empty. " "Nothing will be written to the " "database from this job. That's ok, " "when excluding collections. Last " "processed last name: %s" % lname) if MP_ENABLED: db_write_lock.release() dat.reset_mem_cache(True) gc.collect()