def _write_data_files_to_db(data_dir_name): ''' Reads all the files of a specified directory and writes the content to the memory cache and from there to the database. @param data_dir_name: Directory where to look for the files @type data_dir_name: string ''' if data_dir_name.endswith("/"): data_dir_name = data_dir_name[0:-1] if not data_dir_name: bibtask.write_message("Data directory not specified. Task failed.", stream=sys.stdout, verbose=0) return False if not osp.isdir(data_dir_name): bibtask.write_message("Specified Data directory is not a directory. " "Task failed.", stream=sys.stdout, verbose=0) return False job_dirs = os.listdir(data_dir_name) total = len(job_dirs) status = 0 for job_dir in job_dirs: status += 1 job_dir = "%s/%s" % (data_dir_name, job_dir) if not osp.isdir(job_dir): bibtask.write_message("This is not a directory and therefore " "skipped: %s." % job_dir, stream=sys.stdout, verbose=0) continue results_dir = "%s/results/" % (job_dir,) if not osp.isdir(results_dir): bibtask.write_message("No result set found in %s" % (results_dir,), stream=sys.stdout, verbose=0) continue log_name = osp.abspath(job_dir).split("/") logfile = "%s/%s.log" % (job_dir, log_name[-1]) logfile_lastline = "" if not osp.isfile(logfile): bibtask.write_message("No log file found in %s" % (job_dir,), stream=sys.stdout, verbose=0) continue try: logfile_lastline = tail(logfile) except IOError: logfile_lastline = "" if logfile_lastline.count("Finish! The computation finished in") < 1: bibtask.write_message("Log file indicates broken results for %s" % (job_dir,), stream=sys.stdout, verbose=0) continue correct_files = set(['realauthors.dat', 'ids.dat', 'virtual_author_clusters.dat', 'virtual_authors.dat', 'doclist.dat', 'virtual_author_data.dat', 'authornames.dat', 'virtual_author_cluster_cache.dat', 'realauthor_data.dat', 'ra_va_cache.dat'] ) result_files = os.listdir(results_dir) if not correct_files.issubset(set(result_files)): bibtask.write_message("Reults folder does not hold the " "correct files: %s" % (results_dir,), stream=sys.stdout, verbose=0) continue bibtask.task_update_progress('Loading job %s of %s: %s' % (status, total, log_name[-1])) if (populate_structs_from_files(results_dir, results=True) and write_mem_cache_to_tables(sanity_checks=True)): bibtask.write_message("All Done.", stream=sys.stdout, verbose=0) else: bibtask.write_message("Could not write data to the tables from %s" % (results_dir,), stream=sys.stdout, verbose=0)
def _write_data_files_to_db(data_dir_name): ''' Reads all the files of a specified directory and writes the content to the memory cache and from there to the database. @param data_dir_name: Directory where to look for the files @type data_dir_name: string ''' if data_dir_name.endswith("/"): data_dir_name = data_dir_name[0:-1] if not data_dir_name: bibtask.write_message("Data directory not specified. Task failed.", stream=sys.stdout, verbose=0) return False if not osp.isdir(data_dir_name): bibtask.write_message( "Specified Data directory is not a directory. " "Task failed.", stream=sys.stdout, verbose=0) return False job_dirs = os.listdir(data_dir_name) total = len(job_dirs) status = 0 for job_dir in job_dirs: status += 1 job_dir = "%s/%s" % (data_dir_name, job_dir) if not osp.isdir(job_dir): bibtask.write_message("This is not a directory and therefore " "skipped: %s." % job_dir, stream=sys.stdout, verbose=0) continue results_dir = "%s/results/" % (job_dir, ) if not osp.isdir(results_dir): bibtask.write_message("No result set found in %s" % (results_dir, ), stream=sys.stdout, verbose=0) continue log_name = osp.abspath(job_dir).split("/") logfile = "%s/%s.log" % (job_dir, log_name[-1]) logfile_lastline = "" if not osp.isfile(logfile): bibtask.write_message("No log file found in %s" % (job_dir, ), stream=sys.stdout, verbose=0) continue try: logfile_lastline = tail(logfile) except IOError: logfile_lastline = "" if logfile_lastline.count("Finish! The computation finished in") < 1: bibtask.write_message("Log file indicates broken results for %s" % (job_dir, ), stream=sys.stdout, verbose=0) continue correct_files = set([ 'realauthors.dat', 'ids.dat', 'virtual_author_clusters.dat', 'virtual_authors.dat', 'doclist.dat', 'virtual_author_data.dat', 'authornames.dat', 'virtual_author_cluster_cache.dat', 'realauthor_data.dat', 'ra_va_cache.dat' ]) result_files = os.listdir(results_dir) if not correct_files.issubset(set(result_files)): bibtask.write_message("Reults folder does not hold the " "correct files: %s" % (results_dir, ), stream=sys.stdout, verbose=0) continue bibtask.task_update_progress('Loading job %s of %s: %s' % (status, total, log_name[-1])) if (populate_structs_from_files(results_dir, results=True) and write_mem_cache_to_tables(sanity_checks=True)): bibtask.write_message("All Done.", stream=sys.stdout, verbose=0) else: bibtask.write_message( "Could not write data to the tables from %s" % (results_dir, ), stream=sys.stdout, verbose=0)
def computation_process_starter(i, mp_termination_queue, job_mp_queue, db_write_lock, populate_doclist=True, process_doclist=True, process_orphans=False, print_stats=True, write_to_db=False): ''' Sub process that starts the disambiguation process on a specified set of authors. @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig) @type i: int @param mp_termination_queue: queue holding the exit token for the processes to terminate upon finishing all queue elements @type mp_termination_queue: queue @param job_mp_queue: queue holding the last name blocks @type job_mp_queue: queue @param db_write_lock: shilds the database from too many concurrent accesses @type db_write_lock: multiprocessing.Lock @param last_names: "all" to process all authors or a specific last name @type last_names: string @param process_orphans: process the orphans left after the first process? @type process_orphans: boolean @param db_exists: is there a data representation already in memory? @type db_exists: boolean @param populate_doclist: shall we populate the document list w/ the authors @type populate_doclist: boolean @param write_to_db: write the results back to the database? @type write_to_db: boolean ''' while True: debugmsg(i, "getting name from queue") if job_mp_queue.qsize() > 0: job_last_names = job_mp_queue.get() debugmsg(i, "got queue item! %s items left in queue" % job_mp_queue.qsize()) else: debugmsg(i, "Queue is currently empty...") if not mp_termination_queue.empty(): debugmsg(i, "Exit token there, Process %s salutes to quit!" % i) return else: debugmsg(i, "Exit token not present, continuing in 15s!") time.sleep(15) continue last_name_queue = Queue.Queue() last_name_queue.put(sorted(job_last_names)) gc.collect() while True: dat.reset_mem_cache(True) gc.collect() if last_name_queue.empty(): bconfig.LOGGER.log(25, "Done with all names.") break debugmsg(i, "starting with queue: " + str(last_name_queue.queue)) lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del(lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue # bconfig.LOGGER.log(25, "Processing: %s (%d/%d)." # % (lname, status, total)) if populate_doclist: populate_doclist_for_author_surname(lname, job_last_names) start_computation(process_orphans=process_orphans, process_doclist=process_doclist, print_stats=print_stats) post_remove_names = set() # The following snippet finds additionally processed last names and # removes them from the processing queue. E.g. 't hooft and t'hooft for name in [row['name'] for row in dat.AUTHOR_NAMES if not row['processed']]: potential_removal = "%s" % (name.split(',')[0]) if not potential_removal == "%s" % (lname): post_remove_names.add(potential_removal) if len(post_remove_names) > 0: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bconfig.LOGGER.log(25, "-> Removed %s entries from the " "computation list: %s" % (removed, removed_names)) if lname_list: last_name_queue.put(lname_list) if write_to_db: if MP_ENABLED: db_write_lock.acquire() if dat.ID_TRACKER: try: write_mem_cache_to_tables() except Exception, emsg: bconfig.LOGGER.error("An error occurred while writing " "to the db: %s" % emsg) else: bconfig.LOGGER.info("The ID tracker appears to be empty. " "Nothing will be written to the " "database from this job. That's ok, " "when excluding collections. Last " "processed last name: %s" % lname) if MP_ENABLED: db_write_lock.release() dat.reset_mem_cache(True) gc.collect()
def computation_process_starter(i, mp_termination_queue, job_mp_queue, db_write_lock, populate_doclist=True, process_doclist=True, process_orphans=False, print_stats=True, write_to_db=False): ''' Sub process that starts the disambiguation process on a specified set of authors. @param i: ID of the process (int between 0 and MAX_PROCESSES in bconfig) @type i: int @param mp_termination_queue: queue holding the exit token for the processes to terminate upon finishing all queue elements @type mp_termination_queue: queue @param job_mp_queue: queue holding the last name blocks @type job_mp_queue: queue @param db_write_lock: shilds the database from too many concurrent accesses @type db_write_lock: multiprocessing.Lock @param last_names: "all" to process all authors or a specific last name @type last_names: string @param process_orphans: process the orphans left after the first process? @type process_orphans: boolean @param db_exists: is there a data representation already in memory? @type db_exists: boolean @param populate_doclist: shall we populate the document list w/ the authors @type populate_doclist: boolean @param write_to_db: write the results back to the database? @type write_to_db: boolean ''' while True: if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': getting name from queue' job_last_names_list = job_mp_queue.get() if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str(i) + ': got queue' if len(job_last_names_list) > 0: job_last_names = job_last_names_list[0] if len(job_last_names_list) > 1: job_mp_queue.put(job_last_names_list[1:]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': put non empty list' else: job_mp_queue.put([]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': put empty list' else: if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': we got an empty list...' job_mp_queue.put([]) if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': put empty list' if not mp_termination_queue.empty(): if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': token there, exiting!' return else: if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': token not there, continuing!' time.sleep(15) continue last_name_queue = Queue.Queue() last_name_queue.put(sorted(job_last_names)) del (job_last_names_list) gc.collect() while True: dat.reset_mem_cache(True) gc.collect() if last_name_queue.empty(): bconfig.LOGGER.log(25, "Done with all names.") break if bconfig.TABLES_UTILS_DEBUG: print time.strftime('%H:%M:%S') + ' ' + str( i) + ': starting with queue: ' + str(last_name_queue.queue) lname_list = last_name_queue.get() lname = None if lname_list: lname = lname_list[0] del (lname_list[0]) else: bconfig.LOGGER.warning("Got an empty Queue element. " "Queue seems corrupted.") continue # bconfig.LOGGER.log(25, "Processing: %s (%d/%d)." # % (lname, status, total)) if populate_doclist: populate_doclist_for_author_surname(lname, job_last_names) start_computation(process_orphans=process_orphans, process_doclist=process_doclist, print_stats=print_stats) post_remove_names = set() # The following snippet finds additionally processed last names and # removes them from the processing queue. E.g. 't hooft and t'hooft for name in [ row['name'] for row in dat.AUTHOR_NAMES if not row['processed'] ]: potential_removal = "%s" % (name.split(',')[0]) if not potential_removal == "%s" % (lname): post_remove_names.add(potential_removal) if len(post_remove_names) > 0: removed = 0 removed_names = [] for post_remove_name in post_remove_names: if post_remove_name in lname_list: lname_list.remove(post_remove_name) removed_names.append(post_remove_name) removed += 1 bconfig.LOGGER.log( 25, "-> Removed %s entries from the " "computation list: %s" % (removed, removed_names)) if lname_list: last_name_queue.put(lname_list) if write_to_db: if MP_ENABLED: db_write_lock.acquire() if dat.ID_TRACKER: try: write_mem_cache_to_tables() except Exception, emsg: bconfig.LOGGER.error("An error occurred while writing " "to the db: %s" % emsg) else: bconfig.LOGGER.info("The ID tracker appears to be empty. " "Nothing will be written to the " "database from this job. That's ok, " "when excluding collections. Last " "processed last name: %s" % lname) if MP_ENABLED: db_write_lock.release() dat.reset_mem_cache(True) gc.collect()