def processor(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, my_input_queue, my_output_queue, my_own_name=None): logging.info("Setting up worker.") if my_own_name is not None: proctitle.setproctitle(my_own_name) my_workflow=workflow_class(graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename ) if extra_data_name is not None: my_workflow.__setattr__(extra_data_name, extra_data_contents) logging.info("Finished setting up worker process. Waiting for requests.") try: while True: request=my_input_queue.get() logging.log(ULTRADEBUG, "Processing request %r", request) if request=='STOP': logging.log(ULTRADEBUG, "Received stop request.") break try: my_workflow.process_article(request) # Recover the article, push it on the output queue my_output_queue.put(my_workflow.all_results) # Clear the output queue my_workflow.all_results={} except CouldNotRank: #my_input_queue.put(request) # On error, push the task # back into the queue logging.info("Skipping unrankable article.") except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) raise finally: logging.log(ULTRADEBUG, "Returning results to caller.") logging.log(ULTRADEBUG, "Ending processor execution.") return
def processor(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, my_input_queue, my_output_queue, my_own_name=None): logging.info("Setting up worker.") if my_own_name is not None: proctitle.setproctitle(my_own_name) my_workflow = workflow_class(graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename) if extra_data_name is not None: my_workflow.__setattr__(extra_data_name, extra_data_contents) logging.info("Finished setting up worker process. Waiting for requests.") try: while True: request = my_input_queue.get() logging.log(ULTRADEBUG, "Processing request %r", request) if request == 'STOP': logging.log(ULTRADEBUG, "Received stop request.") break try: my_workflow.process_article(request) # Recover the article, push it on the output queue my_output_queue.put(my_workflow.all_results) # Clear the output queue my_workflow.all_results = {} except CouldNotRank: #my_input_queue.put(request) # On error, push the task # back into the queue logging.info("Skipping unrankable article.") except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) raise finally: logging.log(ULTRADEBUG, "Returning results to caller.") logging.log(ULTRADEBUG, "Ending processor execution.") return
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_processes=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multiprocessing notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are multiprocessing-safe. For example, by using a SyncManager and Namespace and passing the proxy. See umls/concept for an example. """ if num_processes is None: num_processes=cpu_count() if performance_tuning: # Since reading the file involves an awful lot of object creation # and destruction we'll tweak the gc adjustments to sweep less frequently # IOW - we have a LOT of short-lived objects. No sense garbage-collecting # the latter generations very often. # (this is about 10x, 5x, and 5x the usual) original_threshold=gc.get_threshold() gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1], 5 * original_threshold[1]) original_check_interval=sys.getcheckinterval() # Similarly, we'll try to minimize overhead from thread switches # 5x usual value sys.setcheckinterval(5*original_check_interval) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() proctitle.setproctitle("MEDRank-main") processes=[] logging.info("Creating %d worker processes.", num_processes) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues=[Queue(queue_size) for x in xrange(num_processes)] this_output_queue=Queue(2*queue_size) # Create an output processor output_processor=Process(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_processes): this_process=Process(target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue, "MEDRank-Worker-%d" % i), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created process: %r", this_process) this_process.start() processes.append((this_process, this_output_queue, task_queues[i])) all_results={} count=0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count+=1 #queues_and_sizes=[(task_queues[x].qsize(), x) # for x in xrange(num_processes)] #queues_and_sizes.sort() #target_process=queues_and_sizes[0][1] # logging.info("Dispatching article %d: %r", count, each_article) target_process=(count-1) % num_processes #Lowest-loaded process first. logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, processes[target_process][0].name) task_queues[target_process].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results={} alive_processes=[x for x in processes if x[0].is_alive()] remaining_processes=len(alive_processes) logging.info("There are %d processes (out of %d) still alive.", remaining_processes, num_processes) for i in xrange(remaining_processes): alive_processes[i][2].put('STOP') alive_processes[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the processors.") # Back to normal if performance_tuning: gc.set_threshold(original_threshold[0], original_threshold[1], original_threshold[2]) sys.setcheckinterval(original_check_interval) # Note end of output while len(processes)>0: a_process=processes.pop() # We join the process to wait for the end of the reading a_process[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multiprocessing.") return
def output(output_file, result_queue, headers_callback=output_headers, item_callback=output_one_item, initial_result_set_size=100): """Actually dumps the result set to output. Override for easy output customization.""" result_set={} proctitle.setproctitle("MEDRank-output-processor") stop_requested=False # Gather a few values logging.log(ULTRADEBUG, "Gathering values for initial analysis.") for i in xrange(initial_result_set_size): logging.log(ULTRADEBUG, "Getting results %d.", i) try: request=result_queue.get() if request=='STOP': stop_requested=True break result_set.update(request) except KeyboardInterrupt: return except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) logging.log(ULTRADEBUG, "Values gathered. Computing columns.") column_names=set([]) # Add the colnames to the csv if headers_callback is not None: for result in result_set.itervalues(): column_names|=result.columns() # Create a writer column_names=['pmid'] + [x for x in column_names] headers_callback(output_file, column_names) logging.log(ULTRADEBUG, "Looping to get more results and output them.") while True: if not stop_requested: try: request=result_queue.get() if request=='STOP': stop_requested=True else: result_set.update(request) except KeyboardInterrupt: return except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) if stop_requested and len(result_set)==0: break if len(result_set)==0: continue # It can happen! We might get no results, or an empty set. pmid=result_set.keys()[0] logging.log(ULTRADEBUG, "Output: article %r.", pmid) result=result_set[pmid] item_callback(output_file, pmid, result, column_names) del result_set[pmid] try: output_file.flush() except: logging.warn("The output file object does not support flushing.") try: os.fsync(output_file.fileno()) except: logging.warn("Could not fsync the output file. Traceback follows.\n%s", traceback.format_exc()) return
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_processes=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multiprocessing notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are multiprocessing-safe. For example, by using a SyncManager and Namespace and passing the proxy. See umls/concept for an example. """ if num_processes is None: num_processes = cpu_count() if performance_tuning: # Since reading the file involves an awful lot of object creation # and destruction we'll tweak the gc adjustments to sweep less frequently # IOW - we have a LOT of short-lived objects. No sense garbage-collecting # the latter generations very often. # (this is about 10x, 5x, and 5x the usual) original_threshold = gc.get_threshold() gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1], 5 * original_threshold[1]) original_check_interval = sys.getcheckinterval() # Similarly, we'll try to minimize overhead from thread switches # 5x usual value sys.setcheckinterval(5 * original_check_interval) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() proctitle.setproctitle("MEDRank-main") processes = [] logging.info("Creating %d worker processes.", num_processes) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues = [Queue(queue_size) for x in xrange(num_processes)] this_output_queue = Queue(2 * queue_size) # Create an output processor output_processor = Process(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_processes): this_process = Process( target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue, "MEDRank-Worker-%d" % i), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created process: %r", this_process) this_process.start() processes.append((this_process, this_output_queue, task_queues[i])) all_results = {} count = 0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count += 1 #queues_and_sizes=[(task_queues[x].qsize(), x) # for x in xrange(num_processes)] #queues_and_sizes.sort() #target_process=queues_and_sizes[0][1] # logging.info("Dispatching article %d: %r", count, each_article) target_process = (count - 1) % num_processes #Lowest-loaded process first. logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, processes[target_process][0].name) task_queues[target_process].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results = {} alive_processes = [x for x in processes if x[0].is_alive()] remaining_processes = len(alive_processes) logging.info("There are %d processes (out of %d) still alive.", remaining_processes, num_processes) for i in xrange(remaining_processes): alive_processes[i][2].put('STOP') alive_processes[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the processors.") # Back to normal if performance_tuning: gc.set_threshold(original_threshold[0], original_threshold[1], original_threshold[2]) sys.setcheckinterval(original_check_interval) # Note end of output while len(processes) > 0: a_process = processes.pop() # We join the process to wait for the end of the reading a_process[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multiprocessing.") return