def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576): # If the filename isn't specified, use the default one (None has a # special meaning, so we can't use it - it means create a temp file) if filename == "*&$#$%#": filename = _DEFAULT_TREE_DATA logging.info("Initializing tree with data from %r", filename) self._tree = StringDBDict(persistent_file=filename, file_mode=file_mode, cachesize=cachesize) self._invlookup = None # Init the inverse name lookup database lazily self._origname = filename self.terms = self._tree.keys() self.terms.sort() # This one is for speedy retrieval and indexing self._term_list_as_dict = None self._search_dict = None self.num_terms = len(self.terms) return
def main(): mrsty_file=sys.argv[3] original_filename=sys.argv[2] data_store_name=sys.argv[1] original_file=Text(bz2.BZ2File(original_filename, 'r')) print "Loading semantic types from %s" % mrsty_file stypes=SemanticTypes() stypes.build_from_mrsty_file(MRSTYTable(bz2.BZ2File(mrsty_file))) print "Semantic types loaded." print "Turning the data from %s into %s. Please wait." % ( original_filename, data_store_name) data_store=StringDBDict(data_store_name, sync_every_transactions=0, write_out_every_transactions=200000, file_mode='c') data_store.sync_every=0 build_concept_dictionary(original_file, data_store, stypes) data_store.sync_every=100 print "Conversion done."
def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576): # If the filename isn't specified, use the default one (None has a # special meaning, so we can't use it - it means create a temp file) if filename=="*&$#$%#": filename=_DEFAULT_TREE_DATA logging.info("Initializing tree with data from %r", filename) self._tree=StringDBDict(persistent_file=filename, file_mode=file_mode, cachesize=cachesize) self._invlookup=None # Init the inverse name lookup database lazily self._origname=filename self.terms=self._tree.keys() self.terms.sort() # This one is for speedy retrieval and indexing self._term_list_as_dict=None self._search_dict=None self.num_terms=len(self.terms) return
def __init__(self, reader, graph_builder, ranker, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, output_file): logging.debug("Setting up a Workflow instance.") logging.debug("My reader is: %r", reader) self._reader = reader logging.debug("My graph builder is: %r", graph_builder) self._graph_builder = graph_builder self._ranker = MappedRanker(ranker) logging.debug("My ranker is: %r", self._ranker) self._ranking_cutoff = ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) logging.debug("Creating a Tree instance from %s", mesh_tree_filename) self._mesh_tree = Tree(mesh_tree_filename) logging.debug( "Creating SAVCC distance matrix with %r and distance " "function %r", distance_matrix_filename, distance_function) self._matrix = SavccNormalizedMatrix( open(distance_matrix_filename, "rb"), distance_function) logging.debug("Filling in the rest of the evaluation parameters.") self._eval_parameters = eval_parameters self._eval_parameters.mesh_tree = self._mesh_tree self._eval_parameters.savcc_matrix = self._matrix logging.debug("My evaluation parameters are: %r", self._eval_parameters) if umls_converter_data_filename is None: converter_data = None else: converter_data = pickle.load( open(umls_converter_data_filename, "rb")) self._umls_converter = RankedConverter( Converter(self._mesh_tree, converter_data)) logging.debug("My converter is: %r", self._umls_converter) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) self._output_file = output_file logging.debug("My output file is: %r", self._output_file) return
class Tree(object): """Describes a tree of MeSH terms. The contents should be tree_node, generated by the build_mesh_tree_file script. The tree contains a term name (string)->tree_node mapping.""" def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576): # If the filename isn't specified, use the default one (None has a # special meaning, so we can't use it - it means create a temp file) if filename == "*&$#$%#": filename = _DEFAULT_TREE_DATA logging.info("Initializing tree with data from %r", filename) self._tree = StringDBDict(persistent_file=filename, file_mode=file_mode, cachesize=cachesize) self._invlookup = None # Init the inverse name lookup database lazily self._origname = filename self.terms = self._tree.keys() self.terms.sort() # This one is for speedy retrieval and indexing self._term_list_as_dict = None self._search_dict = None self.num_terms = len(self.terms) return def original_filename(self): """Returns the original filename of the tree.""" return self._origname def __repr__(self): return "<MeSH Semantic tree from %s with %d terms>" % \ (self._origname, self.num_terms) @staticmethod def common_root(pos1, pos2): """Determines the common dotted root of a pair of tree positions.""" pos1_split = pos1.split(".") pos2_split = pos2.split(".") common_terms = [] for x in zip(pos1_split, pos2_split): if x[0] != x[1]: break common_terms.append(x[0]) return '.'.join(common_terms) def semantic_distance(self, term1, term2): """Distance between two nodes, assuming there is a single root node for the tree linking all subtrees. Qualifiers and descriptors are automatically excluded""" node1 = self._tree[term1] node2 = self._tree[term2] if node1.is_qualifier() or node2.is_qualifier(): return -1 if node1.is_descriptor() or node2.is_descriptor(): return -1 distance = 999999999999 for pos1 in node1.position: pos1 = '#.%s' % pos1 for pos2 in node2.position: # The extra item in pos 1 and pos 2 emulates the common root # node pos2 = '#.%s' % pos2 root = self.common_root(pos1, pos2) rootdots = root.count(".") dist_1 = pos1.count(".") - rootdots dist_2 = pos2.count(".") - rootdots dist = dist_1 + dist_2 if dist < 0.0: raise ValueError( "Problem: %s<->%s have a negative " "distance", pos1, pos2) if dist < distance: distance = dist return distance def distance(self, term1, term2): """Distance between two nodes, assuming no single root node for the tree linking all subtrees.""" # Check for same-treeness possible_trees1 = self._tree[term1].get_trees() possible_trees2 = self._tree[term2].get_trees() combination_thereof = [x in possible_trees2 for x in possible_trees1] if True not in combination_thereof: return -1 sd = self.semantic_distance(term1, term2) return sd def deepest_of_list(self, list_of_terms): return max( (self._tree[x].deepest_depth(), x) for x in list_of_terms)[1] def _init_inverse_lookup(self): """Sets up the internal data store to perform reverse lookups.""" logging.debug("First request of a reverse lookup. Building the " \ "inverse lookup dictionary.") self._invlookup = {} for k, items in self._tree.iteritems(): for item in items.position: self._invlookup[item] = k logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.") return def reverse_lookup(self, term): """Perform a reverse lookup, after setting up the reverse lookup dictionary if necessary.""" if self._invlookup is None: self._init_inverse_lookup() try: return self._invlookup[term] except KeyError: raise PositionNotInTree("%s is not a position in this tree." % term) def __getitem__(self, key): try: return self._tree[key.lower()] except KeyError: raise TermNotInTree("The term %s is not in the tree %r." % (key, self)) def eliminate_checktags(self, list_of_terms): """Returns a list of terms with the checktags omitted.""" return [x for x in list_of_terms if x not in checktags] def eliminate_descriptors(self, list_of_terms): return [x for x in list_of_terms if not self._tree[x].is_descriptor(x)] def eliminate_qualifiers(self, list_of_terms): return [x for x in list_of_terms if not self[x].is_qualifier()] def only_checktags(self, list_of_terms): return [x for x in list_of_terms if x in checktags] def only_qualifiers(self, list_of_terms): return [x for x in list_of_terms if self._tree[x].is_qualifier()] def only_descriptors(self, list_of_terms): return [x for x in list_of_terms if self._tree[x].is_descriptor()] def index(self, term): """Returns the index of a term in the sorted term list""" if self._term_list_as_dict is None: # Precompute all indexes logging.debug("Building MeSH tree index.") currindex = 0 self._term_list_as_dict = {} for each_term in self.terms: self._term_list_as_dict[each_term] = currindex for each_synonym in self[each_term].synonyms: self._term_list_as_dict[each_synonym] = currindex currindex += 1 try: return self._term_list_as_dict[term] except KeyError: raise TermNotInTree("Term %s is not a member of tree %r" % (term, self)) def term_vector(self, list_of_terms): """Returns a VocabularyVector representing the list of terms as seen by this tree.""" new_vector = VocabularyVector(self.num_terms) for term in list_of_terms: try: new_vector[self.index(term)] = 1 except TermNotInTree: logging.warn( 'Weird: term %r could not be found in %r. It ' 'should be there.', term, self) return new_vector def _init_search_dict(self): """Sets up the internal data store to perform searches.""" logging.debug("First request of a search. Building the " \ "search dictionary.") self._search_dict = {} for k, items in self._tree.iteritems(): for synonym in items.synonyms: if synonym in self._search_dict: self._search_dict[synonym].append(k) else: self._search_dict[synonym] = [k] if k in self._search_dict: self._search_dict[k].append(k) else: self._search_dict[k] = [k] def search(self, term): """Searches the tree for a term, looking at synonyms as well as keys""" if self._search_dict is None: self._init_search_dict() try: result = self._search_dict[term] except KeyError: return TreeSearchResults([]) if len(result) == 1: return TreeSearchResults(self[result[0]]) return TreeSearchResults([self[x] for x in result])
def __init__(self): try: self._cache_location = sys.argv[2] except IndexError: self._cache_location = DEFAULT_CACHE_NAME self._cache = StringDBDict(self._cache_location, file_mode="c")
def __init__(self): try: self._cache_location = sys.argv[2] except IndexError: self._cache_location = _DEFAULT_CONCEPT_STORAGE self._cache = StringDBDict(self._cache_location, file_mode="r")
chunk=chunk.strip() for k, v in tok_bact.iteritems(): chunk=chunk.replace(k, v) if reduce(operator.or_, [x in chunk.lower() for x in useless_lines]): continue if len(chunk) < 2: continue outfile.write('%010d|%s\n' % (chunkid, chunk)) chunkmap[fakename].append(chunkid) chunkid += 1 outfile.close() print "Saving chunkmap" pickle.dump(chunkmap, open(outmapname, "wb"), pickle.HIGHEST_PROTOCOL) print "These files couldn't be processed:" print '\n'.join(skipped) print "Opening (or creating) cache in", sys.argv[2] the_cache=StringDBDict(os.path.join(sys.argv[2], DEFAULT_CACHE_NAME), file_mode='c') PubMed.download_many([str(x) for x in known_articles if str(x) not in the_cache.keys()], download_callback, parser=Medline.RecordParser()) mti_filename=sys.argv[1]+'.mti' print "Finished processing the cache. Using the cache to build", \ mti_filename mti_file=open(mti_filename, "w") chunkmap={} hexfinder=re.compile(r'\\x[a-f0-9][a-f0-9]', re.IGNORECASE) for article in known_articles: try: article_record=the_cache[str(article)] except KeyError: print "Article doesn't exist in cache. Skipping." continue
class Tree(object): """Describes a tree of MeSH terms. The contents should be tree_node, generated by the build_mesh_tree_file script. The tree contains a term name (string)->tree_node mapping.""" def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576): # If the filename isn't specified, use the default one (None has a # special meaning, so we can't use it - it means create a temp file) if filename=="*&$#$%#": filename=_DEFAULT_TREE_DATA logging.info("Initializing tree with data from %r", filename) self._tree=StringDBDict(persistent_file=filename, file_mode=file_mode, cachesize=cachesize) self._invlookup=None # Init the inverse name lookup database lazily self._origname=filename self.terms=self._tree.keys() self.terms.sort() # This one is for speedy retrieval and indexing self._term_list_as_dict=None self._search_dict=None self.num_terms=len(self.terms) return def original_filename(self): """Returns the original filename of the tree.""" return self._origname def __repr__(self): return "<MeSH Semantic tree from %s with %d terms>" % \ (self._origname, self.num_terms) @staticmethod def common_root(pos1, pos2): """Determines the common dotted root of a pair of tree positions.""" pos1_split=pos1.split(".") pos2_split=pos2.split(".") common_terms=[] for x in zip(pos1_split, pos2_split): if x[0]!=x[1]: break common_terms.append(x[0]) return '.'.join(common_terms) def semantic_distance(self, term1, term2): """Distance between two nodes, assuming there is a single root node for the tree linking all subtrees. Qualifiers and descriptors are automatically excluded""" node1=self._tree[term1] node2=self._tree[term2] if node1.is_qualifier() or node2.is_qualifier(): return -1 if node1.is_descriptor() or node2.is_descriptor(): return -1 distance=999999999999 for pos1 in node1.position: pos1='#.%s' % pos1 for pos2 in node2.position: # The extra item in pos 1 and pos 2 emulates the common root # node pos2='#.%s' % pos2 root=self.common_root(pos1, pos2) rootdots=root.count(".") dist_1=pos1.count(".")-rootdots dist_2=pos2.count(".")-rootdots dist=dist_1+dist_2 if dist < 0.0: raise ValueError("Problem: %s<->%s have a negative " "distance", pos1, pos2) if dist < distance: distance=dist return distance def distance(self, term1, term2): """Distance between two nodes, assuming no single root node for the tree linking all subtrees.""" # Check for same-treeness possible_trees1=self._tree[term1].get_trees() possible_trees2=self._tree[term2].get_trees() combination_thereof=[x in possible_trees2 for x in possible_trees1] if True not in combination_thereof: return -1 sd=self.semantic_distance(term1, term2) return sd def deepest_of_list(self, list_of_terms): return max((self._tree[x].deepest_depth(), x) for x in list_of_terms)[1] def _init_inverse_lookup(self): """Sets up the internal data store to perform reverse lookups.""" logging.debug("First request of a reverse lookup. Building the " \ "inverse lookup dictionary.") self._invlookup={} for k, items in self._tree.iteritems(): for item in items.position: self._invlookup[item]=k logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.") return def reverse_lookup(self, term): """Perform a reverse lookup, after setting up the reverse lookup dictionary if necessary.""" if self._invlookup is None: self._init_inverse_lookup() try: return self._invlookup[term] except KeyError: raise PositionNotInTree("%s is not a position in this tree." % term) def __getitem__(self, key): try: return self._tree[key.lower()] except KeyError: raise TermNotInTree("The term %s is not in the tree %r." % (key, self)) def eliminate_checktags(self, list_of_terms): """Returns a list of terms with the checktags omitted.""" return [x for x in list_of_terms if x not in checktags] def eliminate_descriptors(self, list_of_terms): return [x for x in list_of_terms if not self._tree[x].is_descriptor(x)] def eliminate_qualifiers(self, list_of_terms): return [x for x in list_of_terms if not self[x].is_qualifier()] def only_checktags(self, list_of_terms): return [x for x in list_of_terms if x in checktags] def only_qualifiers(self, list_of_terms): return [x for x in list_of_terms if self._tree[x].is_qualifier()] def only_descriptors(self, list_of_terms): return [x for x in list_of_terms if self._tree[x].is_descriptor()] def index(self, term): """Returns the index of a term in the sorted term list""" if self._term_list_as_dict is None: # Precompute all indexes logging.debug("Building MeSH tree index.") currindex=0 self._term_list_as_dict={} for each_term in self.terms: self._term_list_as_dict[each_term]=currindex for each_synonym in self[each_term].synonyms: self._term_list_as_dict[each_synonym]=currindex currindex+=1 try: return self._term_list_as_dict[term] except KeyError: raise TermNotInTree("Term %s is not a member of tree %r" % (term, self)) def term_vector(self, list_of_terms): """Returns a VocabularyVector representing the list of terms as seen by this tree.""" new_vector=VocabularyVector(self.num_terms) for term in list_of_terms: try: new_vector[self.index(term)]=1 except TermNotInTree: logging.warn('Weird: term %r could not be found in %r. It ' 'should be there.', term, self) return new_vector def _init_search_dict(self): """Sets up the internal data store to perform searches.""" logging.debug("First request of a search. Building the " \ "search dictionary.") self._search_dict={} for k, items in self._tree.iteritems(): for synonym in items.synonyms: if synonym in self._search_dict: self._search_dict[synonym].append(k) else: self._search_dict[synonym]=[k] if k in self._search_dict: self._search_dict[k].append(k) else: self._search_dict[k]=[k] def search(self, term): """Searches the tree for a term, looking at synonyms as well as keys""" if self._search_dict is None: self._init_search_dict() try: result=self._search_dict[term] except KeyError: return TreeSearchResults([]) if len(result)==1: return TreeSearchResults(self[result[0]]) return TreeSearchResults([self[x] for x in result])
trees[term] = TreeNode(term, role, synonyms, set(position)) return trees if __name__ == "__main__": # The pickling and unpickling make this horribly slow, so we'll trade some # memory for speed in the build process and later turn the dictionary into # a DB-backed one. tree_storage = {} for treefile in sys.argv[2:]: treesfile = bz2.BZ2File(treefile, 'rU') print "Reading %s..." % treefile tree_storage = build_tree_from_descriptor_file(treesfile, tree_storage) print "Tree built. It has %d unique terms." % len(tree_storage) print "For example... arm=", tree_storage['arm'], " and eye=", \ tree_storage['eye'] print "Done generating tree." print "Storing tree in", sys.argv[1] tree_on_disk = StringDBDict(persistent_file=sys.argv[1], sync_every_transactions=0, write_out_every_transactions=0, file_mode='c') write_counter = 0 for k, v in tree_storage.iteritems(): tree_on_disk[k] = v write_counter += 1 if write_counter % 1000 == 0: print "Stored", write_counter, "terms." tree_on_disk.sync_every = 1 print "Done storing."
trees[term]=TreeNode(term, role, synonyms, set(position)) return trees if __name__=="__main__": # The pickling and unpickling make this horribly slow, so we'll trade some # memory for speed in the build process and later turn the dictionary into # a DB-backed one. tree_storage={} for treefile in sys.argv[2:]: treesfile=bz2.BZ2File(treefile, 'rU') print "Reading %s..." % treefile tree_storage=build_tree_from_descriptor_file(treesfile, tree_storage) print "Tree built. It has %d unique terms." % len(tree_storage) print "For example... arm=", tree_storage['arm'], " and eye=", \ tree_storage['eye'] print "Done generating tree." print "Storing tree in", sys.argv[1] tree_on_disk=StringDBDict(persistent_file=sys.argv[1], sync_every_transactions=0, write_out_every_transactions=0, file_mode='c') write_counter=0 for k,v in tree_storage.iteritems(): tree_on_disk[k]=v write_counter+=1 if write_counter % 1000 == 0: print "Stored", write_counter, "terms." tree_on_disk.sync_every=1 print "Done storing."
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_threads=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multithreading notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are thread-safe. """ if num_threads is None: num_threads=1 logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) # Since there's no direct way of setting the concept cache's title, # we set it here, wait for it to be inherited, and then get the 'real' # process title for this one. if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() threads=[] logging.info("Creating %d worker threads.", num_threads) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues=[Queue(queue_size) for x in xrange(num_threads)] this_output_queue=Queue(2*queue_size) # Create an output processor output_processor=Thread(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_threads): this_thread=Thread(target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created thread: %r", this_thread) this_thread.start() threads.append((this_thread, this_output_queue, task_queues[i])) all_results={} count=0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count+=1 # logging.info("Dispatching article %d: %r", count, each_article) target_thread=(count-1) % num_threads logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, threads[target_thread][0].name) task_queues[target_thread].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results={} alive_threads=[x for x in threads if x[0].is_alive()] remaining_threads=len(alive_threads) logging.info("There are %d threads (out of %d) still alive.", remaining_threads, num_threads) for i in xrange(remaining_threads): alive_threads[i][2].put('STOP') #alive_threads[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the threads.") # Note end of output while len(threads)>0: a_thread=threads.pop() # We join the process to wait for the end of the reading a_thread[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multithreading.") Pmid.close_storage() return
def multi_processor(reader, workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, extra_data_name, extra_data_contents, output_file, num_processes=None, queue_size=None, output_callback=output, output_headers_callback=output_headers, output_item_callback=output_one_item, performance_tuning=True): """ Perform the evaluation. Multiprocessing notes: It's the responsibility of the caller to make sure that extra_data_contents, if any, are multiprocessing-safe. For example, by using a SyncManager and Namespace and passing the proxy. See umls/concept for an example. """ if num_processes is None: num_processes = cpu_count() if performance_tuning: # Since reading the file involves an awful lot of object creation # and destruction we'll tweak the gc adjustments to sweep less frequently # IOW - we have a LOT of short-lived objects. No sense garbage-collecting # the latter generations very often. # (this is about 10x, 5x, and 5x the usual) original_threshold = gc.get_threshold() gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1], 5 * original_threshold[1]) original_check_interval = sys.getcheckinterval() # Similarly, we'll try to minimize overhead from thread switches # 5x usual value sys.setcheckinterval(5 * original_check_interval) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) Pmid.init_storage() proctitle.setproctitle("MEDRank-main") processes = [] logging.info("Creating %d worker processes.", num_processes) #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)] task_queues = [Queue(queue_size) for x in xrange(num_processes)] this_output_queue = Queue(2 * queue_size) # Create an output processor output_processor = Process(target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)) output_processor.start() for i in xrange(num_processes): this_process = Process( target=processor, args=(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, task_queues[i], this_output_queue, "MEDRank-Worker-%d" % i), name="MEDRank-Worker-%d" % i) logging.log(ULTRADEBUG, "Created process: %r", this_process) this_process.start() processes.append((this_process, this_output_queue, task_queues[i])) all_results = {} count = 0 # Use a single dispatch queue for automagical load balancing # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks for each_article in reader: count += 1 #queues_and_sizes=[(task_queues[x].qsize(), x) # for x in xrange(num_processes)] #queues_and_sizes.sort() #target_process=queues_and_sizes[0][1] # logging.info("Dispatching article %d: %r", count, each_article) target_process = (count - 1) % num_processes #Lowest-loaded process first. logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, processes[target_process][0].name) task_queues[target_process].put(each_article) #task_queue[target_process].put(each_article) #task_queue.put(each_article) #logging.info("The task queue is approximately %d items long.", # task_queue.qsize()) logging.log(ULTRADEBUG, "Waiting for processing to end.") all_results = {} alive_processes = [x for x in processes if x[0].is_alive()] remaining_processes = len(alive_processes) logging.info("There are %d processes (out of %d) still alive.", remaining_processes, num_processes) for i in xrange(remaining_processes): alive_processes[i][2].put('STOP') alive_processes[i][2].close() logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.") logging.info("All information sent to the processors.") # Back to normal if performance_tuning: gc.set_threshold(original_threshold[0], original_threshold[1], original_threshold[2]) sys.setcheckinterval(original_check_interval) # Note end of output while len(processes) > 0: a_process = processes.pop() # We join the process to wait for the end of the reading a_process[0].join() # logging.log(ULTRADEBUG, "Fetching results from finished process.") # all_results.update(a_process[1].get()) # Add results to result pool # logging.log(ULTRADEBUG, "Received results.") logging.info("Finishing writing out results.") this_output_queue.put("STOP") output_processor.join() logging.info("Results written. Finishing multiprocessing.") return