Exemple #1
0
 def __init__(self, CUI):
     self.__cui = CUI
     if Concept.__storage is None:
         logging.info("Initializing concept storage from default location."
                      " If this isn't what you want, call "
                      "Concept.init_storage() before allocating a Concept")
         Concept.init_storage()
Exemple #2
0
 def __init__(self, tree, rule_data=None, skip_unknown_concepts=True,
              accepted_types=set(['a', 'i'])):
     logging.debug("Creating Converter with tree %r", tree)
     self._tree=tree
     if rule_data is None:
         rule_data=pickle.load(open(_DEFAULT_CONVERTER_DATA, "rb"))
         logging.info("Using converter data from %r", 
                      _DEFAULT_CONVERTER_DATA)
     self._data=rule_data
     self._extra_checktags=set() 
     self._skip_unknown=skip_unknown_concepts
     self._accepted_types=accepted_types
Exemple #3
0
 def __init__(self,
              tree,
              rule_data=None,
              skip_unknown_concepts=True,
              accepted_types=set(['a', 'i'])):
     logging.debug("Creating Converter with tree %r", tree)
     self._tree = tree
     if rule_data is None:
         rule_data = pickle.load(open(_DEFAULT_CONVERTER_DATA, "rb"))
         logging.info("Using converter data from %r",
                      _DEFAULT_CONVERTER_DATA)
     self._data = rule_data
     self._extra_checktags = set()
     self._skip_unknown = skip_unknown_concepts
     self._accepted_types = accepted_types
Exemple #4
0
 def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576):
     # If the filename isn't specified, use the default one (None has a
     # special meaning, so we can't use it - it means create a temp file)
     if filename=="*&$#$%#":
         filename=_DEFAULT_TREE_DATA
     logging.info("Initializing tree with data from %r", filename)
     self._tree=StringDBDict(persistent_file=filename, file_mode=file_mode,
                             cachesize=cachesize)
     self._invlookup=None # Init the inverse name lookup database lazily
     self._origname=filename
     self.terms=self._tree.keys()
     self.terms.sort()
     # This one is for speedy retrieval and indexing
     self._term_list_as_dict=None
     self._search_dict=None
     self.num_terms=len(self.terms)
     return
Exemple #5
0
 def graph_and_rank(self, article):
     """Turn the article into a graph, then a link matrix, and then rank
     it. Returns the ranked list of nodes."""
     article_graph=self.graph_article(article)
     article_matrix=article_graph.as_mapped_link_matrix()
     if len(article_matrix)==0:
         logging.info("Skipping article %r. It has an empty matrix.", 
                      article)
         raise CouldNotRank("Article %r is not rankable." % article)
     try:
         ranked_article=self._ranker.evaluate(article_matrix)
     except ValueError:
         logging.info("%r returned an exception while ranking %r. "
                      "Skipping.", self._ranker, article)
         raise CouldNotRank("There was an exception while ranking %r." %
                             article)
     return ranked_article
 def graph_and_rank(self, item):
     """Turn the item into a graph, then a link matrix, and then rank
     it. Returns the ranked list of nodes."""
     item_graph = self.graph_item(item)
     logging.log(ULTRADEBUG, "The item graph is %r.", item_graph)
     item_matrix = item_graph.as_mapped_link_matrix()
     if len(item_matrix) == 0:
         logging.info("Skipping item %r. It has an empty matrix.", item)
         raise CouldNotRank("Item %r is not rankable." % item)
     try:
         ranked_item = self._ranker.evaluate(item_matrix)
     except ValueError:
         logging.info(
             "%r returned an exception while ranking %r. "
             "Skipping.", self._ranker, item)
         raise CouldNotRank("There was an exception while ranking %r." %
                            item)
     return ranked_item
Exemple #7
0
 def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576):
     # If the filename isn't specified, use the default one (None has a
     # special meaning, so we can't use it - it means create a temp file)
     if filename == "*&$#$%#":
         filename = _DEFAULT_TREE_DATA
     logging.info("Initializing tree with data from %r", filename)
     self._tree = StringDBDict(persistent_file=filename,
                               file_mode=file_mode,
                               cachesize=cachesize)
     self._invlookup = None  # Init the inverse name lookup database lazily
     self._origname = filename
     self.terms = self._tree.keys()
     self.terms.sort()
     # This one is for speedy retrieval and indexing
     self._term_list_as_dict = None
     self._search_dict = None
     self.num_terms = len(self.terms)
     return
Exemple #8
0
 def graph_and_rank(self, article):
     """Turn the article into a graph, then a link matrix, and then rank
     it. Returns the ranked list of nodes."""
     article_graph = self.graph_article(article)
     article_matrix = article_graph.as_mapped_link_matrix()
     if len(article_matrix) == 0:
         logging.info("Skipping article %r. It has an empty matrix.",
                      article)
         raise CouldNotRank("Article %r is not rankable." % article)
     try:
         ranked_article = self._ranker.evaluate(article_matrix)
     except ValueError:
         logging.info(
             "%r returned an exception while ranking %r. "
             "Skipping.", self._ranker, article)
         raise CouldNotRank("There was an exception while ranking %r." %
                            article)
     return ranked_article
Exemple #9
0
 def graph_and_rank(self, item):
     """Turn the item into a graph, then a link matrix, and then rank
     it. Returns the ranked list of nodes."""
     item_graph=self.graph_item(item)
     logging.log(ULTRADEBUG, "The item graph is %r.", item_graph)
     item_matrix=item_graph.as_mapped_link_matrix()
     if len(item_matrix)==0:
         logging.info("Skipping item %r. It has an empty matrix.", 
                      item)
         raise CouldNotRank("Item %r is not rankable." % item)
     try:
         ranked_item=self._ranker.evaluate(item_matrix)
     except ValueError:
         logging.info("%r returned an exception while ranking %r. "
                      "Skipping.", self._ranker, item)
         raise CouldNotRank("There was an exception while ranking %r." %
                             item)
     return ranked_item
Exemple #10
0
 def run(self):
     """Perform the evaluation"""
     logging.info("Starting workflow %r run", self)
     all_results = {}
     evaluator = self.create_evaluator()
     count = 0
     for each_article in self._reader:
         count += 1
         logging.info("Working on article %d: %r", count, each_article)
         if not self.include_article(each_article):
             logging.log(
                 ULTRADEBUG, "Skipping article %r due to exclusion "
                 " criteria.", each_article)
             continue
         try:
             ranked_article = self.graph_and_rank(each_article)
         except CouldNotRank:
             continue
         converted_terms = self.convert(ranked_article)
         cut_terms = converted_terms.terms_higher_than_or_equal_to(
             self._ranking_cutoff)
         logging.debug(
             "Lowest-ranking term is term #%d out of %d"
             " (score=%1.5f, highest score=%1.5f)", len(cut_terms),
             len(converted_terms), [x[1] for x in cut_terms][-1],
             [x[1] for x in cut_terms][0])
         medline_record_mesh_terms = ExpressionList().from_medline(
             each_article.set_id.article_record().mesh_headings)
         flat_medline = medline_record_mesh_terms.flatten()
         flattened_terms = self.flatten_generated_terms(
             flat_medline, cut_terms)
         flattened_terms = self.limit_length(flat_medline, flattened_terms)
         if len(flat_medline) == 0:
             logging.warn(
                 "No gold standard available for article %r. "
                 "Omitting it from the result set.", each_article)
             continue
         eval_result = self.perform_evaluation(each_article, evaluator,
                                               flat_medline,
                                               flattened_terms)
         flattened_major_headings=\
             medline_record_mesh_terms.major_headings()
         logging.debug("Original headings: %r Major headings: %r",
                       medline_record_mesh_terms, flattened_major_headings)
         mh_result_temp = self.perform_evaluation(each_article, evaluator,
                                                  flattened_major_headings,
                                                  flattened_terms)
         mh_result = NamedResultSet("mh_", mh_result_temp)
         # Compute the total recall, too
         total_recall = self.compute_total_recall(flat_medline,
                                                  converted_terms)
         eval_result.add(total_recall)
         # Unify the result sets
         all_results[each_article.set_id] = eval_result | mh_result
     logging.info("Writing out results.")
     self.output(all_results)
     self.output_metadata()
     return
Exemple #11
0
 def run(self):
     """Perform the evaluation"""
     logging.info("Starting workflow %r run", self)
     all_results={}
     evaluator=self.create_evaluator()
     count=0
     for each_article in self._reader:
         count+=1
         logging.info("Working on article %d: %r", count, each_article)
         if not self.include_article(each_article):
             logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                           " criteria.", each_article)
             continue
         try:
             ranked_article=self.graph_and_rank(each_article)
         except CouldNotRank:
             continue
         converted_terms=self.convert(ranked_article)
         cut_terms=converted_terms.terms_higher_than_or_equal_to(
                             self._ranking_cutoff)
         logging.debug("Lowest-ranking term is term #%d out of %d"
                       " (score=%1.5f, highest score=%1.5f)",
                       len(cut_terms), len(converted_terms),
                       [x[1] for x in cut_terms][-1],
                       [x[1] for x in cut_terms][0])
         medline_record_mesh_terms=ExpressionList().from_medline(
                 each_article.set_id.article_record().mesh_headings)
         flat_medline=medline_record_mesh_terms.flatten()
         flattened_terms=self.flatten_generated_terms(flat_medline,
                         cut_terms)
         flattened_terms=self.limit_length(flat_medline, flattened_terms)
         if len(flat_medline)==0:
             logging.warn("No gold standard available for article %r. "
                          "Omitting it from the result set.", each_article)
             continue
         eval_result=self.perform_evaluation(each_article,
                                             evaluator,
                                             flat_medline,
                                             flattened_terms)
         flattened_major_headings=\
             medline_record_mesh_terms.major_headings()
         logging.debug("Original headings: %r Major headings: %r", 
                         medline_record_mesh_terms,
                         flattened_major_headings)
         mh_result_temp=self.perform_evaluation(each_article, evaluator,
                                                flattened_major_headings,
                                                flattened_terms)
         mh_result=NamedResultSet("mh_", mh_result_temp)
         # Compute the total recall, too
         total_recall=self.compute_total_recall(flat_medline, 
                                                converted_terms)
         eval_result.add(total_recall)
         # Unify the result sets
         all_results[each_article.set_id]=eval_result | mh_result
     logging.info("Writing out results.")
     self.output(all_results)
     self.output_metadata()
     return
Exemple #12
0
def processor(workflow_class,
              graph_builder_constructor, graph_builder_params,
              ranker_constructor, ranker_params,
              eval_parameters, 
              ranking_cutoff,
              mesh_tree_filename, distance_matrix_filename,
              distance_function,
              umls_converter_data_filename,
              extra_data_name,
              extra_data_contents,
              my_input_queue, my_output_queue,
              my_own_name=None):
    logging.info("Setting up worker.")
    if my_own_name is not None:
        proctitle.setproctitle(my_own_name)

    my_workflow=workflow_class(graph_builder_constructor,
                               graph_builder_params,
                               ranker_constructor,
                               ranker_params,
                               eval_parameters,
                               ranking_cutoff,
                               mesh_tree_filename,
                               distance_matrix_filename,
                               distance_function,
                               umls_converter_data_filename
                               )
    if extra_data_name is not None:
        my_workflow.__setattr__(extra_data_name, extra_data_contents)
    logging.info("Finished setting up worker process. Waiting for requests.")
    try:
        while True:
            request=my_input_queue.get()
            logging.log(ULTRADEBUG, "Processing request %r", request)
            if request=='STOP':
                logging.log(ULTRADEBUG, "Received stop request.")
                break
            try:
                my_workflow.process_article(request)
                # Recover the article, push it on the output queue
                my_output_queue.put(my_workflow.all_results)
                # Clear the output queue
                my_workflow.all_results={}
            except CouldNotRank:
                #my_input_queue.put(request) # On error, push the task
                                            # back into the queue
                logging.info("Skipping unrankable article.")
            except:
                logging.warn("EXCEPTION RAISED: \n%s", 
                             traceback.format_exc())
                raise
    finally:
        logging.log(ULTRADEBUG, "Returning results to caller.")
        logging.log(ULTRADEBUG, "Ending processor execution.")
    return
Exemple #13
0
def processor(workflow_class,
              graph_builder_constructor,
              graph_builder_params,
              ranker_constructor,
              ranker_params,
              eval_parameters,
              ranking_cutoff,
              mesh_tree_filename,
              distance_matrix_filename,
              distance_function,
              umls_converter_data_filename,
              extra_data_name,
              extra_data_contents,
              my_input_queue,
              my_output_queue,
              my_own_name=None):
    logging.info("Setting up worker.")
    if my_own_name is not None:
        proctitle.setproctitle(my_own_name)

    my_workflow = workflow_class(graph_builder_constructor,
                                 graph_builder_params, ranker_constructor,
                                 ranker_params, eval_parameters,
                                 ranking_cutoff, mesh_tree_filename,
                                 distance_matrix_filename, distance_function,
                                 umls_converter_data_filename)
    if extra_data_name is not None:
        my_workflow.__setattr__(extra_data_name, extra_data_contents)
    logging.info("Finished setting up worker process. Waiting for requests.")
    try:
        while True:
            request = my_input_queue.get()
            logging.log(ULTRADEBUG, "Processing request %r", request)
            if request == 'STOP':
                logging.log(ULTRADEBUG, "Received stop request.")
                break
            try:
                my_workflow.process_article(request)
                # Recover the article, push it on the output queue
                my_output_queue.put(my_workflow.all_results)
                # Clear the output queue
                my_workflow.all_results = {}
            except CouldNotRank:
                #my_input_queue.put(request) # On error, push the task
                # back into the queue
                logging.info("Skipping unrankable article.")
            except:
                logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc())
                raise
    finally:
        logging.log(ULTRADEBUG, "Returning results to caller.")
        logging.log(ULTRADEBUG, "Ending processor execution.")
    return
Exemple #14
0
 def build_idf_from_file(self, file_reader, default_score=None):
     tempdict = {}
     logging.info("Building the term frequency dictionary")
     count = 1
     logging.debug("Checking for a cache file, and loading from it.")
     try:
         self.populate_from_cache(
             self.cache_file_name(file_reader.original_file.name))
         logging.info("Loaded from cache. It's not necessary to build.")
         return
     except:
         logging.debug("Nope. Proceeding with building the dictionary.")
     for article in file_reader:
         logging.debug(
             "Processing article %r (number %d) for the term"
             " frequency dictionary", article, count)
         if article.set_id.pmid < 0:
             logging.warn("Article with unknown PubMed ID - skipping")
             continue
         count += 1
         tempcounts = {}
         for line in article.lines:
             try:
                 this_cui = line.CUI
             except AttributeError:
                 continue
             # Use the confidence as the score if no default is specified
             #if default_score is None:
             #    try:
             #        this_score=line.confidence
             #    except AttributeError:
             #        continue
             #else:
             #    this_score=default_score
             #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score
             tempcounts[this_cui] = 1
         # Now have all the CUIs that appeared in the article. Update
         # the total counts.
         for k in tempcounts:
             tempdict[k] = tempdict.get(k, 0) + 1
     logging.debug("Built a dictionary with %d items. Computing IDFs.",
                   len(tempdict))
     # max_value=max(tempdict.itervalues())
     #logging.debug("Saving it to permanent storage.")
     for k, v in tempdict.iteritems():
         self[k] = math.log(count / float(v)) + 1.0
     logging.info("Done building the dictionary. Dumping it to a cache "
                  "file.")
     self.dump_to_cache(self.cache_file_name(
         file_reader.original_file.name))
     return
Exemple #15
0
 def build_idf_from_file(self, file_reader, default_score=None):
     tempdict={}
     logging.info("Building the term frequency dictionary")
     count=1
     logging.debug("Checking for a cache file, and loading from it.")
     try:
         self.populate_from_cache(
             self.cache_file_name(file_reader.original_file.name))
         logging.info("Loaded from cache. It's not necessary to build.")
         return
     except:
         logging.debug("Nope. Proceeding with building the dictionary.")
     for article in file_reader:
         logging.debug("Processing article %r (number %d) for the term"
                      " frequency dictionary", article, count)
         if article.set_id.pmid < 0:
             logging.warn("Article with unknown PubMed ID - skipping")
             continue
         count+=1
         tempcounts={}
         for line in article.lines:
             try:
                 this_cui=line.CUI
             except AttributeError:
                 continue
             # Use the confidence as the score if no default is specified
             #if default_score is None:
             #    try:
             #        this_score=line.confidence
             #    except AttributeError:
             #        continue
             #else:
             #    this_score=default_score
             #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score
             tempcounts[this_cui]=1
         # Now have all the CUIs that appeared in the article. Update
         # the total counts.
         for k in tempcounts:
             tempdict[k]=tempdict.get(k, 0)+1
     logging.debug("Built a dictionary with %d items. Computing IDFs.",
                   len(tempdict))
     # max_value=max(tempdict.itervalues())
     #logging.debug("Saving it to permanent storage.")
     for k, v in tempdict.iteritems():
         self[k]=math.log(count/float(v))+1.0
     logging.info("Done building the dictionary. Dumping it to a cache "
                  "file.")
     self.dump_to_cache(
             self.cache_file_name(file_reader.original_file.name))
     return
 def _generate_normalization_factors(self):
     """Computes the array of normalization factors for the current 
     matrix."""
     import operator
     logging.info("Generating array of normalization factors. This is a "
                  "slow operation. Please wait.")
     for i in xrange(self._height):
         logging.debug("Generating normalization factor for row %d", i)
         # Add all of the elements of the row together
         matrix_row=self._get_row(i)
         logging.log(ULTRADEBUG, "Row %d contains: %s", i, matrix_row)
         this_row=reduce(operator.add, matrix_row)
         
         if this_row==0.0:
             logging.info("Row %d in the matrix adds up to 0. This may "
             "be a problem, depending on your evaluation function. Since "
             "this is a normalization calculation, it will be replaced by "
             "1.", i)
             this_row=1.0
         self.normfactors[i]=this_row
         logging.log(ULTRADEBUG, "Normalization factor for row %d=%1.5f", i,
                       this_row)
     logging.info("Normalization factor generation done.")
Exemple #17
0
    def _generate_normalization_factors(self):
        """Computes the array of normalization factors for the current 
        matrix."""
        import operator
        logging.info("Generating array of normalization factors. This is a "
                     "slow operation. Please wait.")
        for i in xrange(self._height):
            logging.debug("Generating normalization factor for row %d", i)
            # Add all of the elements of the row together
            matrix_row = self._get_row(i)
            logging.log(ULTRADEBUG, "Row %d contains: %s", i, matrix_row)
            this_row = reduce(operator.add, matrix_row)

            if this_row == 0.0:
                logging.info(
                    "Row %d in the matrix adds up to 0. This may "
                    "be a problem, depending on your evaluation function. Since "
                    "this is a normalization calculation, it will be replaced by "
                    "1.", i)
                this_row = 1.0
            self.normfactors[i] = this_row
            logging.log(ULTRADEBUG, "Normalization factor for row %d=%1.5f", i,
                        this_row)
        logging.info("Normalization factor generation done.")
def multi_processor(reader,
                    workflow_class,
                    graph_builder_constructor, graph_builder_params,
                    ranker_constructor, ranker_params,
                    eval_parameters, 
                    ranking_cutoff,
                    mesh_tree_filename, distance_matrix_filename,
                    distance_function,
                    umls_converter_data_filename,
                    umls_concept_data_filename,
                    extra_data_name,
                    extra_data_contents,
                    output_file,
                    num_processes=None,
                    queue_size=None,
                    output_callback=output,
                    output_headers_callback=output_headers,
                    output_item_callback=output_one_item,
                    performance_tuning=True):
    """
    Perform the evaluation.
    Multiprocessing notes: It's the responsibility of the caller to make sure that
    extra_data_contents, if any, are multiprocessing-safe. For example, by using
    a SyncManager and Namespace and passing the proxy. See umls/concept for an example.
    """
    
    if num_processes is None:
        num_processes=cpu_count()

    if performance_tuning:
        # Since reading the file involves an awful lot of object creation 
        # and destruction we'll tweak the gc adjustments to sweep less frequently
        # IOW - we have a LOT of short-lived objects. No sense garbage-collecting
        # the latter generations very often.    
        # (this is about 10x, 5x, and 5x the usual)
        original_threshold=gc.get_threshold()
        gc.set_threshold(10 * original_threshold[0], 
                         5 * original_threshold[1],
                         5 * original_threshold[1]) 
        original_check_interval=sys.getcheckinterval()
        # Similarly, we'll try to minimize overhead from thread switches
        # 5x usual value
        sys.setcheckinterval(5*original_check_interval)
    logging.debug("Initializing Concept storage from %s", 
                  umls_concept_data_filename)
                  
    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    proctitle.setproctitle("MEDRank-main")
    
    processes=[]
    logging.info("Creating %d worker processes.", num_processes)
    #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues=[Queue(queue_size) for x in xrange(num_processes)]
    this_output_queue=Queue(2*queue_size)

    # Create an output processor
    output_processor=Process(target=output_callback, 
                             args=(output_file, 
                                   this_output_queue,
                                   output_headers_callback,
                                   output_item_callback))
    output_processor.start()
    
    for i in xrange(num_processes):
        this_process=Process(target=processor, args=(workflow_class,
                                                graph_builder_constructor, 
                                                graph_builder_params,
                                                ranker_constructor, 
                                                ranker_params,
                                                eval_parameters, 
                                                ranking_cutoff,
                                                mesh_tree_filename,
                                                distance_matrix_filename,
                                                distance_function,
                                                umls_converter_data_filename,
                                                extra_data_name,
                                                extra_data_contents,
                                                task_queues[i],
                                                this_output_queue,
                                                "MEDRank-Worker-%d" % i),
                             name="MEDRank-Worker-%d" % i)
        logging.log(ULTRADEBUG, "Created process: %r", this_process)
        this_process.start()
        processes.append((this_process, this_output_queue, task_queues[i]))
    
    all_results={}
    count=0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count+=1
        #queues_and_sizes=[(task_queues[x].qsize(), x) 
        #                  for x in xrange(num_processes)]
        #queues_and_sizes.sort()
        #target_process=queues_and_sizes[0][1]
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_process=(count-1) % num_processes
        #Lowest-loaded process first.
        logging.info("Dispatching article %d: %s to %s", 
                     count,
                     each_article.set_id,
                     processes[target_process][0].name)
        task_queues[target_process].put(each_article)
        #task_queue[target_process].put(each_article)
        #task_queue.put(each_article)
        #logging.info("The task queue is approximately %d items long.", 
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results={}

    alive_processes=[x for x in processes if x[0].is_alive()]
    remaining_processes=len(alive_processes)

    logging.info("There are %d processes (out of %d) still alive.", 
                 remaining_processes,
                 num_processes)
    for i in xrange(remaining_processes):
        alive_processes[i][2].put('STOP')
        alive_processes[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further "
                  "requests will come.")

    logging.info("All information sent to the processors.")

    # Back to normal
    if performance_tuning:
        gc.set_threshold(original_threshold[0],
                         original_threshold[1],
                         original_threshold[2])
        sys.setcheckinterval(original_check_interval)

    # Note end of output

    while len(processes)>0:
        a_process=processes.pop()
        # We join the process to wait for the end of the reading 
        a_process[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multiprocessing.")
    return
def multi_processor(
    reader,
    workflow_class,
    graph_builder_constructor,
    graph_builder_params,
    ranker_constructor,
    ranker_params,
    eval_parameters,
    ranking_cutoff,
    mesh_tree_filename,
    distance_matrix_filename,
    distance_function,
    umls_converter_data_filename,
    umls_concept_data_filename,
    extra_data_name,
    extra_data_contents,
    output_file,
    num_threads=None,
    queue_size=None,
    output_callback=output,
    output_headers_callback=output_headers,
    output_item_callback=output_one_item,
    performance_tuning=True,
):
    """
    Perform the evaluation.
    Multithreading notes: It's the responsibility of the caller to make sure
    that extra_data_contents, if any, are thread-safe. 
    """
    if num_threads is None:
        num_threads = 1

    logging.debug("Initializing Concept storage from %s", umls_concept_data_filename)

    # Since there's no direct way of setting the concept cache's title,
    # we set it here, wait for it to be inherited, and then get the 'real'
    # process title for this one.
    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    threads = []
    logging.info("Creating %d worker threads.", num_threads)
    # task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues = [Queue(queue_size) for x in xrange(num_threads)]
    this_output_queue = Queue(2 * queue_size)

    # Create an output processor
    output_processor = Thread(
        target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)
    )
    output_processor.start()

    for i in xrange(num_threads):
        this_thread = Thread(
            target=processor,
            args=(
                workflow_class,
                graph_builder_constructor,
                graph_builder_params,
                ranker_constructor,
                ranker_params,
                eval_parameters,
                ranking_cutoff,
                mesh_tree_filename,
                distance_matrix_filename,
                distance_function,
                umls_converter_data_filename,
                extra_data_name,
                extra_data_contents,
                task_queues[i],
                this_output_queue,
            ),
            name="MEDRank-Worker-%d" % i,
        )
        logging.log(ULTRADEBUG, "Created thread: %r", this_thread)
        this_thread.start()
        threads.append((this_thread, this_output_queue, task_queues[i]))

    all_results = {}
    count = 0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count += 1
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_thread = (count - 1) % num_threads
        logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, threads[target_thread][0].name)
        task_queues[target_thread].put(each_article)
        # task_queue[target_process].put(each_article)
        # task_queue.put(each_article)
        # logging.info("The task queue is approximately %d items long.",
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results = {}

    alive_threads = [x for x in threads if x[0].is_alive()]
    remaining_threads = len(alive_threads)

    logging.info("There are %d threads (out of %d) still alive.", remaining_threads, num_threads)
    for i in xrange(remaining_threads):
        alive_threads[i][2].put("STOP")
        # alive_threads[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.")

    logging.info("All information sent to the threads.")

    # Note end of output

    while len(threads) > 0:
        a_thread = threads.pop()
        # We join the process to wait for the end of the reading
        a_thread[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multithreading.")
    Pmid.close_storage()
    return
Exemple #20
0
def multi_processor(reader,
                    workflow_class,
                    graph_builder_constructor,
                    graph_builder_params,
                    ranker_constructor,
                    ranker_params,
                    eval_parameters,
                    ranking_cutoff,
                    mesh_tree_filename,
                    distance_matrix_filename,
                    distance_function,
                    umls_converter_data_filename,
                    umls_concept_data_filename,
                    extra_data_name,
                    extra_data_contents,
                    output_file,
                    num_processes=None,
                    queue_size=None,
                    output_callback=output,
                    output_headers_callback=output_headers,
                    output_item_callback=output_one_item,
                    performance_tuning=True):
    """
    Perform the evaluation.
    Multiprocessing notes: It's the responsibility of the caller to make sure that
    extra_data_contents, if any, are multiprocessing-safe. For example, by using
    a SyncManager and Namespace and passing the proxy. See umls/concept for an example.
    """

    if num_processes is None:
        num_processes = cpu_count()

    if performance_tuning:
        # Since reading the file involves an awful lot of object creation
        # and destruction we'll tweak the gc adjustments to sweep less frequently
        # IOW - we have a LOT of short-lived objects. No sense garbage-collecting
        # the latter generations very often.
        # (this is about 10x, 5x, and 5x the usual)
        original_threshold = gc.get_threshold()
        gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1],
                         5 * original_threshold[1])
        original_check_interval = sys.getcheckinterval()
        # Similarly, we'll try to minimize overhead from thread switches
        # 5x usual value
        sys.setcheckinterval(5 * original_check_interval)
    logging.debug("Initializing Concept storage from %s",
                  umls_concept_data_filename)

    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    proctitle.setproctitle("MEDRank-main")

    processes = []
    logging.info("Creating %d worker processes.", num_processes)
    #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues = [Queue(queue_size) for x in xrange(num_processes)]
    this_output_queue = Queue(2 * queue_size)

    # Create an output processor
    output_processor = Process(target=output_callback,
                               args=(output_file, this_output_queue,
                                     output_headers_callback,
                                     output_item_callback))
    output_processor.start()

    for i in xrange(num_processes):
        this_process = Process(
            target=processor,
            args=(workflow_class, graph_builder_constructor,
                  graph_builder_params, ranker_constructor, ranker_params,
                  eval_parameters, ranking_cutoff, mesh_tree_filename,
                  distance_matrix_filename, distance_function,
                  umls_converter_data_filename, extra_data_name,
                  extra_data_contents, task_queues[i], this_output_queue,
                  "MEDRank-Worker-%d" % i),
            name="MEDRank-Worker-%d" % i)
        logging.log(ULTRADEBUG, "Created process: %r", this_process)
        this_process.start()
        processes.append((this_process, this_output_queue, task_queues[i]))

    all_results = {}
    count = 0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count += 1
        #queues_and_sizes=[(task_queues[x].qsize(), x)
        #                  for x in xrange(num_processes)]
        #queues_and_sizes.sort()
        #target_process=queues_and_sizes[0][1]
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_process = (count - 1) % num_processes
        #Lowest-loaded process first.
        logging.info("Dispatching article %d: %s to %s", count,
                     each_article.set_id, processes[target_process][0].name)
        task_queues[target_process].put(each_article)
        #task_queue[target_process].put(each_article)
        #task_queue.put(each_article)
        #logging.info("The task queue is approximately %d items long.",
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results = {}

    alive_processes = [x for x in processes if x[0].is_alive()]
    remaining_processes = len(alive_processes)

    logging.info("There are %d processes (out of %d) still alive.",
                 remaining_processes, num_processes)
    for i in xrange(remaining_processes):
        alive_processes[i][2].put('STOP')
        alive_processes[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further "
                  "requests will come.")

    logging.info("All information sent to the processors.")

    # Back to normal
    if performance_tuning:
        gc.set_threshold(original_threshold[0], original_threshold[1],
                         original_threshold[2])
        sys.setcheckinterval(original_check_interval)

    # Note end of output

    while len(processes) > 0:
        a_process = processes.pop()
        # We join the process to wait for the end of the reading
        a_process[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multiprocessing.")
    return
Exemple #21
0
def multi_processor(reader,
                    workflow_class,
                    graph_builder_constructor, graph_builder_params,
                    ranker_constructor, ranker_params,
                    eval_parameters, 
                    ranking_cutoff,
                    mesh_tree_filename, distance_matrix_filename,
                    distance_function,
                    umls_converter_data_filename,
                    umls_concept_data_filename,
                    extra_data_name,
                    extra_data_contents,
                    output_file,
                    num_threads=None,
                    queue_size=None,
                    output_callback=output,
                    output_headers_callback=output_headers,
                    output_item_callback=output_one_item,
                    performance_tuning=True):
    """
    Perform the evaluation.
    Multithreading notes: It's the responsibility of the caller to make sure
    that extra_data_contents, if any, are thread-safe. 
    """
    if num_threads is None:
        num_threads=1

    logging.debug("Initializing Concept storage from %s", 
                  umls_concept_data_filename)
                  
    # Since there's no direct way of setting the concept cache's title, 
    # we set it here, wait for it to be inherited, and then get the 'real' 
    # process title for this one. 
    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    threads=[]
    logging.info("Creating %d worker threads.", num_threads)
    #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues=[Queue(queue_size) for x in xrange(num_threads)]
    this_output_queue=Queue(2*queue_size)

    # Create an output processor
    output_processor=Thread(target=output_callback, 
                             args=(output_file, 
                                   this_output_queue,
                                   output_headers_callback,
                                   output_item_callback))
    output_processor.start()
    
    for i in xrange(num_threads):
        this_thread=Thread(target=processor, args=(workflow_class,
                                                graph_builder_constructor, 
                                                graph_builder_params,
                                                ranker_constructor, 
                                                ranker_params,
                                                eval_parameters, 
                                                ranking_cutoff,
                                                mesh_tree_filename,
                                                distance_matrix_filename,
                                                distance_function,
                                                umls_converter_data_filename,
                                                extra_data_name,
                                                extra_data_contents,
                                                task_queues[i],
                                                this_output_queue),
                             name="MEDRank-Worker-%d" % i)
        logging.log(ULTRADEBUG, "Created thread: %r", this_thread)
        this_thread.start()
        threads.append((this_thread, this_output_queue, task_queues[i]))
    
    all_results={}
    count=0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count+=1
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_thread=(count-1) % num_threads
        logging.info("Dispatching article %d: %s to %s", 
                     count,
                     each_article.set_id,
                     threads[target_thread][0].name)
        task_queues[target_thread].put(each_article)
        #task_queue[target_process].put(each_article)
        #task_queue.put(each_article)
        #logging.info("The task queue is approximately %d items long.", 
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results={}

    alive_threads=[x for x in threads if x[0].is_alive()]
    remaining_threads=len(alive_threads)

    logging.info("There are %d threads (out of %d) still alive.", 
                 remaining_threads,
                 num_threads)
    for i in xrange(remaining_threads):
        alive_threads[i][2].put('STOP')
        #alive_threads[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further "
                  "requests will come.")

    logging.info("All information sent to the threads.")

    # Note end of output

    while len(threads)>0:
        a_thread=threads.pop()
        # We join the process to wait for the end of the reading 
        a_thread[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multithreading.")
    Pmid.close_storage()
    return