コード例 #1
0
ファイル: summarize_with_nltk.py プロジェクト: YZWD/MEDRank
def main():
    # Read all lines, stripping trailing newlines and leading spaces
    sentences=[s.strip() \
               for s in 
               sentence_detector.tokenize(
                   open(sys.argv[1], 'rU').read().strip())]
    
    # Eliminate empty lines
    sentences=[s for s in sentences if len(s)>0]
    # Create one Node per sentence, with a unique ID based on sequential 
    # numbering, the contents of the sentence, and an initial node weight of 1.0
    sentnodes=[Node(x, sentences[x], 1.0) for x in xrange(len(sentences))]
    
    # Create an empty graph
    sentgraph=Graph()
    
    # Compute the similarity between every pair of sentences and add a link to
    # the graph connecting those nodes. THe 
    for p in sentence_pairs(sentences):
        n1, n2=sentnodes[p[0]], sentnodes[p[1]]
        sentlink=AdirectionalLink(n1, n2, 
                                  sentence_similarity(sentences[p[0]],
                                                      sentences[p[1]]))
        sentgraph.add_relationship(sentlink)
        
    # Create a default TextRanker (that implements TextRank as described) and
    # wrap it in the MappedRanker class, which returns (node, score) pairings
    # instead of just scores
    ranker=MappedRanker(TextRanker())
    # Convert the graph to a link matrix
    matrix=sentgraph.as_mapped_link_matrix()
    # Run the ranker on the matrix
    results=ranker.evaluate(matrix)
    # The ranker returns a RankedResultSet that behaves like a list of
    # (node, score) pairings. By default these are sorted in reverse order,
    # i.e., the highest scores at the beginning. To obtain the desired sentences
    # we just trim the list to size.
    try:
        # Try to get a float from the command line
        desired_length=int(round(float(sys.argv[2])*len(sentences)))
    except:
        desired_length=int(round(float(len(sentences))*0.2))

    # For the final output we only need the node, not its score
    shortened_results=[x[0] for x in results]
    # Now we trim it to the desired length
    shortened_results=shortened_results[:desired_length]
    # Now, for presentation purposes, we reorder the truncated list in its 
    # original order.
    shortened_results.sort(cmp=cmp_two_nodes_by_id)
    
    # Output the summary as a paragraph
    print ' '. join([x.name for x in shortened_results])
コード例 #2
0
def main():
    # Read all lines, stripping trailing newlines and leading spaces
    sentences=[s.strip() \
               for s in
               sentence_detector.tokenize(
                   open(sys.argv[1], 'rU').read().strip())]

    # Eliminate empty lines
    sentences = [s for s in sentences if len(s) > 0]
    # Create one Node per sentence, with a unique ID based on sequential
    # numbering, the contents of the sentence, and an initial node weight of 1.0
    sentnodes = [Node(x, sentences[x], 1.0) for x in xrange(len(sentences))]

    # Create an empty graph
    sentgraph = Graph()

    # Compute the similarity between every pair of sentences and add a link to
    # the graph connecting those nodes. THe
    for p in sentence_pairs(sentences):
        n1, n2 = sentnodes[p[0]], sentnodes[p[1]]
        sentlink = AdirectionalLink(
            n1, n2, sentence_similarity(sentences[p[0]], sentences[p[1]]))
        sentgraph.add_relationship(sentlink)

    # Create a default TextRanker (that implements TextRank as described) and
    # wrap it in the MappedRanker class, which returns (node, score) pairings
    # instead of just scores
    ranker = MappedRanker(TextRanker())
    # Convert the graph to a link matrix
    matrix = sentgraph.as_mapped_link_matrix()
    # Run the ranker on the matrix
    results = ranker.evaluate(matrix)
    # The ranker returns a RankedResultSet that behaves like a list of
    # (node, score) pairings. By default these are sorted in reverse order,
    # i.e., the highest scores at the beginning. To obtain the desired sentences
    # we just trim the list to size.
    try:
        # Try to get a float from the command line
        desired_length = int(round(float(sys.argv[2]) * len(sentences)))
    except:
        desired_length = int(round(float(len(sentences)) * 0.2))

    # For the final output we only need the node, not its score
    shortened_results = [x[0] for x in results]
    # Now we trim it to the desired length
    shortened_results = shortened_results[:desired_length]
    # Now, for presentation purposes, we reorder the truncated list in its
    # original order.
    shortened_results.sort(cmp=cmp_two_nodes_by_id)

    # Output the summary as a paragraph
    print ' '.join([x.name for x in shortened_results])
コード例 #3
0
 def __init__(self, graph_builder_constructor, graph_builder_params,
              ranker_constructor, ranker_params, ranking_cutoff):
     logging.debug("Setting up a SingleItemWorkflow instance.")
     logging.debug("My graph builder is: %r", graph_builder_constructor)
     if graph_builder_constructor is not None:
         self._graph_builder=\
             graph_builder_constructor(*graph_builder_params)
     else:
         self._graph_builder = None
     if ranker_constructor is not None:
         self._ranker = MappedRanker(ranker_constructor(*ranker_params))
     else:
         self._ranker = None
     logging.debug("My ranker is: %r", ranker_constructor)
     self._ranking_cutoff = ranking_cutoff
     logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
     self.all_results = {}
     return
コード例 #4
0
 def __init__(self, reader, graph_builder, ranker, eval_parameters,
              ranking_cutoff, mesh_tree_filename, distance_matrix_filename,
              distance_function, umls_converter_data_filename,
              umls_concept_data_filename, output_file):
     logging.debug("Setting up a Workflow instance.")
     logging.debug("My reader is: %r", reader)
     self._reader = reader
     logging.debug("My graph builder is: %r", graph_builder)
     self._graph_builder = graph_builder
     self._ranker = MappedRanker(ranker)
     logging.debug("My ranker is: %r", self._ranker)
     self._ranking_cutoff = ranking_cutoff
     logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
     logging.debug("Creating a Tree instance from %s", mesh_tree_filename)
     self._mesh_tree = Tree(mesh_tree_filename)
     logging.debug(
         "Creating SAVCC distance matrix with %r and distance "
         "function %r", distance_matrix_filename, distance_function)
     self._matrix = SavccNormalizedMatrix(
         open(distance_matrix_filename, "rb"), distance_function)
     logging.debug("Filling in the rest of the evaluation parameters.")
     self._eval_parameters = eval_parameters
     self._eval_parameters.mesh_tree = self._mesh_tree
     self._eval_parameters.savcc_matrix = self._matrix
     logging.debug("My evaluation parameters are: %r",
                   self._eval_parameters)
     if umls_converter_data_filename is None:
         converter_data = None
     else:
         converter_data = pickle.load(
             open(umls_converter_data_filename, "rb"))
     self._umls_converter = RankedConverter(
         Converter(self._mesh_tree, converter_data))
     logging.debug("My converter is: %r", self._umls_converter)
     logging.debug("Initializing Concept storage from %s",
                   umls_concept_data_filename)
     if umls_concept_data_filename is None:
         Concept.init_storage()
     else:
         Concept.init_storage(StringDBDict(umls_concept_data_filename))
     self._output_file = output_file
     logging.debug("My output file is: %r", self._output_file)
     return
コード例 #5
0
ファイル: workflow.py プロジェクト: YZWD/MEDRank
 def __init__(self, reader, graph_builder, ranker, eval_parameters, 
              ranking_cutoff,
              mesh_tree_filename, distance_matrix_filename,
              distance_function,
              umls_converter_data_filename, umls_concept_data_filename,
              output_file):
     logging.debug("Setting up a Workflow instance.")
     logging.debug("My reader is: %r", reader)
     self._reader=reader
     logging.debug("My graph builder is: %r", graph_builder)
     self._graph_builder=graph_builder
     self._ranker=MappedRanker(ranker)
     logging.debug("My ranker is: %r", self._ranker)
     self._ranking_cutoff=ranking_cutoff
     logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
     logging.debug("Creating a Tree instance from %s", mesh_tree_filename)
     self._mesh_tree=Tree(mesh_tree_filename)
     logging.debug("Creating SAVCC distance matrix with %r and distance "
                   "function %r", 
                   distance_matrix_filename, distance_function)
     self._matrix=SavccNormalizedMatrix(
                   open(distance_matrix_filename, "rb"), distance_function)
     logging.debug("Filling in the rest of the evaluation parameters.")
     self._eval_parameters=eval_parameters
     self._eval_parameters.mesh_tree=self._mesh_tree
     self._eval_parameters.savcc_matrix=self._matrix
     logging.debug("My evaluation parameters are: %r", 
                   self._eval_parameters)
     if umls_converter_data_filename is None:
         converter_data=None
     else:
         converter_data=pickle.load(open(umls_converter_data_filename, 
                                         "rb"))
     self._umls_converter=RankedConverter(Converter(self._mesh_tree, 
                                                    converter_data))
     logging.debug("My converter is: %r", self._umls_converter)
     logging.debug("Initializing Concept storage from %s", 
                   umls_concept_data_filename)
     if umls_concept_data_filename is None:
         Concept.init_storage()
     else:
         Concept.init_storage(StringDBDict(umls_concept_data_filename))
     self._output_file=output_file
     logging.debug("My output file is: %r", self._output_file)
     return
コード例 #6
0
ファイル: single_item_workflow.py プロジェクト: YZWD/MEDRank
 def __init__(self, graph_builder_constructor, graph_builder_params,
              ranker_constructor, ranker_params, ranking_cutoff):
     logging.debug("Setting up a SingleItemWorkflow instance.")
     logging.debug("My graph builder is: %r", graph_builder_constructor)
     if graph_builder_constructor is not None:
         self._graph_builder=\
             graph_builder_constructor(*graph_builder_params)
     else:
         self._graph_builder=None    
     if ranker_constructor is not None:
         self._ranker=MappedRanker(ranker_constructor(*ranker_params))
     else:
         self._ranker=None
     logging.debug("My ranker is: %r", ranker_constructor)
     self._ranking_cutoff=ranking_cutoff
     logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
     self.all_results={}
     return
コード例 #7
0
ファイル: single_item_workflow.py プロジェクト: YZWD/MEDRank
class SingleItemWorkflow(object):
    """Contains a skeleton simple workflow through the system for a single 
    item (i.e. a document, clinical note, article, etc.). It expects a
    graph builder constructor, its parameter set, a ranker constructor, its
    parameter set, and a ranking cutoff. (Otherwise, why are you using MEDRank?)
    
    Parameters:
    graph_builder_constructor: A class that knows how to build a Graph (as in
                               MEDRank.computation.graph.Graph)
    graph_builder_params:   The parameters you want to use to call the 
                            aforementioned constructor.
    ranker_constructor:     A class that knows how to build a Ranker (as in
                            MEDRank.computation.ranker.Ranker) or descendant
    ranker_params: The parameters to pass to THAT constructor
    ranking_cutoff: A float value between 0.0 (no filtering) and 1.0. 
                   Everything below ranking_cutoff gets discarded.
    """
    def __init__(self, graph_builder_constructor, graph_builder_params,
                 ranker_constructor, ranker_params, ranking_cutoff):
        logging.debug("Setting up a SingleItemWorkflow instance.")
        logging.debug("My graph builder is: %r", graph_builder_constructor)
        if graph_builder_constructor is not None:
            self._graph_builder=\
                graph_builder_constructor(*graph_builder_params)
        else:
            self._graph_builder=None    
        if ranker_constructor is not None:
            self._ranker=MappedRanker(ranker_constructor(*ranker_params))
        else:
            self._ranker=None
        logging.debug("My ranker is: %r", ranker_constructor)
        self._ranking_cutoff=ranking_cutoff
        logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
        self.all_results={}
        return
    def __repr__(self):
        return "<%s instance>" % self.__class__.__name__
    def graph_item(self, item):
        if self._graph_builder is None:
            return None
        return self._graph_builder.create_graph(item)
    def graph_and_rank(self, item):
        """Turn the item into a graph, then a link matrix, and then rank
        it. Returns the ranked list of nodes."""
        item_graph=self.graph_item(item)
        logging.log(ULTRADEBUG, "The item graph is %r.", item_graph)
        item_matrix=item_graph.as_mapped_link_matrix()
        if len(item_matrix)==0:
            logging.info("Skipping item %r. It has an empty matrix.", 
                         item)
            raise CouldNotRank("Item %r is not rankable." % item)
        try:
            ranked_item=self._ranker.evaluate(item_matrix)
        except ValueError:
            logging.info("%r returned an exception while ranking %r. "
                         "Skipping.", self._ranker, item)
            raise CouldNotRank("There was an exception while ranking %r." %
                                item)
        return ranked_item
    def include_item(self, item):
        """Should this item be included in the sample? Return a boolean
        specifying so. Override to customize."""
        return True
    def process_item(self, one_item):
        if not self.include_item(one_item):
            logging.log(ULTRADEBUG, "Skipping item %r due to exclusion "
                          " criteria.", one_item)
            return
        try:
            ranked_item=self.graph_and_rank(one_item)
        except CouldNotRank:
            return
        cut_item=[x for x in ranked_item if x[1] >= self._ranking_cutoff]
        # Unify the result sets
        self.all_results[one_item.set_id]=cut_item
        return
        
コード例 #8
0
class Workflow(object):
    """Contains a basic workflow through the system. The parameters are:
    reader: an instance of an NLMOutput descendant (that knows how to read an
            output file, basically)
    graph_builder: a graph builder
    ranker: a Ranker 
    eval_parameters: Pass an instance of EvaluationParameters, but just fill
                     in the numerical ones. The matrix and tree will be 
                     instantiated and loaded by the Workflow constructor.
    mesh_tree_filename: a filename containing the tree you want to use
    distance_matrix_filename: the distance matrix you wish to use for SAVCC
                              computations
    distance_function:  the distance function used to interpret the distance
                        matrix
    umls_converter_data_filename: a filename pointing to the file built by
                                  the preprocess_checktag_boost_lists.sh 
                                  script
    umls_concept_data_filename: a filename pointing to the file built by the
                                preprocess_umls_mesh_mappings.sh script                            
    output_file: a file object in which you want to place the results of the
                 computation
    Call the run() method to execute it."""
    def __init__(self, reader, graph_builder, ranker, eval_parameters,
                 ranking_cutoff, mesh_tree_filename, distance_matrix_filename,
                 distance_function, umls_converter_data_filename,
                 umls_concept_data_filename, output_file):
        logging.debug("Setting up a Workflow instance.")
        logging.debug("My reader is: %r", reader)
        self._reader = reader
        logging.debug("My graph builder is: %r", graph_builder)
        self._graph_builder = graph_builder
        self._ranker = MappedRanker(ranker)
        logging.debug("My ranker is: %r", self._ranker)
        self._ranking_cutoff = ranking_cutoff
        logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
        logging.debug("Creating a Tree instance from %s", mesh_tree_filename)
        self._mesh_tree = Tree(mesh_tree_filename)
        logging.debug(
            "Creating SAVCC distance matrix with %r and distance "
            "function %r", distance_matrix_filename, distance_function)
        self._matrix = SavccNormalizedMatrix(
            open(distance_matrix_filename, "rb"), distance_function)
        logging.debug("Filling in the rest of the evaluation parameters.")
        self._eval_parameters = eval_parameters
        self._eval_parameters.mesh_tree = self._mesh_tree
        self._eval_parameters.savcc_matrix = self._matrix
        logging.debug("My evaluation parameters are: %r",
                      self._eval_parameters)
        if umls_converter_data_filename is None:
            converter_data = None
        else:
            converter_data = pickle.load(
                open(umls_converter_data_filename, "rb"))
        self._umls_converter = RankedConverter(
            Converter(self._mesh_tree, converter_data))
        logging.debug("My converter is: %r", self._umls_converter)
        logging.debug("Initializing Concept storage from %s",
                      umls_concept_data_filename)
        if umls_concept_data_filename is None:
            Concept.init_storage()
        else:
            Concept.init_storage(StringDBDict(umls_concept_data_filename))
        self._output_file = output_file
        logging.debug("My output file is: %r", self._output_file)
        return

    def __repr__(self):
        return "<%s instance>" % self.__class__.__name__

    def limit_length(self, gold_standard_terms, generated_terms):
        """Limits the length of the generated terms and returns the new list.
        By default it just truncates the list to the length of the gold 
        standard. Override to customize."""
        return generated_terms[:len(gold_standard_terms)]

    def create_evaluator(self):
        """Creates and returns the evaluator we'll use - override for easy
        customization"""
        return comprehensive(self._eval_parameters)

    def output(self, result_set):
        """Actually dumps the result set to output. Override for easy output
        customization."""
        column_names = set([])
        for result in result_set.itervalues():
            column_names |= result.columns()
        # Create a writer
        column_names = ['pmid'] + [x for x in column_names]
        output_writer = DictWriter(self._output_file, fieldnames=column_names)
        # Add the colnames to the csv
        output_writer.writer.writerow(column_names)
        for pmid, result in result_set.iteritems():
            outdict = result.as_dict()
            # Convert the PMID to string, which will be harmless of it's any
            # datatype but force it to display just the actual number if it's
            # a Pmid()
            outdict['pmid'] = str(pmid)
            output_writer.writerow(outdict)
        return

    def output_metadata(self):
        """Exports a sidecar file (with the creative extension .metadata) that
        describes the evaluation. Override for easy customization."""
        metadata_filename = self._output_file.name + '.metadata'
        data = []
        for potential_var, potential_var_value in self.__dict__.iteritems():
            if potential_var[0] == '_' and potential_var[1] != '_':
                data.append('%s=%r' % (potential_var[1:], potential_var_value))
        meta_out = open(metadata_filename, 'w')
        meta_out.write("[MEDRank_metadata]\n")
        meta_out.write('\n'.join(data))
        meta_out.close()

    def convert(self, terms_to_convert):
        """Override for easy customization"""
        return self._umls_converter.convert(terms_to_convert)

    def graph_article(self, article):
        if self._graph_builder is None:
            return None
        return self._graph_builder.create_graph(article)

    def graph_and_rank(self, article):
        """Turn the article into a graph, then a link matrix, and then rank
        it. Returns the ranked list of nodes."""
        article_graph = self.graph_article(article)
        article_matrix = article_graph.as_mapped_link_matrix()
        if len(article_matrix) == 0:
            logging.info("Skipping article %r. It has an empty matrix.",
                         article)
            raise CouldNotRank("Article %r is not rankable." % article)
        try:
            ranked_article = self._ranker.evaluate(article_matrix)
        except ValueError:
            logging.info(
                "%r returned an exception while ranking %r. "
                "Skipping.", self._ranker, article)
            raise CouldNotRank("There was an exception while ranking %r." %
                               article)
        return ranked_article

    def flatten_generated_terms(self, gold_standard_terms, generated_terms):
        """Flatten without any further preprocessing - this may be desirable 
        if, for example, all terms after the pagerank cutoff should be 
        considered equivalent. This may not always be the case."""
        return generated_terms.as_ExpressionList().flatten()

    def perform_evaluation(self, article, evaluator, flat_medline,
                           flattened_terms):
        results = evaluator.evaluate(flat_medline, flattened_terms)
        # Get the size of the LinkMatrix - we'll have to build the graph again
        # but only if we have something to build with.
        article_graph = self.graph_article(article)
        if article_graph is not None:
            results.update(article_graph.compute_measures())
        if self._graph_builder is not None:
            results.update(self._graph_builder.measurements)
        return results

    def include_article(self, article):
        """Should this article be included in the sample? Return a boolean
        specifying so. Override to customize."""
        return True

    def compute_total_recall(self, flat_gold_standard, converted_terms):
        """Computes the Total Recall of an article."""
        flat_converted = converted_terms.as_ExpressionList().flatten()
        tr = TotalRecall().evaluate(flat_gold_standard, flat_converted)
        return tr

    def run(self):
        """Perform the evaluation"""
        logging.info("Starting workflow %r run", self)
        all_results = {}
        evaluator = self.create_evaluator()
        count = 0
        for each_article in self._reader:
            count += 1
            logging.info("Working on article %d: %r", count, each_article)
            if not self.include_article(each_article):
                logging.log(
                    ULTRADEBUG, "Skipping article %r due to exclusion "
                    " criteria.", each_article)
                continue
            try:
                ranked_article = self.graph_and_rank(each_article)
            except CouldNotRank:
                continue
            converted_terms = self.convert(ranked_article)
            cut_terms = converted_terms.terms_higher_than_or_equal_to(
                self._ranking_cutoff)
            logging.debug(
                "Lowest-ranking term is term #%d out of %d"
                " (score=%1.5f, highest score=%1.5f)", len(cut_terms),
                len(converted_terms), [x[1] for x in cut_terms][-1],
                [x[1] for x in cut_terms][0])
            medline_record_mesh_terms = ExpressionList().from_medline(
                each_article.set_id.article_record().mesh_headings)
            flat_medline = medline_record_mesh_terms.flatten()
            flattened_terms = self.flatten_generated_terms(
                flat_medline, cut_terms)
            flattened_terms = self.limit_length(flat_medline, flattened_terms)
            if len(flat_medline) == 0:
                logging.warn(
                    "No gold standard available for article %r. "
                    "Omitting it from the result set.", each_article)
                continue
            eval_result = self.perform_evaluation(each_article, evaluator,
                                                  flat_medline,
                                                  flattened_terms)
            flattened_major_headings=\
                medline_record_mesh_terms.major_headings()
            logging.debug("Original headings: %r Major headings: %r",
                          medline_record_mesh_terms, flattened_major_headings)
            mh_result_temp = self.perform_evaluation(each_article, evaluator,
                                                     flattened_major_headings,
                                                     flattened_terms)
            mh_result = NamedResultSet("mh_", mh_result_temp)
            # Compute the total recall, too
            total_recall = self.compute_total_recall(flat_medline,
                                                     converted_terms)
            eval_result.add(total_recall)
            # Unify the result sets
            all_results[each_article.set_id] = eval_result | mh_result
        logging.info("Writing out results.")
        self.output(all_results)
        self.output_metadata()
        return
コード例 #9
0
ファイル: workflow.py プロジェクト: YZWD/MEDRank
class Workflow(object):
    """Contains a basic workflow through the system. The parameters are:
    reader: an instance of an NLMOutput descendant (that knows how to read an
            output file, basically)
    graph_builder: a graph builder
    ranker: a Ranker 
    eval_parameters: Pass an instance of EvaluationParameters, but just fill
                     in the numerical ones. The matrix and tree will be 
                     instantiated and loaded by the Workflow constructor.
    mesh_tree_filename: a filename containing the tree you want to use
    distance_matrix_filename: the distance matrix you wish to use for SAVCC
                              computations
    distance_function:  the distance function used to interpret the distance
                        matrix
    umls_converter_data_filename: a filename pointing to the file built by
                                  the preprocess_checktag_boost_lists.sh 
                                  script
    umls_concept_data_filename: a filename pointing to the file built by the
                                preprocess_umls_mesh_mappings.sh script                            
    output_file: a file object in which you want to place the results of the
                 computation
    Call the run() method to execute it."""
    def __init__(self, reader, graph_builder, ranker, eval_parameters, 
                 ranking_cutoff,
                 mesh_tree_filename, distance_matrix_filename,
                 distance_function,
                 umls_converter_data_filename, umls_concept_data_filename,
                 output_file):
        logging.debug("Setting up a Workflow instance.")
        logging.debug("My reader is: %r", reader)
        self._reader=reader
        logging.debug("My graph builder is: %r", graph_builder)
        self._graph_builder=graph_builder
        self._ranker=MappedRanker(ranker)
        logging.debug("My ranker is: %r", self._ranker)
        self._ranking_cutoff=ranking_cutoff
        logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
        logging.debug("Creating a Tree instance from %s", mesh_tree_filename)
        self._mesh_tree=Tree(mesh_tree_filename)
        logging.debug("Creating SAVCC distance matrix with %r and distance "
                      "function %r", 
                      distance_matrix_filename, distance_function)
        self._matrix=SavccNormalizedMatrix(
                      open(distance_matrix_filename, "rb"), distance_function)
        logging.debug("Filling in the rest of the evaluation parameters.")
        self._eval_parameters=eval_parameters
        self._eval_parameters.mesh_tree=self._mesh_tree
        self._eval_parameters.savcc_matrix=self._matrix
        logging.debug("My evaluation parameters are: %r", 
                      self._eval_parameters)
        if umls_converter_data_filename is None:
            converter_data=None
        else:
            converter_data=pickle.load(open(umls_converter_data_filename, 
                                            "rb"))
        self._umls_converter=RankedConverter(Converter(self._mesh_tree, 
                                                       converter_data))
        logging.debug("My converter is: %r", self._umls_converter)
        logging.debug("Initializing Concept storage from %s", 
                      umls_concept_data_filename)
        if umls_concept_data_filename is None:
            Concept.init_storage()
        else:
            Concept.init_storage(StringDBDict(umls_concept_data_filename))
        self._output_file=output_file
        logging.debug("My output file is: %r", self._output_file)
        return
    def __repr__(self):
        return "<%s instance>" % self.__class__.__name__
    def limit_length(self, gold_standard_terms, generated_terms):
        """Limits the length of the generated terms and returns the new list.
        By default it just truncates the list to the length of the gold 
        standard. Override to customize."""
        return generated_terms[:len(gold_standard_terms)]
    def create_evaluator(self):
        """Creates and returns the evaluator we'll use - override for easy
        customization"""
        return comprehensive(self._eval_parameters)
    def output(self, result_set):
        """Actually dumps the result set to output. Override for easy output
        customization."""
        column_names=set([])
        for result in result_set.itervalues():
            column_names|=result.columns()
        # Create a writer
        column_names=['pmid'] + [x for x in column_names]
        output_writer=DictWriter(self._output_file, 
                                 fieldnames=column_names)
        # Add the colnames to the csv
        output_writer.writer.writerow(column_names) 
        for pmid, result in result_set.iteritems():
            outdict=result.as_dict()
            # Convert the PMID to string, which will be harmless of it's any
            # datatype but force it to display just the actual number if it's
            # a Pmid()
            outdict['pmid']=str(pmid)
            output_writer.writerow(outdict)
        return
    def output_metadata(self):
        """Exports a sidecar file (with the creative extension .metadata) that
        describes the evaluation. Override for easy customization."""
        metadata_filename=self._output_file.name + '.metadata'
        data=[]
        for potential_var, potential_var_value in self.__dict__.iteritems():
            if potential_var[0]=='_' and potential_var[1]!='_':
                data.append('%s=%r' % (potential_var[1:],
                                       potential_var_value))
        meta_out=open(metadata_filename, 'w')
        meta_out.write("[MEDRank_metadata]\n")
        meta_out.write('\n'.join(data))
        meta_out.close()
    def convert(self, terms_to_convert):
        """Override for easy customization"""
        return self._umls_converter.convert(terms_to_convert)
    def graph_article(self, article):
        if self._graph_builder is None:
            return None
        return self._graph_builder.create_graph(article)
    def graph_and_rank(self, article):
        """Turn the article into a graph, then a link matrix, and then rank
        it. Returns the ranked list of nodes."""
        article_graph=self.graph_article(article)
        article_matrix=article_graph.as_mapped_link_matrix()
        if len(article_matrix)==0:
            logging.info("Skipping article %r. It has an empty matrix.", 
                         article)
            raise CouldNotRank("Article %r is not rankable." % article)
        try:
            ranked_article=self._ranker.evaluate(article_matrix)
        except ValueError:
            logging.info("%r returned an exception while ranking %r. "
                         "Skipping.", self._ranker, article)
            raise CouldNotRank("There was an exception while ranking %r." %
                                article)
        return ranked_article
    def flatten_generated_terms(self, gold_standard_terms, generated_terms):
        """Flatten without any further preprocessing - this may be desirable 
        if, for example, all terms after the pagerank cutoff should be 
        considered equivalent. This may not always be the case."""
        return generated_terms.as_ExpressionList().flatten()
    def perform_evaluation(self, article,
                           evaluator, flat_medline, flattened_terms):
        results=evaluator.evaluate(flat_medline, flattened_terms)
        # Get the size of the LinkMatrix - we'll have to build the graph again
        # but only if we have something to build with.
        article_graph=self.graph_article(article)
        if article_graph is not None:
            results.update(article_graph.compute_measures())
        if self._graph_builder is not None:
            results.update(self._graph_builder.measurements)
        return results
    def include_article(self, article):
        """Should this article be included in the sample? Return a boolean
        specifying so. Override to customize."""
        return True
    def compute_total_recall(self, flat_gold_standard, converted_terms):
        """Computes the Total Recall of an article."""
        flat_converted=converted_terms.as_ExpressionList().flatten()
        tr=TotalRecall().evaluate(flat_gold_standard, flat_converted)
        return tr
    def run(self):
        """Perform the evaluation"""
        logging.info("Starting workflow %r run", self)
        all_results={}
        evaluator=self.create_evaluator()
        count=0
        for each_article in self._reader:
            count+=1
            logging.info("Working on article %d: %r", count, each_article)
            if not self.include_article(each_article):
                logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                              " criteria.", each_article)
                continue
            try:
                ranked_article=self.graph_and_rank(each_article)
            except CouldNotRank:
                continue
            converted_terms=self.convert(ranked_article)
            cut_terms=converted_terms.terms_higher_than_or_equal_to(
                                self._ranking_cutoff)
            logging.debug("Lowest-ranking term is term #%d out of %d"
                          " (score=%1.5f, highest score=%1.5f)",
                          len(cut_terms), len(converted_terms),
                          [x[1] for x in cut_terms][-1],
                          [x[1] for x in cut_terms][0])
            medline_record_mesh_terms=ExpressionList().from_medline(
                    each_article.set_id.article_record().mesh_headings)
            flat_medline=medline_record_mesh_terms.flatten()
            flattened_terms=self.flatten_generated_terms(flat_medline,
                            cut_terms)
            flattened_terms=self.limit_length(flat_medline, flattened_terms)
            if len(flat_medline)==0:
                logging.warn("No gold standard available for article %r. "
                             "Omitting it from the result set.", each_article)
                continue
            eval_result=self.perform_evaluation(each_article,
                                                evaluator,
                                                flat_medline,
                                                flattened_terms)
            flattened_major_headings=\
                medline_record_mesh_terms.major_headings()
            logging.debug("Original headings: %r Major headings: %r", 
                            medline_record_mesh_terms,
                            flattened_major_headings)
            mh_result_temp=self.perform_evaluation(each_article, evaluator,
                                                   flattened_major_headings,
                                                   flattened_terms)
            mh_result=NamedResultSet("mh_", mh_result_temp)
            # Compute the total recall, too
            total_recall=self.compute_total_recall(flat_medline, 
                                                   converted_terms)
            eval_result.add(total_recall)
            # Unify the result sets
            all_results[each_article.set_id]=eval_result | mh_result
        logging.info("Writing out results.")
        self.output(all_results)
        self.output_metadata()
        return
コード例 #10
0
class SingleItemWorkflow(object):
    """Contains a skeleton simple workflow through the system for a single 
    item (i.e. a document, clinical note, article, etc.). It expects a
    graph builder constructor, its parameter set, a ranker constructor, its
    parameter set, and a ranking cutoff. (Otherwise, why are you using MEDRank?)
    
    Parameters:
    graph_builder_constructor: A class that knows how to build a Graph (as in
                               MEDRank.computation.graph.Graph)
    graph_builder_params:   The parameters you want to use to call the 
                            aforementioned constructor.
    ranker_constructor:     A class that knows how to build a Ranker (as in
                            MEDRank.computation.ranker.Ranker) or descendant
    ranker_params: The parameters to pass to THAT constructor
    ranking_cutoff: A float value between 0.0 (no filtering) and 1.0. 
                   Everything below ranking_cutoff gets discarded.
    """
    def __init__(self, graph_builder_constructor, graph_builder_params,
                 ranker_constructor, ranker_params, ranking_cutoff):
        logging.debug("Setting up a SingleItemWorkflow instance.")
        logging.debug("My graph builder is: %r", graph_builder_constructor)
        if graph_builder_constructor is not None:
            self._graph_builder=\
                graph_builder_constructor(*graph_builder_params)
        else:
            self._graph_builder = None
        if ranker_constructor is not None:
            self._ranker = MappedRanker(ranker_constructor(*ranker_params))
        else:
            self._ranker = None
        logging.debug("My ranker is: %r", ranker_constructor)
        self._ranking_cutoff = ranking_cutoff
        logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
        self.all_results = {}
        return

    def __repr__(self):
        return "<%s instance>" % self.__class__.__name__

    def graph_item(self, item):
        if self._graph_builder is None:
            return None
        return self._graph_builder.create_graph(item)

    def graph_and_rank(self, item):
        """Turn the item into a graph, then a link matrix, and then rank
        it. Returns the ranked list of nodes."""
        item_graph = self.graph_item(item)
        logging.log(ULTRADEBUG, "The item graph is %r.", item_graph)
        item_matrix = item_graph.as_mapped_link_matrix()
        if len(item_matrix) == 0:
            logging.info("Skipping item %r. It has an empty matrix.", item)
            raise CouldNotRank("Item %r is not rankable." % item)
        try:
            ranked_item = self._ranker.evaluate(item_matrix)
        except ValueError:
            logging.info(
                "%r returned an exception while ranking %r. "
                "Skipping.", self._ranker, item)
            raise CouldNotRank("There was an exception while ranking %r." %
                               item)
        return ranked_item

    def include_item(self, item):
        """Should this item be included in the sample? Return a boolean
        specifying so. Override to customize."""
        return True

    def process_item(self, one_item):
        if not self.include_item(one_item):
            logging.log(ULTRADEBUG, "Skipping item %r due to exclusion "
                        " criteria.", one_item)
            return
        try:
            ranked_item = self.graph_and_rank(one_item)
        except CouldNotRank:
            return
        cut_item = [x for x in ranked_item if x[1] >= self._ranking_cutoff]
        # Unify the result sets
        self.all_results[one_item.set_id] = cut_item
        return