def main(): # Read all lines, stripping trailing newlines and leading spaces sentences=[s.strip() \ for s in sentence_detector.tokenize( open(sys.argv[1], 'rU').read().strip())] # Eliminate empty lines sentences=[s for s in sentences if len(s)>0] # Create one Node per sentence, with a unique ID based on sequential # numbering, the contents of the sentence, and an initial node weight of 1.0 sentnodes=[Node(x, sentences[x], 1.0) for x in xrange(len(sentences))] # Create an empty graph sentgraph=Graph() # Compute the similarity between every pair of sentences and add a link to # the graph connecting those nodes. THe for p in sentence_pairs(sentences): n1, n2=sentnodes[p[0]], sentnodes[p[1]] sentlink=AdirectionalLink(n1, n2, sentence_similarity(sentences[p[0]], sentences[p[1]])) sentgraph.add_relationship(sentlink) # Create a default TextRanker (that implements TextRank as described) and # wrap it in the MappedRanker class, which returns (node, score) pairings # instead of just scores ranker=MappedRanker(TextRanker()) # Convert the graph to a link matrix matrix=sentgraph.as_mapped_link_matrix() # Run the ranker on the matrix results=ranker.evaluate(matrix) # The ranker returns a RankedResultSet that behaves like a list of # (node, score) pairings. By default these are sorted in reverse order, # i.e., the highest scores at the beginning. To obtain the desired sentences # we just trim the list to size. try: # Try to get a float from the command line desired_length=int(round(float(sys.argv[2])*len(sentences))) except: desired_length=int(round(float(len(sentences))*0.2)) # For the final output we only need the node, not its score shortened_results=[x[0] for x in results] # Now we trim it to the desired length shortened_results=shortened_results[:desired_length] # Now, for presentation purposes, we reorder the truncated list in its # original order. shortened_results.sort(cmp=cmp_two_nodes_by_id) # Output the summary as a paragraph print ' '. join([x.name for x in shortened_results])
def main(): # Read all lines, stripping trailing newlines and leading spaces sentences=[s.strip() \ for s in sentence_detector.tokenize( open(sys.argv[1], 'rU').read().strip())] # Eliminate empty lines sentences = [s for s in sentences if len(s) > 0] # Create one Node per sentence, with a unique ID based on sequential # numbering, the contents of the sentence, and an initial node weight of 1.0 sentnodes = [Node(x, sentences[x], 1.0) for x in xrange(len(sentences))] # Create an empty graph sentgraph = Graph() # Compute the similarity between every pair of sentences and add a link to # the graph connecting those nodes. THe for p in sentence_pairs(sentences): n1, n2 = sentnodes[p[0]], sentnodes[p[1]] sentlink = AdirectionalLink( n1, n2, sentence_similarity(sentences[p[0]], sentences[p[1]])) sentgraph.add_relationship(sentlink) # Create a default TextRanker (that implements TextRank as described) and # wrap it in the MappedRanker class, which returns (node, score) pairings # instead of just scores ranker = MappedRanker(TextRanker()) # Convert the graph to a link matrix matrix = sentgraph.as_mapped_link_matrix() # Run the ranker on the matrix results = ranker.evaluate(matrix) # The ranker returns a RankedResultSet that behaves like a list of # (node, score) pairings. By default these are sorted in reverse order, # i.e., the highest scores at the beginning. To obtain the desired sentences # we just trim the list to size. try: # Try to get a float from the command line desired_length = int(round(float(sys.argv[2]) * len(sentences))) except: desired_length = int(round(float(len(sentences)) * 0.2)) # For the final output we only need the node, not its score shortened_results = [x[0] for x in results] # Now we trim it to the desired length shortened_results = shortened_results[:desired_length] # Now, for presentation purposes, we reorder the truncated list in its # original order. shortened_results.sort(cmp=cmp_two_nodes_by_id) # Output the summary as a paragraph print ' '.join([x.name for x in shortened_results])
def __init__(self, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, ranking_cutoff): logging.debug("Setting up a SingleItemWorkflow instance.") logging.debug("My graph builder is: %r", graph_builder_constructor) if graph_builder_constructor is not None: self._graph_builder=\ graph_builder_constructor(*graph_builder_params) else: self._graph_builder = None if ranker_constructor is not None: self._ranker = MappedRanker(ranker_constructor(*ranker_params)) else: self._ranker = None logging.debug("My ranker is: %r", ranker_constructor) self._ranking_cutoff = ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) self.all_results = {} return
def __init__(self, reader, graph_builder, ranker, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, output_file): logging.debug("Setting up a Workflow instance.") logging.debug("My reader is: %r", reader) self._reader = reader logging.debug("My graph builder is: %r", graph_builder) self._graph_builder = graph_builder self._ranker = MappedRanker(ranker) logging.debug("My ranker is: %r", self._ranker) self._ranking_cutoff = ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) logging.debug("Creating a Tree instance from %s", mesh_tree_filename) self._mesh_tree = Tree(mesh_tree_filename) logging.debug( "Creating SAVCC distance matrix with %r and distance " "function %r", distance_matrix_filename, distance_function) self._matrix = SavccNormalizedMatrix( open(distance_matrix_filename, "rb"), distance_function) logging.debug("Filling in the rest of the evaluation parameters.") self._eval_parameters = eval_parameters self._eval_parameters.mesh_tree = self._mesh_tree self._eval_parameters.savcc_matrix = self._matrix logging.debug("My evaluation parameters are: %r", self._eval_parameters) if umls_converter_data_filename is None: converter_data = None else: converter_data = pickle.load( open(umls_converter_data_filename, "rb")) self._umls_converter = RankedConverter( Converter(self._mesh_tree, converter_data)) logging.debug("My converter is: %r", self._umls_converter) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) self._output_file = output_file logging.debug("My output file is: %r", self._output_file) return
def __init__(self, reader, graph_builder, ranker, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, output_file): logging.debug("Setting up a Workflow instance.") logging.debug("My reader is: %r", reader) self._reader=reader logging.debug("My graph builder is: %r", graph_builder) self._graph_builder=graph_builder self._ranker=MappedRanker(ranker) logging.debug("My ranker is: %r", self._ranker) self._ranking_cutoff=ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) logging.debug("Creating a Tree instance from %s", mesh_tree_filename) self._mesh_tree=Tree(mesh_tree_filename) logging.debug("Creating SAVCC distance matrix with %r and distance " "function %r", distance_matrix_filename, distance_function) self._matrix=SavccNormalizedMatrix( open(distance_matrix_filename, "rb"), distance_function) logging.debug("Filling in the rest of the evaluation parameters.") self._eval_parameters=eval_parameters self._eval_parameters.mesh_tree=self._mesh_tree self._eval_parameters.savcc_matrix=self._matrix logging.debug("My evaluation parameters are: %r", self._eval_parameters) if umls_converter_data_filename is None: converter_data=None else: converter_data=pickle.load(open(umls_converter_data_filename, "rb")) self._umls_converter=RankedConverter(Converter(self._mesh_tree, converter_data)) logging.debug("My converter is: %r", self._umls_converter) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) self._output_file=output_file logging.debug("My output file is: %r", self._output_file) return
def __init__(self, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, ranking_cutoff): logging.debug("Setting up a SingleItemWorkflow instance.") logging.debug("My graph builder is: %r", graph_builder_constructor) if graph_builder_constructor is not None: self._graph_builder=\ graph_builder_constructor(*graph_builder_params) else: self._graph_builder=None if ranker_constructor is not None: self._ranker=MappedRanker(ranker_constructor(*ranker_params)) else: self._ranker=None logging.debug("My ranker is: %r", ranker_constructor) self._ranking_cutoff=ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) self.all_results={} return
class SingleItemWorkflow(object): """Contains a skeleton simple workflow through the system for a single item (i.e. a document, clinical note, article, etc.). It expects a graph builder constructor, its parameter set, a ranker constructor, its parameter set, and a ranking cutoff. (Otherwise, why are you using MEDRank?) Parameters: graph_builder_constructor: A class that knows how to build a Graph (as in MEDRank.computation.graph.Graph) graph_builder_params: The parameters you want to use to call the aforementioned constructor. ranker_constructor: A class that knows how to build a Ranker (as in MEDRank.computation.ranker.Ranker) or descendant ranker_params: The parameters to pass to THAT constructor ranking_cutoff: A float value between 0.0 (no filtering) and 1.0. Everything below ranking_cutoff gets discarded. """ def __init__(self, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, ranking_cutoff): logging.debug("Setting up a SingleItemWorkflow instance.") logging.debug("My graph builder is: %r", graph_builder_constructor) if graph_builder_constructor is not None: self._graph_builder=\ graph_builder_constructor(*graph_builder_params) else: self._graph_builder=None if ranker_constructor is not None: self._ranker=MappedRanker(ranker_constructor(*ranker_params)) else: self._ranker=None logging.debug("My ranker is: %r", ranker_constructor) self._ranking_cutoff=ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) self.all_results={} return def __repr__(self): return "<%s instance>" % self.__class__.__name__ def graph_item(self, item): if self._graph_builder is None: return None return self._graph_builder.create_graph(item) def graph_and_rank(self, item): """Turn the item into a graph, then a link matrix, and then rank it. Returns the ranked list of nodes.""" item_graph=self.graph_item(item) logging.log(ULTRADEBUG, "The item graph is %r.", item_graph) item_matrix=item_graph.as_mapped_link_matrix() if len(item_matrix)==0: logging.info("Skipping item %r. It has an empty matrix.", item) raise CouldNotRank("Item %r is not rankable." % item) try: ranked_item=self._ranker.evaluate(item_matrix) except ValueError: logging.info("%r returned an exception while ranking %r. " "Skipping.", self._ranker, item) raise CouldNotRank("There was an exception while ranking %r." % item) return ranked_item def include_item(self, item): """Should this item be included in the sample? Return a boolean specifying so. Override to customize.""" return True def process_item(self, one_item): if not self.include_item(one_item): logging.log(ULTRADEBUG, "Skipping item %r due to exclusion " " criteria.", one_item) return try: ranked_item=self.graph_and_rank(one_item) except CouldNotRank: return cut_item=[x for x in ranked_item if x[1] >= self._ranking_cutoff] # Unify the result sets self.all_results[one_item.set_id]=cut_item return
class Workflow(object): """Contains a basic workflow through the system. The parameters are: reader: an instance of an NLMOutput descendant (that knows how to read an output file, basically) graph_builder: a graph builder ranker: a Ranker eval_parameters: Pass an instance of EvaluationParameters, but just fill in the numerical ones. The matrix and tree will be instantiated and loaded by the Workflow constructor. mesh_tree_filename: a filename containing the tree you want to use distance_matrix_filename: the distance matrix you wish to use for SAVCC computations distance_function: the distance function used to interpret the distance matrix umls_converter_data_filename: a filename pointing to the file built by the preprocess_checktag_boost_lists.sh script umls_concept_data_filename: a filename pointing to the file built by the preprocess_umls_mesh_mappings.sh script output_file: a file object in which you want to place the results of the computation Call the run() method to execute it.""" def __init__(self, reader, graph_builder, ranker, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, output_file): logging.debug("Setting up a Workflow instance.") logging.debug("My reader is: %r", reader) self._reader = reader logging.debug("My graph builder is: %r", graph_builder) self._graph_builder = graph_builder self._ranker = MappedRanker(ranker) logging.debug("My ranker is: %r", self._ranker) self._ranking_cutoff = ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) logging.debug("Creating a Tree instance from %s", mesh_tree_filename) self._mesh_tree = Tree(mesh_tree_filename) logging.debug( "Creating SAVCC distance matrix with %r and distance " "function %r", distance_matrix_filename, distance_function) self._matrix = SavccNormalizedMatrix( open(distance_matrix_filename, "rb"), distance_function) logging.debug("Filling in the rest of the evaluation parameters.") self._eval_parameters = eval_parameters self._eval_parameters.mesh_tree = self._mesh_tree self._eval_parameters.savcc_matrix = self._matrix logging.debug("My evaluation parameters are: %r", self._eval_parameters) if umls_converter_data_filename is None: converter_data = None else: converter_data = pickle.load( open(umls_converter_data_filename, "rb")) self._umls_converter = RankedConverter( Converter(self._mesh_tree, converter_data)) logging.debug("My converter is: %r", self._umls_converter) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) self._output_file = output_file logging.debug("My output file is: %r", self._output_file) return def __repr__(self): return "<%s instance>" % self.__class__.__name__ def limit_length(self, gold_standard_terms, generated_terms): """Limits the length of the generated terms and returns the new list. By default it just truncates the list to the length of the gold standard. Override to customize.""" return generated_terms[:len(gold_standard_terms)] def create_evaluator(self): """Creates and returns the evaluator we'll use - override for easy customization""" return comprehensive(self._eval_parameters) def output(self, result_set): """Actually dumps the result set to output. Override for easy output customization.""" column_names = set([]) for result in result_set.itervalues(): column_names |= result.columns() # Create a writer column_names = ['pmid'] + [x for x in column_names] output_writer = DictWriter(self._output_file, fieldnames=column_names) # Add the colnames to the csv output_writer.writer.writerow(column_names) for pmid, result in result_set.iteritems(): outdict = result.as_dict() # Convert the PMID to string, which will be harmless of it's any # datatype but force it to display just the actual number if it's # a Pmid() outdict['pmid'] = str(pmid) output_writer.writerow(outdict) return def output_metadata(self): """Exports a sidecar file (with the creative extension .metadata) that describes the evaluation. Override for easy customization.""" metadata_filename = self._output_file.name + '.metadata' data = [] for potential_var, potential_var_value in self.__dict__.iteritems(): if potential_var[0] == '_' and potential_var[1] != '_': data.append('%s=%r' % (potential_var[1:], potential_var_value)) meta_out = open(metadata_filename, 'w') meta_out.write("[MEDRank_metadata]\n") meta_out.write('\n'.join(data)) meta_out.close() def convert(self, terms_to_convert): """Override for easy customization""" return self._umls_converter.convert(terms_to_convert) def graph_article(self, article): if self._graph_builder is None: return None return self._graph_builder.create_graph(article) def graph_and_rank(self, article): """Turn the article into a graph, then a link matrix, and then rank it. Returns the ranked list of nodes.""" article_graph = self.graph_article(article) article_matrix = article_graph.as_mapped_link_matrix() if len(article_matrix) == 0: logging.info("Skipping article %r. It has an empty matrix.", article) raise CouldNotRank("Article %r is not rankable." % article) try: ranked_article = self._ranker.evaluate(article_matrix) except ValueError: logging.info( "%r returned an exception while ranking %r. " "Skipping.", self._ranker, article) raise CouldNotRank("There was an exception while ranking %r." % article) return ranked_article def flatten_generated_terms(self, gold_standard_terms, generated_terms): """Flatten without any further preprocessing - this may be desirable if, for example, all terms after the pagerank cutoff should be considered equivalent. This may not always be the case.""" return generated_terms.as_ExpressionList().flatten() def perform_evaluation(self, article, evaluator, flat_medline, flattened_terms): results = evaluator.evaluate(flat_medline, flattened_terms) # Get the size of the LinkMatrix - we'll have to build the graph again # but only if we have something to build with. article_graph = self.graph_article(article) if article_graph is not None: results.update(article_graph.compute_measures()) if self._graph_builder is not None: results.update(self._graph_builder.measurements) return results def include_article(self, article): """Should this article be included in the sample? Return a boolean specifying so. Override to customize.""" return True def compute_total_recall(self, flat_gold_standard, converted_terms): """Computes the Total Recall of an article.""" flat_converted = converted_terms.as_ExpressionList().flatten() tr = TotalRecall().evaluate(flat_gold_standard, flat_converted) return tr def run(self): """Perform the evaluation""" logging.info("Starting workflow %r run", self) all_results = {} evaluator = self.create_evaluator() count = 0 for each_article in self._reader: count += 1 logging.info("Working on article %d: %r", count, each_article) if not self.include_article(each_article): logging.log( ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) continue try: ranked_article = self.graph_and_rank(each_article) except CouldNotRank: continue converted_terms = self.convert(ranked_article) cut_terms = converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug( "Lowest-ranking term is term #%d out of %d" " (score=%1.5f, highest score=%1.5f)", len(cut_terms), len(converted_terms), [x[1] for x in cut_terms][-1], [x[1] for x in cut_terms][0]) medline_record_mesh_terms = ExpressionList().from_medline( each_article.set_id.article_record().mesh_headings) flat_medline = medline_record_mesh_terms.flatten() flattened_terms = self.flatten_generated_terms( flat_medline, cut_terms) flattened_terms = self.limit_length(flat_medline, flattened_terms) if len(flat_medline) == 0: logging.warn( "No gold standard available for article %r. " "Omitting it from the result set.", each_article) continue eval_result = self.perform_evaluation(each_article, evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() logging.debug("Original headings: %r Major headings: %r", medline_record_mesh_terms, flattened_major_headings) mh_result_temp = self.perform_evaluation(each_article, evaluator, flattened_major_headings, flattened_terms) mh_result = NamedResultSet("mh_", mh_result_temp) # Compute the total recall, too total_recall = self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets all_results[each_article.set_id] = eval_result | mh_result logging.info("Writing out results.") self.output(all_results) self.output_metadata() return
class Workflow(object): """Contains a basic workflow through the system. The parameters are: reader: an instance of an NLMOutput descendant (that knows how to read an output file, basically) graph_builder: a graph builder ranker: a Ranker eval_parameters: Pass an instance of EvaluationParameters, but just fill in the numerical ones. The matrix and tree will be instantiated and loaded by the Workflow constructor. mesh_tree_filename: a filename containing the tree you want to use distance_matrix_filename: the distance matrix you wish to use for SAVCC computations distance_function: the distance function used to interpret the distance matrix umls_converter_data_filename: a filename pointing to the file built by the preprocess_checktag_boost_lists.sh script umls_concept_data_filename: a filename pointing to the file built by the preprocess_umls_mesh_mappings.sh script output_file: a file object in which you want to place the results of the computation Call the run() method to execute it.""" def __init__(self, reader, graph_builder, ranker, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, umls_concept_data_filename, output_file): logging.debug("Setting up a Workflow instance.") logging.debug("My reader is: %r", reader) self._reader=reader logging.debug("My graph builder is: %r", graph_builder) self._graph_builder=graph_builder self._ranker=MappedRanker(ranker) logging.debug("My ranker is: %r", self._ranker) self._ranking_cutoff=ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) logging.debug("Creating a Tree instance from %s", mesh_tree_filename) self._mesh_tree=Tree(mesh_tree_filename) logging.debug("Creating SAVCC distance matrix with %r and distance " "function %r", distance_matrix_filename, distance_function) self._matrix=SavccNormalizedMatrix( open(distance_matrix_filename, "rb"), distance_function) logging.debug("Filling in the rest of the evaluation parameters.") self._eval_parameters=eval_parameters self._eval_parameters.mesh_tree=self._mesh_tree self._eval_parameters.savcc_matrix=self._matrix logging.debug("My evaluation parameters are: %r", self._eval_parameters) if umls_converter_data_filename is None: converter_data=None else: converter_data=pickle.load(open(umls_converter_data_filename, "rb")) self._umls_converter=RankedConverter(Converter(self._mesh_tree, converter_data)) logging.debug("My converter is: %r", self._umls_converter) logging.debug("Initializing Concept storage from %s", umls_concept_data_filename) if umls_concept_data_filename is None: Concept.init_storage() else: Concept.init_storage(StringDBDict(umls_concept_data_filename)) self._output_file=output_file logging.debug("My output file is: %r", self._output_file) return def __repr__(self): return "<%s instance>" % self.__class__.__name__ def limit_length(self, gold_standard_terms, generated_terms): """Limits the length of the generated terms and returns the new list. By default it just truncates the list to the length of the gold standard. Override to customize.""" return generated_terms[:len(gold_standard_terms)] def create_evaluator(self): """Creates and returns the evaluator we'll use - override for easy customization""" return comprehensive(self._eval_parameters) def output(self, result_set): """Actually dumps the result set to output. Override for easy output customization.""" column_names=set([]) for result in result_set.itervalues(): column_names|=result.columns() # Create a writer column_names=['pmid'] + [x for x in column_names] output_writer=DictWriter(self._output_file, fieldnames=column_names) # Add the colnames to the csv output_writer.writer.writerow(column_names) for pmid, result in result_set.iteritems(): outdict=result.as_dict() # Convert the PMID to string, which will be harmless of it's any # datatype but force it to display just the actual number if it's # a Pmid() outdict['pmid']=str(pmid) output_writer.writerow(outdict) return def output_metadata(self): """Exports a sidecar file (with the creative extension .metadata) that describes the evaluation. Override for easy customization.""" metadata_filename=self._output_file.name + '.metadata' data=[] for potential_var, potential_var_value in self.__dict__.iteritems(): if potential_var[0]=='_' and potential_var[1]!='_': data.append('%s=%r' % (potential_var[1:], potential_var_value)) meta_out=open(metadata_filename, 'w') meta_out.write("[MEDRank_metadata]\n") meta_out.write('\n'.join(data)) meta_out.close() def convert(self, terms_to_convert): """Override for easy customization""" return self._umls_converter.convert(terms_to_convert) def graph_article(self, article): if self._graph_builder is None: return None return self._graph_builder.create_graph(article) def graph_and_rank(self, article): """Turn the article into a graph, then a link matrix, and then rank it. Returns the ranked list of nodes.""" article_graph=self.graph_article(article) article_matrix=article_graph.as_mapped_link_matrix() if len(article_matrix)==0: logging.info("Skipping article %r. It has an empty matrix.", article) raise CouldNotRank("Article %r is not rankable." % article) try: ranked_article=self._ranker.evaluate(article_matrix) except ValueError: logging.info("%r returned an exception while ranking %r. " "Skipping.", self._ranker, article) raise CouldNotRank("There was an exception while ranking %r." % article) return ranked_article def flatten_generated_terms(self, gold_standard_terms, generated_terms): """Flatten without any further preprocessing - this may be desirable if, for example, all terms after the pagerank cutoff should be considered equivalent. This may not always be the case.""" return generated_terms.as_ExpressionList().flatten() def perform_evaluation(self, article, evaluator, flat_medline, flattened_terms): results=evaluator.evaluate(flat_medline, flattened_terms) # Get the size of the LinkMatrix - we'll have to build the graph again # but only if we have something to build with. article_graph=self.graph_article(article) if article_graph is not None: results.update(article_graph.compute_measures()) if self._graph_builder is not None: results.update(self._graph_builder.measurements) return results def include_article(self, article): """Should this article be included in the sample? Return a boolean specifying so. Override to customize.""" return True def compute_total_recall(self, flat_gold_standard, converted_terms): """Computes the Total Recall of an article.""" flat_converted=converted_terms.as_ExpressionList().flatten() tr=TotalRecall().evaluate(flat_gold_standard, flat_converted) return tr def run(self): """Perform the evaluation""" logging.info("Starting workflow %r run", self) all_results={} evaluator=self.create_evaluator() count=0 for each_article in self._reader: count+=1 logging.info("Working on article %d: %r", count, each_article) if not self.include_article(each_article): logging.log(ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) continue try: ranked_article=self.graph_and_rank(each_article) except CouldNotRank: continue converted_terms=self.convert(ranked_article) cut_terms=converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug("Lowest-ranking term is term #%d out of %d" " (score=%1.5f, highest score=%1.5f)", len(cut_terms), len(converted_terms), [x[1] for x in cut_terms][-1], [x[1] for x in cut_terms][0]) medline_record_mesh_terms=ExpressionList().from_medline( each_article.set_id.article_record().mesh_headings) flat_medline=medline_record_mesh_terms.flatten() flattened_terms=self.flatten_generated_terms(flat_medline, cut_terms) flattened_terms=self.limit_length(flat_medline, flattened_terms) if len(flat_medline)==0: logging.warn("No gold standard available for article %r. " "Omitting it from the result set.", each_article) continue eval_result=self.perform_evaluation(each_article, evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() logging.debug("Original headings: %r Major headings: %r", medline_record_mesh_terms, flattened_major_headings) mh_result_temp=self.perform_evaluation(each_article, evaluator, flattened_major_headings, flattened_terms) mh_result=NamedResultSet("mh_", mh_result_temp) # Compute the total recall, too total_recall=self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets all_results[each_article.set_id]=eval_result | mh_result logging.info("Writing out results.") self.output(all_results) self.output_metadata() return
class SingleItemWorkflow(object): """Contains a skeleton simple workflow through the system for a single item (i.e. a document, clinical note, article, etc.). It expects a graph builder constructor, its parameter set, a ranker constructor, its parameter set, and a ranking cutoff. (Otherwise, why are you using MEDRank?) Parameters: graph_builder_constructor: A class that knows how to build a Graph (as in MEDRank.computation.graph.Graph) graph_builder_params: The parameters you want to use to call the aforementioned constructor. ranker_constructor: A class that knows how to build a Ranker (as in MEDRank.computation.ranker.Ranker) or descendant ranker_params: The parameters to pass to THAT constructor ranking_cutoff: A float value between 0.0 (no filtering) and 1.0. Everything below ranking_cutoff gets discarded. """ def __init__(self, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, ranking_cutoff): logging.debug("Setting up a SingleItemWorkflow instance.") logging.debug("My graph builder is: %r", graph_builder_constructor) if graph_builder_constructor is not None: self._graph_builder=\ graph_builder_constructor(*graph_builder_params) else: self._graph_builder = None if ranker_constructor is not None: self._ranker = MappedRanker(ranker_constructor(*ranker_params)) else: self._ranker = None logging.debug("My ranker is: %r", ranker_constructor) self._ranking_cutoff = ranking_cutoff logging.debug("My ranking cutoff is: %r", self._ranking_cutoff) self.all_results = {} return def __repr__(self): return "<%s instance>" % self.__class__.__name__ def graph_item(self, item): if self._graph_builder is None: return None return self._graph_builder.create_graph(item) def graph_and_rank(self, item): """Turn the item into a graph, then a link matrix, and then rank it. Returns the ranked list of nodes.""" item_graph = self.graph_item(item) logging.log(ULTRADEBUG, "The item graph is %r.", item_graph) item_matrix = item_graph.as_mapped_link_matrix() if len(item_matrix) == 0: logging.info("Skipping item %r. It has an empty matrix.", item) raise CouldNotRank("Item %r is not rankable." % item) try: ranked_item = self._ranker.evaluate(item_matrix) except ValueError: logging.info( "%r returned an exception while ranking %r. " "Skipping.", self._ranker, item) raise CouldNotRank("There was an exception while ranking %r." % item) return ranked_item def include_item(self, item): """Should this item be included in the sample? Return a boolean specifying so. Override to customize.""" return True def process_item(self, one_item): if not self.include_item(one_item): logging.log(ULTRADEBUG, "Skipping item %r due to exclusion " " criteria.", one_item) return try: ranked_item = self.graph_and_rank(one_item) except CouldNotRank: return cut_item = [x for x in ranked_item if x[1] >= self._ranking_cutoff] # Unify the result sets self.all_results[one_item.set_id] = cut_item return