def _evaluate(self, gold_standard, seen_terms): "Compute SAVCC between two sets of terms" logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f', gold_standard, seen_terms, self._alpha) gold_standard_vector = self._my_tree.term_vector(gold_standard) seen_vector = self._my_tree.term_vector(seen_terms) # This computes [(alpha*I2)+(1-alpha x M)I2] modified_term=seen_vector.scale(self._alpha)+\ self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha) logging.log(ULTRADEBUG, "Modified term=%r", modified_term) # I1 * modified_term numerator = gold_standard_vector.dot(modified_term) # Denominator of the whole thing denominator = gold_standard_vector.length() * modified_term.length() try: result = numerator / denominator except ZeroDivisionError: logging.warn( "ZeroDivisionError when computing SAVCC for %r and %r:", gold_standard, seen_terms) result = 0 logging.log(ULTRADEBUG, "Numerator=%1.7f Denominator=%1.7f Result=%1.7f", numerator, denominator, result) return result
def __iter__(self): current_set=[] current_id=None bad_id=-1 for line in NLMOutput.__iter__(self): try: this_lines_set_id=self._chunkmap.pmid_from_block(line.line_id) except KeyError: logging.warn("Line without chunkmap equivalent. Emitting" " as id %d", bad_id) this_lines_set_id=Pmid(bad_id) if this_lines_set_id!=current_id: # Is this the first invocation? If not, we have to emit the # linelist that just ended, but if it is we'll just pretend # that we did. if current_id is not None: # Emit the linelist that just ended logging.log(ULTRADEBUG, "Completed set of lines %s " "according to the chunkmap. Emitting them.", current_id) if current_id<0: # Decrement bad line counter bad_id-=1 yield self._lines_type(current_id, current_set) # Start a new, empty linelist current_id=this_lines_set_id current_set=[] current_set.append(line) # Is there something left to emit after the iteration's over? if len(current_set)>0: logging.log(ULTRADEBUG, "Completed iteration. Emitting the last " "lines left with set id %s", current_id) yield self._lines_type(current_id, current_set) return
def _evaluate(self, gold_standard, seen_terms): "Compute SAVCC between two sets of terms" logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f', gold_standard, seen_terms, self._alpha) gold_standard_vector=self._my_tree.term_vector(gold_standard) seen_vector=self._my_tree.term_vector(seen_terms) # This computes [(alpha*I2)+(1-alpha x M)I2] modified_term=seen_vector.scale(self._alpha)+\ self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha) logging.log(ULTRADEBUG, "Modified term=%r", modified_term) # I1 * modified_term numerator=gold_standard_vector.dot(modified_term) # Denominator of the whole thing denominator=gold_standard_vector.length()*modified_term.length() try: result=numerator/denominator except ZeroDivisionError: logging.warn("ZeroDivisionError when computing SAVCC for %r and %r:", gold_standard, seen_terms) result=0 logging.log(ULTRADEBUG, "Numerator=%1.7f Denominator=%1.7f Result=%1.7f", numerator, denominator, result) return result
def __init__(self, fileobject, transform_function): SavccMatrix.__init__(self, fileobject, transform_function) # Add normalization factors logging.log(ULTRADEBUG, "Initializing normalization array") # Default behavior: no normalization self.normfactors=[1.0]*self._height # Tentative normalization array name array_filename=self._expected_norm_array_name() logging.debug("Trying to load a normalization array from disk. The " "file should be named %s.", array_filename) # Make sure that only one process or thread at a time can attempt to get # the normalization factors _normfactor_lock.acquire() try: try: self._load_normalization_factors(open(array_filename, 'rb')) logging.debug('Normalization factors loaded from disk.') except IOError: logging.debug("Unable to load normalization factors from disk.") self._generate_normalization_factors() # Only save normalization factors if they are not a StringIO # object if not isinstance(fileobject, StringIO.StringIO): logging.debug("Saving normalization factors to %s", array_filename) try: self._save_normalization_factors(open(array_filename, 'wb')) except IOError: logging.warn("Unable to save the normalization array. " "It will have to be regenerated each " "time.") finally: _normfactor_lock.release()
def start_conversion(self): """start_conversion: Begin the conversion process by cleaning up the internal state of the converter.""" if len(self._extra_checktags) > 0: logging.warn("Cleaning up _extra_checktags, but there was content" " there. Someone didn't retrieve it.") self._extra_checktags = set()
def start_conversion(self): """start_conversion: Begin the conversion process by cleaning up the internal state of the converter.""" if len(self._extra_checktags)>0: logging.warn("Cleaning up _extra_checktags, but there was content" " there. Someone didn't retrieve it.") self._extra_checktags=set()
def run(self): """Perform the evaluation""" logging.info("Starting workflow %r run", self) all_results = {} evaluator = self.create_evaluator() count = 0 for each_article in self._reader: count += 1 logging.info("Working on article %d: %r", count, each_article) if not self.include_article(each_article): logging.log( ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) continue try: ranked_article = self.graph_and_rank(each_article) except CouldNotRank: continue converted_terms = self.convert(ranked_article) cut_terms = converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug( "Lowest-ranking term is term #%d out of %d" " (score=%1.5f, highest score=%1.5f)", len(cut_terms), len(converted_terms), [x[1] for x in cut_terms][-1], [x[1] for x in cut_terms][0]) medline_record_mesh_terms = ExpressionList().from_medline( each_article.set_id.article_record().mesh_headings) flat_medline = medline_record_mesh_terms.flatten() flattened_terms = self.flatten_generated_terms( flat_medline, cut_terms) flattened_terms = self.limit_length(flat_medline, flattened_terms) if len(flat_medline) == 0: logging.warn( "No gold standard available for article %r. " "Omitting it from the result set.", each_article) continue eval_result = self.perform_evaluation(each_article, evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() logging.debug("Original headings: %r Major headings: %r", medline_record_mesh_terms, flattened_major_headings) mh_result_temp = self.perform_evaluation(each_article, evaluator, flattened_major_headings, flattened_terms) mh_result = NamedResultSet("mh_", mh_result_temp) # Compute the total recall, too total_recall = self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets all_results[each_article.set_id] = eval_result | mh_result logging.info("Writing out results.") self.output(all_results) self.output_metadata() return
def run(self): """Perform the evaluation""" logging.info("Starting workflow %r run", self) all_results={} evaluator=self.create_evaluator() count=0 for each_article in self._reader: count+=1 logging.info("Working on article %d: %r", count, each_article) if not self.include_article(each_article): logging.log(ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) continue try: ranked_article=self.graph_and_rank(each_article) except CouldNotRank: continue converted_terms=self.convert(ranked_article) cut_terms=converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug("Lowest-ranking term is term #%d out of %d" " (score=%1.5f, highest score=%1.5f)", len(cut_terms), len(converted_terms), [x[1] for x in cut_terms][-1], [x[1] for x in cut_terms][0]) medline_record_mesh_terms=ExpressionList().from_medline( each_article.set_id.article_record().mesh_headings) flat_medline=medline_record_mesh_terms.flatten() flattened_terms=self.flatten_generated_terms(flat_medline, cut_terms) flattened_terms=self.limit_length(flat_medline, flattened_terms) if len(flat_medline)==0: logging.warn("No gold standard available for article %r. " "Omitting it from the result set.", each_article) continue eval_result=self.perform_evaluation(each_article, evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() logging.debug("Original headings: %r Major headings: %r", medline_record_mesh_terms, flattened_major_headings) mh_result_temp=self.perform_evaluation(each_article, evaluator, flattened_major_headings, flattened_terms) mh_result=NamedResultSet("mh_", mh_result_temp) # Compute the total recall, too total_recall=self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets all_results[each_article.set_id]=eval_result | mh_result logging.info("Writing out results.") self.output(all_results) self.output_metadata() return
def processor(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, my_input_queue, my_output_queue, my_own_name=None): logging.info("Setting up worker.") if my_own_name is not None: proctitle.setproctitle(my_own_name) my_workflow=workflow_class(graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename ) if extra_data_name is not None: my_workflow.__setattr__(extra_data_name, extra_data_contents) logging.info("Finished setting up worker process. Waiting for requests.") try: while True: request=my_input_queue.get() logging.log(ULTRADEBUG, "Processing request %r", request) if request=='STOP': logging.log(ULTRADEBUG, "Received stop request.") break try: my_workflow.process_article(request) # Recover the article, push it on the output queue my_output_queue.put(my_workflow.all_results) # Clear the output queue my_workflow.all_results={} except CouldNotRank: #my_input_queue.put(request) # On error, push the task # back into the queue logging.info("Skipping unrankable article.") except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) raise finally: logging.log(ULTRADEBUG, "Returning results to caller.") logging.log(ULTRADEBUG, "Ending processor execution.") return
def _create_graph(self, list_of_lines): """Build a graph by generating a relationship for every pair of co-occurring nodes. We take advantage of the fact that (for our purposes) all lines with the same line_id in METAMAP output come from the same sentence.""" new_graph = self._type_of_graph_to_build() logging.debug("Retrieving semantic predications for %r", self._line_set_id) try: predications = get_predications(self._line_set_id) except: logging.warn( "No predications for %r: an exception was raised.\n%s", self._line_set_id, traceback.format_exc()) return new_graph logging.log(ULTRADEBUG, "Building a METAMAP co-occurrence graph from %r", list_of_lines) for sentence in self.sentence_iterator(list_of_lines): # Each "sentence" contains a set of potential nodes that need # screening. nodes = [] for concept in sentence: new_node = self._node_factory(concept.CUI, concept.description, concept.confidence, concept.line) if self.include_node(new_node): #nodes.append((concept.CUI, concept.confidence)) nodes.append(new_node) else: logging.log(ULTRADEBUG, "%r excluded from the graph.", new_node) # Once we have all the nodes in a sentence, we generate all # possible combinations (O(n^2)), yes it's ugly. for i in xrange(len(nodes)): # Since relationships are not directional we can skip half # of the generation (i.e. if we have i<-->j we don't need # j<-->i for j in xrange(i + 1, len(nodes)): node1, node2 = nodes[i], nodes[j] #new_link=AdirectionalLink(node1[0], node2[0], # (node1[1]+node2[1])/2.0) # Is there a predication? try: this_link = predications[(node1, node2)] except KeyError: continue new_link = self._adirectional_link_factory( node1, node2, this_link.weight) if self.include_link(new_link): new_graph.add_relationship(new_link) else: logging.log(ULTRADEBUG, "Excluding link %r from the graph", new_link) return new_graph
def _create_graph(self, list_of_lines): """Build a graph by generating a relationship for every pair of co-occurring nodes. We take advantage of the fact that (for our purposes) all lines with the same line_id in METAMAP output come from the same sentence.""" new_graph=self._type_of_graph_to_build() logging.debug("Retrieving semantic predications for %r", self._line_set_id) try: predications=get_predications(self._line_set_id) except: logging.warn("No predications for %r: an exception was raised.\n%s", self._line_set_id, traceback.format_exc()) return new_graph logging.log(ULTRADEBUG, "Building a METAMAP co-occurrence graph from %r", list_of_lines) for sentence in self.sentence_iterator(list_of_lines): # Each "sentence" contains a set of potential nodes that need # screening. nodes=[] for concept in sentence: new_node=self._node_factory(concept.CUI, concept.description, concept.confidence, concept.line) if self.include_node(new_node): #nodes.append((concept.CUI, concept.confidence)) nodes.append(new_node) else: logging.log(ULTRADEBUG, "%r excluded from the graph.", new_node) # Once we have all the nodes in a sentence, we generate all # possible combinations (O(n^2)), yes it's ugly. for i in xrange(len(nodes)): # Since relationships are not directional we can skip half # of the generation (i.e. if we have i<-->j we don't need # j<-->i for j in xrange(i+1, len(nodes)): node1, node2=nodes[i],nodes[j] #new_link=AdirectionalLink(node1[0], node2[0], # (node1[1]+node2[1])/2.0) # Is there a predication? try: this_link=predications[(node1, node2)] except KeyError: continue new_link=self._adirectional_link_factory(node1, node2, this_link.weight) if self.include_link(new_link): new_graph.add_relationship(new_link) else: logging.log(ULTRADEBUG, "Excluding link %r from the graph", new_link) return new_graph
def term_vector(self, list_of_terms): """Returns a VocabularyVector representing the list of terms as seen by this tree.""" new_vector = VocabularyVector(self.num_terms) for term in list_of_terms: try: new_vector[self.index(term)] = 1 except TermNotInTree: logging.warn( 'Weird: term %r could not be found in %r. It ' 'should be there.', term, self) return new_vector
def processor(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, my_input_queue, my_output_queue, my_own_name=None): logging.info("Setting up worker.") if my_own_name is not None: proctitle.setproctitle(my_own_name) my_workflow = workflow_class(graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename) if extra_data_name is not None: my_workflow.__setattr__(extra_data_name, extra_data_contents) logging.info("Finished setting up worker process. Waiting for requests.") try: while True: request = my_input_queue.get() logging.log(ULTRADEBUG, "Processing request %r", request) if request == 'STOP': logging.log(ULTRADEBUG, "Received stop request.") break try: my_workflow.process_article(request) # Recover the article, push it on the output queue my_output_queue.put(my_workflow.all_results) # Clear the output queue my_workflow.all_results = {} except CouldNotRank: #my_input_queue.put(request) # On error, push the task # back into the queue logging.info("Skipping unrankable article.") except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) raise finally: logging.log(ULTRADEBUG, "Returning results to caller.") logging.log(ULTRADEBUG, "Ending processor execution.") return
def term_vector(self, list_of_terms): """Returns a VocabularyVector representing the list of terms as seen by this tree.""" new_vector=VocabularyVector(self.num_terms) for term in list_of_terms: try: new_vector[self.index(term)]=1 except TermNotInTree: logging.warn('Weird: term %r could not be found in %r. It ' 'should be there.', term, self) return new_vector
def process_article(self, each_article): if not self.include_article(each_article): logging.log(ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) return try: ranked_article=self.graph_and_rank(each_article) except CouldNotRank: return logging.debug("Ranked article: %r", ranked_article) converted_terms=self.convert(ranked_article) logging.debug("Converted terms: %r", converted_terms) cut_terms=converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug("Cut terms: %r", cut_terms) try: medline_record_mesh_terms=ExpressionList().from_medline( each_article.set_id.article_record()['MH']) except: logging.warn("Could not obtain an article record for %r. " "Skipping.", each_article) return flat_medline=medline_record_mesh_terms.flatten() flattened_terms=self.flatten_generated_terms(flat_medline, cut_terms) flattened_terms=self.limit_length(flat_medline, flattened_terms) if len(flat_medline)==0: logging.warn("No gold standard available for article %r. " "Omitting it from the result set.", each_article) return eval_result=self.perform_evaluation(each_article, self.evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() #logging.debug("Original headings: %r Major headings: %r", # medline_record_mesh_terms, # flattened_major_headings) logging.debug("Flattened MeSH terms: %r", flat_medline) logging.debug("Flattened generated terms: %r", flattened_terms) mh_result_temp=self.perform_evaluation(each_article, self.evaluator, flattened_major_headings, flattened_terms) mh_result=NamedResultSet("major_", mh_result_temp) # Compute the total recall, too total_recall=self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets self.all_results[each_article.set_id]=eval_result | mh_result return
def build_idf_from_file(self, file_reader, default_score=None): tempdict = {} logging.info("Building the term frequency dictionary") count = 1 logging.debug("Checking for a cache file, and loading from it.") try: self.populate_from_cache( self.cache_file_name(file_reader.original_file.name)) logging.info("Loaded from cache. It's not necessary to build.") return except: logging.debug("Nope. Proceeding with building the dictionary.") for article in file_reader: logging.debug( "Processing article %r (number %d) for the term" " frequency dictionary", article, count) if article.set_id.pmid < 0: logging.warn("Article with unknown PubMed ID - skipping") continue count += 1 tempcounts = {} for line in article.lines: try: this_cui = line.CUI except AttributeError: continue # Use the confidence as the score if no default is specified #if default_score is None: # try: # this_score=line.confidence # except AttributeError: # continue #else: # this_score=default_score #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score tempcounts[this_cui] = 1 # Now have all the CUIs that appeared in the article. Update # the total counts. for k in tempcounts: tempdict[k] = tempdict.get(k, 0) + 1 logging.debug("Built a dictionary with %d items. Computing IDFs.", len(tempdict)) # max_value=max(tempdict.itervalues()) #logging.debug("Saving it to permanent storage.") for k, v in tempdict.iteritems(): self[k] = math.log(count / float(v)) + 1.0 logging.info("Done building the dictionary. Dumping it to a cache " "file.") self.dump_to_cache(self.cache_file_name( file_reader.original_file.name)) return
def from_graphml_file(self, file_object, default_link=Link): from xml.etree.ElementTree import iterparse def get_subelement_data(elem, key): result = [ x.text for x in elem.getiterator() if x.tag == "{http://graphml.graphdrawing.org/xmlns}data" and x.get('key') == key ] if len(result) == 0: return None return result[0] nodes = {} # Discover the names of the attributes we're looking for by investigating the keys # Then actually read the file keystore = {} for event, element in iterparse(file_object): #print element if element.tag == "{http://graphml.graphdrawing.org/xmlns}key": if element.get('attr.name') is None: continue keystore[element.get('for') + '.' + element.get('attr.name')] = element.get('id') # print keystore if element.tag == "{http://graphml.graphdrawing.org/xmlns}node": # The next line supports yEd's NodeLabel and Profuse's label nodename = get_subelement_data(element, keystore['node.description']) if nodename is None: nodename = "NoName" nodekey = get_subelement_data(element, keystore['node.MR_id']) nodes[element.get('id')] = Node(nodekey, nodename, 1.0) if element.tag == "{http://graphml.graphdrawing.org/xmlns}edge": n1 = nodes[element.get('source')] n2 = nodes[element.get('target')] try: weight = float( get_subelement_data(element, keystore['edge.weight'])) except: logging.warn('Failed at reading weight because of:\n%s', traceback.format_exc()) weight = 1.0 try: relname = get_subelement_data(element, keystore['edge.description']) except: relname = "" self.add_relationship(default_link(n1, n2, weight, relname)) self.consolidate_graph() return
def iter_concepts(self): """Iterates through the concepts (only one per position) so that they can be extracted in order. We will get the first concept that covers each positional 'slot' in the original. """ #concepts_iter=MappingLine.ev_parser.finditer(self.line) #concept_slots={} #for concept in concepts_iter: # concept=concept.groupdict() # positions=MappingLine.position_extractor.findall( # concept['match_positions']) # this_pos=int(positions[0]) # covered_pos=reduce(operator.or_, [int(x) in concept_slots for x in # positions]) # if covered_pos in concept_slots: # continue # concept_slots[this_pos]=ConceptLine(concept['cui'], # concept['preferred_concept_name'], # -int(concept['candidate_score'])) # # Fill in the rest of the slots covered by this concept # for each_slot in positions[1:]: # concept_slots[int(each_slot)]=None # #ordered_slots=concept_slots.keys() #ordered_slots.sort() #for slot in ordered_slots: # if concept_slots[slot] is not None: # yield concept_slots[slot] #return try: all_mappings=mappings.parseString(self.line)[0] except: logging.warn("FAIL parsing %s", self.line) raise if len(all_mappings)==0: return # Get the mapping with the best score. If all have the same score, # uses the first one. best_mapping=all_mappings[0]['Expression'][0] best_mapping_score=all_mappings[0]['Score'] for m in all_mappings[1:]: if m['Score']>best_mapping_score: best_mapping_score=m['Score'] best_mapping=m['Expression'][0] # The EVs are in order for e in best_mapping: new_concept=ConceptLine(e['ConceptID'], e['Name'], int(e['Score'])) logging.debug("Emitting %r", new_concept) yield new_concept return
def build_idf_from_file(self, file_reader, default_score=None): tempdict={} logging.info("Building the term frequency dictionary") count=1 logging.debug("Checking for a cache file, and loading from it.") try: self.populate_from_cache( self.cache_file_name(file_reader.original_file.name)) logging.info("Loaded from cache. It's not necessary to build.") return except: logging.debug("Nope. Proceeding with building the dictionary.") for article in file_reader: logging.debug("Processing article %r (number %d) for the term" " frequency dictionary", article, count) if article.set_id.pmid < 0: logging.warn("Article with unknown PubMed ID - skipping") continue count+=1 tempcounts={} for line in article.lines: try: this_cui=line.CUI except AttributeError: continue # Use the confidence as the score if no default is specified #if default_score is None: # try: # this_score=line.confidence # except AttributeError: # continue #else: # this_score=default_score #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score tempcounts[this_cui]=1 # Now have all the CUIs that appeared in the article. Update # the total counts. for k in tempcounts: tempdict[k]=tempdict.get(k, 0)+1 logging.debug("Built a dictionary with %d items. Computing IDFs.", len(tempdict)) # max_value=max(tempdict.itervalues()) #logging.debug("Saving it to permanent storage.") for k, v in tempdict.iteritems(): self[k]=math.log(count/float(v))+1.0 logging.info("Done building the dictionary. Dumping it to a cache " "file.") self.dump_to_cache( self.cache_file_name(file_reader.original_file.name)) return
def process_article(self, each_article): if not self.include_article(each_article): logging.log(ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) return try: ranked_article = self.graph_and_rank(each_article) except CouldNotRank: return logging.debug("Ranked article: %r", ranked_article) converted_terms = self.convert(ranked_article) logging.debug("Converted terms: %r", converted_terms) cut_terms = converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug("Cut terms: %r", cut_terms) try: medline_record_mesh_terms = ExpressionList().from_medline( each_article.set_id.article_record()['MH']) except: logging.warn( "Could not obtain an article record for %r. " "Skipping.", each_article) return flat_medline = medline_record_mesh_terms.flatten() flattened_terms = self.flatten_generated_terms(flat_medline, cut_terms) flattened_terms = self.limit_length(flat_medline, flattened_terms) if len(flat_medline) == 0: logging.warn( "No gold standard available for article %r. " "Omitting it from the result set.", each_article) return eval_result = self.perform_evaluation(each_article, self.evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() #logging.debug("Original headings: %r Major headings: %r", # medline_record_mesh_terms, # flattened_major_headings) logging.debug("Flattened MeSH terms: %r", flat_medline) logging.debug("Flattened generated terms: %r", flattened_terms) mh_result_temp = self.perform_evaluation(each_article, self.evaluator, flattened_major_headings, flattened_terms) mh_result = NamedResultSet("major_", mh_result_temp) # Compute the total recall, too total_recall = self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets self.all_results[each_article.set_id] = eval_result | mh_result return
def iter_concepts(self): """Iterates through the concepts (only one per position) so that they can be extracted in order. We will get the first concept that covers each positional 'slot' in the original. """ #concepts_iter=MappingLine.ev_parser.finditer(self.line) #concept_slots={} #for concept in concepts_iter: # concept=concept.groupdict() # positions=MappingLine.position_extractor.findall( # concept['match_positions']) # this_pos=int(positions[0]) # covered_pos=reduce(operator.or_, [int(x) in concept_slots for x in # positions]) # if covered_pos in concept_slots: # continue # concept_slots[this_pos]=ConceptLine(concept['cui'], # concept['preferred_concept_name'], # -int(concept['candidate_score'])) # # Fill in the rest of the slots covered by this concept # for each_slot in positions[1:]: # concept_slots[int(each_slot)]=None # #ordered_slots=concept_slots.keys() #ordered_slots.sort() #for slot in ordered_slots: # if concept_slots[slot] is not None: # yield concept_slots[slot] #return try: all_mappings = mappings.parseString(self.line)[0] except: logging.warn("FAIL parsing %s", self.line) raise if len(all_mappings) == 0: return # Get the mapping with the best score. If all have the same score, # uses the first one. best_mapping = all_mappings[0]['Expression'][0] best_mapping_score = all_mappings[0]['Score'] for m in all_mappings[1:]: if m['Score'] > best_mapping_score: best_mapping_score = m['Score'] best_mapping = m['Expression'][0] # The EVs are in order for e in best_mapping: new_concept = ConceptLine(e['ConceptID'], e['Name'], int(e['Score'])) logging.debug("Emitting %r", new_concept) yield new_concept return
def freeze(self): """Dumps the configuration to special keys so that the DBDict state can be replicated later (i.e. for persistence). This procedure is performed every WRITE_EVERY writes, but COUNTER_KEY is kept updated continuously.""" if self.my_mode == "r": return #self.my_lock.acquire() try: self.my_store[SYNC_KEY] = str(self.sync_every) self.my_store[COUNTER_KEY] = str(self.write_counter) self.my_store[WRITE_EVERY_KEY] = str(self.write_every) self.my_store.sync() except: logging.warn("ERROR while storing state: %s", traceback.format_exc())
def freeze(self): """Dumps the configuration to special keys so that the DBDict state can be replicated later (i.e. for persistence). This procedure is performed every WRITE_EVERY writes, but COUNTER_KEY is kept updated continuously.""" if self.my_mode=="r": return #self.my_lock.acquire() try: self.my_store[SYNC_KEY]=str(self.sync_every) self.my_store[COUNTER_KEY]=str(self.write_counter) self.my_store[WRITE_EVERY_KEY]=str(self.write_every) self.my_store.sync() except: logging.warn("ERROR while storing state: %s", traceback.format_exc())
def from_graphml_file(self, file_object, default_link=Link): from xml.etree.ElementTree import iterparse def get_subelement_data(elem, key): result=[x.text for x in elem.getiterator() if x.tag=="{http://graphml.graphdrawing.org/xmlns}data" and x.get('key')==key] if len(result)==0: return None return result[0] nodes={} # Discover the names of the attributes we're looking for by investigating the keys # Then actually read the file keystore={} for event, element in iterparse(file_object): #print element if element.tag=="{http://graphml.graphdrawing.org/xmlns}key": if element.get('attr.name') is None: continue keystore[element.get('for')+'.'+element.get('attr.name')]=element.get('id') # print keystore if element.tag=="{http://graphml.graphdrawing.org/xmlns}node": # The next line supports yEd's NodeLabel and Profuse's label nodename=get_subelement_data(element, keystore['node.description']) if nodename is None: nodename="NoName" nodekey=get_subelement_data(element, keystore['node.MR_id']) nodes[element.get('id')]=Node(nodekey, nodename, 1.0) if element.tag=="{http://graphml.graphdrawing.org/xmlns}edge": n1=nodes[element.get('source')] n2=nodes[element.get('target')] try: weight=float(get_subelement_data(element, keystore['edge.weight'])) except: logging.warn('Failed at reading weight because of:\n%s', traceback.format_exc()) weight=1.0 try: relname=get_subelement_data(element, keystore['edge.description']) except: relname="" self.add_relationship(default_link(n1, n2, weight, relname)) self.consolidate_graph() return
def article_record(self): if Pmid.__article_cache is None: Pmid.init_storage() fetch = False my_record = self.__article_cache.get_record(self.__pmid) if my_record is not None: return my_record else: fetch = self.__fetch_new logging.warn('Could not read %s from the cache: \n', self.__pmid) if fetch: my_record = self.__article_cache.fetch_record(self.__pmid) if not self.__article_cache.put_record(self.__pmid, my_record): logging.warn("Unable to update the database: \n%r", my_record) else: raise KeyError("No record for article %r could be found." % self.__pmid) return my_record
def article_record(self): if Pmid.__article_cache is None: Pmid.init_storage() fetch=False my_record=self.__article_cache.get_record(self.__pmid) if my_record is not None: return my_record else: fetch=self.__fetch_new logging.warn('Could not read %s from the cache: \n', self.__pmid) if fetch: my_record=self.__article_cache.fetch_record(self.__pmid) if not self.__article_cache.put_record(self.__pmid, my_record): logging.warn("Unable to update the database: \n%r", my_record) else: raise KeyError("No record for article %r could be found." % self.__pmid) return my_record
def ignore_exception(self, which_exception, on_which_line): """Decides whether exceptions during parsing correspond to known problems with SEMREP's output, and whether to ignore the corresponding lines.""" if type(which_exception) is CUINotFoundError: logging.log(ULTRADEBUG, "Skipping line '%s' because no CUI could be found " "on it" % on_which_line) return True if type(which_exception) is NoLineTypeError: logging.log(ULTRADEBUG, "Skipping line '%s' because its type could not be " "determined.", on_which_line) return True if type(which_exception) is NoConfidenceError: logging.log(ULTRADEBUG, "Skipping line '%s' because it has no confidence.", on_which_line) return True if type(which_exception) is UnknownLineTypeError: logging.warn("Skipping line '%s' because it has an unknown type", on_which_line) return True return False
def __iter__(self): current_set = [] current_id = None bad_id = -1 for line in NLMOutput.__iter__(self): try: this_lines_set_id = self._chunkmap.pmid_from_block( line.line_id) except KeyError: logging.warn( "Line without chunkmap equivalent. Emitting" " as id %d", bad_id) this_lines_set_id = Pmid(bad_id) if this_lines_set_id != current_id: # Is this the first invocation? If not, we have to emit the # linelist that just ended, but if it is we'll just pretend # that we did. if current_id is not None: # Emit the linelist that just ended logging.log( ULTRADEBUG, "Completed set of lines %s " "according to the chunkmap. Emitting them.", current_id) if current_id < 0: # Decrement bad line counter bad_id -= 1 yield self._lines_type(current_id, current_set) # Start a new, empty linelist current_id = this_lines_set_id current_set = [] current_set.append(line) # Is there something left to emit after the iteration's over? if len(current_set) > 0: logging.log( ULTRADEBUG, "Completed iteration. Emitting the last " "lines left with set id %s", current_id) yield self._lines_type(current_id, current_set) return
def ignore_exception(self, which_exception, on_which_line): """Decides whether exceptions during parsing correspond to known problems with SEMREP's output, and whether to ignore the corresponding lines.""" if type(which_exception) is CUINotFoundError: logging.log( ULTRADEBUG, "Skipping line '%s' because no CUI could be found " "on it" % on_which_line) return True if type(which_exception) is NoLineTypeError: logging.log( ULTRADEBUG, "Skipping line '%s' because its type could not be " "determined.", on_which_line) return True if type(which_exception) is NoConfidenceError: logging.log(ULTRADEBUG, "Skipping line '%s' because it has no confidence.", on_which_line) return True if type(which_exception) is UnknownLineTypeError: logging.warn("Skipping line '%s' because it has an unknown type", on_which_line) return True return False
def __init__(self, fileobject, transform_function): SavccMatrix.__init__(self, fileobject, transform_function) # Add normalization factors logging.log(ULTRADEBUG, "Initializing normalization array") # Default behavior: no normalization self.normfactors = [1.0] * self._height # Tentative normalization array name array_filename = self._expected_norm_array_name() logging.debug( "Trying to load a normalization array from disk. The " "file should be named %s.", array_filename) # Make sure that only one process or thread at a time can attempt to get # the normalization factors _normfactor_lock.acquire() try: try: self._load_normalization_factors(open(array_filename, 'rb')) logging.debug('Normalization factors loaded from disk.') except IOError: logging.debug( "Unable to load normalization factors from disk.") self._generate_normalization_factors() # Only save normalization factors if they are not a StringIO # object if not isinstance(fileobject, StringIO.StringIO): logging.debug("Saving normalization factors to %s", array_filename) try: self._save_normalization_factors( open(array_filename, 'wb')) except IOError: logging.warn("Unable to save the normalization array. " "It will have to be regenerated each " "time.") finally: _normfactor_lock.release()
import os.path import sys from MEDRank.computation.base_matrix import Matrix from ctypes import cdll, CDLL, byref # Disable warnings about spaces before operators (they drive me crazy) # pylint: disable-msg=C0322 try: LIBRARY_LOCATION = os.path.join(sys.exec_prefix, 'lib', 'python' + sys.version[:3], 'site-packages', 'MEDRank', 'computation', '_distmat.so') cdll.LoadLibrary(LIBRARY_LOCATION) DISTLIB = CDLL(LIBRARY_LOCATION) except: DISTLIB = None logging.warn("_distmat.so is not available; attempts to compute graph " "metrics will result in an exception.") class DistanceMatrix(object): """Represents a distance matrix, in which each C[i, j] encodes the distance from i to j in a graph. Pass the value you plan on using as an unreachable distance to the constructor. If you omit it, it will default to the link matrix's size (reasonable in most cases). The distance matrix is meant to compute stats on, so it's immutable by design. """ def __init__(self, a_link_matrix, unreachable_distance=None): self._matrix = Matrix(len(a_link_matrix)) if unreachable_distance is None:
def output(output_file, result_queue, headers_callback=output_headers, item_callback=output_one_item, initial_result_set_size=100): """Actually dumps the result set to output. Override for easy output customization.""" result_set={} proctitle.setproctitle("MEDRank-output-processor") stop_requested=False # Gather a few values logging.log(ULTRADEBUG, "Gathering values for initial analysis.") for i in xrange(initial_result_set_size): logging.log(ULTRADEBUG, "Getting results %d.", i) try: request=result_queue.get() if request=='STOP': stop_requested=True break result_set.update(request) except KeyboardInterrupt: return except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) logging.log(ULTRADEBUG, "Values gathered. Computing columns.") column_names=set([]) # Add the colnames to the csv if headers_callback is not None: for result in result_set.itervalues(): column_names|=result.columns() # Create a writer column_names=['pmid'] + [x for x in column_names] headers_callback(output_file, column_names) logging.log(ULTRADEBUG, "Looping to get more results and output them.") while True: if not stop_requested: try: request=result_queue.get() if request=='STOP': stop_requested=True else: result_set.update(request) except KeyboardInterrupt: return except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) if stop_requested and len(result_set)==0: break if len(result_set)==0: continue # It can happen! We might get no results, or an empty set. pmid=result_set.keys()[0] logging.log(ULTRADEBUG, "Output: article %r.", pmid) result=result_set[pmid] item_callback(output_file, pmid, result, column_names) del result_set[pmid] try: output_file.flush() except: logging.warn("The output file object does not support flushing.") try: os.fsync(output_file.fileno()) except: logging.warn("Could not fsync the output file. Traceback follows.\n%s", traceback.format_exc()) return
import os.path import sys from MEDRank.computation.base_matrix import Matrix from ctypes import cdll, CDLL, byref # Disable warnings about spaces before operators (they drive me crazy) # pylint: disable-msg=C0322 try: LIBRARY_LOCATION=os.path.join(sys.exec_prefix, 'lib', 'python'+sys.version[:3], 'site-packages', 'MEDRank', 'computation', '_distmat.so') cdll.LoadLibrary(LIBRARY_LOCATION) DISTLIB=CDLL(LIBRARY_LOCATION) except: DISTLIB=None logging.warn("_distmat.so is not available; attempts to compute graph " "metrics will result in an exception.") class DistanceMatrix(object): """Represents a distance matrix, in which each C[i, j] encodes the distance from i to j in a graph. Pass the value you plan on using as an unreachable distance to the constructor. If you omit it, it will default to the link matrix's size (reasonable in most cases). The distance matrix is meant to compute stats on, so it's immutable by design. """ def __init__(self, a_link_matrix, unreachable_distance=None): self._matrix=Matrix(len(a_link_matrix)) if unreachable_distance is None: unreachable_distance=len(a_link_matrix)