def _evaluate(self, gold_standard, seen_terms): "Compute SAVCC between two sets of terms" logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f', gold_standard, seen_terms, self._alpha) gold_standard_vector=self._my_tree.term_vector(gold_standard) seen_vector=self._my_tree.term_vector(seen_terms) # This computes [(alpha*I2)+(1-alpha x M)I2] modified_term=seen_vector.scale(self._alpha)+\ self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha) logging.log(ULTRADEBUG, "Modified term=%r", modified_term) # I1 * modified_term numerator=gold_standard_vector.dot(modified_term) # Denominator of the whole thing denominator=gold_standard_vector.length()*modified_term.length() try: result=numerator/denominator except ZeroDivisionError: logging.warn("ZeroDivisionError when computing SAVCC for %r and %r:", gold_standard, seen_terms) result=0 logging.log(ULTRADEBUG, "Numerator=%1.7f Denominator=%1.7f Result=%1.7f", numerator, denominator, result) return result
def sentence_iterator(self, list_of_lines): """Iterates through the list of lines, returning a group with the same line_id each time. At the end of the iteration, the procedure updates the measurements of the graph builder to record the number of sentences it emitted.""" current_group = [] current_id = None sentence_count = 0 for each_line in list_of_lines: if each_line.line_id != current_id: # Is it the first time? If not, emit the current group if current_id is not None: logging.log(ULTRADEBUG, "Emitting sentence %s with %d terms", current_id, len(current_group)) sentence_count += 1 yield current_group current_id = each_line.line_id current_group = [] current_group.append(each_line) # Are there lines left? Emit them if len(current_group) > 0: logging.log(ULTRADEBUG, "Emitting last sentence %s with %d terms", current_id, len(current_group)) sentence_count += 1 yield current_group # Iteration ended - record the measurement self._measurements.add(ArticleSentenceCount(sentence_count)) return
def __init__( self, original_line, cui_position=6, # Support for different SEMREP output formats description_position=7, semantic_type_position=8): SemrepLine.__init__(self, original_line) #line_breakup=self._line.split(self.split_char) try: self._cui = self.split_line[cui_position] except IndexError: raise CUINotFoundError("There was no CUI in the line '%s'" % self._line) if self._cui == '': raise CUINotFoundError("There was no CUI in the line '%s'" % self._line) try: self._description = self.split_line[description_position] self._semantic_type = self.split_line[semantic_type_position] except IndexError: raise ParsingError("Data missing from line '%s'" % self._line) # Some entities have no stated confidence. We use 0 in such cases, # so they can be eliminated from the workflow later. try: self.confidence = float(self.split_line[-3]) / 1000.0 except ValueError: raise NoConfidenceError("Could not parse a confidence value in " "line '%s'" % self._line) logging.log(ULTRADEBUG, "Created an entity_line @ %d: %s (%s) %1.3f", self.line_id, self._cui, self._description, self.confidence)
def __init__(self, original_line): SemrepLine.__init__(self, original_line) #line_breakup=self._line.split(self.split_char) try: self._cui1=self.split_line[11] if self._cui1=='': raise IndexError() # Trigger the CUINotFoundError except IndexError: raise CUINotFoundError("There was no CUI1 in the line '%s'" % self._line) try: self._cui2=self.split_line[33] if self._cui2=='': raise IndexError() # Trigger the CUINotFoundError except IndexError: raise CUINotFoundError("There was no CUI2 in the line '%s'" % self._line) try: self._relation_type=self.split_line[24] except IndexError: raise ParsingError("Data missing from line '%s'" % self._line) try: self.confidence=float(self.split_line[-3])/1000.0 except ValueError: raise NoConfidenceError("Could not parse a confidence value in " "line '%s'" % self._line) logging.log(ULTRADEBUG, "Created a relation_line @ %d: %s--%s-->%s (%1.3f)", self.line_id, self._cui1, self._relation_type, self._cui2, self.confidence)
def __init__(self, original_line, cui_position=6, # Support for different SEMREP output formats description_position=7, semantic_type_position=8): SemrepLine.__init__(self, original_line) #line_breakup=self._line.split(self.split_char) try: self._cui=self.split_line[cui_position] except IndexError: raise CUINotFoundError("There was no CUI in the line '%s'" % self._line) if self._cui=='': raise CUINotFoundError("There was no CUI in the line '%s'" % self._line) try: self._description=self.split_line[description_position] self._semantic_type=self.split_line[semantic_type_position] except IndexError: raise ParsingError("Data missing from line '%s'" % self._line) # Some entities have no stated confidence. We use 0 in such cases, # so they can be eliminated from the workflow later. try: self.confidence=float(self.split_line[-3])/1000.0 except ValueError: raise NoConfidenceError("Could not parse a confidence value in " "line '%s'" % self._line) logging.log(ULTRADEBUG, "Created an entity_line @ %d: %s (%s) %1.3f", self.line_id, self._cui, self._description, self.confidence)
def __init__(self, fileobject, transform_function): SavccMatrix.__init__(self, fileobject, transform_function) # Add normalization factors logging.log(ULTRADEBUG, "Initializing normalization array") # Default behavior: no normalization self.normfactors=[1.0]*self._height # Tentative normalization array name array_filename=self._expected_norm_array_name() logging.debug("Trying to load a normalization array from disk. The " "file should be named %s.", array_filename) # Make sure that only one process or thread at a time can attempt to get # the normalization factors _normfactor_lock.acquire() try: try: self._load_normalization_factors(open(array_filename, 'rb')) logging.debug('Normalization factors loaded from disk.') except IOError: logging.debug("Unable to load normalization factors from disk.") self._generate_normalization_factors() # Only save normalization factors if they are not a StringIO # object if not isinstance(fileobject, StringIO.StringIO): logging.debug("Saving normalization factors to %s", array_filename) try: self._save_normalization_factors(open(array_filename, 'wb')) except IOError: logging.warn("Unable to save the normalization array. " "It will have to be regenerated each " "time.") finally: _normfactor_lock.release()
def __init__(self, original_line): SemrepLine.__init__(self, original_line) #line_breakup=self._line.split(self.split_char) try: self._cui1 = self.split_line[11] if self._cui1 == '': raise IndexError() # Trigger the CUINotFoundError except IndexError: raise CUINotFoundError("There was no CUI1 in the line '%s'" % self._line) try: self._cui2 = self.split_line[33] if self._cui2 == '': raise IndexError() # Trigger the CUINotFoundError except IndexError: raise CUINotFoundError("There was no CUI2 in the line '%s'" % self._line) try: self._relation_type = self.split_line[24] except IndexError: raise ParsingError("Data missing from line '%s'" % self._line) try: self.confidence = float(self.split_line[-3]) / 1000.0 except ValueError: raise NoConfidenceError("Could not parse a confidence value in " "line '%s'" % self._line) logging.log(ULTRADEBUG, "Created a relation_line @ %d: %s--%s-->%s (%1.3f)", self.line_id, self._cui1, self._relation_type, self._cui2, self.confidence)
def __init__(self, damping_factor=0.85, max_iterations=10000, epsilon=0.0001): logging.log(ULTRADEBUG, "Creating a ranker object.") self._max_iter=max_iterations self._e=epsilon self._d=damping_factor self._latest_stats=None
def _evaluate(self, gold_standard, seen_terms): "Compute SAVCC between two sets of terms" logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f', gold_standard, seen_terms, self._alpha) gold_standard_vector = self._my_tree.term_vector(gold_standard) seen_vector = self._my_tree.term_vector(seen_terms) # This computes [(alpha*I2)+(1-alpha x M)I2] modified_term=seen_vector.scale(self._alpha)+\ self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha) logging.log(ULTRADEBUG, "Modified term=%r", modified_term) # I1 * modified_term numerator = gold_standard_vector.dot(modified_term) # Denominator of the whole thing denominator = gold_standard_vector.length() * modified_term.length() try: result = numerator / denominator except ZeroDivisionError: logging.warn( "ZeroDivisionError when computing SAVCC for %r and %r:", gold_standard, seen_terms) result = 0 logging.log(ULTRADEBUG, "Numerator=%1.7f Denominator=%1.7f Result=%1.7f", numerator, denominator, result) return result
def unfreeze(self): """Restores dictionary state from the database. Assumes a very conservative state if it can't read state from disk.""" # Is this the first-ever load?? logging.log(ULTRADEBUG, "Restoring state from disk.") if len(self.my_store.keys()) == 0: return #self.my_lock.acquire() try: self.sync_every = int(self.my_store[SYNC_KEY]) except KeyError: self.sync_every = 1 try: self.write_counter = int(self.my_store[COUNTER_KEY]) except KeyError: self.write_counter = 0 try: # If there's no WRITE_EVERY, keep the default (for compatibility # With previous versions of the DBDict) result = int(self.my_store[WRITE_EVERY_KEY]) except KeyError: result = self.write_every self.write_every = result #self.my_lock.release() logging.log(ULTRADEBUG, "State restored.")
def check_extra_checktag_rules(self, an_expression): """Checks to see if a mesh term is in a known checktag-emitting tree""" positions = [ self._tree[a_term.term].position for a_term in an_expression.utterance ] for position in positions: for rule in self.data.extra_checktag_rules: """Check each position in each tree for membership in the tree-based checktag rules""" if 'in' in rule: in_rule = any( [any([x in y for x in rule['in']]) for y in position]) else: in_rule = False if 'not in' in rule: not_in_rule = any([ any([x in y for x in rule['not in']]) for y in position ]) else: not_in_rule = False if in_rule and not not_in_rule: logging.log(ULTRADEBUG, "Expression %r matches checktag rule %r", an_expression, rule) self._extra_checktags |= set( [Term(x) for x in rule['terms']]) return
def check_extra_checktag_rules(self, an_expression): """Checks to see if a mesh term is in a known checktag-emitting tree""" positions=[self._tree[a_term.term].position for a_term in an_expression.utterance] for position in positions: for rule in self.data.extra_checktag_rules: """Check each position in each tree for membership in the tree-based checktag rules""" if 'in' in rule: in_rule=any([any([x in y for x in rule['in']]) for y in position]) else: in_rule=False if 'not in' in rule: not_in_rule=any([any([x in y for x in rule['not in']]) for y in position]) else: not_in_rule=False if in_rule and not not_in_rule: logging.log(ULTRADEBUG, "Expression %r matches checktag rule %r", an_expression, rule) self._extra_checktags|=set([Term(x) for x in rule['terms']]) return
def __init__(self, fileobject=None, transform_function=None): # The default matrix is installed together with the package if fileobject is None: fileobject=open(_DEFAULT_MATRIX_NAME, "rb") # The matrix file has a header that describes the size of the # matrix in the first bytes self.header_size=struct.calcsize(">HH") # Read the file header and get the height and width of the matrix self._height, self._width=struct.unpack('>HH', fileobject.read(self.header_size)) logging.debug("We're reading a %dx%d matrix", self._height, self._width) # Keep a link to the file. We'll need it. self._matrix_file=fileobject #self._matrix_file_handle=self._matrix_file.fileno() # This will store the mappings from bytes in the matrix to actual # results self.transform=[0.0]*256 logging.log(ULTRADEBUG, "Building the transformation array.") for i in xrange(255): self.transform[i]=transform_function(i) # We leave the last value blank; it's always 0. logging.debug("The transformation array is %s.", str(self.transform)) # Save the size of a byte for later self.byte_size=struct.calcsize('<B') self._cached_row=-1 self._row_cache=None
def unfreeze(self): """Restores dictionary state from the database. Assumes a very conservative state if it can't read state from disk.""" # Is this the first-ever load?? logging.log(ULTRADEBUG, "Restoring state from disk.") if len(self.my_store.keys())==0: return #self.my_lock.acquire() try: self.sync_every=int(self.my_store[SYNC_KEY]) except KeyError: self.sync_every=1 try: self.write_counter=int(self.my_store[COUNTER_KEY]) except KeyError: self.write_counter=0 try: # If there's no WRITE_EVERY, keep the default (for compatibility # With previous versions of the DBDict) result=int(self.my_store[WRITE_EVERY_KEY]) except KeyError: result=self.write_every self.write_every=result #self.my_lock.release() logging.log(ULTRADEBUG, "State restored.")
def init_storage(cache_name=pmid_cache.DEFAULT_CACHE_HOST, fetch_new_articles=True): if Pmid.__article_cache is None: Pmid.__article_cache = pmid_cache.Client(cache_name) Pmid.__fetch_new = fetch_new_articles logging.log(ULTRADEBUG, "Started up a connection to the pubmed cache database.")
def __init__(self, fileobject=None, transform_function=None): # The default matrix is installed together with the package if fileobject is None: fileobject = open(_DEFAULT_MATRIX_NAME, "rb") # The matrix file has a header that describes the size of the # matrix in the first bytes self.header_size = struct.calcsize(">HH") # Read the file header and get the height and width of the matrix self._height, self._width = struct.unpack( '>HH', fileobject.read(self.header_size)) logging.debug("We're reading a %dx%d matrix", self._height, self._width) # Keep a link to the file. We'll need it. self._matrix_file = fileobject #self._matrix_file_handle=self._matrix_file.fileno() # This will store the mappings from bytes in the matrix to actual # results self.transform = [0.0] * 256 logging.log(ULTRADEBUG, "Building the transformation array.") for i in xrange(255): self.transform[i] = transform_function(i) # We leave the last value blank; it's always 0. logging.debug("The transformation array is %s.", str(self.transform)) # Save the size of a byte for later self.byte_size = struct.calcsize('<B') self._cached_row = -1 self._row_cache = None
def __init__(self, original_line): Line.__init__(self, original_line, id_position=0) try: self._cui=self.split_line[2] except IndexError: raise CUINotFoundError("There was no CUI in the line '%s'" % self._line) if self._cui=='': raise CUINotFoundError("There was no CUI in the line '%s'" % self._line) try: self._description=self.split_line[1] self._source=self.split_line[7] self._type=self.split_line[4].upper() except IndexError: raise ParsingError("Data missing from line '%s'" % self._line) # Some entities have no stated confidence. We use 0 in such cases, # so they can be eliminated from the workflow later. try: self.confidence=float(self.split_line[3])/1000.0 except ValueError: raise NoConfidenceError("Could not parse a confidence value in " "line '%s'" % self._line) logging.log(ULTRADEBUG, "Created a MtiLine @ %d: %s (%s) %1.3f", self.line_id, self._cui, self._description, self.confidence)
def init_storage(cache_name=pmid_cache.DEFAULT_CACHE_HOST, fetch_new_articles=True): if Pmid.__article_cache is None: Pmid.__article_cache=pmid_cache.Client(cache_name) Pmid.__fetch_new=fetch_new_articles logging.log(ULTRADEBUG, "Started up a connection to the pubmed cache database.")
def check_for_subheadings(self, an_expression): "Checks to see if this expression needs a subheading added." positions = [ self._tree[a_term.term].position for a_term in an_expression.utterance ] for position in positions: for rule in self.data.subheading_rules: """Check each position in each tree for membership in the tree-based checktag rules""" if 'in' in rule: in_rule = any( [any([x in y for x in rule['in']]) for y in position]) else: in_rule = False if 'not in' in rule: not_in_rule = any([ any([x in y for x in rule['not in']]) for y in position ]) else: not_in_rule = False if in_rule and not not_in_rule: logging.log(ULTRADEBUG, "Expression %r matches subheading rule %r", an_expression, rule) return [Term(x) for x in rule['terms']] return []
def sentence_iterator(self, list_of_lines): """Iterates through the list of lines, returning a group with the same line_id each time. At the end of the iteration, the procedure updates the measurements of the graph builder to record the number of sentences it emitted.""" current_group=[] current_id=None sentence_count=0 for each_line in list_of_lines: if each_line.line_id!=current_id: # Is it the first time? If not, emit the current group if current_id is not None: logging.log(ULTRADEBUG, "Emitting sentence %s with %d terms", current_id, len(current_group)) sentence_count+=1 yield current_group current_id=each_line.line_id current_group=[] current_group.append(each_line) # Are there lines left? Emit them if len(current_group)>0: logging.log(ULTRADEBUG, "Emitting last sentence %s with %d terms", current_id, len(current_group)) sentence_count+=1 yield current_group # Iteration ended - record the measurement self._measurements.add(ArticleSentenceCount(sentence_count)) return
def __iter__(self): current_set=[] current_id=None bad_id=-1 for line in NLMOutput.__iter__(self): try: this_lines_set_id=self._chunkmap.pmid_from_block(line.line_id) except KeyError: logging.warn("Line without chunkmap equivalent. Emitting" " as id %d", bad_id) this_lines_set_id=Pmid(bad_id) if this_lines_set_id!=current_id: # Is this the first invocation? If not, we have to emit the # linelist that just ended, but if it is we'll just pretend # that we did. if current_id is not None: # Emit the linelist that just ended logging.log(ULTRADEBUG, "Completed set of lines %s " "according to the chunkmap. Emitting them.", current_id) if current_id<0: # Decrement bad line counter bad_id-=1 yield self._lines_type(current_id, current_set) # Start a new, empty linelist current_id=this_lines_set_id current_set=[] current_set.append(line) # Is there something left to emit after the iteration's over? if len(current_set)>0: logging.log(ULTRADEBUG, "Completed iteration. Emitting the last " "lines left with set id %s", current_id) yield self._lines_type(current_id, current_set) return
def run(self): """Perform the evaluation""" logging.info("Starting workflow %r run", self) all_results={} evaluator=self.create_evaluator() count=0 for each_article in self._reader: count+=1 logging.info("Working on article %d: %r", count, each_article) if not self.include_article(each_article): logging.log(ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) continue try: ranked_article=self.graph_and_rank(each_article) except CouldNotRank: continue converted_terms=self.convert(ranked_article) cut_terms=converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug("Lowest-ranking term is term #%d out of %d" " (score=%1.5f, highest score=%1.5f)", len(cut_terms), len(converted_terms), [x[1] for x in cut_terms][-1], [x[1] for x in cut_terms][0]) medline_record_mesh_terms=ExpressionList().from_medline( each_article.set_id.article_record().mesh_headings) flat_medline=medline_record_mesh_terms.flatten() flattened_terms=self.flatten_generated_terms(flat_medline, cut_terms) flattened_terms=self.limit_length(flat_medline, flattened_terms) if len(flat_medline)==0: logging.warn("No gold standard available for article %r. " "Omitting it from the result set.", each_article) continue eval_result=self.perform_evaluation(each_article, evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() logging.debug("Original headings: %r Major headings: %r", medline_record_mesh_terms, flattened_major_headings) mh_result_temp=self.perform_evaluation(each_article, evaluator, flattened_major_headings, flattened_terms) mh_result=NamedResultSet("mh_", mh_result_temp) # Compute the total recall, too total_recall=self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets all_results[each_article.set_id]=eval_result | mh_result logging.info("Writing out results.") self.output(all_results) self.output_metadata() return
def run(self): """Perform the evaluation""" logging.info("Starting workflow %r run", self) all_results = {} evaluator = self.create_evaluator() count = 0 for each_article in self._reader: count += 1 logging.info("Working on article %d: %r", count, each_article) if not self.include_article(each_article): logging.log( ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) continue try: ranked_article = self.graph_and_rank(each_article) except CouldNotRank: continue converted_terms = self.convert(ranked_article) cut_terms = converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug( "Lowest-ranking term is term #%d out of %d" " (score=%1.5f, highest score=%1.5f)", len(cut_terms), len(converted_terms), [x[1] for x in cut_terms][-1], [x[1] for x in cut_terms][0]) medline_record_mesh_terms = ExpressionList().from_medline( each_article.set_id.article_record().mesh_headings) flat_medline = medline_record_mesh_terms.flatten() flattened_terms = self.flatten_generated_terms( flat_medline, cut_terms) flattened_terms = self.limit_length(flat_medline, flattened_terms) if len(flat_medline) == 0: logging.warn( "No gold standard available for article %r. " "Omitting it from the result set.", each_article) continue eval_result = self.perform_evaluation(each_article, evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() logging.debug("Original headings: %r Major headings: %r", medline_record_mesh_terms, flattened_major_headings) mh_result_temp = self.perform_evaluation(each_article, evaluator, flattened_major_headings, flattened_terms) mh_result = NamedResultSet("mh_", mh_result_temp) # Compute the total recall, too total_recall = self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets all_results[each_article.set_id] = eval_result | mh_result logging.info("Writing out results.") self.output(all_results) self.output_metadata() return
def __init__(self, damping_factor=0.85, max_iterations=10000, epsilon=0.0001): logging.log(ULTRADEBUG, "Creating a ranker object.") self._max_iter = max_iterations self._e = epsilon self._d = damping_factor self._latest_stats = None
def ignore_exception(self, which_exception, on_which_line): """Checks whether an exception generated by the parser is actionable or if it should be ignored. We ignore CUINotFoundError, because it is impossible to do anything in MEDRank without a CUI.""" if type(which_exception) is CUINotFoundError: logging.log(ULTRADEBUG, "Skipping line '%s' because no CUI could be found " "on it" % on_which_line) return True return False
def ignore_exception(self, which_exception, on_which_line): """Decides whether exceptions during parsing correspond to known problems with SEMREP's output, and whether to ignore the corresponding lines.""" if type(which_exception) is WrongTypeOfLineError: logging.log(ULTRADEBUG, "Skipping line '%s' because its type could not be " "determined.", on_which_line) return True return False
def is_ignorable(self, which_line): """We need to specify ignorable lines in terms of useless strings, so we need to check the input against these.""" for line in self.__lines_to_ignore: if line in which_line: logging.log(ULTRADEBUG, "Line '%s' contains this " "skippable string: '%s'", which_line, line) return True return False
def evaluate(self, term_list_1, term_list_2): """Performs a set of evaluations (in no particular order) and returns their results as members of a ResultSet""" results=ResultSet() for each_evaluator in self: logging.log(ULTRADEBUG, "Applying %s as part of an EvaluationGroup", each_evaluator.__class__.__name__) results.add(each_evaluator.evaluate(term_list_1, term_list_2)) return results
def _init_inverse_lookup(self): """Sets up the internal data store to perform reverse lookups.""" logging.debug("First request of a reverse lookup. Building the " \ "inverse lookup dictionary.") self._invlookup={} for k, items in self._tree.iteritems(): for item in items.position: self._invlookup[item]=k logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.") return
def is_ignorable(self, which_line): """We need to specify ignorable lines in terms of useless strings, so we need to check the input against these.""" for line in self.__lines_to_ignore: if line in which_line: logging.log( ULTRADEBUG, "Line '%s' contains this " "skippable string: '%s'", which_line, line) return True return False
def _init_inverse_lookup(self): """Sets up the internal data store to perform reverse lookups.""" logging.debug("First request of a reverse lookup. Building the " \ "inverse lookup dictionary.") self._invlookup = {} for k, items in self._tree.iteritems(): for item in items.position: self._invlookup[item] = k logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.") return
def ignore_exception(self, which_exception, on_which_line): """Decides whether exceptions during parsing correspond to known problems with SEMREP's output, and whether to ignore the corresponding lines.""" if type(which_exception) is WrongTypeOfLineError: logging.log( ULTRADEBUG, "Skipping line '%s' because its type could not be " "determined.", on_which_line) return True return False
def __init__(self, original_line): MachineOutputLine.__init__(self, original_line) if self.line_type != 'utterance': raise WrongTypeOfLineError("%r is not an utterance." % self.line) parsed_line = UtteranceLine.parser.match(self.line).groupdict() self._my_line_id = int( UtteranceLine.numbers.findall(parsed_line['id'])[0]) logging.log(ULTRADEBUG, "Created an UtteranceLine with set id %d", self._my_line_id) # Update the line id for this lexer MachineOutputLine._line_id = self._my_line_id
def __del__(self): self._lock.acquire() try: if self.my_persistence: self.__t.commit() self.__t.close() if not self.my_persistence: logging.log(ULTRADEBUG, "Deleting temporary file %r", self.my_filename) os.unlink(self.my_filename) finally: self._lock.release()
def __init__(self, original_line): MachineOutputLine.__init__(self, original_line) if self.line_type!='utterance': raise WrongTypeOfLineError("%r is not an utterance." % self.line) parsed_line=UtteranceLine.parser.match(self.line).groupdict() self._my_line_id=int( UtteranceLine.numbers.findall(parsed_line['id'])[0]) logging.log(ULTRADEBUG, "Created an UtteranceLine with set id %d", self._my_line_id) # Update the line id for this lexer MachineOutputLine._line_id=self._my_line_id
def process_article(self, each_article): if not self.include_article(each_article): logging.log(ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) return try: ranked_article=self.graph_and_rank(each_article) except CouldNotRank: return logging.debug("Ranked article: %r", ranked_article) converted_terms=self.convert(ranked_article) logging.debug("Converted terms: %r", converted_terms) cut_terms=converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug("Cut terms: %r", cut_terms) try: medline_record_mesh_terms=ExpressionList().from_medline( each_article.set_id.article_record()['MH']) except: logging.warn("Could not obtain an article record for %r. " "Skipping.", each_article) return flat_medline=medline_record_mesh_terms.flatten() flattened_terms=self.flatten_generated_terms(flat_medline, cut_terms) flattened_terms=self.limit_length(flat_medline, flattened_terms) if len(flat_medline)==0: logging.warn("No gold standard available for article %r. " "Omitting it from the result set.", each_article) return eval_result=self.perform_evaluation(each_article, self.evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() #logging.debug("Original headings: %r Major headings: %r", # medline_record_mesh_terms, # flattened_major_headings) logging.debug("Flattened MeSH terms: %r", flat_medline) logging.debug("Flattened generated terms: %r", flattened_terms) mh_result_temp=self.perform_evaluation(each_article, self.evaluator, flattened_major_headings, flattened_terms) mh_result=NamedResultSet("major_", mh_result_temp) # Compute the total recall, too total_recall=self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets self.all_results[each_article.set_id]=eval_result | mh_result return
def check_checktag_rules(self, CUI): """Compares a CUI to the checktag rules, and emits checktags if it matches. If a CUI is a member of an MTI list, and the list is a known match to a checktag, we emit the checktag at the end of the process.""" # We check every list for membership (except exclusions) for (listname, checktags) in self.data.checktag_rules.iteritems(): if listname=='_exclusions': continue if CUI in self.data.lists[listname]: logging.log(ULTRADEBUG, 'CUI %r matches list %r. Checktags %r added.', CUI, listname, checktags) self._extra_checktags|=set([Term(x) for x in checktags])
def process_item(self, one_item): if not self.include_item(one_item): logging.log(ULTRADEBUG, "Skipping item %r due to exclusion " " criteria.", one_item) return try: ranked_item = self.graph_and_rank(one_item) except CouldNotRank: return cut_item = [x for x in ranked_item if x[1] >= self._ranking_cutoff] # Unify the result sets self.all_results[one_item.set_id] = cut_item return
def _create_graph(self, list_of_lines): new_graph = self._type_of_graph_to_build() logging.log( ULTRADEBUG, "Building a METAMAP proximity co-occurrence graph " "from %r", list_of_lines) # Iterate through each sentence, emitting links for each pair of # adjacent concepts (concept evaluators permitting) for sentence in self.sentence_iterator(list_of_lines): nodes = [] for concept in sentence: if not isinstance(concept, MetamapLine): logging.log( ULTRADEBUG, "Skipping line %r, as it isn't a " "MetamapLine", sentence) continue new_node = self._node_factory(concept.CUI, concept.description, concept.confidence, concept.line) if self.include_node(new_node): nodes.append(new_node) logging.log(ULTRADEBUG, "%r included in the graph", new_node) else: logging.log(ULTRADEBUG, "%r excluded from the graph", new_node) for i in xrange(len(nodes) - 1): for j in xrange(i + 1, len(nodes)): # Adjacent nodes are related more in this model. # The weight of the relationship is given by the distance node1, node2 = nodes[i], nodes[j] if self._direction_inferrer is None: new_link = self._adirectional_link_factory( node1, node2, self._link_strength(j - i)) else: new_dir=\ self._direction_inferrer.infer_relation_direction( node1.node_id, node2.node_id) if new_dir == 0: new_link = self._adirectional_link_factory( node1, node2, self._link_strength(j - i)) else: new_link = self._link_factory( node1, node2, new_dir * self._link_strength(j - i)) if self.include_link(new_link): new_graph.add_relationship(new_link) else: logging.log(ULTRADEBUG, "Excluding link %r from the graph", new_link) return new_graph
def compute_measures(self): """Computes graph metrics for the current object.""" self._consolidate_if_necessary() logging.log(ULTRADEBUG, "Computing graph metrics for %r", self) graph_measures=ResultSet() graph_measures.add(GraphNumberLinks(len(self._relationships))) unique_nodes=set() for a_relation in self._relationships: unique_nodes.add(a_relation.node1) unique_nodes.add(a_relation.node2) graph_measures.add(GraphNumberNodes(len(unique_nodes))) graph_measures.add(GraphAverageNodeWeight(reduce(operator.add, [x.weight for x in unique_nodes])/ float(len(unique_nodes)))) graph_measures.add(GraphAverageLinkWeight(reduce(operator.add, [x.weight for x in self._relationships])/ float(len(self._relationships)))) graph_measures.add(GraphLinkDegree(float(len(self._relationships))/ float(len(unique_nodes)))) logging.log(ULTRADEBUG, "Starting computation of the distance matrix.") distmat=DistanceMatrix(self.as_mapped_link_matrix()) logging.log(ULTRADEBUG, "Distance matrix obtained. Computing stats.") rocs=[distmat.relative_out_centrality(x) for x in xrange(len(distmat))] rics=[distmat.relative_in_centrality(x) for x in xrange(len(distmat))] avrocs=reduce(operator.add, rocs)/float(len(distmat)) avrics=reduce(operator.add, rics)/float(len(distmat)) graph_measures.add(GraphRelativeOutCentrality(avrocs)) graph_measures.add(GraphRelativeInCentrality(avrics)) graph_measures.add(GraphStratum(distmat.stratum())) graph_measures.add(GraphCompactness(distmat.compactness())) logging.log(ULTRADEBUG, "Finished computing graph metrics.") return graph_measures
def _create_graph(self, list_of_lines): new_graph=self._type_of_graph_to_build() logging.log(ULTRADEBUG, "Building a SEMREP co-occurrence graph from %r", list_of_lines) # Iterate through each sentence, emitting links for each pair of # adjacent concepts (concept evaluators permitting) for sentence in self.sentence_iterator(list_of_lines): nodes=[] for concept in sentence: if not isinstance(concept, EntityMeSHLine): logging.log(ULTRADEBUG, "Skipping line %r, as it isn't an " "EntityMeSHLine", concept) continue new_node=self._node_factory(concept.CUI, concept.description, concept.confidence, concept.mesh) if self.include_node(new_node): nodes.append(new_node) else: logging.log(ULTRADEBUG, "%r excluded from the graph", new_node) for i in xrange(len(nodes)-1): # Adjacent nodes are related in this model. node1, node2=nodes[i:i+2] new_link=self._adirectional_link_factory(node1, node2, (node1.weight+node2.weight)/2.0) if self.include_link(new_link): new_graph.add_relationship(new_link) else: logging.log(ULTRADEBUG, "Excluding link %r from the graph", new_link) return new_graph
def _create_graph(self, list_of_lines): new_graph=self._type_of_graph_to_build() logging.log(ULTRADEBUG, "Building a SEMREP co-occurrence graph from %r", list_of_lines) # Iterate through each sentence, emitting links for each pair of # adjacent concepts (concept evaluators permitting) for sentence in self.sentence_iterator(list_of_lines): nodes=[] for concept in sentence: if not isinstance(concept, EntityLine): logging.log(ULTRADEBUG, "Skipping line %r, as it isn't an " "EntityLine", concept) continue new_node=self._node_factory(concept.CUI, concept.description, concept.confidence) if self.include_node(new_node): nodes.append(new_node) else: logging.log(ULTRADEBUG, "%r excluded from the graph", new_node) for i in xrange(len(nodes)-1): # Adjacent nodes are related in this model. node1, node2=nodes[i:i+2] new_link=self._adirectional_link_factory(node1, node2, (node1.weight+node2.weight)/2.0) if self.include_link(new_link): new_graph.add_relationship(new_link) else: logging.log(ULTRADEBUG, "Excluding link %r from the graph", new_link) return new_graph
def check_checktag_rules(self, CUI): """Compares a CUI to the checktag rules, and emits checktags if it matches. If a CUI is a member of an MTI list, and the list is a known match to a checktag, we emit the checktag at the end of the process.""" # We check every list for membership (except exclusions) for (listname, checktags) in self.data.checktag_rules.iteritems(): if listname == '_exclusions': continue if CUI in self.data.lists[listname]: logging.log(ULTRADEBUG, 'CUI %r matches list %r. Checktags %r added.', CUI, listname, checktags) self._extra_checktags |= set([Term(x) for x in checktags])
def process_item(self, one_item): if not self.include_item(one_item): logging.log(ULTRADEBUG, "Skipping item %r due to exclusion " " criteria.", one_item) return try: ranked_item=self.graph_and_rank(one_item) except CouldNotRank: return cut_item=[x for x in ranked_item if x[1] >= self._ranking_cutoff] # Unify the result sets self.all_results[one_item.set_id]=cut_item return
def process_article(self, each_article): if not self.include_article(each_article): logging.log(ULTRADEBUG, "Skipping article %r due to exclusion " " criteria.", each_article) return try: ranked_article = self.graph_and_rank(each_article) except CouldNotRank: return logging.debug("Ranked article: %r", ranked_article) converted_terms = self.convert(ranked_article) logging.debug("Converted terms: %r", converted_terms) cut_terms = converted_terms.terms_higher_than_or_equal_to( self._ranking_cutoff) logging.debug("Cut terms: %r", cut_terms) try: medline_record_mesh_terms = ExpressionList().from_medline( each_article.set_id.article_record()['MH']) except: logging.warn( "Could not obtain an article record for %r. " "Skipping.", each_article) return flat_medline = medline_record_mesh_terms.flatten() flattened_terms = self.flatten_generated_terms(flat_medline, cut_terms) flattened_terms = self.limit_length(flat_medline, flattened_terms) if len(flat_medline) == 0: logging.warn( "No gold standard available for article %r. " "Omitting it from the result set.", each_article) return eval_result = self.perform_evaluation(each_article, self.evaluator, flat_medline, flattened_terms) flattened_major_headings=\ medline_record_mesh_terms.major_headings() #logging.debug("Original headings: %r Major headings: %r", # medline_record_mesh_terms, # flattened_major_headings) logging.debug("Flattened MeSH terms: %r", flat_medline) logging.debug("Flattened generated terms: %r", flattened_terms) mh_result_temp = self.perform_evaluation(each_article, self.evaluator, flattened_major_headings, flattened_terms) mh_result = NamedResultSet("major_", mh_result_temp) # Compute the total recall, too total_recall = self.compute_total_recall(flat_medline, converted_terms) eval_result.add(total_recall) # Unify the result sets self.all_results[each_article.set_id] = eval_result | mh_result return
def _create_table_if_necessary(self): self._lock.acquire() try: try: dummy=self.__t.execute('select * from s limit 1') except sqlite3.OperationalError: # Table doesn't exist logging.log(ULTRADEBUG, "Table doesn't exist - must be a new database.") self.__t.execute("""create table s (pkey TEXT PRIMARY KEY NOT NULL, data BLOB NOT NULL)""") logging.debug("Table created.") finally: self._lock.release() return
def convert(self, a_ranked_result_set): """Convert a ranked result set into a RankedConversionResult. In other words, convert a ranked term list to its MeSH equivalents.""" result = RankedConversionResult() self._my_converter.start_conversion() for incoming_term, incoming_score in a_ranked_result_set: converted = self._my_converter.convert( Concept(incoming_term.node_id)) if converted.utterance != []: result.add_term_score(converted, incoming_score) converted = self._my_converter.end_conversion() if converted.utterance != []: result.add_term_score(converted, incoming_score + self._checktag_boost) logging.log(ULTRADEBUG, "RankedConverter results: %r", result) return result
def convert(self, a_ranked_result_set): """Convert a ranked result set into a RankedConversionResult. In other words, convert a ranked term list to its MeSH equivalents.""" result=RankedConversionResult() self._my_converter.start_conversion() for incoming_term, incoming_score in a_ranked_result_set: converted=self._my_converter.convert( Concept(incoming_term.node_id)) if converted.utterance!=[]: result.add_term_score(converted, incoming_score) converted=self._my_converter.end_conversion() if converted.utterance!=[]: result.add_term_score(converted, incoming_score+ self._checktag_boost) logging.log(ULTRADEBUG, "RankedConverter results: %r", result) return result
def _create_graph(self, list_of_lines): new_graph=self._type_of_graph_to_build() logging.log(ULTRADEBUG, "Building a METAMAP proximity co-occurrence graph " "from %r", list_of_lines) # Iterate through each sentence, emitting links for each pair of # adjacent concepts (concept evaluators permitting) for sentence in self.sentence_iterator(list_of_lines): nodes=[] for concept in sentence: if not isinstance(concept, MetamapLine): logging.log(ULTRADEBUG, "Skipping line %r, as it isn't a " "MetamapLine", sentence) continue new_node=self._node_factory(concept.CUI, concept.description, concept.confidence, concept.line) if self.include_node(new_node): nodes.append(new_node) logging.log(ULTRADEBUG, "%r included in the graph", new_node) else: logging.log(ULTRADEBUG, "%r excluded from the graph", new_node) for i in xrange(len(nodes)-1): for j in xrange(i+1, len(nodes)): # Adjacent nodes are related more in this model. # The weight of the relationship is given by the distance node1, node2=nodes[i], nodes[j] if self._direction_inferrer is None: new_link=self._adirectional_link_factory(node1, node2, self._link_strength(j-i)) else: new_dir=\ self._direction_inferrer.infer_relation_direction( node1.node_id, node2.node_id) if new_dir==0: new_link=self._adirectional_link_factory(node1, node2, self._link_strength(j-i)) else: new_link=self._link_factory(node1, node2, new_dir*self._link_strength(j-i)) if self.include_link(new_link): new_graph.add_relationship(new_link) else: logging.log(ULTRADEBUG, "Excluding link %r from the graph", new_link) return new_graph
def mult_by_vector(self, vector): """Multiplies the matrix by a vocabulary_vector, returning a new vocabulary_vector""" vec_size = len(vector) logging.log(ULTRADEBUG, "Multiplying by %s", vector) # Size checks if vec_size != self._width: raise ValueError("The vector and matrix shapes do not match.") result = VocabularyVector(self._height) # Operands to include in the multiplication nonzero_indices = vector.nonzero() for i in xrange(self._height): this_result = 0.0 for j in nonzero_indices: this_result += self[i, j] * vector[j] result[i] = this_result return result
def mult_by_vector(self, vector): """Multiplies the matrix by a vocabulary_vector, returning a new vocabulary_vector""" vec_size=len(vector) logging.log(ULTRADEBUG, "Multiplying by %s", vector) # Size checks if vec_size != self._width: raise ValueError("The vector and matrix shapes do not match.") result=VocabularyVector(self._height) # Operands to include in the multiplication nonzero_indices=vector.nonzero() for i in xrange(self._height): this_result=0.0 for j in nonzero_indices: this_result+=self[i,j]*vector[j] result[i]=this_result return result
def processor(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, my_input_queue, my_output_queue, my_own_name=None): logging.info("Setting up worker.") if my_own_name is not None: proctitle.setproctitle(my_own_name) my_workflow=workflow_class(graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename ) if extra_data_name is not None: my_workflow.__setattr__(extra_data_name, extra_data_contents) logging.info("Finished setting up worker process. Waiting for requests.") try: while True: request=my_input_queue.get() logging.log(ULTRADEBUG, "Processing request %r", request) if request=='STOP': logging.log(ULTRADEBUG, "Received stop request.") break try: my_workflow.process_article(request) # Recover the article, push it on the output queue my_output_queue.put(my_workflow.all_results) # Clear the output queue my_workflow.all_results={} except CouldNotRank: #my_input_queue.put(request) # On error, push the task # back into the queue logging.info("Skipping unrankable article.") except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) raise finally: logging.log(ULTRADEBUG, "Returning results to caller.") logging.log(ULTRADEBUG, "Ending processor execution.") return
def graph_and_rank(self, item): """Turn the item into a graph, then a link matrix, and then rank it. Returns the ranked list of nodes.""" item_graph = self.graph_item(item) logging.log(ULTRADEBUG, "The item graph is %r.", item_graph) item_matrix = item_graph.as_mapped_link_matrix() if len(item_matrix) == 0: logging.info("Skipping item %r. It has an empty matrix.", item) raise CouldNotRank("Item %r is not rankable." % item) try: ranked_item = self._ranker.evaluate(item_matrix) except ValueError: logging.info( "%r returned an exception while ranking %r. " "Skipping.", self._ranker, item) raise CouldNotRank("There was an exception while ranking %r." % item) return ranked_item
def processor(workflow_class, graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename, extra_data_name, extra_data_contents, my_input_queue, my_output_queue, my_own_name=None): logging.info("Setting up worker.") if my_own_name is not None: proctitle.setproctitle(my_own_name) my_workflow = workflow_class(graph_builder_constructor, graph_builder_params, ranker_constructor, ranker_params, eval_parameters, ranking_cutoff, mesh_tree_filename, distance_matrix_filename, distance_function, umls_converter_data_filename) if extra_data_name is not None: my_workflow.__setattr__(extra_data_name, extra_data_contents) logging.info("Finished setting up worker process. Waiting for requests.") try: while True: request = my_input_queue.get() logging.log(ULTRADEBUG, "Processing request %r", request) if request == 'STOP': logging.log(ULTRADEBUG, "Received stop request.") break try: my_workflow.process_article(request) # Recover the article, push it on the output queue my_output_queue.put(my_workflow.all_results) # Clear the output queue my_workflow.all_results = {} except CouldNotRank: #my_input_queue.put(request) # On error, push the task # back into the queue logging.info("Skipping unrankable article.") except: logging.warn("EXCEPTION RAISED: \n%s", traceback.format_exc()) raise finally: logging.log(ULTRADEBUG, "Returning results to caller.") logging.log(ULTRADEBUG, "Ending processor execution.") return
def _post_process_graph(self, built_graph): """Post-processes the graph. The default implementation consolidates it and adds orphan nodes to the graph, consolidating it again.""" built_graph.consolidate_graph() self._tf_idf_scores = None # Make sure the scores aren't recycled # accidentally if self._add_orphan_nodes: added = 0 rels = built_graph.relationships known_nodes = set([x.node1 for x in rels] + [x.node2 for x in rels]) for n in self._node_cache: if n not in known_nodes: added += 1 built_graph.add_relationship( AdirectionalLink(n, n, n.weight)) built_graph.consolidate_graph() logging.log(ULTRADEBUG, "Added %d orphan nodes", added) self._node_cache = set([]) return built_graph