Example #1
0
 def __init__(self, fileobject=None, transform_function=None):
     # The default matrix is installed together with the package
     if fileobject is None:
         fileobject=open(_DEFAULT_MATRIX_NAME, "rb")
     # The matrix file has a header that describes the size of the 
     # matrix in the first bytes
     self.header_size=struct.calcsize(">HH")
     # Read the file header and get the height and width of the matrix 
     self._height, self._width=struct.unpack('>HH',
                                   fileobject.read(self.header_size))
     logging.debug("We're reading a %dx%d matrix", self._height,
                                                   self._width)
     # Keep a link to the file. We'll need it.
     self._matrix_file=fileobject
     #self._matrix_file_handle=self._matrix_file.fileno()
     # This will store the mappings from bytes in the matrix to actual
     # results
     self.transform=[0.0]*256
     logging.log(ULTRADEBUG, "Building the transformation array.")
     for i in xrange(255):
         self.transform[i]=transform_function(i)
     # We leave the last value blank; it's always 0.
     logging.debug("The transformation array is %s.", str(self.transform))
     # Save the size of a byte for later
     self.byte_size=struct.calcsize('<B')
     self._cached_row=-1
     self._row_cache=None
Example #2
0
 def __init__(self, fileobject=None, transform_function=None):
     # The default matrix is installed together with the package
     if fileobject is None:
         fileobject = open(_DEFAULT_MATRIX_NAME, "rb")
     # The matrix file has a header that describes the size of the
     # matrix in the first bytes
     self.header_size = struct.calcsize(">HH")
     # Read the file header and get the height and width of the matrix
     self._height, self._width = struct.unpack(
         '>HH', fileobject.read(self.header_size))
     logging.debug("We're reading a %dx%d matrix", self._height,
                   self._width)
     # Keep a link to the file. We'll need it.
     self._matrix_file = fileobject
     #self._matrix_file_handle=self._matrix_file.fileno()
     # This will store the mappings from bytes in the matrix to actual
     # results
     self.transform = [0.0] * 256
     logging.log(ULTRADEBUG, "Building the transformation array.")
     for i in xrange(255):
         self.transform[i] = transform_function(i)
     # We leave the last value blank; it's always 0.
     logging.debug("The transformation array is %s.", str(self.transform))
     # Save the size of a byte for later
     self.byte_size = struct.calcsize('<B')
     self._cached_row = -1
     self._row_cache = None
Example #3
0
 def _evaluate(self, gold_standard, seen_terms):
     "Compute SAVCC between two sets of terms"
     logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f',
                   gold_standard,
                   seen_terms,
                   self._alpha)
     gold_standard_vector=self._my_tree.term_vector(gold_standard)
     seen_vector=self._my_tree.term_vector(seen_terms)
     # This computes [(alpha*I2)+(1-alpha x M)I2]
     modified_term=seen_vector.scale(self._alpha)+\
         self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha)
     logging.log(ULTRADEBUG, "Modified term=%r", modified_term)
     # I1 * modified_term
     numerator=gold_standard_vector.dot(modified_term)
     # Denominator of the whole thing
     denominator=gold_standard_vector.length()*modified_term.length()
     try:
         result=numerator/denominator
     except ZeroDivisionError:
         logging.warn("ZeroDivisionError when computing SAVCC for %r and %r:",
                  gold_standard, seen_terms)
         result=0
     logging.log(ULTRADEBUG, "Numerator=%1.7f Denominator=%1.7f Result=%1.7f",
                   numerator,
                   denominator,
                   result)
     return result
Example #4
0
    def _evaluate(self, term_list_1, term_list_2):
        "Compute Hooper's IC between two lists of terms."
        logging.debug(
            """Evaluating Hooper's IC with term_list_1=%s and
        term_list_2=%s""",
            term_list_1,
            term_list_2,
        )
        terms_in_common = []
        length_list_1 = len(term_list_1)
        # Keep this number separate, because it will change!
        original_len_list_1 = length_list_1
        length_list_2 = len(term_list_2)
        for i in xrange(original_len_list_1):
            term = term_list_1[i]
            if term in term_list_2:
                terms_in_common.append(term)
                length_list_1 = length_list_1 - 1
                length_list_2 = length_list_2 - 1
        # Compute A and return the result
        common = len(terms_in_common)
        logging.debug("No terms in common, returning 0")
        if common == 0:
            return 0.0

        return float(common) / (common + length_list_1 + length_list_2)
Example #5
0
 def _consolidate_if_necessary(self):
     """Consolidates the graph if it's necessary, ignores the call 
     otherwise"""
     if len(self._temp_relationships) > 0:
         logging.debug("The graph needs to be consolidated. There are "
                       "relationships in temp storage. Consolidating now.")
         self.consolidate_graph()
Example #6
0
 def _evaluate(self, gold_standard, seen_terms):
     "Compute SAVCC between two sets of terms"
     logging.debug('Gold standard=%s Seen terms=%s alpha=%1.5f',
                   gold_standard, seen_terms, self._alpha)
     gold_standard_vector = self._my_tree.term_vector(gold_standard)
     seen_vector = self._my_tree.term_vector(seen_terms)
     # This computes [(alpha*I2)+(1-alpha x M)I2]
     modified_term=seen_vector.scale(self._alpha)+\
         self._my_matrix.mult_by_vector(seen_vector).scale(1-self._alpha)
     logging.log(ULTRADEBUG, "Modified term=%r", modified_term)
     # I1 * modified_term
     numerator = gold_standard_vector.dot(modified_term)
     # Denominator of the whole thing
     denominator = gold_standard_vector.length() * modified_term.length()
     try:
         result = numerator / denominator
     except ZeroDivisionError:
         logging.warn(
             "ZeroDivisionError when computing SAVCC for %r and %r:",
             gold_standard, seen_terms)
         result = 0
     logging.log(ULTRADEBUG,
                 "Numerator=%1.7f Denominator=%1.7f Result=%1.7f",
                 numerator, denominator, result)
     return result
 def build_from_mrrel_file_and_stype_table(self, mrrel_table,
                                           semantic_types):
     """Builds a relationship dictionary and stores a semantic type 
     table."""
     count=0
     self._stypes=semantic_types
     for l in mrrel_table:
         if l.original_direction:
             if l.cui1==l.cui2: # Relationships of a concept to itself are
                                # not interesting to us.
                 continue
             # n^2 computation FTW!
             tuis1=semantic_types.get(l.cui1, [])
             tuis2=semantic_types.get(l.cui2, [])
             for t1 in tuis1:
                 for t2 in tuis2:
                     try:
                         self[(t1, t2)]+=1
                     except KeyError:
                         self[(t1, t2)]=1
                     try:
                         self[(t2, t1)]-=1
                     except KeyError:
                         self[(t2, t1)]=1
         count+=1
         if (count % 1000)==0: logging.debug("%d lines processed", count)
Example #8
0
 def _consolidate_if_necessary(self):
     """Consolidates the graph if it's necessary, ignores the call 
     otherwise"""
     if len(self._temp_relationships)>0:
         logging.debug("The graph needs to be consolidated. There are "
                       "relationships in temp storage. Consolidating now.")
         self.consolidate_graph()
Example #9
0
 def __init__(self, parameters):
     Evaluation.__init__(self, parameters)
     self._my_matrix=self._parameters.savcc_matrix
     self._alpha=self._parameters.alpha
     self._my_tree=self._parameters.mesh_tree
     logging.debug("Creating SAVCC evaluator. Tree %r Alpha %1.5f "
                   "Matrix %r", self._my_tree, self._alpha, 
                                self._my_matrix)
Example #10
0
 def __init__(self, parameters):
     Evaluation.__init__(self, parameters)
     self._my_matrix = self._parameters.savcc_matrix
     self._alpha = self._parameters.alpha
     self._my_tree = self._parameters.mesh_tree
     logging.debug(
         "Creating SAVCC evaluator. Tree %r Alpha %1.5f "
         "Matrix %r", self._my_tree, self._alpha, self._my_matrix)
Example #11
0
def predications_name_and_path(pubmed_id, path):
    filename=os.path.join(path, "%s.pickle.bz2" %
                                 predication_filename(pubmed_id))
    filedir=os.path.dirname(filename)
    if not os.access(filedir, os.F_OK):
        os.makedirs(filedir)
    logging.debug("Fully-specced pathname for %r=%s", pubmed_id, filename)
    return filename
Example #12
0
def predications_name_and_path(pubmed_id, path):
    filename = os.path.join(path,
                            "%s.pickle.bz2" % predication_filename(pubmed_id))
    filedir = os.path.dirname(filename)
    if not os.access(filedir, os.F_OK):
        os.makedirs(filedir)
    logging.debug("Fully-specced pathname for %r=%s", pubmed_id, filename)
    return filename
Example #13
0
 def run(self):
     """Perform the evaluation"""
     logging.info("Starting workflow %r run", self)
     all_results={}
     evaluator=self.create_evaluator()
     count=0
     for each_article in self._reader:
         count+=1
         logging.info("Working on article %d: %r", count, each_article)
         if not self.include_article(each_article):
             logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                           " criteria.", each_article)
             continue
         try:
             ranked_article=self.graph_and_rank(each_article)
         except CouldNotRank:
             continue
         converted_terms=self.convert(ranked_article)
         cut_terms=converted_terms.terms_higher_than_or_equal_to(
                             self._ranking_cutoff)
         logging.debug("Lowest-ranking term is term #%d out of %d"
                       " (score=%1.5f, highest score=%1.5f)",
                       len(cut_terms), len(converted_terms),
                       [x[1] for x in cut_terms][-1],
                       [x[1] for x in cut_terms][0])
         medline_record_mesh_terms=ExpressionList().from_medline(
                 each_article.set_id.article_record().mesh_headings)
         flat_medline=medline_record_mesh_terms.flatten()
         flattened_terms=self.flatten_generated_terms(flat_medline,
                         cut_terms)
         flattened_terms=self.limit_length(flat_medline, flattened_terms)
         if len(flat_medline)==0:
             logging.warn("No gold standard available for article %r. "
                          "Omitting it from the result set.", each_article)
             continue
         eval_result=self.perform_evaluation(each_article,
                                             evaluator,
                                             flat_medline,
                                             flattened_terms)
         flattened_major_headings=\
             medline_record_mesh_terms.major_headings()
         logging.debug("Original headings: %r Major headings: %r", 
                         medline_record_mesh_terms,
                         flattened_major_headings)
         mh_result_temp=self.perform_evaluation(each_article, evaluator,
                                                flattened_major_headings,
                                                flattened_terms)
         mh_result=NamedResultSet("mh_", mh_result_temp)
         # Compute the total recall, too
         total_recall=self.compute_total_recall(flat_medline, 
                                                converted_terms)
         eval_result.add(total_recall)
         # Unify the result sets
         all_results[each_article.set_id]=eval_result | mh_result
     logging.info("Writing out results.")
     self.output(all_results)
     self.output_metadata()
     return
Example #14
0
 def run(self):
     """Perform the evaluation"""
     logging.info("Starting workflow %r run", self)
     all_results = {}
     evaluator = self.create_evaluator()
     count = 0
     for each_article in self._reader:
         count += 1
         logging.info("Working on article %d: %r", count, each_article)
         if not self.include_article(each_article):
             logging.log(
                 ULTRADEBUG, "Skipping article %r due to exclusion "
                 " criteria.", each_article)
             continue
         try:
             ranked_article = self.graph_and_rank(each_article)
         except CouldNotRank:
             continue
         converted_terms = self.convert(ranked_article)
         cut_terms = converted_terms.terms_higher_than_or_equal_to(
             self._ranking_cutoff)
         logging.debug(
             "Lowest-ranking term is term #%d out of %d"
             " (score=%1.5f, highest score=%1.5f)", len(cut_terms),
             len(converted_terms), [x[1] for x in cut_terms][-1],
             [x[1] for x in cut_terms][0])
         medline_record_mesh_terms = ExpressionList().from_medline(
             each_article.set_id.article_record().mesh_headings)
         flat_medline = medline_record_mesh_terms.flatten()
         flattened_terms = self.flatten_generated_terms(
             flat_medline, cut_terms)
         flattened_terms = self.limit_length(flat_medline, flattened_terms)
         if len(flat_medline) == 0:
             logging.warn(
                 "No gold standard available for article %r. "
                 "Omitting it from the result set.", each_article)
             continue
         eval_result = self.perform_evaluation(each_article, evaluator,
                                               flat_medline,
                                               flattened_terms)
         flattened_major_headings=\
             medline_record_mesh_terms.major_headings()
         logging.debug("Original headings: %r Major headings: %r",
                       medline_record_mesh_terms, flattened_major_headings)
         mh_result_temp = self.perform_evaluation(each_article, evaluator,
                                                  flattened_major_headings,
                                                  flattened_terms)
         mh_result = NamedResultSet("mh_", mh_result_temp)
         # Compute the total recall, too
         total_recall = self.compute_total_recall(flat_medline,
                                                  converted_terms)
         eval_result.add(total_recall)
         # Unify the result sets
         all_results[each_article.set_id] = eval_result | mh_result
     logging.info("Writing out results.")
     self.output(all_results)
     self.output_metadata()
     return
Example #15
0
    def _evaluate(self, gold_standard, seen_terms):
        "Compute precision."
        logging.debug("Computing precision between gold standard %r and "
                      "term list %r", gold_standard, seen_terms)
        correct_terms=[x for x in seen_terms if x in gold_standard]
        size_seen_terms=len(seen_terms)
        size_correct_terms=len(correct_terms)

        return float(size_correct_terms)/size_seen_terms
Example #16
0
File: f2.py Project: YZWD/MEDRank
 def _evaluate(self, gold_standard, seen_terms):
     "Compute the actual F2."
     logging.debug("Computing F2 between gold standard %r and "
                   "term list %r", gold_standard, seen_terms)
     r=Recall().evaluate(gold_standard, seen_terms).result
     p=Precision().evaluate(gold_standard, seen_terms).result
     if r==0.0 and p==0.0:
         return 0.0
     return (5.0*r*p)/(4.0*p+r)
Example #17
0
    def _evaluate(self, gold_standard, seen_terms):
        "Compute the actual recall."
        logging.debug("Computing recall between gold standard %r and "
                      "term list %r", gold_standard, seen_terms)
        correct_terms=[x for x in seen_terms if x in gold_standard]
        size_gold_standard=len(gold_standard)
        size_correct_terms=len(correct_terms)

        return float(size_correct_terms)/size_gold_standard
Example #18
0
def predication_filename(pubmed_id):
    p=list(str(pubmed_id))
    p, last_two=p[:-2], p[-2:]
    ptuple=tuple(['_%s' % x for x in p])
    if ptuple==():
        fn=''.join(last_two)
    else:
        fn=os.path.join(os.path.join(*ptuple), ''.join(last_two))
    logging.debug("Predication filename for %r=%s", pubmed_id, fn)
    return fn
Example #19
0
def predication_filename(pubmed_id):
    p = list(str(pubmed_id))
    p, last_two = p[:-2], p[-2:]
    ptuple = tuple(['_%s' % x for x in p])
    if ptuple == ():
        fn = ''.join(last_two)
    else:
        fn = os.path.join(os.path.join(*ptuple), ''.join(last_two))
    logging.debug("Predication filename for %r=%s", pubmed_id, fn)
    return fn
Example #20
0
 def _init_inverse_lookup(self):
     """Sets up the internal data store to perform reverse lookups."""
     logging.debug("First request of a reverse lookup. Building the " \
                   "inverse lookup dictionary.")
     self._invlookup = {}
     for k, items in self._tree.iteritems():
         for item in items.position:
             self._invlookup[item] = k
     logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.")
     return
Example #21
0
    def _evaluate(self, gold_standard, seen_terms):
        "Compute the actual recall."
        logging.debug(
            "Computing recall between gold standard %r and "
            "term list %r", gold_standard, seen_terms)
        correct_terms = [x for x in seen_terms if x in gold_standard]
        size_gold_standard = len(gold_standard)
        size_correct_terms = len(correct_terms)

        return float(size_correct_terms) / size_gold_standard
Example #22
0
 def _evaluate(self, gold_standard, seen_terms):
     "Compute the actual F2."
     logging.debug(
         "Computing F2 between gold standard %r and "
         "term list %r", gold_standard, seen_terms)
     r = Recall().evaluate(gold_standard, seen_terms).result
     p = Precision().evaluate(gold_standard, seen_terms).result
     if r == 0.0 and p == 0.0:
         return 0.0
     return (5.0 * r * p) / (4.0 * p + r)
Example #23
0
File: tree.py Project: YZWD/MEDRank
 def _init_inverse_lookup(self):
     """Sets up the internal data store to perform reverse lookups."""
     logging.debug("First request of a reverse lookup. Building the " \
                   "inverse lookup dictionary.")
     self._invlookup={}
     for k, items in self._tree.iteritems():
         for item in items.position:
             self._invlookup[item]=k
     logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.")
     return
 def _create_graph(self, list_of_lines):
     """Build a graph by generating a relationship for every pair of
     co-occurring nodes. We take advantage of the fact that (for our
     purposes) all lines with the same line_id in METAMAP output come from
     the same sentence."""
     new_graph = self._type_of_graph_to_build()
     logging.debug("Retrieving semantic predications for %r",
                   self._line_set_id)
     try:
         predications = get_predications(self._line_set_id)
     except:
         logging.warn(
             "No predications for %r: an exception was raised.\n%s",
             self._line_set_id, traceback.format_exc())
         return new_graph
     logging.log(ULTRADEBUG,
                 "Building a METAMAP co-occurrence graph from %r",
                 list_of_lines)
     for sentence in self.sentence_iterator(list_of_lines):
         # Each "sentence" contains a set of potential nodes that need
         # screening.
         nodes = []
         for concept in sentence:
             new_node = self._node_factory(concept.CUI, concept.description,
                                           concept.confidence, concept.line)
             if self.include_node(new_node):
                 #nodes.append((concept.CUI, concept.confidence))
                 nodes.append(new_node)
             else:
                 logging.log(ULTRADEBUG, "%r excluded from the graph.",
                             new_node)
         # Once we have all the nodes in a sentence, we generate all
         # possible combinations (O(n^2)), yes it's ugly.
         for i in xrange(len(nodes)):
             # Since relationships are not directional we can skip half
             # of the generation (i.e. if we have i<-->j we don't need
             # j<-->i
             for j in xrange(i + 1, len(nodes)):
                 node1, node2 = nodes[i], nodes[j]
                 #new_link=AdirectionalLink(node1[0], node2[0],
                 #                          (node1[1]+node2[1])/2.0)
                 # Is there a predication?
                 try:
                     this_link = predications[(node1, node2)]
                 except KeyError:
                     continue
                 new_link = self._adirectional_link_factory(
                     node1, node2, this_link.weight)
                 if self.include_link(new_link):
                     new_graph.add_relationship(new_link)
                 else:
                     logging.log(ULTRADEBUG,
                                 "Excluding link %r from the graph",
                                 new_link)
     return new_graph
 def _create_graph(self, list_of_lines):
     """Build a graph by generating a relationship for every pair of
     co-occurring nodes. We take advantage of the fact that (for our
     purposes) all lines with the same line_id in METAMAP output come from
     the same sentence."""
     new_graph=self._type_of_graph_to_build()
     logging.debug("Retrieving semantic predications for %r", 
                   self._line_set_id)
     try:
         predications=get_predications(self._line_set_id)
     except:
         logging.warn("No predications for %r: an exception was raised.\n%s",
                      self._line_set_id, traceback.format_exc())
         return new_graph
     logging.log(ULTRADEBUG, 
                 "Building a METAMAP co-occurrence graph from %r",
                 list_of_lines)
     for sentence in self.sentence_iterator(list_of_lines):
         # Each "sentence" contains a set of potential nodes that need
         # screening.
         nodes=[]
         for concept in sentence:
             new_node=self._node_factory(concept.CUI, 
                                         concept.description, 
                                         concept.confidence,
                                         concept.line)
             if self.include_node(new_node):
                 #nodes.append((concept.CUI, concept.confidence))
                 nodes.append(new_node)
             else:
                 logging.log(ULTRADEBUG, "%r excluded from the graph.", new_node)
         # Once we have all the nodes in a sentence, we generate all
         # possible combinations (O(n^2)), yes it's ugly.
         for i in xrange(len(nodes)):
             # Since relationships are not directional we can skip half
             # of the generation (i.e. if we have i<-->j we don't need
             # j<-->i
             for j in xrange(i+1, len(nodes)):
                 node1, node2=nodes[i],nodes[j] 
                 #new_link=AdirectionalLink(node1[0], node2[0], 
                 #                          (node1[1]+node2[1])/2.0)
                 # Is there a predication?
                 try:
                     this_link=predications[(node1, node2)]
                 except KeyError:
                     continue
                 new_link=self._adirectional_link_factory(node1, node2,
                                        this_link.weight)
                 if self.include_link(new_link):
                     new_graph.add_relationship(new_link)
                 else:
                     logging.log(ULTRADEBUG, "Excluding link %r from the graph",
                                   new_link)
     return new_graph
Example #26
0
 def __eq__(self, other):
     """SLOW comparison operation!"""
     if len(self)!=len(other):
         return False
     for i in xrange(len(self)):
         for j in xrange(len(self)):
             if self[i, j]!=other[i, j]:
                 logging.debug("FAIL @%d, %d. Self=%r, other=%r", 
                               i, j, self[i,j], other[i,j])
                 return False
     return True
Example #27
0
 def __init__(self, tree, rule_data=None, skip_unknown_concepts=True,
              accepted_types=set(['a', 'i'])):
     logging.debug("Creating Converter with tree %r", tree)
     self._tree=tree
     if rule_data is None:
         rule_data=pickle.load(open(_DEFAULT_CONVERTER_DATA, "rb"))
         logging.info("Using converter data from %r", 
                      _DEFAULT_CONVERTER_DATA)
     self._data=rule_data
     self._extra_checktags=set() 
     self._skip_unknown=skip_unknown_concepts
     self._accepted_types=accepted_types
Example #28
0
 def process_article(self, each_article):
     if not self.include_article(each_article):
         logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                       " criteria.", each_article)
         return
     try:
         ranked_article=self.graph_and_rank(each_article)
     except CouldNotRank:
         return
     logging.debug("Ranked article: %r", ranked_article)
     converted_terms=self.convert(ranked_article)
     logging.debug("Converted terms: %r", converted_terms)
     cut_terms=converted_terms.terms_higher_than_or_equal_to(
                         self._ranking_cutoff)
     logging.debug("Cut terms: %r", cut_terms)
     try:
         medline_record_mesh_terms=ExpressionList().from_medline(
                 each_article.set_id.article_record()['MH'])
     except:
         logging.warn("Could not obtain an article record for %r. "
                      "Skipping.", each_article)
         return
     flat_medline=medline_record_mesh_terms.flatten()
     flattened_terms=self.flatten_generated_terms(flat_medline,
                     cut_terms)
     flattened_terms=self.limit_length(flat_medline, flattened_terms)
     if len(flat_medline)==0:
         logging.warn("No gold standard available for article %r. "
                      "Omitting it from the result set.", each_article)
         return
     eval_result=self.perform_evaluation(each_article,
                                         self.evaluator,
                                         flat_medline,
                                         flattened_terms)
     flattened_major_headings=\
         medline_record_mesh_terms.major_headings()
     #logging.debug("Original headings: %r Major headings: %r", 
     #                medline_record_mesh_terms,
     #                flattened_major_headings)
     logging.debug("Flattened MeSH terms: %r", flat_medline)
     logging.debug("Flattened generated terms: %r", flattened_terms)
     mh_result_temp=self.perform_evaluation(each_article, self.evaluator,
                                            flattened_major_headings,
                                            flattened_terms)
     mh_result=NamedResultSet("major_", mh_result_temp)
     # Compute the total recall, too
     total_recall=self.compute_total_recall(flat_medline, 
                                            converted_terms)
     eval_result.add(total_recall)
     # Unify the result sets
     self.all_results[each_article.set_id]=eval_result | mh_result
     return
Example #29
0
 def iter_concepts(self):
     """Iterates through the concepts (only one per position) so that
     they can be extracted in order. We will get the first concept that
     covers each positional 'slot' in the original. """
     #concepts_iter=MappingLine.ev_parser.finditer(self.line)
     #concept_slots={}
     #for concept in concepts_iter:
     #    concept=concept.groupdict()
     #    positions=MappingLine.position_extractor.findall(
     #            concept['match_positions'])
     #    this_pos=int(positions[0])
     #    covered_pos=reduce(operator.or_, [int(x) in concept_slots for x in
     #                                      positions])
     #    if covered_pos in concept_slots:
     #        continue
     #    concept_slots[this_pos]=ConceptLine(concept['cui'], 
     #                             concept['preferred_concept_name'],
     #                             -int(concept['candidate_score']))
     #    # Fill in the rest of the slots covered by this concept
     #    for each_slot in positions[1:]:
     #        concept_slots[int(each_slot)]=None
     #        
     #ordered_slots=concept_slots.keys()
     #ordered_slots.sort()
     #for slot in ordered_slots:
     #    if concept_slots[slot] is not None:
     #        yield concept_slots[slot]
     #return
     try:
         all_mappings=mappings.parseString(self.line)[0]
     except:
         logging.warn("FAIL parsing %s", self.line)
         raise
     if len(all_mappings)==0:
         return
     # Get the mapping with the best score. If all have the same score,
     # uses the first one.
     best_mapping=all_mappings[0]['Expression'][0]
     best_mapping_score=all_mappings[0]['Score']
     for m in all_mappings[1:]:
         if m['Score']>best_mapping_score:
             best_mapping_score=m['Score']
             best_mapping=m['Expression'][0]
     # The EVs are in order
     for e in best_mapping:
         new_concept=ConceptLine(e['ConceptID'],
                                 e['Name'],
                                 int(e['Score']))
         logging.debug("Emitting %r", new_concept)
         yield new_concept
     return
Example #30
0
 def __init__(self, fileobject, transform_function):
     SavccMatrix.__init__(self, fileobject, transform_function)
     # Add normalization factors
     logging.log(ULTRADEBUG, "Initializing normalization array")
     # Default behavior: no normalization
     self.normfactors=[1.0]*self._height
     # Tentative normalization array name
     array_filename=self._expected_norm_array_name()
     logging.debug("Trying to load a normalization array from disk. The "
                   "file should be named %s.", array_filename)
     # Make sure that only one process or thread at a time can attempt to get 
     # the normalization factors
     _normfactor_lock.acquire()
     try:
         try:
             self._load_normalization_factors(open(array_filename, 'rb'))
             logging.debug('Normalization factors loaded from disk.')
         except IOError:
             logging.debug("Unable to load normalization factors from disk.")
             self._generate_normalization_factors()
             # Only save normalization factors if they are not a StringIO
             # object
             if not isinstance(fileobject, StringIO.StringIO):
                 logging.debug("Saving normalization factors to %s",
                               array_filename)
                 try:
                     self._save_normalization_factors(open(array_filename,
                                                           'wb'))
                 except IOError:
                     logging.warn("Unable to save the normalization array. "
                                  "It will have to be regenerated each "
                                  "time.")
     finally:
         _normfactor_lock.release()
Example #31
0
 def iter_concepts(self):
     """Iterates through the concepts (only one per position) so that
     they can be extracted in order. We will get the first concept that
     covers each positional 'slot' in the original. """
     #concepts_iter=MappingLine.ev_parser.finditer(self.line)
     #concept_slots={}
     #for concept in concepts_iter:
     #    concept=concept.groupdict()
     #    positions=MappingLine.position_extractor.findall(
     #            concept['match_positions'])
     #    this_pos=int(positions[0])
     #    covered_pos=reduce(operator.or_, [int(x) in concept_slots for x in
     #                                      positions])
     #    if covered_pos in concept_slots:
     #        continue
     #    concept_slots[this_pos]=ConceptLine(concept['cui'],
     #                             concept['preferred_concept_name'],
     #                             -int(concept['candidate_score']))
     #    # Fill in the rest of the slots covered by this concept
     #    for each_slot in positions[1:]:
     #        concept_slots[int(each_slot)]=None
     #
     #ordered_slots=concept_slots.keys()
     #ordered_slots.sort()
     #for slot in ordered_slots:
     #    if concept_slots[slot] is not None:
     #        yield concept_slots[slot]
     #return
     try:
         all_mappings = mappings.parseString(self.line)[0]
     except:
         logging.warn("FAIL parsing %s", self.line)
         raise
     if len(all_mappings) == 0:
         return
     # Get the mapping with the best score. If all have the same score,
     # uses the first one.
     best_mapping = all_mappings[0]['Expression'][0]
     best_mapping_score = all_mappings[0]['Score']
     for m in all_mappings[1:]:
         if m['Score'] > best_mapping_score:
             best_mapping_score = m['Score']
             best_mapping = m['Expression'][0]
     # The EVs are in order
     for e in best_mapping:
         new_concept = ConceptLine(e['ConceptID'], e['Name'],
                                   int(e['Score']))
         logging.debug("Emitting %r", new_concept)
         yield new_concept
     return
Example #32
0
 def process_article(self, each_article):
     if not self.include_article(each_article):
         logging.log(ULTRADEBUG, "Skipping article %r due to exclusion "
                     " criteria.", each_article)
         return
     try:
         ranked_article = self.graph_and_rank(each_article)
     except CouldNotRank:
         return
     logging.debug("Ranked article: %r", ranked_article)
     converted_terms = self.convert(ranked_article)
     logging.debug("Converted terms: %r", converted_terms)
     cut_terms = converted_terms.terms_higher_than_or_equal_to(
         self._ranking_cutoff)
     logging.debug("Cut terms: %r", cut_terms)
     try:
         medline_record_mesh_terms = ExpressionList().from_medline(
             each_article.set_id.article_record()['MH'])
     except:
         logging.warn(
             "Could not obtain an article record for %r. "
             "Skipping.", each_article)
         return
     flat_medline = medline_record_mesh_terms.flatten()
     flattened_terms = self.flatten_generated_terms(flat_medline, cut_terms)
     flattened_terms = self.limit_length(flat_medline, flattened_terms)
     if len(flat_medline) == 0:
         logging.warn(
             "No gold standard available for article %r. "
             "Omitting it from the result set.", each_article)
         return
     eval_result = self.perform_evaluation(each_article, self.evaluator,
                                           flat_medline, flattened_terms)
     flattened_major_headings=\
         medline_record_mesh_terms.major_headings()
     #logging.debug("Original headings: %r Major headings: %r",
     #                medline_record_mesh_terms,
     #                flattened_major_headings)
     logging.debug("Flattened MeSH terms: %r", flat_medline)
     logging.debug("Flattened generated terms: %r", flattened_terms)
     mh_result_temp = self.perform_evaluation(each_article, self.evaluator,
                                              flattened_major_headings,
                                              flattened_terms)
     mh_result = NamedResultSet("major_", mh_result_temp)
     # Compute the total recall, too
     total_recall = self.compute_total_recall(flat_medline, converted_terms)
     eval_result.add(total_recall)
     # Unify the result sets
     self.all_results[each_article.set_id] = eval_result | mh_result
     return
Example #33
0
File: tree.py Project: YZWD/MEDRank
 def _init_search_dict(self):
     """Sets up the internal data store to perform searches."""
     logging.debug("First request of a search. Building the " \
                   "search dictionary.")
     self._search_dict={}
     for k, items in self._tree.iteritems():
         for synonym in items.synonyms:
             if synonym in self._search_dict:
                 self._search_dict[synonym].append(k)
             else:
                 self._search_dict[synonym]=[k]
         if k in self._search_dict:
             self._search_dict[k].append(k)
         else:
             self._search_dict[k]=[k]
Example #34
0
 def __init__(self,
              tree,
              rule_data=None,
              skip_unknown_concepts=True,
              accepted_types=set(['a', 'i'])):
     logging.debug("Creating Converter with tree %r", tree)
     self._tree = tree
     if rule_data is None:
         rule_data = pickle.load(open(_DEFAULT_CONVERTER_DATA, "rb"))
         logging.info("Using converter data from %r",
                      _DEFAULT_CONVERTER_DATA)
     self._data = rule_data
     self._extra_checktags = set()
     self._skip_unknown = skip_unknown_concepts
     self._accepted_types = accepted_types
Example #35
0
 def _init_search_dict(self):
     """Sets up the internal data store to perform searches."""
     logging.debug("First request of a search. Building the " \
                   "search dictionary.")
     self._search_dict = {}
     for k, items in self._tree.iteritems():
         for synonym in items.synonyms:
             if synonym in self._search_dict:
                 self._search_dict[synonym].append(k)
             else:
                 self._search_dict[synonym] = [k]
         if k in self._search_dict:
             self._search_dict[k].append(k)
         else:
             self._search_dict[k] = [k]
Example #36
0
 def _create_table_if_necessary(self):
     self._lock.acquire()
     try:
         try:
             dummy=self.__t.execute('select * from s limit 1')
         except sqlite3.OperationalError:
             # Table doesn't exist
             logging.log(ULTRADEBUG, "Table doesn't exist - must be a new database.")
             self.__t.execute("""create table s 
                                 (pkey TEXT PRIMARY KEY NOT NULL,
                                  data BLOB NOT NULL)""")
             logging.debug("Table created.")
     finally:
         self._lock.release()
     return
Example #37
0
 def convert(self, umls_concept):
     try:
         return self.convert_step_1(umls_concept)
     except NoConceptInfoError:
         if not self._skip_unknown:
             raise
         else:
             logging.debug("There is no detailed info on %r", umls_concept)
             return Expression([])
     except TermNotInTree:
         if not self._skip_unknown:
             raise
         else:
             logging.debug(
                 "I could not find an equivalence for %r "
                 "in the tree %r", umls_concept, self._tree)
             return Expression([])
Example #38
0
 def index(self, term):
     """Returns the index of a term in the sorted term list"""
     if self._term_list_as_dict is None:
         # Precompute all indexes
         logging.debug("Building MeSH tree index.")
         currindex = 0
         self._term_list_as_dict = {}
         for each_term in self.terms:
             self._term_list_as_dict[each_term] = currindex
             for each_synonym in self[each_term].synonyms:
                 self._term_list_as_dict[each_synonym] = currindex
             currindex += 1
     try:
         return self._term_list_as_dict[term]
     except KeyError:
         raise TermNotInTree("Term %s is not a member of tree %r" %
                             (term, self))
Example #39
0
File: tree.py Project: YZWD/MEDRank
 def index(self, term):
     """Returns the index of a term in the sorted term list"""
     if self._term_list_as_dict is None:
         # Precompute all indexes
         logging.debug("Building MeSH tree index.")
         currindex=0
         self._term_list_as_dict={}
         for each_term in self.terms:
             self._term_list_as_dict[each_term]=currindex
             for each_synonym in self[each_term].synonyms:
                 self._term_list_as_dict[each_synonym]=currindex
             currindex+=1
     try:
         return self._term_list_as_dict[term]
     except KeyError:
         raise TermNotInTree("Term %s is not a member of tree %r" % 
                             (term, self))
Example #40
0
 def convert(self, umls_concept):
     try:
         return self.convert_step_1(umls_concept)
     except NoConceptInfoError:
         if not self._skip_unknown:
             raise
         else:
             logging.debug("There is no detailed info on %r", umls_concept)
             return Expression([])
     except TermNotInTree:
         if not self._skip_unknown:
             raise
         else:
             logging.debug("I could not find an equivalence for %r "
                         "in the tree %r", 
                          umls_concept, self._tree)
             return Expression([])
Example #41
0
 def __init__(self, persistent_file_name, isolation_level="IMMEDIATE", 
              compression=False):
     self.my_persistence=persistent_file_name is not None
     if not self.my_persistence:
         # Not very elegant...
         self.my_filename=os.path.join(tempfile.gettempdir(),
                                       tempfile.gettempprefix() + 
                                       str(random.randint(0, 19283673)))
         logging.debug('No filename specified - using tempfile %s', 
                       self.my_filename)
     else:
         self.my_filename=os.path.abspath(persistent_file_name)
     self._lock=self._get_lock()
     self.__t=sqlite3.connect(self.my_filename, isolation_level=isolation_level)
     self.__t.text_factory=str
     self._commits_enabled=True # Only disable in very specific cases!
     self._create_table_if_necessary()
     self.compressed=compression
Example #42
0
 def evaluate(self, link_matrix):
     """Perform an iterative computation of HITS"""
     logging.debug("Setting up to compute HITS on %r", link_matrix)
     # Sanity check
     if len(link_matrix) == 0:
         raise ValueError("Attempting to HITS-rank an empty link matrix.")
     start = time.clock()
     logging.log(ULTRADEBUG, "Normalizing the link matrix.")
     # Perform actual computations on a normalized matrix
     try:
         normatrix = link_matrix.normalize()
     except ZeroDivisionError:
         raise ZeroDivisionError("Aberrant matrix: There are no links.")
     logging.log(ULTRADEBUG, "Iterating for HITS.")
     # Set up the iteration variables
     accumulator = 2 * self._e
     iterations = 0
     x_w = [1.0] * len(normatrix)
     y_w = x_w[:]
     start_iter = time.clock()  # Benchmarking
     while (accumulator > self._e):
         if iterations > self._max_iter:
             logging.debug(
                 "Reached the iteration limit of %d. Ending the "
                 "HITS computation prematurely.", self._max_iter)
             break
         accumulator = 0.0
         new_x_w = self.i_operation(normatrix, y_w)
         new_y_w = self.o_operation(normatrix, new_x_w)
         iterations += 1
         new_x_w = self.normalize_weights(new_x_w)
         new_y_w = self.normalize_weights(new_y_w)
         delta = zip(x_w, new_x_w) + zip(y_w, new_y_w)
         accumulator = reduce(operator.add,
                              (abs(x[0] - x[1]) for x in delta))
         x_w = new_x_w
         y_w = new_y_w
     logging.log(ULTRADEBUG, "Iteration done.")
     finished_iter = time.clock()
     self._latest_stats = RankerStats(iterations, accumulator, start,
                                      start_iter, finished_iter)
     return (x_w, y_w)
Example #43
0
 def __init__(self, persistent_file=None, sync_every_transactions=5,
              write_out_every_transactions=100, file_mode="r", 
              cachesize=1048576,
              compression=False):
     logging.debug("Creating database for %r", persistent_file)
     self.my_store=SQLiteDict(persistent_file)
     logging.log(ULTRADEBUG, "Initializing internal state")
     self.my_mode=file_mode
     self.my_file=persistent_file
     self.my_store.compressed=compression
     self._sync_every=sync_every_transactions
     if self._sync_every==0:
         self.my_store.commits_enabled=False
     self.write_counter=0 
     self.write_every=write_out_every_transactions
     self.persistent=persistent_file is not None
     #self.my_lock=RLock()
     if self.persistent:
         self.unfreeze()
         self.integrity_checking()
Example #44
0
 def build_idf_from_file(self, file_reader, default_score=None):
     tempdict = {}
     logging.info("Building the term frequency dictionary")
     count = 1
     logging.debug("Checking for a cache file, and loading from it.")
     try:
         self.populate_from_cache(
             self.cache_file_name(file_reader.original_file.name))
         logging.info("Loaded from cache. It's not necessary to build.")
         return
     except:
         logging.debug("Nope. Proceeding with building the dictionary.")
     for article in file_reader:
         logging.debug(
             "Processing article %r (number %d) for the term"
             " frequency dictionary", article, count)
         if article.set_id.pmid < 0:
             logging.warn("Article with unknown PubMed ID - skipping")
             continue
         count += 1
         tempcounts = {}
         for line in article.lines:
             try:
                 this_cui = line.CUI
             except AttributeError:
                 continue
             # Use the confidence as the score if no default is specified
             #if default_score is None:
             #    try:
             #        this_score=line.confidence
             #    except AttributeError:
             #        continue
             #else:
             #    this_score=default_score
             #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score
             tempcounts[this_cui] = 1
         # Now have all the CUIs that appeared in the article. Update
         # the total counts.
         for k in tempcounts:
             tempdict[k] = tempdict.get(k, 0) + 1
     logging.debug("Built a dictionary with %d items. Computing IDFs.",
                   len(tempdict))
     # max_value=max(tempdict.itervalues())
     #logging.debug("Saving it to permanent storage.")
     for k, v in tempdict.iteritems():
         self[k] = math.log(count / float(v)) + 1.0
     logging.info("Done building the dictionary. Dumping it to a cache "
                  "file.")
     self.dump_to_cache(self.cache_file_name(
         file_reader.original_file.name))
     return
Example #45
0
 def evaluate(self, link_matrix):
     """Perform an iterative computation of HITS"""
     logging.debug("Setting up to compute HITS on %r", link_matrix)
     # Sanity check
     if len(link_matrix)==0:
         raise ValueError("Attempting to HITS-rank an empty link matrix.")
     start=time.clock()
     logging.log(ULTRADEBUG, "Normalizing the link matrix.")
     # Perform actual computations on a normalized matrix
     try:
         normatrix=link_matrix.normalize()
     except ZeroDivisionError:
         raise ZeroDivisionError("Aberrant matrix: There are no links.")
     logging.log(ULTRADEBUG, "Iterating for HITS.")
     # Set up the iteration variables
     accumulator=2*self._e
     iterations=0
     x_w=[1.0]*len(normatrix)
     y_w=x_w[:]
     start_iter=time.clock() # Benchmarking
     while (accumulator>self._e):
         if iterations>self._max_iter:
             logging.debug("Reached the iteration limit of %d. Ending the "
             "HITS computation prematurely.", self._max_iter)
             break
         accumulator=0.0
         new_x_w=self.i_operation(normatrix, y_w)
         new_y_w=self.o_operation(normatrix, new_x_w)
         iterations+=1
         new_x_w=self.normalize_weights(new_x_w)
         new_y_w=self.normalize_weights(new_y_w)
         delta=zip(x_w, new_x_w) + zip(y_w, new_y_w)
         accumulator=reduce(operator.add, (abs(x[0]-x[1]) for x in delta))
         x_w=new_x_w
         y_w=new_y_w
     logging.log(ULTRADEBUG, "Iteration done.")
     finished_iter=time.clock()
     self._latest_stats=RankerStats(iterations, accumulator, start,
                                     start_iter, finished_iter)
     return (x_w, y_w)
def read_lists(the_zip):
    lists = {}
    zf = ZipFile(the_zip)
    files = zf.namelist()
    for each_file in files:
        try:
            listname = os.path.splitext(os.path.basename(each_file))[0].lower()
            logging.debug('Reading UMLS Checktag boost list %s from file %s',
                          listname, each_file)
            this_list = {}
            row_counter = 0
            # To handle boneheadedness on behalf of the CSV readers, we'll
            # use a temporary file
            filedata = zf.read(each_file)
            tmphandle, tmpname = tempfile.mkstemp()
            os.write(tmphandle, filedata)
            os.fsync(tmphandle)
            os.close(tmphandle)
            print "Decompressed %s into %s. Processing it." % (each_file,
                                                               tmpname)
            fakefile = open(tmpname, 'rU')
            this_reader = DictReader(fakefile)
            for item in this_reader:
                row_counter += 1
                if item['Row'].strip() == '':
                    raise ValueError(
                        'Blank line in UMLS file %s when '
                        'expecting row %d', each_file, row_counter)
                if int(item['Row']) != row_counter:
                    raise ValueError("Inconsistent UMLS list file %s",
                                     each_file)
                this_list[item['CUI'].strip().lower()]=\
                    item['Description'].strip().lower()
            os.unlink(tmpname)
            lists[listname] = this_list
        except:
            logging.debug("Exception happened while processing file %s",
                          each_file)
            raise
    return lists
Example #47
0
 def build_idf_from_file(self, file_reader, default_score=None):
     tempdict={}
     logging.info("Building the term frequency dictionary")
     count=1
     logging.debug("Checking for a cache file, and loading from it.")
     try:
         self.populate_from_cache(
             self.cache_file_name(file_reader.original_file.name))
         logging.info("Loaded from cache. It's not necessary to build.")
         return
     except:
         logging.debug("Nope. Proceeding with building the dictionary.")
     for article in file_reader:
         logging.debug("Processing article %r (number %d) for the term"
                      " frequency dictionary", article, count)
         if article.set_id.pmid < 0:
             logging.warn("Article with unknown PubMed ID - skipping")
             continue
         count+=1
         tempcounts={}
         for line in article.lines:
             try:
                 this_cui=line.CUI
             except AttributeError:
                 continue
             # Use the confidence as the score if no default is specified
             #if default_score is None:
             #    try:
             #        this_score=line.confidence
             #    except AttributeError:
             #        continue
             #else:
             #    this_score=default_score
             #tempdict[this_cui]=tempdict.get(this_cui, 0.0)+this_score
             tempcounts[this_cui]=1
         # Now have all the CUIs that appeared in the article. Update
         # the total counts.
         for k in tempcounts:
             tempdict[k]=tempdict.get(k, 0)+1
     logging.debug("Built a dictionary with %d items. Computing IDFs.",
                   len(tempdict))
     # max_value=max(tempdict.itervalues())
     #logging.debug("Saving it to permanent storage.")
     for k, v in tempdict.iteritems():
         self[k]=math.log(count/float(v))+1.0
     logging.info("Done building the dictionary. Dumping it to a cache "
                  "file.")
     self.dump_to_cache(
             self.cache_file_name(file_reader.original_file.name))
     return
def read_lists(the_zip):
    lists={}
    zf=ZipFile(the_zip)
    files=zf.namelist()
    for each_file in files:
        try:
            listname=os.path.splitext(os.path.basename(each_file))[0].lower()
            logging.debug('Reading UMLS Checktag boost list %s from file %s',
                          listname,
                          each_file)
            this_list={}
            row_counter=0
            # To handle boneheadedness on behalf of the CSV readers, we'll
            # use a temporary file
            filedata=zf.read(each_file)
            tmphandle, tmpname=tempfile.mkstemp()
            os.write(tmphandle, filedata)
            os.fsync(tmphandle)
            os.close(tmphandle)
            print "Decompressed %s into %s. Processing it." % (each_file, 
                                                               tmpname)
            fakefile=open(tmpname, 'rU')
            this_reader=DictReader(fakefile)
            for item in this_reader:
                row_counter+=1
                if item['Row'].strip()=='':
                    raise ValueError('Blank line in UMLS file %s when '
                                     'expecting row %d', each_file, row_counter)
                if int(item['Row']) != row_counter:
                    raise ValueError("Inconsistent UMLS list file %s",
                                     each_file)
                this_list[item['CUI'].strip().lower()]=\
                    item['Description'].strip().lower()
            os.unlink(tmpname)
            lists[listname]=this_list
        except:
            logging.debug("Exception happened while processing file %s",
                          each_file)
            raise
    return lists
Example #49
0
    def _evaluate(self, term_list_1, term_list_2):
        "Compute Hooper's IC between two lists of terms."
        logging.debug("""Evaluating Hooper's IC with term_list_1=%s and
        term_list_2=%s""", term_list_1, term_list_2)
        terms_in_common=[]
        length_list_1=len(term_list_1)
        # Keep this number separate, because it will change!
        original_len_list_1=length_list_1
        length_list_2=len(term_list_2)
        for i in xrange(original_len_list_1):
            term=term_list_1[i]
            if term in term_list_2:
                terms_in_common.append(term)
                length_list_1=length_list_1-1
                length_list_2=length_list_2-1
        # Compute A and return the result
        common=len(terms_in_common)
        logging.debug("No terms in common, returning 0")
        if common==0: 
            return 0.0

        return float(common)/(common+length_list_1+length_list_2)
Example #50
0
 def __init__(self,
              persistent_file=None,
              sync_every_transactions=5,
              write_out_every_transactions=100,
              file_mode="r",
              cachesize=1048576,
              compression=False):
     logging.debug("Creating database for %r", persistent_file)
     self.my_store = SQLiteDict(persistent_file)
     logging.log(ULTRADEBUG, "Initializing internal state")
     self.my_mode = file_mode
     self.my_file = persistent_file
     self.my_store.compressed = compression
     self._sync_every = sync_every_transactions
     if self._sync_every == 0:
         self.my_store.commits_enabled = False
     self.write_counter = 0
     self.write_every = write_out_every_transactions
     self.persistent = persistent_file is not None
     #self.my_lock=RLock()
     if self.persistent:
         self.unfreeze()
         self.integrity_checking()
Example #51
0
 def _generate_normalization_factors(self):
     """Computes the array of normalization factors for the current 
     matrix."""
     import operator
     logging.info("Generating array of normalization factors. This is a "
                  "slow operation. Please wait.")
     for i in xrange(self._height):
         logging.debug("Generating normalization factor for row %d", i)
         # Add all of the elements of the row together
         matrix_row=self._get_row(i)
         logging.log(ULTRADEBUG, "Row %d contains: %s", i, matrix_row)
         this_row=reduce(operator.add, matrix_row)
         
         if this_row==0.0:
             logging.info("Row %d in the matrix adds up to 0. This may "
             "be a problem, depending on your evaluation function. Since "
             "this is a normalization calculation, it will be replaced by "
             "1.", i)
             this_row=1.0
         self.normfactors[i]=this_row
         logging.log(ULTRADEBUG, "Normalization factor for row %d=%1.5f", i,
                       this_row)
     logging.info("Normalization factor generation done.")
Example #52
0
 def __init__(self, original_line):
     Line.__init__(self, original_line, id_position=0)
     # Unnecessary - done in the Line constructor
     #line_breakup=self._line.split(self.split_char) 
     try:
         self._cui=self.split_line[4]
     except IndexError:
         raise CUINotFoundError("There was no CUI in the line '%s'" % 
                                self._line)
     if self._cui=='':
         raise CUINotFoundError("There was no CUI in the line '%s'" % 
                                self._line)
     try:
         self._description=self.split_line[3]
         self._source=self.split_line[6]
         self._semtype=self.split_line[5]
     except IndexError:
         raise ParsingError("Data missing from line '%s'" % self._line)
     # Some entities have no stated confidence. We use 0 in such cases,
     # so they can be eliminated from the workflow later.
     try:
         self.confidence=float(self.split_line[2])/1000.0
     except ValueError:
         raise NoConfidenceError("Could not parse a confidence value in "
                                 "line '%s'" % self._line)
     try:
         locations=self.split_line[8].split(',')
     except:
         logging.debug("Could not find a location in line %s", self._line)
         locations=[""]
     locations=[x.split(':') for x in locations]
     locations.sort()
     # Use the first appearance of a term as its location
     self._location=locations[0]
     logging.log(ULTRADEBUG, "Created a MetamapLine @ %d: %s (%s) %1.3f", 
                   self.line_id, self._cui,
                   self._description, self.confidence)
Example #53
0
 def __init__(self, original_line):
     Line.__init__(self, original_line, id_position=0)
     # Unnecessary - done in the Line constructor
     #line_breakup=self._line.split(self.split_char)
     try:
         self._cui = self.split_line[4]
     except IndexError:
         raise CUINotFoundError("There was no CUI in the line '%s'" %
                                self._line)
     if self._cui == '':
         raise CUINotFoundError("There was no CUI in the line '%s'" %
                                self._line)
     try:
         self._description = self.split_line[3]
         self._source = self.split_line[6]
         self._semtype = self.split_line[5]
     except IndexError:
         raise ParsingError("Data missing from line '%s'" % self._line)
     # Some entities have no stated confidence. We use 0 in such cases,
     # so they can be eliminated from the workflow later.
     try:
         self.confidence = float(self.split_line[2]) / 1000.0
     except ValueError:
         raise NoConfidenceError("Could not parse a confidence value in "
                                 "line '%s'" % self._line)
     try:
         locations = self.split_line[8].split(',')
     except:
         logging.debug("Could not find a location in line %s", self._line)
         locations = [""]
     locations = [x.split(':') for x in locations]
     locations.sort()
     # Use the first appearance of a term as its location
     self._location = locations[0]
     logging.log(ULTRADEBUG, "Created a MetamapLine @ %d: %s (%s) %1.3f",
                 self.line_id, self._cui, self._description,
                 self.confidence)
Example #54
0
    def _generate_normalization_factors(self):
        """Computes the array of normalization factors for the current 
        matrix."""
        import operator
        logging.info("Generating array of normalization factors. This is a "
                     "slow operation. Please wait.")
        for i in xrange(self._height):
            logging.debug("Generating normalization factor for row %d", i)
            # Add all of the elements of the row together
            matrix_row = self._get_row(i)
            logging.log(ULTRADEBUG, "Row %d contains: %s", i, matrix_row)
            this_row = reduce(operator.add, matrix_row)

            if this_row == 0.0:
                logging.info(
                    "Row %d in the matrix adds up to 0. This may "
                    "be a problem, depending on your evaluation function. Since "
                    "this is a normalization calculation, it will be replaced by "
                    "1.", i)
                this_row = 1.0
            self.normfactors[i] = this_row
            logging.log(ULTRADEBUG, "Normalization factor for row %d=%1.5f", i,
                        this_row)
        logging.info("Normalization factor generation done.")
Example #55
0
 def __init__(self, graph_builder_constructor, graph_builder_params,
              ranker_constructor, ranker_params, ranking_cutoff):
     logging.debug("Setting up a SingleItemWorkflow instance.")
     logging.debug("My graph builder is: %r", graph_builder_constructor)
     if graph_builder_constructor is not None:
         self._graph_builder=\
             graph_builder_constructor(*graph_builder_params)
     else:
         self._graph_builder = None
     if ranker_constructor is not None:
         self._ranker = MappedRanker(ranker_constructor(*ranker_params))
     else:
         self._ranker = None
     logging.debug("My ranker is: %r", ranker_constructor)
     self._ranking_cutoff = ranking_cutoff
     logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
     self.all_results = {}
     return