Ejemplo n.º 1
0
 def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576):
     # If the filename isn't specified, use the default one (None has a
     # special meaning, so we can't use it - it means create a temp file)
     if filename == "*&$#$%#":
         filename = _DEFAULT_TREE_DATA
     logging.info("Initializing tree with data from %r", filename)
     self._tree = StringDBDict(persistent_file=filename,
                               file_mode=file_mode,
                               cachesize=cachesize)
     self._invlookup = None  # Init the inverse name lookup database lazily
     self._origname = filename
     self.terms = self._tree.keys()
     self.terms.sort()
     # This one is for speedy retrieval and indexing
     self._term_list_as_dict = None
     self._search_dict = None
     self.num_terms = len(self.terms)
     return
Ejemplo n.º 2
0
def main():

    mrsty_file=sys.argv[3]
    original_filename=sys.argv[2]
    data_store_name=sys.argv[1]
    original_file=Text(bz2.BZ2File(original_filename, 'r'))
    print "Loading semantic types from %s" % mrsty_file
    stypes=SemanticTypes()
    stypes.build_from_mrsty_file(MRSTYTable(bz2.BZ2File(mrsty_file)))
    print "Semantic types loaded."
    print "Turning the data from %s into %s. Please wait." % (
            original_filename, data_store_name)
    data_store=StringDBDict(data_store_name, 
                            sync_every_transactions=0,
                            write_out_every_transactions=200000,
                            file_mode='c')
    data_store.sync_every=0
    build_concept_dictionary(original_file, data_store, stypes)
    data_store.sync_every=100
    print "Conversion done."
Ejemplo n.º 3
0
Archivo: tree.py Proyecto: YZWD/MEDRank
 def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576):
     # If the filename isn't specified, use the default one (None has a
     # special meaning, so we can't use it - it means create a temp file)
     if filename=="*&$#$%#":
         filename=_DEFAULT_TREE_DATA
     logging.info("Initializing tree with data from %r", filename)
     self._tree=StringDBDict(persistent_file=filename, file_mode=file_mode,
                             cachesize=cachesize)
     self._invlookup=None # Init the inverse name lookup database lazily
     self._origname=filename
     self.terms=self._tree.keys()
     self.terms.sort()
     # This one is for speedy retrieval and indexing
     self._term_list_as_dict=None
     self._search_dict=None
     self.num_terms=len(self.terms)
     return
Ejemplo n.º 4
0
 def __init__(self, reader, graph_builder, ranker, eval_parameters,
              ranking_cutoff, mesh_tree_filename, distance_matrix_filename,
              distance_function, umls_converter_data_filename,
              umls_concept_data_filename, output_file):
     logging.debug("Setting up a Workflow instance.")
     logging.debug("My reader is: %r", reader)
     self._reader = reader
     logging.debug("My graph builder is: %r", graph_builder)
     self._graph_builder = graph_builder
     self._ranker = MappedRanker(ranker)
     logging.debug("My ranker is: %r", self._ranker)
     self._ranking_cutoff = ranking_cutoff
     logging.debug("My ranking cutoff is: %r", self._ranking_cutoff)
     logging.debug("Creating a Tree instance from %s", mesh_tree_filename)
     self._mesh_tree = Tree(mesh_tree_filename)
     logging.debug(
         "Creating SAVCC distance matrix with %r and distance "
         "function %r", distance_matrix_filename, distance_function)
     self._matrix = SavccNormalizedMatrix(
         open(distance_matrix_filename, "rb"), distance_function)
     logging.debug("Filling in the rest of the evaluation parameters.")
     self._eval_parameters = eval_parameters
     self._eval_parameters.mesh_tree = self._mesh_tree
     self._eval_parameters.savcc_matrix = self._matrix
     logging.debug("My evaluation parameters are: %r",
                   self._eval_parameters)
     if umls_converter_data_filename is None:
         converter_data = None
     else:
         converter_data = pickle.load(
             open(umls_converter_data_filename, "rb"))
     self._umls_converter = RankedConverter(
         Converter(self._mesh_tree, converter_data))
     logging.debug("My converter is: %r", self._umls_converter)
     logging.debug("Initializing Concept storage from %s",
                   umls_concept_data_filename)
     if umls_concept_data_filename is None:
         Concept.init_storage()
     else:
         Concept.init_storage(StringDBDict(umls_concept_data_filename))
     self._output_file = output_file
     logging.debug("My output file is: %r", self._output_file)
     return
Ejemplo n.º 5
0
class Tree(object):
    """Describes a tree of MeSH terms. The contents should be tree_node,
    generated by the build_mesh_tree_file script. The tree contains a 
    term name (string)->tree_node mapping."""
    def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576):
        # If the filename isn't specified, use the default one (None has a
        # special meaning, so we can't use it - it means create a temp file)
        if filename == "*&$#$%#":
            filename = _DEFAULT_TREE_DATA
        logging.info("Initializing tree with data from %r", filename)
        self._tree = StringDBDict(persistent_file=filename,
                                  file_mode=file_mode,
                                  cachesize=cachesize)
        self._invlookup = None  # Init the inverse name lookup database lazily
        self._origname = filename
        self.terms = self._tree.keys()
        self.terms.sort()
        # This one is for speedy retrieval and indexing
        self._term_list_as_dict = None
        self._search_dict = None
        self.num_terms = len(self.terms)
        return

    def original_filename(self):
        """Returns the original filename of the tree."""
        return self._origname

    def __repr__(self):
        return "<MeSH Semantic tree from %s with %d terms>" % \
                (self._origname, self.num_terms)

    @staticmethod
    def common_root(pos1, pos2):
        """Determines the common dotted root of a pair of tree positions."""
        pos1_split = pos1.split(".")
        pos2_split = pos2.split(".")
        common_terms = []
        for x in zip(pos1_split, pos2_split):
            if x[0] != x[1]: break
            common_terms.append(x[0])
        return '.'.join(common_terms)

    def semantic_distance(self, term1, term2):
        """Distance between two nodes, assuming there is a single root node
        for the tree linking all subtrees. Qualifiers and descriptors are
        automatically excluded"""
        node1 = self._tree[term1]
        node2 = self._tree[term2]
        if node1.is_qualifier() or node2.is_qualifier():
            return -1
        if node1.is_descriptor() or node2.is_descriptor():
            return -1
        distance = 999999999999
        for pos1 in node1.position:
            pos1 = '#.%s' % pos1
            for pos2 in node2.position:
                # The extra item in pos 1 and pos 2 emulates the common root
                # node
                pos2 = '#.%s' % pos2
                root = self.common_root(pos1, pos2)
                rootdots = root.count(".")
                dist_1 = pos1.count(".") - rootdots
                dist_2 = pos2.count(".") - rootdots
                dist = dist_1 + dist_2
                if dist < 0.0:
                    raise ValueError(
                        "Problem: %s<->%s have a negative "
                        "distance", pos1, pos2)
                if dist < distance: distance = dist
        return distance

    def distance(self, term1, term2):
        """Distance between two nodes, assuming no single root node
        for the tree linking all subtrees."""
        # Check for same-treeness
        possible_trees1 = self._tree[term1].get_trees()
        possible_trees2 = self._tree[term2].get_trees()
        combination_thereof = [x in possible_trees2 for x in possible_trees1]
        if True not in combination_thereof:
            return -1
        sd = self.semantic_distance(term1, term2)
        return sd

    def deepest_of_list(self, list_of_terms):
        return max(
            (self._tree[x].deepest_depth(), x) for x in list_of_terms)[1]

    def _init_inverse_lookup(self):
        """Sets up the internal data store to perform reverse lookups."""
        logging.debug("First request of a reverse lookup. Building the " \
                      "inverse lookup dictionary.")
        self._invlookup = {}
        for k, items in self._tree.iteritems():
            for item in items.position:
                self._invlookup[item] = k
        logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.")
        return

    def reverse_lookup(self, term):
        """Perform a reverse lookup, after setting up the reverse lookup
        dictionary if necessary."""
        if self._invlookup is None:
            self._init_inverse_lookup()
        try:
            return self._invlookup[term]
        except KeyError:
            raise PositionNotInTree("%s is not a position in this tree." %
                                    term)

    def __getitem__(self, key):
        try:
            return self._tree[key.lower()]
        except KeyError:
            raise TermNotInTree("The term %s is not in the tree %r." %
                                (key, self))

    def eliminate_checktags(self, list_of_terms):
        """Returns a list of terms with the checktags omitted."""
        return [x for x in list_of_terms if x not in checktags]

    def eliminate_descriptors(self, list_of_terms):
        return [x for x in list_of_terms if not self._tree[x].is_descriptor(x)]

    def eliminate_qualifiers(self, list_of_terms):
        return [x for x in list_of_terms if not self[x].is_qualifier()]

    def only_checktags(self, list_of_terms):
        return [x for x in list_of_terms if x in checktags]

    def only_qualifiers(self, list_of_terms):
        return [x for x in list_of_terms if self._tree[x].is_qualifier()]

    def only_descriptors(self, list_of_terms):
        return [x for x in list_of_terms if self._tree[x].is_descriptor()]

    def index(self, term):
        """Returns the index of a term in the sorted term list"""
        if self._term_list_as_dict is None:
            # Precompute all indexes
            logging.debug("Building MeSH tree index.")
            currindex = 0
            self._term_list_as_dict = {}
            for each_term in self.terms:
                self._term_list_as_dict[each_term] = currindex
                for each_synonym in self[each_term].synonyms:
                    self._term_list_as_dict[each_synonym] = currindex
                currindex += 1
        try:
            return self._term_list_as_dict[term]
        except KeyError:
            raise TermNotInTree("Term %s is not a member of tree %r" %
                                (term, self))

    def term_vector(self, list_of_terms):
        """Returns a VocabularyVector representing the list of terms as seen 
        by this tree."""
        new_vector = VocabularyVector(self.num_terms)
        for term in list_of_terms:
            try:
                new_vector[self.index(term)] = 1
            except TermNotInTree:
                logging.warn(
                    'Weird: term %r could not be found in %r. It '
                    'should be there.', term, self)
        return new_vector

    def _init_search_dict(self):
        """Sets up the internal data store to perform searches."""
        logging.debug("First request of a search. Building the " \
                      "search dictionary.")
        self._search_dict = {}
        for k, items in self._tree.iteritems():
            for synonym in items.synonyms:
                if synonym in self._search_dict:
                    self._search_dict[synonym].append(k)
                else:
                    self._search_dict[synonym] = [k]
            if k in self._search_dict:
                self._search_dict[k].append(k)
            else:
                self._search_dict[k] = [k]

    def search(self, term):
        """Searches the tree for a term, looking at synonyms as well as keys"""
        if self._search_dict is None:
            self._init_search_dict()
        try:
            result = self._search_dict[term]
        except KeyError:
            return TreeSearchResults([])
        if len(result) == 1:
            return TreeSearchResults(self[result[0]])
        return TreeSearchResults([self[x] for x in result])
Ejemplo n.º 6
0
 def __init__(self):
     try:
         self._cache_location = sys.argv[2]
     except IndexError:
         self._cache_location = DEFAULT_CACHE_NAME
     self._cache = StringDBDict(self._cache_location, file_mode="c")
Ejemplo n.º 7
0
 def __init__(self):
     try:
         self._cache_location = sys.argv[2]
     except IndexError:
         self._cache_location = _DEFAULT_CONCEPT_STORAGE
     self._cache = StringDBDict(self._cache_location, file_mode="r")
Ejemplo n.º 8
0
         chunk=chunk.strip()
         for k, v in tok_bact.iteritems():
             chunk=chunk.replace(k, v)
         if reduce(operator.or_, 
                   [x in chunk.lower() for x in useless_lines]): continue
         if len(chunk) < 2: continue
         outfile.write('%010d|%s\n' % (chunkid, chunk))
         chunkmap[fakename].append(chunkid)
         chunkid += 1
 outfile.close()
 print "Saving chunkmap"
 pickle.dump(chunkmap, open(outmapname, "wb"), pickle.HIGHEST_PROTOCOL)
 print "These files couldn't be processed:"
 print '\n'.join(skipped)
 print "Opening (or creating) cache in", sys.argv[2]
 the_cache=StringDBDict(os.path.join(sys.argv[2], DEFAULT_CACHE_NAME),
                        file_mode='c')
 PubMed.download_many([str(x) for x in known_articles if str(x) not in 
                       the_cache.keys()], download_callback,
                      parser=Medline.RecordParser())
 mti_filename=sys.argv[1]+'.mti'
 print "Finished processing the cache. Using the cache to build", \
        mti_filename
 mti_file=open(mti_filename, "w")
 chunkmap={}
 hexfinder=re.compile(r'\\x[a-f0-9][a-f0-9]', re.IGNORECASE)
 for article in known_articles:
     try:
         article_record=the_cache[str(article)]
     except KeyError:
         print "Article doesn't exist in cache. Skipping."
         continue
Ejemplo n.º 9
0
Archivo: tree.py Proyecto: YZWD/MEDRank
class Tree(object):
    """Describes a tree of MeSH terms. The contents should be tree_node,
    generated by the build_mesh_tree_file script. The tree contains a 
    term name (string)->tree_node mapping."""
    def __init__(self, filename="*&$#$%#", file_mode="r", cachesize=1048576):
        # If the filename isn't specified, use the default one (None has a
        # special meaning, so we can't use it - it means create a temp file)
        if filename=="*&$#$%#":
            filename=_DEFAULT_TREE_DATA
        logging.info("Initializing tree with data from %r", filename)
        self._tree=StringDBDict(persistent_file=filename, file_mode=file_mode,
                                cachesize=cachesize)
        self._invlookup=None # Init the inverse name lookup database lazily
        self._origname=filename
        self.terms=self._tree.keys()
        self.terms.sort()
        # This one is for speedy retrieval and indexing
        self._term_list_as_dict=None
        self._search_dict=None
        self.num_terms=len(self.terms)
        return
    def original_filename(self):
        """Returns the original filename of the tree."""
        return self._origname
    def __repr__(self): 
        return "<MeSH Semantic tree from %s with %d terms>" % \
                (self._origname, self.num_terms)
    
    @staticmethod
    def common_root(pos1, pos2):
        """Determines the common dotted root of a pair of tree positions."""
        pos1_split=pos1.split(".")
        pos2_split=pos2.split(".")
        common_terms=[]
        for x in zip(pos1_split, pos2_split):
            if x[0]!=x[1]: break
            common_terms.append(x[0])
        return '.'.join(common_terms)
    
    def semantic_distance(self, term1, term2):
        """Distance between two nodes, assuming there is a single root node
        for the tree linking all subtrees. Qualifiers and descriptors are
        automatically excluded"""
        node1=self._tree[term1]
        node2=self._tree[term2]
        if node1.is_qualifier() or node2.is_qualifier():
            return -1
        if node1.is_descriptor() or node2.is_descriptor():
            return -1
        distance=999999999999
        for pos1 in node1.position:
            pos1='#.%s' % pos1
            for pos2 in node2.position:
                # The extra item in pos 1 and pos 2 emulates the common root 
                # node
                pos2='#.%s' % pos2
                root=self.common_root(pos1, pos2)
                rootdots=root.count(".")
                dist_1=pos1.count(".")-rootdots
                dist_2=pos2.count(".")-rootdots
                dist=dist_1+dist_2
                if dist < 0.0:
                    raise ValueError("Problem: %s<->%s have a negative "
                                     "distance", pos1, pos2)
                if dist < distance: distance=dist
        return distance
    def distance(self, term1, term2):
        """Distance between two nodes, assuming no single root node
        for the tree linking all subtrees."""
        # Check for same-treeness
        possible_trees1=self._tree[term1].get_trees()
        possible_trees2=self._tree[term2].get_trees()
        combination_thereof=[x in possible_trees2 for x in possible_trees1]
        if True not in combination_thereof:
            return -1
        sd=self.semantic_distance(term1, term2)
        return sd
    def deepest_of_list(self, list_of_terms):
        return max((self._tree[x].deepest_depth(), x)
                   for x in list_of_terms)[1]
    def _init_inverse_lookup(self):
        """Sets up the internal data store to perform reverse lookups."""
        logging.debug("First request of a reverse lookup. Building the " \
                      "inverse lookup dictionary.")
        self._invlookup={}
        for k, items in self._tree.iteritems():
            for item in items.position:
                self._invlookup[item]=k
        logging.log(ULTRADEBUG, "Done building inverse lookup dictionary.")
        return

    def reverse_lookup(self, term):
        """Perform a reverse lookup, after setting up the reverse lookup
        dictionary if necessary."""
        if self._invlookup is None:
            self._init_inverse_lookup()
        try:
            return self._invlookup[term]
        except KeyError:
            raise PositionNotInTree("%s is not a position in this tree." %
                                    term)

    def __getitem__(self, key):
        try:
            return self._tree[key.lower()]
        except KeyError:
            raise TermNotInTree("The term %s is not in the tree %r." % 
                                (key, self))

    def eliminate_checktags(self, list_of_terms):
        """Returns a list of terms with the checktags omitted."""
        return [x for x in list_of_terms if x not in checktags]

    def eliminate_descriptors(self, list_of_terms):
        return [x for x in list_of_terms 
                if not self._tree[x].is_descriptor(x)]
    def eliminate_qualifiers(self, list_of_terms):
        return [x for x in list_of_terms
                if not self[x].is_qualifier()]
    def only_checktags(self, list_of_terms):
        return [x for x in list_of_terms if x in checktags]
    def only_qualifiers(self, list_of_terms):
        return [x for x in list_of_terms if self._tree[x].is_qualifier()]
    def only_descriptors(self, list_of_terms):
        return [x for x in list_of_terms if self._tree[x].is_descriptor()]
    def index(self, term):
        """Returns the index of a term in the sorted term list"""
        if self._term_list_as_dict is None:
            # Precompute all indexes
            logging.debug("Building MeSH tree index.")
            currindex=0
            self._term_list_as_dict={}
            for each_term in self.terms:
                self._term_list_as_dict[each_term]=currindex
                for each_synonym in self[each_term].synonyms:
                    self._term_list_as_dict[each_synonym]=currindex
                currindex+=1
        try:
            return self._term_list_as_dict[term]
        except KeyError:
            raise TermNotInTree("Term %s is not a member of tree %r" % 
                                (term, self))
    def term_vector(self, list_of_terms):
        """Returns a VocabularyVector representing the list of terms as seen 
        by this tree."""
        new_vector=VocabularyVector(self.num_terms)
        for term in list_of_terms:
            try:
                new_vector[self.index(term)]=1
            except TermNotInTree:
                logging.warn('Weird: term %r could not be found in %r. It '
                             'should be there.',
                             term, self)
        return new_vector
    def _init_search_dict(self):
        """Sets up the internal data store to perform searches."""
        logging.debug("First request of a search. Building the " \
                      "search dictionary.")
        self._search_dict={}
        for k, items in self._tree.iteritems():
            for synonym in items.synonyms:
                if synonym in self._search_dict:
                    self._search_dict[synonym].append(k)
                else:
                    self._search_dict[synonym]=[k]
            if k in self._search_dict:
                self._search_dict[k].append(k)
            else:
                self._search_dict[k]=[k]
    def search(self, term):
        """Searches the tree for a term, looking at synonyms as well as keys"""
        if self._search_dict is None:
            self._init_search_dict()
        try:
            result=self._search_dict[term]
        except KeyError:
            return TreeSearchResults([])
        if len(result)==1:
            return TreeSearchResults(self[result[0]])
        return TreeSearchResults([self[x] for x in result])
Ejemplo n.º 10
0
            trees[term] = TreeNode(term, role, synonyms, set(position))
    return trees


if __name__ == "__main__":
    # The pickling and unpickling make this horribly slow, so we'll trade some
    # memory for speed in the build process and later turn the dictionary into
    # a DB-backed one.
    tree_storage = {}
    for treefile in sys.argv[2:]:
        treesfile = bz2.BZ2File(treefile, 'rU')
        print "Reading %s..." % treefile
        tree_storage = build_tree_from_descriptor_file(treesfile, tree_storage)

    print "Tree built. It has %d unique terms." % len(tree_storage)
    print "For example... arm=", tree_storage['arm'], " and eye=", \
          tree_storage['eye']
    print "Done generating tree."
    print "Storing tree in", sys.argv[1]
    tree_on_disk = StringDBDict(persistent_file=sys.argv[1],
                                sync_every_transactions=0,
                                write_out_every_transactions=0,
                                file_mode='c')
    write_counter = 0
    for k, v in tree_storage.iteritems():
        tree_on_disk[k] = v
        write_counter += 1
        if write_counter % 1000 == 0:
            print "Stored", write_counter, "terms."
    tree_on_disk.sync_every = 1
    print "Done storing."
Ejemplo n.º 11
0
            trees[term]=TreeNode(term, role, synonyms, set(position))
    return trees


if __name__=="__main__":
    # The pickling and unpickling make this horribly slow, so we'll trade some
    # memory for speed in the build process and later turn the dictionary into
    # a DB-backed one.
    tree_storage={}
    for treefile in sys.argv[2:]:
        treesfile=bz2.BZ2File(treefile, 'rU')
        print "Reading %s..." % treefile
        tree_storage=build_tree_from_descriptor_file(treesfile, tree_storage)
    
    print "Tree built. It has %d unique terms." % len(tree_storage)
    print "For example... arm=", tree_storage['arm'], " and eye=", \
          tree_storage['eye']
    print "Done generating tree."
    print "Storing tree in", sys.argv[1]
    tree_on_disk=StringDBDict(persistent_file=sys.argv[1], 
                              sync_every_transactions=0,
                              write_out_every_transactions=0,
                              file_mode='c')
    write_counter=0
    for k,v in tree_storage.iteritems():
        tree_on_disk[k]=v
        write_counter+=1
        if write_counter % 1000 == 0:
            print "Stored", write_counter, "terms."
    tree_on_disk.sync_every=1
    print "Done storing."
Ejemplo n.º 12
0
def multi_processor(reader,
                    workflow_class,
                    graph_builder_constructor, graph_builder_params,
                    ranker_constructor, ranker_params,
                    eval_parameters, 
                    ranking_cutoff,
                    mesh_tree_filename, distance_matrix_filename,
                    distance_function,
                    umls_converter_data_filename,
                    umls_concept_data_filename,
                    extra_data_name,
                    extra_data_contents,
                    output_file,
                    num_threads=None,
                    queue_size=None,
                    output_callback=output,
                    output_headers_callback=output_headers,
                    output_item_callback=output_one_item,
                    performance_tuning=True):
    """
    Perform the evaluation.
    Multithreading notes: It's the responsibility of the caller to make sure
    that extra_data_contents, if any, are thread-safe. 
    """
    if num_threads is None:
        num_threads=1

    logging.debug("Initializing Concept storage from %s", 
                  umls_concept_data_filename)
                  
    # Since there's no direct way of setting the concept cache's title, 
    # we set it here, wait for it to be inherited, and then get the 'real' 
    # process title for this one. 
    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    threads=[]
    logging.info("Creating %d worker threads.", num_threads)
    #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues=[Queue(queue_size) for x in xrange(num_threads)]
    this_output_queue=Queue(2*queue_size)

    # Create an output processor
    output_processor=Thread(target=output_callback, 
                             args=(output_file, 
                                   this_output_queue,
                                   output_headers_callback,
                                   output_item_callback))
    output_processor.start()
    
    for i in xrange(num_threads):
        this_thread=Thread(target=processor, args=(workflow_class,
                                                graph_builder_constructor, 
                                                graph_builder_params,
                                                ranker_constructor, 
                                                ranker_params,
                                                eval_parameters, 
                                                ranking_cutoff,
                                                mesh_tree_filename,
                                                distance_matrix_filename,
                                                distance_function,
                                                umls_converter_data_filename,
                                                extra_data_name,
                                                extra_data_contents,
                                                task_queues[i],
                                                this_output_queue),
                             name="MEDRank-Worker-%d" % i)
        logging.log(ULTRADEBUG, "Created thread: %r", this_thread)
        this_thread.start()
        threads.append((this_thread, this_output_queue, task_queues[i]))
    
    all_results={}
    count=0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count+=1
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_thread=(count-1) % num_threads
        logging.info("Dispatching article %d: %s to %s", 
                     count,
                     each_article.set_id,
                     threads[target_thread][0].name)
        task_queues[target_thread].put(each_article)
        #task_queue[target_process].put(each_article)
        #task_queue.put(each_article)
        #logging.info("The task queue is approximately %d items long.", 
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results={}

    alive_threads=[x for x in threads if x[0].is_alive()]
    remaining_threads=len(alive_threads)

    logging.info("There are %d threads (out of %d) still alive.", 
                 remaining_threads,
                 num_threads)
    for i in xrange(remaining_threads):
        alive_threads[i][2].put('STOP')
        #alive_threads[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further "
                  "requests will come.")

    logging.info("All information sent to the threads.")

    # Note end of output

    while len(threads)>0:
        a_thread=threads.pop()
        # We join the process to wait for the end of the reading 
        a_thread[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multithreading.")
    Pmid.close_storage()
    return
Ejemplo n.º 13
0
def multi_processor(reader,
                    workflow_class,
                    graph_builder_constructor,
                    graph_builder_params,
                    ranker_constructor,
                    ranker_params,
                    eval_parameters,
                    ranking_cutoff,
                    mesh_tree_filename,
                    distance_matrix_filename,
                    distance_function,
                    umls_converter_data_filename,
                    umls_concept_data_filename,
                    extra_data_name,
                    extra_data_contents,
                    output_file,
                    num_processes=None,
                    queue_size=None,
                    output_callback=output,
                    output_headers_callback=output_headers,
                    output_item_callback=output_one_item,
                    performance_tuning=True):
    """
    Perform the evaluation.
    Multiprocessing notes: It's the responsibility of the caller to make sure that
    extra_data_contents, if any, are multiprocessing-safe. For example, by using
    a SyncManager and Namespace and passing the proxy. See umls/concept for an example.
    """

    if num_processes is None:
        num_processes = cpu_count()

    if performance_tuning:
        # Since reading the file involves an awful lot of object creation
        # and destruction we'll tweak the gc adjustments to sweep less frequently
        # IOW - we have a LOT of short-lived objects. No sense garbage-collecting
        # the latter generations very often.
        # (this is about 10x, 5x, and 5x the usual)
        original_threshold = gc.get_threshold()
        gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1],
                         5 * original_threshold[1])
        original_check_interval = sys.getcheckinterval()
        # Similarly, we'll try to minimize overhead from thread switches
        # 5x usual value
        sys.setcheckinterval(5 * original_check_interval)
    logging.debug("Initializing Concept storage from %s",
                  umls_concept_data_filename)

    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    proctitle.setproctitle("MEDRank-main")

    processes = []
    logging.info("Creating %d worker processes.", num_processes)
    #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues = [Queue(queue_size) for x in xrange(num_processes)]
    this_output_queue = Queue(2 * queue_size)

    # Create an output processor
    output_processor = Process(target=output_callback,
                               args=(output_file, this_output_queue,
                                     output_headers_callback,
                                     output_item_callback))
    output_processor.start()

    for i in xrange(num_processes):
        this_process = Process(
            target=processor,
            args=(workflow_class, graph_builder_constructor,
                  graph_builder_params, ranker_constructor, ranker_params,
                  eval_parameters, ranking_cutoff, mesh_tree_filename,
                  distance_matrix_filename, distance_function,
                  umls_converter_data_filename, extra_data_name,
                  extra_data_contents, task_queues[i], this_output_queue,
                  "MEDRank-Worker-%d" % i),
            name="MEDRank-Worker-%d" % i)
        logging.log(ULTRADEBUG, "Created process: %r", this_process)
        this_process.start()
        processes.append((this_process, this_output_queue, task_queues[i]))

    all_results = {}
    count = 0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count += 1
        #queues_and_sizes=[(task_queues[x].qsize(), x)
        #                  for x in xrange(num_processes)]
        #queues_and_sizes.sort()
        #target_process=queues_and_sizes[0][1]
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_process = (count - 1) % num_processes
        #Lowest-loaded process first.
        logging.info("Dispatching article %d: %s to %s", count,
                     each_article.set_id, processes[target_process][0].name)
        task_queues[target_process].put(each_article)
        #task_queue[target_process].put(each_article)
        #task_queue.put(each_article)
        #logging.info("The task queue is approximately %d items long.",
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results = {}

    alive_processes = [x for x in processes if x[0].is_alive()]
    remaining_processes = len(alive_processes)

    logging.info("There are %d processes (out of %d) still alive.",
                 remaining_processes, num_processes)
    for i in xrange(remaining_processes):
        alive_processes[i][2].put('STOP')
        alive_processes[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further "
                  "requests will come.")

    logging.info("All information sent to the processors.")

    # Back to normal
    if performance_tuning:
        gc.set_threshold(original_threshold[0], original_threshold[1],
                         original_threshold[2])
        sys.setcheckinterval(original_check_interval)

    # Note end of output

    while len(processes) > 0:
        a_process = processes.pop()
        # We join the process to wait for the end of the reading
        a_process[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multiprocessing.")
    return