Beispiel #1
0
 def fill_in_reverse_chunkmap(self):
     """Creates a reverse chunkmap. The original chunkmap holds a
     filename->[chunk ids] map. We want to turn this into a chunk id->
     pubmed id map. In order to do that, we extract the pubmed id from
     each and add the chunkids as new keys to the reverse dictionary, with
     the pubmed id as default value."""
     for k, value in self.iteritems():
         pubmed_id = Pmid()
         pubmed_id.set_from_string(k)
         chunk_ids = [int(x) for x in value]
         self._reverse_chunkmap.update(dict.fromkeys(chunk_ids, pubmed_id))
Beispiel #2
0
 def fill_in_reverse_chunkmap(self):
     """Creates a reverse chunkmap. The original chunkmap holds a
     filename->[chunk ids] map. We want to turn this into a chunk id->
     pubmed id map. In order to do that, we extract the pubmed id from
     each and add the chunkids as new keys to the reverse dictionary, with
     the pubmed id as default value."""
     for k, value in self.iteritems():
         pubmed_id=Pmid()
         pubmed_id.set_from_string(k)
         chunk_ids=[int(x) for x in value]
         self._reverse_chunkmap.update(
             dict.fromkeys(chunk_ids, pubmed_id))
Beispiel #3
0
 def testChunkedOutput(self):
     from MEDRank.file.chunkmap import chunkmap_factory
     from MEDRank.pubmed.pmid import Pmid
     _cm={'1.txt': [12345], '2.txt': [56789, 56790]}
     cm=chunkmap_factory(_cm)
     cno=ChunkedNLMOutput(self.fakefile, Line, self.lines_to_skip, cm)
     processed_sets=[x for x in cno]
     self.assertEquals(len(processed_sets), 2)
     self.assertEquals(len(processed_sets[0].lines), 1)
     self.assertEquals(len(processed_sets[1].lines), 2)
     self.assertEquals(processed_sets[0].set_id, Pmid(1))
     self.assertEquals(processed_sets[1].set_id, Pmid(2))
     self.assertEquals(processed_sets[1].lines[1].line_id, 56790)
Beispiel #4
0
    def __iter__(self):
        current_set = []
        current_id = None
        bad_id = -1
        for line in NLMOutput.__iter__(self):
            try:
                this_lines_set_id = self._chunkmap.pmid_from_block(
                    line.line_id)
            except KeyError:
                logging.warn(
                    "Line without chunkmap equivalent. Emitting"
                    " as id %d", bad_id)
                this_lines_set_id = Pmid(bad_id)
            if this_lines_set_id != current_id:
                # Is this the first invocation? If not, we have to emit the
                # linelist that just ended, but if it is we'll just pretend
                # that we did.
                if current_id is not None:
                    # Emit the linelist that just ended
                    logging.log(
                        ULTRADEBUG, "Completed set of lines %s "
                        "according to the chunkmap. Emitting them.",
                        current_id)
                    if current_id < 0:
                        # Decrement bad line counter
                        bad_id -= 1
                    yield self._lines_type(current_id, current_set)

                # Start a new, empty linelist
                current_id = this_lines_set_id
                current_set = []
            current_set.append(line)
        # Is there something left to emit after the iteration's over?
        if len(current_set) > 0:
            logging.log(
                ULTRADEBUG, "Completed iteration. Emitting the last "
                "lines left with set id %s", current_id)
            yield self._lines_type(current_id, current_set)
        return
def multi_processor(reader,
                    workflow_class,
                    graph_builder_constructor, graph_builder_params,
                    ranker_constructor, ranker_params,
                    eval_parameters, 
                    ranking_cutoff,
                    mesh_tree_filename, distance_matrix_filename,
                    distance_function,
                    umls_converter_data_filename,
                    umls_concept_data_filename,
                    extra_data_name,
                    extra_data_contents,
                    output_file,
                    num_processes=None,
                    queue_size=None,
                    output_callback=output,
                    output_headers_callback=output_headers,
                    output_item_callback=output_one_item,
                    performance_tuning=True):
    """
    Perform the evaluation.
    Multiprocessing notes: It's the responsibility of the caller to make sure that
    extra_data_contents, if any, are multiprocessing-safe. For example, by using
    a SyncManager and Namespace and passing the proxy. See umls/concept for an example.
    """
    
    if num_processes is None:
        num_processes=cpu_count()

    if performance_tuning:
        # Since reading the file involves an awful lot of object creation 
        # and destruction we'll tweak the gc adjustments to sweep less frequently
        # IOW - we have a LOT of short-lived objects. No sense garbage-collecting
        # the latter generations very often.    
        # (this is about 10x, 5x, and 5x the usual)
        original_threshold=gc.get_threshold()
        gc.set_threshold(10 * original_threshold[0], 
                         5 * original_threshold[1],
                         5 * original_threshold[1]) 
        original_check_interval=sys.getcheckinterval()
        # Similarly, we'll try to minimize overhead from thread switches
        # 5x usual value
        sys.setcheckinterval(5*original_check_interval)
    logging.debug("Initializing Concept storage from %s", 
                  umls_concept_data_filename)
                  
    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    proctitle.setproctitle("MEDRank-main")
    
    processes=[]
    logging.info("Creating %d worker processes.", num_processes)
    #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues=[Queue(queue_size) for x in xrange(num_processes)]
    this_output_queue=Queue(2*queue_size)

    # Create an output processor
    output_processor=Process(target=output_callback, 
                             args=(output_file, 
                                   this_output_queue,
                                   output_headers_callback,
                                   output_item_callback))
    output_processor.start()
    
    for i in xrange(num_processes):
        this_process=Process(target=processor, args=(workflow_class,
                                                graph_builder_constructor, 
                                                graph_builder_params,
                                                ranker_constructor, 
                                                ranker_params,
                                                eval_parameters, 
                                                ranking_cutoff,
                                                mesh_tree_filename,
                                                distance_matrix_filename,
                                                distance_function,
                                                umls_converter_data_filename,
                                                extra_data_name,
                                                extra_data_contents,
                                                task_queues[i],
                                                this_output_queue,
                                                "MEDRank-Worker-%d" % i),
                             name="MEDRank-Worker-%d" % i)
        logging.log(ULTRADEBUG, "Created process: %r", this_process)
        this_process.start()
        processes.append((this_process, this_output_queue, task_queues[i]))
    
    all_results={}
    count=0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count+=1
        #queues_and_sizes=[(task_queues[x].qsize(), x) 
        #                  for x in xrange(num_processes)]
        #queues_and_sizes.sort()
        #target_process=queues_and_sizes[0][1]
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_process=(count-1) % num_processes
        #Lowest-loaded process first.
        logging.info("Dispatching article %d: %s to %s", 
                     count,
                     each_article.set_id,
                     processes[target_process][0].name)
        task_queues[target_process].put(each_article)
        #task_queue[target_process].put(each_article)
        #task_queue.put(each_article)
        #logging.info("The task queue is approximately %d items long.", 
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results={}

    alive_processes=[x for x in processes if x[0].is_alive()]
    remaining_processes=len(alive_processes)

    logging.info("There are %d processes (out of %d) still alive.", 
                 remaining_processes,
                 num_processes)
    for i in xrange(remaining_processes):
        alive_processes[i][2].put('STOP')
        alive_processes[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further "
                  "requests will come.")

    logging.info("All information sent to the processors.")

    # Back to normal
    if performance_tuning:
        gc.set_threshold(original_threshold[0],
                         original_threshold[1],
                         original_threshold[2])
        sys.setcheckinterval(original_check_interval)

    # Note end of output

    while len(processes)>0:
        a_process=processes.pop()
        # We join the process to wait for the end of the reading 
        a_process[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multiprocessing.")
    return
Beispiel #6
0
 def pmid_from_block(self, chunknum):
     num = self._number_finder.findall(str(chunknum))
     if len(num) == 0:
         raise KeyError("No chunk number found in %r" % chunknum)
     return Pmid(num[0])
Beispiel #7
0
 def pmid_from_block(self, chunknum):
     return Pmid(self[chunknum])
Beispiel #8
0
def multi_processor(reader,
                    workflow_class,
                    graph_builder_constructor, graph_builder_params,
                    ranker_constructor, ranker_params,
                    eval_parameters, 
                    ranking_cutoff,
                    mesh_tree_filename, distance_matrix_filename,
                    distance_function,
                    umls_converter_data_filename,
                    umls_concept_data_filename,
                    extra_data_name,
                    extra_data_contents,
                    output_file,
                    num_threads=None,
                    queue_size=None,
                    output_callback=output,
                    output_headers_callback=output_headers,
                    output_item_callback=output_one_item,
                    performance_tuning=True):
    """
    Perform the evaluation.
    Multithreading notes: It's the responsibility of the caller to make sure
    that extra_data_contents, if any, are thread-safe. 
    """
    if num_threads is None:
        num_threads=1

    logging.debug("Initializing Concept storage from %s", 
                  umls_concept_data_filename)
                  
    # Since there's no direct way of setting the concept cache's title, 
    # we set it here, wait for it to be inherited, and then get the 'real' 
    # process title for this one. 
    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    threads=[]
    logging.info("Creating %d worker threads.", num_threads)
    #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues=[Queue(queue_size) for x in xrange(num_threads)]
    this_output_queue=Queue(2*queue_size)

    # Create an output processor
    output_processor=Thread(target=output_callback, 
                             args=(output_file, 
                                   this_output_queue,
                                   output_headers_callback,
                                   output_item_callback))
    output_processor.start()
    
    for i in xrange(num_threads):
        this_thread=Thread(target=processor, args=(workflow_class,
                                                graph_builder_constructor, 
                                                graph_builder_params,
                                                ranker_constructor, 
                                                ranker_params,
                                                eval_parameters, 
                                                ranking_cutoff,
                                                mesh_tree_filename,
                                                distance_matrix_filename,
                                                distance_function,
                                                umls_converter_data_filename,
                                                extra_data_name,
                                                extra_data_contents,
                                                task_queues[i],
                                                this_output_queue),
                             name="MEDRank-Worker-%d" % i)
        logging.log(ULTRADEBUG, "Created thread: %r", this_thread)
        this_thread.start()
        threads.append((this_thread, this_output_queue, task_queues[i]))
    
    all_results={}
    count=0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count+=1
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_thread=(count-1) % num_threads
        logging.info("Dispatching article %d: %s to %s", 
                     count,
                     each_article.set_id,
                     threads[target_thread][0].name)
        task_queues[target_thread].put(each_article)
        #task_queue[target_process].put(each_article)
        #task_queue.put(each_article)
        #logging.info("The task queue is approximately %d items long.", 
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results={}

    alive_threads=[x for x in threads if x[0].is_alive()]
    remaining_threads=len(alive_threads)

    logging.info("There are %d threads (out of %d) still alive.", 
                 remaining_threads,
                 num_threads)
    for i in xrange(remaining_threads):
        alive_threads[i][2].put('STOP')
        #alive_threads[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further "
                  "requests will come.")

    logging.info("All information sent to the threads.")

    # Note end of output

    while len(threads)>0:
        a_thread=threads.pop()
        # We join the process to wait for the end of the reading 
        a_thread[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multithreading.")
    Pmid.close_storage()
    return
def multi_processor(
    reader,
    workflow_class,
    graph_builder_constructor,
    graph_builder_params,
    ranker_constructor,
    ranker_params,
    eval_parameters,
    ranking_cutoff,
    mesh_tree_filename,
    distance_matrix_filename,
    distance_function,
    umls_converter_data_filename,
    umls_concept_data_filename,
    extra_data_name,
    extra_data_contents,
    output_file,
    num_threads=None,
    queue_size=None,
    output_callback=output,
    output_headers_callback=output_headers,
    output_item_callback=output_one_item,
    performance_tuning=True,
):
    """
    Perform the evaluation.
    Multithreading notes: It's the responsibility of the caller to make sure
    that extra_data_contents, if any, are thread-safe. 
    """
    if num_threads is None:
        num_threads = 1

    logging.debug("Initializing Concept storage from %s", umls_concept_data_filename)

    # Since there's no direct way of setting the concept cache's title,
    # we set it here, wait for it to be inherited, and then get the 'real'
    # process title for this one.
    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    threads = []
    logging.info("Creating %d worker threads.", num_threads)
    # task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues = [Queue(queue_size) for x in xrange(num_threads)]
    this_output_queue = Queue(2 * queue_size)

    # Create an output processor
    output_processor = Thread(
        target=output_callback, args=(output_file, this_output_queue, output_headers_callback, output_item_callback)
    )
    output_processor.start()

    for i in xrange(num_threads):
        this_thread = Thread(
            target=processor,
            args=(
                workflow_class,
                graph_builder_constructor,
                graph_builder_params,
                ranker_constructor,
                ranker_params,
                eval_parameters,
                ranking_cutoff,
                mesh_tree_filename,
                distance_matrix_filename,
                distance_function,
                umls_converter_data_filename,
                extra_data_name,
                extra_data_contents,
                task_queues[i],
                this_output_queue,
            ),
            name="MEDRank-Worker-%d" % i,
        )
        logging.log(ULTRADEBUG, "Created thread: %r", this_thread)
        this_thread.start()
        threads.append((this_thread, this_output_queue, task_queues[i]))

    all_results = {}
    count = 0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count += 1
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_thread = (count - 1) % num_threads
        logging.info("Dispatching article %d: %s to %s", count, each_article.set_id, threads[target_thread][0].name)
        task_queues[target_thread].put(each_article)
        # task_queue[target_process].put(each_article)
        # task_queue.put(each_article)
        # logging.info("The task queue is approximately %d items long.",
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results = {}

    alive_threads = [x for x in threads if x[0].is_alive()]
    remaining_threads = len(alive_threads)

    logging.info("There are %d threads (out of %d) still alive.", remaining_threads, num_threads)
    for i in xrange(remaining_threads):
        alive_threads[i][2].put("STOP")
        # alive_threads[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further " "requests will come.")

    logging.info("All information sent to the threads.")

    # Note end of output

    while len(threads) > 0:
        a_thread = threads.pop()
        # We join the process to wait for the end of the reading
        a_thread[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multithreading.")
    Pmid.close_storage()
    return
Beispiel #10
0
def multi_processor(reader,
                    workflow_class,
                    graph_builder_constructor,
                    graph_builder_params,
                    ranker_constructor,
                    ranker_params,
                    eval_parameters,
                    ranking_cutoff,
                    mesh_tree_filename,
                    distance_matrix_filename,
                    distance_function,
                    umls_converter_data_filename,
                    umls_concept_data_filename,
                    extra_data_name,
                    extra_data_contents,
                    output_file,
                    num_processes=None,
                    queue_size=None,
                    output_callback=output,
                    output_headers_callback=output_headers,
                    output_item_callback=output_one_item,
                    performance_tuning=True):
    """
    Perform the evaluation.
    Multiprocessing notes: It's the responsibility of the caller to make sure that
    extra_data_contents, if any, are multiprocessing-safe. For example, by using
    a SyncManager and Namespace and passing the proxy. See umls/concept for an example.
    """

    if num_processes is None:
        num_processes = cpu_count()

    if performance_tuning:
        # Since reading the file involves an awful lot of object creation
        # and destruction we'll tweak the gc adjustments to sweep less frequently
        # IOW - we have a LOT of short-lived objects. No sense garbage-collecting
        # the latter generations very often.
        # (this is about 10x, 5x, and 5x the usual)
        original_threshold = gc.get_threshold()
        gc.set_threshold(10 * original_threshold[0], 5 * original_threshold[1],
                         5 * original_threshold[1])
        original_check_interval = sys.getcheckinterval()
        # Similarly, we'll try to minimize overhead from thread switches
        # 5x usual value
        sys.setcheckinterval(5 * original_check_interval)
    logging.debug("Initializing Concept storage from %s",
                  umls_concept_data_filename)

    if umls_concept_data_filename is None:
        Concept.init_storage()
    else:
        Concept.init_storage(StringDBDict(umls_concept_data_filename))
    Pmid.init_storage()

    proctitle.setproctitle("MEDRank-main")

    processes = []
    logging.info("Creating %d worker processes.", num_processes)
    #task_queue=[JoinableQueue(queue_size) for x in xrange(num_processes)]
    task_queues = [Queue(queue_size) for x in xrange(num_processes)]
    this_output_queue = Queue(2 * queue_size)

    # Create an output processor
    output_processor = Process(target=output_callback,
                               args=(output_file, this_output_queue,
                                     output_headers_callback,
                                     output_item_callback))
    output_processor.start()

    for i in xrange(num_processes):
        this_process = Process(
            target=processor,
            args=(workflow_class, graph_builder_constructor,
                  graph_builder_params, ranker_constructor, ranker_params,
                  eval_parameters, ranking_cutoff, mesh_tree_filename,
                  distance_matrix_filename, distance_function,
                  umls_converter_data_filename, extra_data_name,
                  extra_data_contents, task_queues[i], this_output_queue,
                  "MEDRank-Worker-%d" % i),
            name="MEDRank-Worker-%d" % i)
        logging.log(ULTRADEBUG, "Created process: %r", this_process)
        this_process.start()
        processes.append((this_process, this_output_queue, task_queues[i]))

    all_results = {}
    count = 0

    # Use a single dispatch queue for automagical load balancing
    # CHANGED - Now uses multiple queues to avoid starving due to waiting on semlocks
    for each_article in reader:
        count += 1
        #queues_and_sizes=[(task_queues[x].qsize(), x)
        #                  for x in xrange(num_processes)]
        #queues_and_sizes.sort()
        #target_process=queues_and_sizes[0][1]
        # logging.info("Dispatching article %d: %r", count, each_article)
        target_process = (count - 1) % num_processes
        #Lowest-loaded process first.
        logging.info("Dispatching article %d: %s to %s", count,
                     each_article.set_id, processes[target_process][0].name)
        task_queues[target_process].put(each_article)
        #task_queue[target_process].put(each_article)
        #task_queue.put(each_article)
        #logging.info("The task queue is approximately %d items long.",
        #             task_queue.qsize())

    logging.log(ULTRADEBUG, "Waiting for processing to end.")
    all_results = {}

    alive_processes = [x for x in processes if x[0].is_alive()]
    remaining_processes = len(alive_processes)

    logging.info("There are %d processes (out of %d) still alive.",
                 remaining_processes, num_processes)
    for i in xrange(remaining_processes):
        alive_processes[i][2].put('STOP')
        alive_processes[i][2].close()
    logging.debug("Sent STOP requests. Notifying queue that no further "
                  "requests will come.")

    logging.info("All information sent to the processors.")

    # Back to normal
    if performance_tuning:
        gc.set_threshold(original_threshold[0], original_threshold[1],
                         original_threshold[2])
        sys.setcheckinterval(original_check_interval)

    # Note end of output

    while len(processes) > 0:
        a_process = processes.pop()
        # We join the process to wait for the end of the reading
        a_process[0].join()
        # logging.log(ULTRADEBUG, "Fetching results from finished process.")
        # all_results.update(a_process[1].get()) # Add results to result pool
        # logging.log(ULTRADEBUG, "Received results.")
    logging.info("Finishing writing out results.")
    this_output_queue.put("STOP")
    output_processor.join()
    logging.info("Results written. Finishing multiprocessing.")
    return