def calculate(path_data, path_artists, talky=False): """ Reads all connected components from the given dataset and computes measures for this graph. For each connected component 'calculate_connected_component' is called - see this method for documentation. """ top_artists = get_top_artists(path_artists) for i, graph in enumerate(iterator.components(path_data)): calculate_connected_component(i, graph, top_artists, talky)
def calculate_concurrent(path_data, path_artists, num_threads=4, talky=False): """ Same as `calculate`, but uses multiple threads to accelerate the computation process. Threading is only applied the calculation of the measures - the input data is still read sequentially. Note: this is even more memory consuming than `calculate`. Further note: the real bottleneck seems to be reading the data from disk. """ from threading import Thread from Queue import Queue from sys import stdout import time def worker(id): while True: index, graph = queue.get() # filling the queue might take longer than processing (due to file reads) # thus we tell the queue that we are done; queue is not involved later on so it should be ok queue.task_done() do_work(index, graph) status[id] += 1 def do_work(index, graph): calculate_connected_component(index, graph, top_artists) def print_stati(): # fancy output looks ugly... if talky: # print table head stdout.write( "Progress:" + "all".rjust(11) + " || " + " | ".join([("T%s" % (i + 1)).rjust(5) for i in range(num_threads)]) + "\n" ) while do_the_print: # reprint table body stdout.write( "\r" + str(sum(status)).rjust(20) + " || " + " | ".join([str(i).rjust(5) for i in status]) ) stdout.flush() time.sleep(0.75) stdout.write("\n") num_threads = max(num_threads, 1) # stupid user might be stupid queue = Queue(maxsize=num_threads * 4) top_artists = get_top_artists(path_artists) status = [0 for i in range(num_threads)] do_the_print = True # create workers for i in range(num_threads): t = Thread(target=worker, args=(i,)) t.daemon = True t.start() # for status information status_thread = Thread(target=print_stati) status_thread.daemon = True status_thread.start() # load data for tupel in enumerate(iterator.components(path_data)): queue.put(tupel) # wait until all threads are finished queue.join() do_the_print = False status_thread.join() # let it write a newline