Example #1
0
def calculate(path_data, path_artists, talky=False):
    """
	Reads all connected components from the given dataset and computes
	measures for this graph. For each connected component
	'calculate_connected_component' is called - see this method for
	documentation.
	"""
    top_artists = get_top_artists(path_artists)
    for i, graph in enumerate(iterator.components(path_data)):
        calculate_connected_component(i, graph, top_artists, talky)
Example #2
0
def calculate_concurrent(path_data, path_artists, num_threads=4, talky=False):
    """
	Same as `calculate`, but uses multiple threads to accelerate the computation
	process. Threading is only applied the calculation of the measures - the
	input data is still read sequentially.

	Note: this is even more memory consuming than `calculate`.
	Further note: the real bottleneck seems to be reading the data from disk.
	"""
    from threading import Thread
    from Queue import Queue
    from sys import stdout
    import time

    def worker(id):
        while True:
            index, graph = queue.get()
            # filling the queue might take longer than processing (due to file reads)
            # thus we tell the queue that we are done; queue is not involved later on so it should be ok
            queue.task_done()
            do_work(index, graph)
            status[id] += 1

    def do_work(index, graph):
        calculate_connected_component(index, graph, top_artists)

    def print_stati():
        # fancy output looks ugly...
        if talky:
            # print table head
            stdout.write(
                "Progress:"
                + "all".rjust(11)
                + "  || "
                + " | ".join([("T%s" % (i + 1)).rjust(5) for i in range(num_threads)])
                + "\n"
            )
            while do_the_print:
                # reprint table body
                stdout.write(
                    "\r" + str(sum(status)).rjust(20) + "  || " + " | ".join([str(i).rjust(5) for i in status])
                )
                stdout.flush()
                time.sleep(0.75)
            stdout.write("\n")

    num_threads = max(num_threads, 1)  # stupid user might be stupid
    queue = Queue(maxsize=num_threads * 4)
    top_artists = get_top_artists(path_artists)
    status = [0 for i in range(num_threads)]
    do_the_print = True

    # create workers
    for i in range(num_threads):
        t = Thread(target=worker, args=(i,))
        t.daemon = True
        t.start()

        # for status information
    status_thread = Thread(target=print_stati)
    status_thread.daemon = True
    status_thread.start()

    # load data
    for tupel in enumerate(iterator.components(path_data)):
        queue.put(tupel)

        # wait until all threads are finished
    queue.join()
    do_the_print = False
    status_thread.join()  # let it write a newline