Esempio n. 1
0
def run_task(user_tweets, nodes, nodes_params, detect_interval, directed):
    """ Parallelize the flow detection algorithm for a given set of parameters."""

    # Detect tweets on each cluster machine
    flows = user_tweets.flatMap(lambda x: Flow.infer_flows(x[0], x[1], nodes.value, detect_interval, directed))

    # Aggregate results for each Flow
    agg_flows = flows.reduceByKey(Flow.reduce_flows_helper)

    # Build the final Flow objects from attributes
    final_flows = agg_flows.map(lambda x: Flow.build_final_flows(x[0], x[1]))

    # Generate node weights
    weighted_nodes = final_flows.flatMap(lambda x: [(x.src, x.weight), (x.dst, x.weight)])
    weighted_nodes = weighted_nodes.reduceByKey(lambda a, b: a + b)
    weighted_nodes = weighted_nodes.map(lambda x: {'node': x[0], 'weight': x[1]})

    # Save the results
    filename = '_{}_{}_{}_{}_{}.json'.format(nodes_params[0],
                                             nodes_params[1],
                                             nodes_params[2],
                                             detect_interval,
                                             directed)
    json_mapper = lambda x: json.dumps(x, default=lambda y: y.json)
    final_flows.map(json_mapper).saveAsTextFile(os.path.join(PATH_BASE, 'results', 'flows' + filename))
    weighted_nodes.map(json_mapper).saveAsTextFile(os.path.join(PATH_BASE, 'results', 'nodes' + filename))