def run_task(user_tweets, nodes, nodes_params, detect_interval, directed): """ Parallelize the flow detection algorithm for a given set of parameters.""" # Detect tweets on each cluster machine flows = user_tweets.flatMap(lambda x: Flow.infer_flows(x[0], x[1], nodes.value, detect_interval, directed)) # Aggregate results for each Flow agg_flows = flows.reduceByKey(Flow.reduce_flows_helper) # Build the final Flow objects from attributes final_flows = agg_flows.map(lambda x: Flow.build_final_flows(x[0], x[1])) # Generate node weights weighted_nodes = final_flows.flatMap(lambda x: [(x.src, x.weight), (x.dst, x.weight)]) weighted_nodes = weighted_nodes.reduceByKey(lambda a, b: a + b) weighted_nodes = weighted_nodes.map(lambda x: {'node': x[0], 'weight': x[1]}) # Save the results filename = '_{}_{}_{}_{}_{}.json'.format(nodes_params[0], nodes_params[1], nodes_params[2], detect_interval, directed) json_mapper = lambda x: json.dumps(x, default=lambda y: y.json) final_flows.map(json_mapper).saveAsTextFile(os.path.join(PATH_BASE, 'results', 'flows' + filename)) weighted_nodes.map(json_mapper).saveAsTextFile(os.path.join(PATH_BASE, 'results', 'nodes' + filename))