def run(sc, TH, infile, outfile): rdd = sc.textFile(infile) fisrtline = rdd.first() data = rdd.filter(lambda line: line != fisrtline) uid = data.map(lambda line: (line.split(',')[0], line.split(',')[1])) \ .groupByKey() \ .mapValues(lambda x: sorted(list(x))) \ .collectAsMap() cand_pairs = list(itertools.combinations(list(uid.keys()), 2)) edge, vertex = list(), set() for pair in cand_pairs: if len(set(uid[pair[0]]).intersection(set(uid[pair[1]]))) >= TH: edge.append(tuple((pair[0], pair[1]))) edge.append(tuple((pair[1], pair[0]))) vertex.add(pair[0]) vertex.add(pair[1]) graph = GraphFrame(sc.parallelize(list(vertex)).map(lambda uid: (uid,)).toDF(['id']), sc.parallelize(edge).toDF(["src", "dst"])) communities = graph.labelPropagation(maxIter=5) communities = communities.rdd.coalesce(1) \ .map(lambda idx_label: (idx_label[1], idx_label[0])) \ .groupByKey() \ .map(lambda label_idxes: sorted(list(label_idxes[1]))) \ .sortBy(lambda idxes: (len(idxes), idxes)) \ .collect() Task1.toFile(outfile, communities)
def main(argv): filter_threshold = int(argv[1]) input_file_path = argv[2] output_file_path = argv[3] # os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.6.0-spark2.4-s_2.11") # filter_threshold = 7 # input_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/data/ub_sample_data.csv" # output_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/output/task1.txt" # 38648 records total_start = time.time() start = time.time() input_data = sc.textFile(input_file_path).\ filter(lambda line: "user_id" not in line).\ map(lambda line: tuple(line.split(","))).\ groupByKey().\ mapValues(set).\ persist() # 3374 edges = input_data.\ cartesian(input_data).\ filter(lambda pair: pair[0][0] < pair[1][0]).\ filter(lambda pair: len(pair[0][1].intersection(pair[1][1])) >= filter_threshold).\ flatMap(lambda pair: [(pair[0][0], pair[1][0]), (pair[1][0], pair[0][0])]).\ persist() # 996 498 edges_df = edges.map(lambda pair: Row(src=pair[0], dst=pair[1])).toDF() vertices = edges.flatMap(lambda _: _).distinct().persist() # 222 vertices_df = vertices.map(Row("id")).toDF() print("finish building edges and vertices:", time.time() - start) start = time.time() graph = GraphFrame(vertices_df, edges_df) result = graph.labelPropagation(maxIter=5) print("finish running LPA:", time.time() - start) # result.count() # 222 # result.show() result_rdd = result.rdd.\ map(lambda pair: (pair['label'], pair['id'])).\ groupByKey().\ mapValues(lambda values: (sorted(list(values)), len(values))).\ persist() result_collection = result_rdd.collect() result_collection.sort(key=lambda kv: (kv[1][1], kv[1][0][0])) with open(output_file_path, "w") as output_file: for community_id, (user_list, length) in result_collection: output_file.write(f"'{user_list[0]}'") for user in user_list[1:]: output_file.write(f", '{user}'") output_file.write("\n") print("total running time:", time.time() - total_start)
def lpa(self, graph, iter): print("Community Detection\t1\tInitializing Algorithm", flush=True) edges = graph.get_df() vertices = edges.select('src').union( edges.select('dst')).distinct().withColumnRenamed('src', 'id') print("Community Detection\t2\tExecuting Label Propagation Algorithm", flush=True) graph = GraphFrame(vertices, edges) result = graph.labelPropagation(maxIter=iter) return result.orderBy('label', ascending=True).withColumnRenamed( 'label', 'Community')
def main(argv): assert len( argv ) == 3, "Script takes 3 arguments <filter_threshold><input_file><community_output_file>" filter_threshold, input_file, output_file = argv filter_threshold = int(filter_threshold) config = SparkConf().setMaster("local[*]") \ .setAppName("Task2") \ .set("spark.executor.memory", "4g") \ .set("spark.driver.memory", "4g") sc = SparkContext(conf=config).getOrCreate() spark = SparkSession(sc) sc.setLogLevel("ERROR") lines = sc.textFile(input_file) header = lines.first() rdd_dict = lines.filter(lambda x: x != header) \ .map(lambda x: (x.split(',')[0], x.split(',')[1])) \ .groupByKey().collectAsMap() user_pairs = list(combinations(rdd_dict.keys(), 2)) edges_rdd = sc.parallelize(user_pairs) \ .map(lambda x: (x[0], x[1])) \ .filter(lambda x: get_intersection(rdd_dict[x[0]], rdd_dict[x[1]]) >= filter_threshold) \ .cache() nodes_df = edges_rdd.flatMap(lambda x: x).distinct().map( lambda x: (x, )).toDF(["id"]) edges_df = edges_rdd.toDF(["src", "dst"]) gf = GraphFrame(nodes_df, edges_df) communities_rdd = gf.labelPropagation(maxIter=5).rdd.coalesce(1) communities = communities_rdd.map(lambda x: (x[1], x[0])) \ .groupByKey() \ .map(lambda x: sorted(list(x[1]))) \ .sortBy(lambda x: (len(x), x)) \ .collect() with open(output_file, "w+") as file: for community in communities: value = str(community)[1:-1] file.writelines(value + "\n") file.close()
class LPA(): def __init__(self): self.spark = SparkSession \ .builder \ .appName('Example_2') \ .getOrCreate() def graphx(self): self.df = self.spark.read.option("header", "true").csv('results_new/data-00000-of-00010.csv') # print(self.df.show(n=5)) self.df = self.df.dropna() self.rdd = self.df.select("url","mention").rdd.flatMap(lambda x: x).distinct() # print(self.rdd.take(5)) def hashnode(x): return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8] hashnode_udf = udf(hashnode) vertices = self.rdd.map(lambda x: (hashnode(x), x)).toDF(["id", "url"]) vertices.show(5) edges = self.df.select("url", "mention") \ .withColumn("src", hashnode_udf("url")) \ .withColumn("dst", hashnode_udf("mention")) \ .select("src", "dst") edges.show(5) self.graph = GraphFrame(vertices, edges) # print(self.graph) print('communities are ') self.communities = self.graph.labelPropagation(maxIter=2) print(self.communities.persist().show(10)) print(self.communities.sort(desc("label")).show(50)) self.communities.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("communities") print("There are " + str(self.communities.select('label').distinct().count()) + " communities in sample graph.") print(self.graph.inDegrees.join(vertices, on="id") \ .orderBy("inDegree", ascending=False).show(10)) print(self.graph.stronglyConnectedComponents(maxIter=2).select('url','component').show(20))
edgelist = [] verticeSet = set() for x in combinations(dic.keys(), 2): if len(dic[x[0]].intersection(dic[x[1]])) >= threshold: edgelist.append(x) edgelist.append((x[1], x[0])) verticeSet.add(x[0]) verticeSet.add(x[1]) verticelist = list(combinations(verticeSet, 1)) sqlContext = SQLContext(sc) vertices = sqlContext.createDataFrame(verticelist, ["id"]) edges = sqlContext.createDataFrame(edgelist, ["src", "dst"]) g = GraphFrame(vertices, edges) labeled = g.labelPropagation(maxIter=5) resRDD = labeled.rdd.map(lambda x: (x['label'], [x['id']]))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (len(x[1]), [sorted(x[1])]))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0], sorted(x[1])))\ .sortByKey() f = open(output, "w") for x in resRDD.collect(): communities = x[1] for community in communities: f.write(str(community)[1:-1]+'\n') f.close() print("runtime", time.time()-curr, "s")
start = time.time() threshold = int(sys.argv[1]) inputfile = sys.argv[2] outputfile = sys.argv[3] sc = SparkContext(master="local[3]") sc.setLogLevel("WARN") sql_sc = SQLContext(sc) #sql_sc.sql("set spark.sql.shuffle.partitions=200") data = sc.textFile(inputfile).map(lambda x: x.split(',')).filter( lambda x: x[0] != 'user_id') user_pairs = data.map(lambda x: [x[1], x[0]]).groupByKey().mapValues( sorted).mapValues(lambda x: combinations(x, 2)).flatMap( lambda x: x[1]).flatMap(lambda x: [[x, 1], [x[::-1], 1]]).reduceByKey( add).map(lambda x: [x[0][0], x[0][1], x[1]]).filter( lambda x: x[2] >= threshold) users = user_pairs.flatMap(lambda x: x[:2]).distinct().map(lambda x: [x]) print(users.count()) vertices = sql_sc.createDataFrame(users, ['id']) edges = sql_sc.createDataFrame(user_pairs, ["src", "dst", "intersection"]) graph = GraphFrame(vertices, edges) result = graph.labelPropagation(maxIter=5).select( 'id', 'label').rdd.map(lambda x: [x[1], x[0]]).groupByKey().map(lambda x: sorted( list(x[1]))).sortBy(lambda x: x[0]).sortBy(len).collect() file = open(outputfile, 'w') for item in result: file.write("'" + "', '".join(item) + "'") file.write('\n') print("Duration: ", time.time() - start)
output_path = sys.argv[2] os.environ['PYSPARK_SUBMIT_ARGS'] = ( '--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11') sc = SparkContext('local[*]', 'task1') sc.setLogLevel('OFF') start = time.time() vertices, edges = read_file(input_path) sqlc = SQLContext(sc) vertices = sqlc.createDataFrame(vertices, ['id']) edges = sqlc.createDataFrame(edges, ['src', 'dst']) graph = GraphFrame(vertices, edges) community = graph.labelPropagation(maxIter=5) communityRDD = community.rdd.map( lambda x: (x['label'], '\'{}\''.format(x['id']))).groupByKey() communityRDD = communityRDD.map(lambda x: sorted(x[1])).sortBy( lambda x: (len(x), x[0])) result = communityRDD.collect() with open(output_path, 'w') as f: for group in result: f.write(', '.join(group)) f.write('\n') print('Duration: {}'.format(time.time() - start))
sourceRdd = sc.textFile(input_file) first_line = sourceRdd.first() uid_bid = sourceRdd.filter(lambda line: line != first_line).distinct().map(lambda tup: tup.split(',')) gen_pair = uid_bid.map(lambda line: (line[1], [line[0]])).reduceByKey(add).flatMap(lambda list_pair:make_pair(list_pair[1])) freq_pair = gen_pair.map(lambda line: (line, 1)).reduceByKey(add).filter(lambda a: a[1] >= filter_threshold).keys() edges = freq_pair.flatMap(lambda line: ((line[0], line[1]), (line[1], line[0]))) #create DataFrame needs a tuple as input vertices = freq_pair.flatMap(lambda line: line).distinct().map(lambda line: (line, )) sql_ctx = SQLContext(sc) v_df = sql_ctx.createDataFrame(vertices, ["id"]) e_df = sql_ctx.createDataFrame(edges, ["src", "dst"]) graph_f = GraphFrame(v_df, e_df) label_prop = graph_f.labelPropagation(maxIter=5) result_community = label_prop.rdd.coalesce(1).map(lambda id_label: (id_label[1], id_label[0])).groupByKey()\ .map(lambda communities: sorted(list(communities[1]))).sortBy(lambda comms: (len(comms), comms)) dumpResult(output_file, result_community) end = time.time() print("Duration:", end-start)
edge_list = [] vertex_list = [] for each in user_pair: edge_list.append(tuple([each[0],each[1]])) edge_list.append(tuple([each[1], each[0]])) vertex_list.append(each[0]) vertex_list.append(each[1]) #remove same vertex vertex_list = list(set(vertex_list)) edge = sc.parallelize(edge_list).toDF(["src","dst"]) vertex = sc.parallelize(vertex_list).map(lambda s: Row(id=s)).toDF(['id']) graph = GraphFrame(vertex,edge) community = graph.labelPropagation(maxIter=5).rdd.map(lambda s: (s[1],s[0])).groupByKey()\ .map(lambda s: list(sorted(s[1]))).collect() sorted_community = sorted(community,key=lambda s:(len(s),min(s))) with open(community_output_file_path, 'w') as output_file: for i in sorted_community: output_file.writelines(str(i)[1:-1] + "\n") output_file.close() end = time.time() print ("Duration: %s Seconds"%(end-start))
input_lines = sc.textFile(input_file_path).map(lambda x: x.split(',')).map( lambda x: (x[0], x[1])).filter(lambda x: x[0] != "user_id").groupByKey( ).mapValues(lambda x: list(x)) ub_dict = input_lines.collectAsMap() edges = [] points = set() for x in list(itertools.combinations(ub_dict.keys(), 2)): if len(set(ub_dict[x[0]]).intersection(set( ub_dict[x[1]]))) >= int(filter_threshold): edges.append(x) edges.append((x[1], x[0])) points.add(x[0]) points.add(x[1]) points_df = sc.parallelize(list(points)).map(lambda x: (x, )) points_df = ss.createDataFrame(points_df, ['id']) edges_df = sc.parallelize(edges) edges_df = ss.createDataFrame(edges_df, schema=['src', 'dst']) graph = GraphFrame(points_df, edges_df) lpa_graph = graph.labelPropagation(maxIter=5) communities = lpa_graph.rdd.map(lambda x: (x[1], x[0])).groupByKey().map( lambda x: sorted(list(x[1]))).sortBy(lambda x: (len(x), x)) result = communities.collect() # output with open(output_file_path, 'w+') as output_file: for line in result: output_file.writelines(str(line)[1:-1] + "\n") output_file.close() print('Duration:', (time.time() - start_time))
os.environ["PYSPARK_SUBMIT_ARGS"] = ( "--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11") filter_threshold = 7 fpath = sys.argv[1] output_path = sys.argv[2] start = time.time() rdd = sc.textFile(fpath).filter(lambda x: x != 'user_id,business_id') \ .map(lambda x: (x.split(',')[0], x.split(',')[1])) \ .groupByKey() \ .map(lambda x: (x[0], list(x[1]))).persist() uid_lst = rdd.map(lambda x: x[0]).collect() bid_lst_info = rdd.collectAsMap() edges = rdd.flatMap(count_common).distinct().map(lambda x: (x[0][0], x[0][1])).toDF(["src", "dst"]) v = rdd.map(lambda x: (x[0],)).toDF(['id']) gf = GraphFrame(v, edges).dropIsolatedVertices() result = gf.labelPropagation(maxIter=5) ans_rdd = result.rdd.coalesce(4).map(tuple) ans = ans_rdd.map(lambda x: (x[1], x[0])).groupByKey().mapValues(list).mapValues(sorted).sortBy( lambda x: len(x[1])).map(lambda x: x[1]).collect() #export answers write_files(output_path, ans) end = time.time() duration = end - start print(f'finished in {duration} seconds!')
# is built as an instance of a GraphFrame, which is a pair of # vertices (as `v`) and edges (as `e`): graph = GraphFrame(v, e) print("graph=", graph) # GraphFrame(v:[id: string, name: string ... 1 more field], # e:[src: string, dst: string ... 1 more field]) #================================== # Label Propagation Algorithm (LPA) #================================== # Run static Label Propagation Algorithm for detecting # communities in networks. # Each node in the network is initially assigned to # its own community. At every superstep, nodes send # their community affiliation to all neighbors and # update their state to the mode community affiliation # of incoming messages. # # LPA is a standard community detection algorithm for # graphs. It is very inexpensive computationally, although # (1) convergence is not guaranteed and # (2) one can end up with trivial solutions # (all nodes are identified into a single community). # result = graph.labelPropagation(maxIter=5) result.show(truncate=False) result.select("id", "label").show(truncate=False) # done! spark.stop()
.map(lambda x: tuple([x]))\ .distinct() nodes_df = nodes.toDF(["id"]) # nodes_df.show() # get all possible edges edges_df = user_pairs.union(user_pairs.map(lambda x: (x[1], x[0]))).toDF( ["src", "dst"]) # edges_df.show() # create graph community_graph = GraphFrame(nodes_df, edges_df) # get labels result = community_graph.labelPropagation(maxIter=5) # result.show() # group communities detected_communities = result\ .rdd\ .map(tuple)\ .map(lambda x: (x[1], x[0]))\ .groupByKey()\ .map(lambda x: sorted(list(x[1])))\ .sortBy(lambda x: (len(x), x[0])) detected_communities_collection = detected_communities.collect() write_results(detected_communities_collection, argv[3])
## Create the mapping. graphVertices = (rawData.map(lambda f : (f[0])) + rawData.map(lambda f : (f[1]))).distinct().map(lambda x: Row(id = x)).collect() pprint(graphVertices) graphEdges = rawData.flatMap(lambda f : ((f[0], f[1]), (f[1], f[0]))).collect() ## Create the vertices. vertices = sqlContext.createDataFrame(graphVertices, ['id']) ## Create the edges. edges = sqlContext.createDataFrame(graphEdges, ["src", "dst"]) ## Construct the graph. g = GraphFrame(vertices, edges) ## Run the label propagation algorithm. result = g.labelPropagation(maxIter = 5).collect() resultDict = {} for i in result: label = i["label"] if label not in resultDict.keys(): resultDict[label] = [] resultDict[label].append(str(i["id"])) ordered_keys = sorted(resultDict, key=lambda k: (len(resultDict[k]), min(resultDict[k]))) with open(outfilePath, "w") as f: for key in ordered_keys: temp = sorted(resultDict[key]) f.write("'" + "', '".join(temp) + "'\n") tockTime = time.time()
input_file = sys.argv[1] output_file = sys.argv[2] # input_file = 'dataset/power_input.txt' # output_file = 'output/task1.csv' conf = SparkConf().setAppName("INF553").setMaster('local[*]') sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") sqlContext = SQLContext(sc) input_data = sc.textFile(input_file) input_data = input_data.map(lambda x: x.split(" ")) vertices_src = input_data.map(lambda x: x[0]).persist() vertices_dst = input_data.map(lambda x: x[1]).persist() vertices = sc.union([vertices_src, vertices_dst]).distinct().map(lambda x: Row(x)) edges_forward = input_data.map(lambda x: (x[0], x[1])).persist() edges_backward = input_data.map(lambda x: (x[1], x[0])).persist() edges = sc.union([edges_forward, edges_backward]).distinct() vertices = sqlContext.createDataFrame(vertices, ["id"]) edges = sqlContext.createDataFrame(edges, ["src", "dst"]) gf = GraphFrame(vertices, edges) lpa_df = gf.labelPropagation(maxIter=5) output = lpa_df.rdd.map(lambda x: (x[1], x[0])).groupByKey().mapValues(lambda x: sorted(list(x))) \ .sortBy(lambda x: (len(x[1]), x[1])).map(lambda x: tuple(x[1])).collect() write_to_file(output_file, output) print("Duration:", time.time() - start_time)
g.edges.show() result_df_political = sqlContext.read.parquet("result.df.f_political") # result_df_political.show() def get_groups(row): return row[0] min_length = 9999999 iteration_min_length = 100 yeah_good_length = False for i in range(7, 15): result_df = g.labelPropagation(maxIter=i) clusters_result = result_df_political.join(result_df, result_df_political["identifier"] == result_df["id"]) clusters_result = clusters_result.drop("id") clusters_result_groups = clusters_result.groupBy("label").count().sort(col("label")).rdd.map(get_groups).collect() clusters_result.groupBy("label").count().sort(col("label")).show() users_by_groups_7 = {} groups_length = 0 for group in clusters_result_groups: groups_length += 1 print "Length:\n" + str(groups_length) print "Iteration:" + str(i) result_df.show() if min_length > groups_length:
lambda x: (str(x[0]), str(x[1]))).collect() edges2_rdd = data.map(lambda x: x.split(' ')).map( lambda x: (str(x[1]), str(x[0]))).collect() edges1 = edges1_rdd + edges2_rdd edges_rdd = sc.parallelize(edges1) vertices = sqlContext.createDataFrame(vertices_rdd, ["id"]) edges = sqlContext.createDataFrame(edges_rdd, ["src", "dst"]) #add edges and vertices to graph in library g = GraphFrame(vertices, edges) #print(g) #call algorithm to detect communities result = g.labelPropagation(maxIter=5).rdd.map( lambda x: (x[1], str(x[0]))).groupByKey().map(lambda x: sorted(x[1])) sorted_graph_list = result.collect() sorted_graph_list.sort() sorted_graph_list.sort(key=len) #print(sorted_graph_list) f = open(output_file_name, 'w') for i in sorted_graph_list: s = str(i).replace("[", "").replace("]", "") f.write(str(s)) f.write("\n") f.close()
# | a| Alice| 34| # | b| Bob| 36| # | c|Charlie| 30| # | d| David| 29| # | e| Esther| 32| # | f| Fanny| 36| # | g| Gabby| 60| # +--+-------+---+ g.edges.show(5) # +---+---+------------+ # |src|dst|relationship| # +---+---+------------+ # | a| b| friend| # | b| c| follow| # | c| b| follow| # | f| c| follow| # | e| f| follow| # | e| d| friend| # | d| a| friend| # | a| e| friend| # +---+---+------------+ # Get a DataFrame with columns "id" and "inDegree" (in-degree) vertexInDegrees = g.inDegrees vertexInDegrees.show(5) # Run LPA communities = g.labelPropagation(maxIter=5) communities.persist().show(10)
user_distinct = graph_rdd.map(lambda line: (line[0], line[1]))\ .flatMap(lambda x: x)\ .distinct()\ .map(lambda x: [x]) graph_with_reverse = graph_rdd.map(lambda line: ([line[0], line[1]], [line[1], line[0]]) )\ .flatMap(lambda x: x) vertex_df = spark_sql.createDataFrame(user_distinct, ["id"]).coalesce(6) edge_df = spark_sql.createDataFrame(graph_with_reverse,["src", "dst"]).coalesce(6) time_here = time.time() g = GraphFrame(vertex_df, edge_df) res_df = g.labelPropagation(maxIter=5) # res_df.show() print("Duration for graph propagtion:", time.time() - time_here) res_rdd = res_df.coalesce(1).rdd.map(tuple) res_list = res_rdd.map(lambda pair: (pair[1], pair[0]))\ .groupByKey()\ .map(lambda line: sorted(list(line[1])))\ .sortBy(lambda x: [len(x), x] )\ .map(lambda x: ", ".join(x))\ .collect() convert_to_file(OUTPUT_PATH, res_list) print("Duration: ", time.time() - start_time)
sqlContext = SQLContext(sc) sc.setLogLevel("ERROR") sample_text = sc.textFile(sys.argv[2]).map(lambda x : x.split(',')) sample_first = sample_text.take(1) sample_text = sample_text.filter(lambda x : x !=sample_first[0]).map(lambda x : (x[1],[x[0]])).reduceByKey(lambda x,y: x+y) #edges = sample_text.map(lambda x : (x[0],[x[1])])).reduceByKey(lambda x,y : x+y).map(lambda x :(1,[x])).reduceByKey(lambda x,y : x+y).flatMap(lambda x : itertools.combinations(x[1],2)).filter(lambda x : len(set(x[0][1]).intersection(set(x[1][1]))) >= 7).map(lambda x : (x[0][0].encode('UTF-8'),x[1][0].encode('UTF-8'))).distinct().map(lambda x : [x[0],x[1]]) t = int(sys.argv[1]) edges = sample_text.flatMap(lambda x : ([x[0],y] for y in list(itertools.permutations(x[1],2)))).filter(lambda x : len(set(x[1])) != 1).map(lambda x : (x[1],[x[0]])).reduceByKey(lambda x,y : x+y).filter(lambda x : len(set(x[1])) >= t).map(lambda x :(x[0][0],x[0][1])).distinct().map(lambda x :[x[0],x[1]]) edgesDF = sqlContext.createDataFrame(edges,["src","dst"]) vertices = edges.flatMap(lambda x: ((x[0]),(x[1]))).distinct().map(lambda x : [x]) verticesDF = sqlContext.createDataFrame(vertices,["id"]) g = GraphFrame(verticesDF, edgesDF) comm = g.labelPropagation(maxIter=5).select('label','id') comm_rdd = comm.rdd.map(lambda x:(x[0],[x[1]])).reduceByKey(lambda x,y: x+y).sortBy(lambda x : (len(x[1]),sorted(x[1]))).collect() f = open(sys.argv[3],"w") for community in comm_rdd : print(community) f.write(str(sorted(community[1])).strip("[]")) f.write("\n") f.close() sc.stop() print("Duration : " + str(time.time()-start))
edge_list = list() vertex_set = set() for pair in uid_pairs: if len( set(uid_bidxes_dict[pair[0]]).intersection( set(uid_bidxes_dict[pair[1]]))) >= int(filter_threshold): edge_list.append(tuple(pair)) edge_list.append(tuple((pair[1], pair[0]))) vertex_set.add(pair[0]) vertex_set.add(pair[1]) # vertex_df = vertex_rdd.toDF(["id"]).write.csv('vertex.csv') # edge_df = edge_rdd.toDF(["src", "dst"]).write.csv('edge.csv') vertex_df = sc.parallelize(list(vertex_set)).map(lambda uid: (uid, )).toDF( ['id']) edge_df = sc.parallelize(edge_list).toDF(["src", "dst"]) graph_frame = GraphFrame(vertex_df, edge_df) communities = graph_frame.labelPropagation(maxIter=5) communities_rdd = communities.rdd.coalesce(1) \ .map(lambda idx_label: (idx_label[1], idx_label[0])) \ .groupByKey().map(lambda label_idxes: sorted(list(label_idxes[1]))) \ .sortBy(lambda idxes: (len(idxes), idxes)) # export your finding export2File(communities_rdd.collect(), output_file_path) print("Duration: %d s." % (time.time() - start))
return False new_rddr = user_rdd.cartesian(user_rdd).filter(lambda a: a[0] < a[1]).filter( lambda a: check3(user_busi_dict[a[0]], user_busi_dict[a[1]])) user = new_rddr.map(lambda a: list([a[0], a[1]])).flatMap( lambda a: a).distinct() user_df = user.map(lambda x: (x, )).toDF(["id"]) edge_df = new_rddr.toDF(["src", "dst"]) # print(user.take(10)) # edge_df.show() # user_df.show() g = GraphFrame(user_df, edge_df) # print(g) result = g.labelPropagation(maxIter=5).persist() # result.show() res = result.select( "id", "label").rdd.map(tuple).map(lambda a: (a[1], a[0])).groupByKey().map( lambda a: (a[0], tuple(sorted(list(a[1]))))).map(lambda a: (a[1][0], a[ 1])).sortByKey().map(lambda a: (len(a[1]), a[1])).sortByKey().map( lambda a: a[1]).collect() # print(res.take(50)) filename = out_path with open(filename, 'w') as zaili: for a in res: l = len(a) for i in range(l): if i == l - 1: zaili.write(str(a[i])) else:
e = sqlContext.createDataFrame(edges, ["src", "dst", "relationship"]) # Create a GraphFrame print "Vertex: " + str(len(vertex)) print "Edges: " + str(len(edges)) g = GraphFrame(v, e) g.vertices.show() g.edges.show() def get_groups(row): return row[0] result_df_7 = g.labelPropagation(maxIter=7) result_df_7.show() groups_by_7 = result_df_7.groupBy("label").count().sort( col("label")).rdd.map(get_groups).collect() result_df_7.groupBy("label").count().sort(col("label")).show() groups_length = 0 for group in groups_by_7: groups_length += 1 print "Length:\n" + str(groups_length) """result_df_7.write.parquet("result.df.7") result_df_6 = g.labelPropagation(maxIter=6) result_df_6.show() result_df_6.write.parquet("result.df.6")