def process_batch(batch_filename, batch_num): import graphframes print('processing', batch_filename) edges = spark.read.csv(batch_filename, header=True) graph = graphframes.GraphFrame(vertices, edges) communities = graph.labelPropagation(maxIter=5) comm_filename = 'data/models/comm_{}.csv'.format(batch_num) communities.toPandas().to_csv(comm_filename, header=True, index=False)
def createGraph(vertices, edges): ''' Function: Create graph Parameters: Pyspark Dataframe - vertices, edges Returns: GraphFrame ''' print("Creating graph..") # Generate the graph graph = GF.GraphFrame(vertices, edges) print("Graph creation complete.") return graph
def main(): sc = initializeSpark() spark = SparkSession(sc) directory, post_id = parse_pls() rdds = make_rdds_from_dir(directory, sc) post_rdd = rdds["posts_rdd"] string = make_stripped_string(post_rdd, post_id) print("\n Body from post_id: " + str(post_id) + ", stripped of shitespaces and special characters:\n") print("'" + string + "'\n") # Tokenize the string tokens = tokenize(string) # remove duplicate entries tokens_unique = remove_dupes(tokens) # Assign id to the unique tokens token_id_tuple = assign_id_to_list(tokens_unique) # Now assign these id's the the original token list token_id_all = assign_unique_ids(token_id_tuple, tokens) print("\nTokens retrieved from the body with their respective id's: \n") for i in token_id_all: print(i) print("\n\nEdges:\n") ids = [] for i in token_id_all: ids.append(i[0]) # Create edges on a window size of 5, using the ids of the tokens edges = create_edges(ids, 5) # Removes duplicate edges from list edges = remove_dupe_tuples(edges) print(edges) print("\n\nPageRank:") sqlContext = SQLContext(sc) v = sqlContext.createDataFrame(token_id_tuple, ["id", "word"]) e = sqlContext.createDataFrame(edges, ["src", "dst"]) g = graphframes.GraphFrame(v, e) results = g.pageRank(resetProbability=0.15, tol=0.0001) results.vertices.select("word", "pagerank").show(truncate=False)
def calculaSugestaoDeAmigos(usuarios, relacionamentos, intervalo): grafo = graphframes.GraphFrame(usuarios, relacionamentos) resultado = [] #calcula amigos proximo para cada usuario no intervalo for inter in intervalo: #para cada usuario deve_se gerar a sugestao de amigos resultado.append(sugestao(grafo, inter, usuarios)) """criamos a funcao mas poderia ter feito graph.find (“(a)-[e]->(b); (b)-[e2]->(c); !(a)-[]->(c)”) queremos um usuário “a” que tenha conexão com “b” (a)-[e]->(b), sendo que “b” tem conexão com “c” (b)-[e2]->(c), mas “a” e “c” não tenha conexão !(a)-[]->(c).""" return resultado
def loadGraph(context, name): ''' Function: Load graph from file Parameters: N/A Returns: GraphFrame ''' # Load the graph from file print("\nLoading graph data..") vertices = context.read.parquet('store/'+name+'Vertices.parquet') edges = context.read.parquet('store/'+name+'Edges.parquet') print("\nGenerating graph..") graph = GF.GraphFrame(vertices, edges) print("\nGraph load complete.") return graph
.appName("PostLinksRelation")\ .getOrCreate() #people = spark.createDataFrame([("Bilbo Baggins", 50), ("Gandalf", 1000), ("Thorin", 195), ("Balin", 178), ("Kili", 77), # ("Dwalin", 169), ("Oin", 167), ("Gloin", 158), ("Fili", 82), ("Bombur", None)], ["name", "age"]) #people.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save() #''' #df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load() df = spark.read.json(sys.argv[1]) df.printSchema() edges = df.rdd.map(parseEdges).cache() nodes = edges.map(lambda x: (x[0], x[1])).flatMap(lambda X: X).distinct() edges = changeToDF_v2(edges, "src dst weight", "edges") #nodes=changeToDF_v1(nodes,"nodes") nodes = changeToDF_v2(nodes, "id", "nodes") g = graphframes.GraphFrame(nodes, edges) g.dropIsolatedVertices() print("+" * 100) vertices = g.vertices.collect() vertices = list(map(lambda x: x["id"], vertices)) print(vertices[:36]) print("+" * 100) distances_data = None for i in range(len(vertices)): print("*" * 50 + "{}/{}".format(i + 1, len(vertices)) + "*" * 50) for j in range(i): distances = g.bfs( fromExpr="id!=-123", toExpr="id!=-123",
if determine_edge((input_list[i],input_list[j]))==True: output.append((input_list[i],input_list[j])) return output edge=select_pair(t0) v=set() for i in edge: for t in i: if t not in v: v.add(t) vertices=sqlContext.createDataFrame(list(map(lambda a:[a], v)), ["id"]) edge+=list(map(lambda x:tuple(reversed(x)), edge)) edges=sqlContext.createDataFrame(edge, ["src", "dst"]) g=graphframes.GraphFrame(vertices, edges) result = g.labelPropagation(maxIter=5).groupBy("label").agg(F.collect_list("id")).collect() # result.select("id", "label").show() output=[] for i in result: output.append(list(sorted(i["collect_list(id)"]))) output.sort(key=lambda x:(len(x),x[0])) file2=open(community_output_file_path, "w") # for i in range(len(output)-1): # file2.write(''.join(str(s) for s in output[i]) + '\n')
biz1 = user_biz_dict[user_pair[0]] biz2 = user_biz_dict[user_pair[1]] return len(biz1 & biz2) >= filter_threshold edgeRDD = user_pairsRDD.filter(filter_edge) nodeRDD = edgeRDD.flatMap(lambda x: x).distinct() # undirected graph, edges need to be both way edgeRDD = edgeRDD.flatMap(lambda x: [x, (x[1], x[0])]) \ .map(lambda x: (idx_user_dict[x[0]], idx_user_dict[x[1]])) edgeDF = edgeRDD.toDF(["src", "dst"]) print("number of edges in the graph:", edgeDF.count()) nodeDF = nodeRDD.map(lambda x: (idx_user_dict[x],)).toDF(["id"]) print("number of nodes in the graph:", nodeDF.count()) # build graph的node和edge还是要用原本的user_id,坑。。。 g = graphframes.GraphFrame(nodeDF, edgeDF) resultDF = g.labelPropagation(maxIter=5) # (label, list of user_id) resultRDD = resultDF.rdd.map(lambda x: (x[1], x[0])) \ .groupByKey().mapValues(list) \ .map(lambda x: sorted(x[1])) \ .sortBy(lambda x: (len(x), x[0])) result = resultRDD.collect() with open(community_output_file_path, 'w') as f: for one in result: print(str(one)[1: -1], file = f) end_time = time.time()
def resolve(sc, nodes, edges): graph = gf.GraphFrame(nodes, edges) sc.setCheckpointDir("/tmp/checkpoints") return graph.connectedComponents()
def generate_spark_graph(strings, sc, mat=None, min_ld=1, max_ld=1): """ Make a graph using the Spark graphframes library Inputs ------ strings: list a list of strings to use for the pairwise distance matrix sc : pyspark.SparkContext a live SparkContext mat : pyspark.RDD, optional an RDD representing the distance matrix (returned by `distance_matrix`). If not given, it is generated automatically min_ld : int, optional minimum Levenshtein distance max_ld : int, optional maximum Levenshtein distance Returns ------- g : graphframes.GraphFrame object with strings as node names """ try: import findspark findspark.init() import graphframes from pyspark.sql import Row, SQLContext from pyspark.sql.types import StructField, StructType, IntegerType, ShortType, StringType, LongType except: warn( 'Problem importing pyspark -- are you sure your SPARK_HOME is set?' ) sqc = SQLContext(sc) strings_b = sc.broadcast(strings) size = len(strings) # make the vertex DataFrame v_schema = StructType([ StructField('id', IntegerType()), StructField('string', StringType()) ]) v_rdd = sc.parallelize( range(size)).map(lambda x: Row(id=x, string=strings_b.value[x])) v = sqc.createDataFrame(v_rdd, schema=v_schema) # make the edge DataFrame if mat is None: mat = distance_matrix(strings, min_ld=min_ld, max_ld=max_ld, sc=sc) e_schema = StructType([ StructField('src', IntegerType()), StructField('dst', IntegerType()), StructField('weight', ShortType()) ]) e = sqc.createDataFrame(mat, schema=e_schema) gf = graphframes.GraphFrame(v, e) return gf
vertices_df = vertices_df.repartition(conf.partitions, "cell") effective_partitions = vertices_df.rdd.getNumPartitions() print("effective partitions # =", effective_partitions) # ---------- edges edge_values = lambda start, stop: [(i, e[0], e[1]) for i, e in enumerate( edge_it(conf.vertices, range(start, stop), conf.degree_max))] edges = batch_create(directory=conf.graphs, file="edges", build_values=edge_values, columns=["eid", "src", "dst"], total_rows=conf.vertices, batches=conf.batches_edges, vertices=vertices_df, grid_size=conf.g) graph = graphframes.GraphFrame(vertices_df, edges) stepper.show_step("Create a GraphFrame") print("count: vertices=", graph.vertices.count(), "edges=", graph.edges.count()) stepper.show_step("count GraphFrame") graph.vertices.show() graph.edges.show() spark.sparkContext.stop()
# company to product - minedFrom # product to company - belongsTo edges_to_product = values.withColumn("relationship", F.lit("minedFrom")) edges_to_product = edges_to_product \ .select(F.col("company_nr").alias("src"), F.col("product").alias("dst"), "relationship") \ .distinct() # as we need pagerank for companies then we create another df so that it would be directed edges_from_product = values.withColumn("relationship", F.lit("offers")) edges_from_product = edges_from_product \ .select(F.col("product").alias("dst"), F.col("company_nr").alias("src"), "relationship") \ .distinct() edges = edges_from_product.unionAll(edges_to_product) g = graphframes.GraphFrame(nodes, edges) # For each vertex, find the average neighbour degree # fist calculate the degree print("Calculating Degree") vertices_out = g.degrees gx = graphframes.GraphFrame(vertices_out, edges) print("Calculating Nearest Neighbour Degree") msgToSrc = None msgToDst = AM.src["degree"] nnd = gx.aggregateMessages(F.avg(AM.msg).alias("nearest-neighbour-degree"), sendToSrc=msgToSrc, sendToDst=msgToDst)