def runBPwithGraphFrames(cls, g, numIter): """Run Belief Propagation using GraphFrame. This implementation of BP shows how to use GraphFrame's aggregateMessages method. """ # choose colors for vertices for BP scheduling colorG = cls._colorGraph(g) numColors = colorG.vertices.select('color').distinct().count() # TODO: handle vertices without any edges # initialize vertex beliefs at 0.0 gx = GraphFrame( colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges) # run BP for numIter iterations for iter_ in range(numIter): # for each color, have that color receive messages from neighbors for color in range(numColors): # Send messages to vertices of the current color. # We may send to source or destination since edges are treated as undirected. msgForSrc = sqlfunctions.when(AM.src['color'] == color, AM.edge['b'] * AM.dst['belief']) msgForDst = sqlfunctions.when(AM.dst['color'] == color, AM.edge['b'] * AM.src['belief']) # numerically stable sigmoid logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType()) aggregates = gx.aggregateMessages(sqlfunctions.sum( AM.msg).alias("aggMess"), msgToSrc=msgForSrc, msgToDst=msgForDst) v = gx.vertices # receive messages and update beliefs for vertices of the current color newBeliefCol = sqlfunctions.when( (v['color'] == color) & (aggregates['aggMess'].isNotNull()), logistic(aggregates['aggMess'] + v['a'])).otherwise( v['belief']) # keep old beliefs for other colors newVertices = ( v.join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer').drop( aggregates['id'] ) # drop duplicate ID column (from outer join) .withColumn('newBelief', newBeliefCol) # compute new beliefs .drop('aggMess') # drop messages .drop('belief') # drop old beliefs .withColumnRenamed('newBelief', 'belief')) # cache new vertices using workaround for SPARK-1334 cachedNewVertices = AM.getCachedDataFrame(newVertices) gx = GraphFrame(cachedNewVertices, gx.edges) # Drop the "color" column from vertices return GraphFrame(gx.vertices.drop('color'), gx.edges)
def runBPwithGraphFrames(cls, g, numIter): """Run Belief Propagation using GraphFrame. This implementation of BP shows how to use GraphFrame's aggregateMessages method. """ # choose colors for vertices for BP scheduling colorG = cls._colorGraph(g) numColors = colorG.vertices.select('color').distinct().count() # TODO: handle vertices without any edges # initialize vertex beliefs at 0.0 gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges) # run BP for numIter iterations for iter_ in range(numIter): # for each color, have that color receive messages from neighbors for color in range(numColors): # Send messages to vertices of the current color. # We may send to source or destination since edges are treated as undirected. msgForSrc = sqlfunctions.when( AM.src['color'] == color, AM.edge['b'] * AM.dst['belief']) msgForDst = sqlfunctions.when( AM.dst['color'] == color, AM.edge['b'] * AM.src['belief']) # numerically stable sigmoid logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType()) aggregates = gx.aggregateMessages( sqlfunctions.sum(AM.msg).alias("aggMess"), sendToSrc=msgForSrc, sendToDst=msgForDst) v = gx.vertices # receive messages and update beliefs for vertices of the current color newBeliefCol = sqlfunctions.when( (v['color'] == color) & (aggregates['aggMess'].isNotNull()), logistic(aggregates['aggMess'] + v['a']) ).otherwise(v['belief']) # keep old beliefs for other colors newVertices = (v .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer') .drop(aggregates['id']) # drop duplicate ID column (from outer join) .withColumn('newBelief', newBeliefCol) # compute new beliefs .drop('aggMess') # drop messages .drop('belief') # drop old beliefs .withColumnRenamed('newBelief', 'belief') ) # cache new vertices using workaround for SPARK-1334 cachedNewVertices = AM.getCachedDataFrame(newVertices) gx = GraphFrame(cachedNewVertices, gx.edges) # Drop the "color" column from vertices return GraphFrame(gx.vertices.drop('color'), gx.edges)
vertices = grouped_edges.withColumn( 'neighbors_of_neighbors', create_id_list_column_udf(grouped_edges['id'], grouped_edges['neighbors'])).join( vertices, on='id', how='left_outer').drop('neighbors') vertices.show(500, truncate=False) #--------------------------------------- print('State 3') g = GraphFrame(vertices, edges) #g.vertices.show(500, truncate=False) #g.edges.show(500, truncate=False) aggregates = g.aggregateMessages(F.collect_set( AM.msg).alias('neighbors_of_neighbors2'), sendToDst=AM.src['neighbors_of_neighbors']) vertices = vertices.join(aggregates, on='id', how='left_outer').withColumnRenamed( 'neighbors_of_neighbors', 'neighbors').withColumnRenamed( 'neighbors_of_neighbors2', 'neighbors_of_neighbors') vertices.show(500, truncate=False) #--------------------------------------- print('State 4') def same_neighbors(neighbors, neighbors_of_neighbors):
.save() # logging.info('Done!') # # # # ## closeness centrality # vertices = g.vertices.withColumn("ids", F.array()) cached_vertices = AM.getCachedDataFrame(vertices) g2 = GraphFrame(cached_vertices, g.edges) for i in range(0, g2.vertices.count()): msg_dst = new_paths_udf(AM.src["ids"], AM.src["id"]) msg_src = new_paths_udf(AM.dst["ids"], AM.dst["id"]) agg = g2.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToSrc=msg_src, sendToDst=msg_dst) res = agg.withColumn("newIds", flatten_udf("agg")).drop("agg") new_vertices = (g2.vertices.join( res, on="id", how="left_outer").withColumn( "mergedIds", merge_paths_udf("ids", "newIds", "id")).drop( "ids", "newIds").withColumnRenamed("mergedIds", "ids")) cached_new_vertices = AM.getCachedDataFrame(new_vertices) g2 = GraphFrame(cached_new_vertices, g2.edges) closeness_centrality = g2.vertices\ .withColumn("closeness", closeness_udf("ids"))\ .sort("closeness", ascending=False) # closeness_centrality.write.csv('Closeness_centrality.csv')
def run(self, graph, num_iter=1): model = self.model # Make a spark UDF to initialize each node with a random trait @sqlfunctions.udf(returnType=types.ArrayType(types.IntegerType())) def udf_initialize_traits(arg): return model.initialize_traits(arg) # Make a spark UDF handling inter-node communication @sqlfunctions.udf(returnType=types.ArrayType(types.IntegerType())) def udf_node_interaction(my_traits, neighbor_traits): return model.check_neighbor_encounters(my_traits, neighbor_traits) # Make a spark UDF handling inter-node communication @sqlfunctions.udf(returnType=types.ArrayType(types.IntegerType())) def udf_combine_traits(my_traits, other_traits): return model.combine_traits(my_traits, other_traits) # Initialize random traits new_vertices = graph.vertices\ .withColumn("traits", udf_initialize_traits(graph.vertices.id)) # Make a new graph with this random trait current_graph = GraphFrame(new_vertices, graph.edges) # For number of iterations we want to run: diversities = [] for i in range(num_iter): # Send neighbor traits to each node neighbor_traits = current_graph.aggregateMessages( sqlfunctions.collect_list(AM.msg).alias("neighbor_traits"), sendToSrc=None, sendToDst=AM.src["traits"]) # Join neighbor traits back to main table new_vertices = current_graph.vertices\ .join(neighbor_traits, "id", "left_outer") # Select which neighbor to interact with new_vertices = new_vertices\ .withColumn("interaction_traits", udf_node_interaction("traits", "neighbor_traits"))\ .drop("neighbor_traits") # Mix your and neighbor traits new_vertices = new_vertices\ .withColumn("combined_traits", udf_combine_traits("traits", "interaction_traits")) # Drop intermediate columns new_vertices = new_vertices\ .drop("traits", "interaction_traits")\ .withColumnRenamed("combined_traits", "traits") # Cache cached_new_vertices = AM.getCachedDataFrame(new_vertices) # Update current graph with new nodes current_graph = GraphFrame(cached_new_vertices, graph.edges) # Record trait diversity diversity = current_graph.vertices.select( 'traits').distinct().count() diversities.append(diversity) print("Iteration: {}, trait diversity: {}".format(i, diversity)) return current_graph, diversities
# Page rank print("\n Page rank") # # run until convergence to tol # results = g.pageRank(resetProbability=0.15, tol=0.01) # results.vertices.select("id", "pagerank").show() # results.edges.select("src", "dst", "weight").show() ## Run PageRank personalized for vertex ["a", "b", "c", "d"] in parallel # results4 = g.parallelPersonalizedPageRank(resetProbability=0.15, sourceIds=["a", "b", "c", "d"], maxIter=10)\ print("\n shortest paths from each node to landmards node") results = g.shortestPaths(landmarks=["a", "d"]) results.select("id", "distances").show() # # Saving and Loading GraphFrames # g.vertices.write.parquet("hdfs://myLocation/vertices") # g.edges.write.parquet("hdfs://myLocation/edges") # # # Load the vertices and edges back. # sameV = sqlContext.read.parquet("hdfs://myLocation/vertices") # sameE = sqlContext.read.parquet("hdfs://myLocation/edges") # message passing via AggregateMessages # For each user, sum the ages of the adjacent users. msgToSrc = AM.dst["age"] msgToDst = AM.src["age"] agg = g.aggregateMessages(sum(AM.msg).alias("summedAges"), sendToSrc=msgToSrc, sentToDst=msgToDst) agg.show()
display(e) # COMMAND ---------- from graphframes import GraphFrame from graphframes.lib import AggregateMessages as AM # Create a graph with vertices containing an empty parents array column g = GraphFrame(v.withColumn("parents", array()), e) # Initial message to be passed to neighbor vertices. We want to traverse from the leaf, hence AM.src msgToDst = AM.src["name"] for i in range(6): # AM.msg contains the next message i.e. next parent in our case agg = g.aggregateMessages(collect_list(AM.msg).alias("tmpParent"), sendToDst=msgToDst) # Append this message to the parents array column of vertices and also keep it as a standalone column for next iteration currentV = g.vertices newV = currentV.join(agg, "id", how = "left") \ .drop(agg["id"]) \ .withColumn("parents", concat(agg["tmpParent"], currentV["parents"])) \ .withColumn("lastParent", col("tmpParent")[0]) \ .drop("tmpParent") # Caching the transitionary vertices dataframe is important here, otherwise the Spark job will take very long time to complete cachedNewV = AM.getCachedDataFrame(newV) g = GraphFrame(cachedNewV, g.edges) # Pass the standalone column i.e recent parent to the next iteration msgToDst = AM.src["lastParent"]
types.StructField("neighbours", types.ArrayType(types.StringType())) ]) new_neighbours_udf = F.udf(new_neighbours, neighbours_type) vertices = vertices.withColumn( "neighbours", new_neighbours_udf(vertices["id"], vertices["neighbours_list"])) # construct the graph graph = GraphFrame(vertices, edges) print("Graph after tweaks:") graph.vertices.show() graph.edges.show() # send neighbours list to neighbours aggregates = graph.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=AM.src["neighbours"]) print("Using aggregateMessages:") aggregates.show() # find common neighbours print("Finding common neighbours:") aggregates = aggregates.join(graph.vertices, on="id").drop("neighbours") # aggregates.show() def common_neighbours(node_neighbours, messagers_neighbours): common_list = [] for neighbours in messagers_neighbours: common_list.append({ "id": neighbours.id,
def LPAImp(self, numIter, modularity=True): """Label propogation algorithm for bipartite networks with synchronous updating scheme; Return a data frame with columns which containts the vertices ID, labeling assignment and modularity (if specified to be returned) Keyword Arguments: numIter -- Number of iteration for LPAb modularity -- A boolean variable indicating whether the modularity should be calculated and returned. """ # Assign initial label to the users initLabelUDF = F.udf(lambda i, j: i if j == 1 else None, types.IntegerType()) v = self.gf.vertices.withColumn( 'label', initLabelUDF(F.col('id'), F.col('nodeType'))) # Add edges for every node that goes to itself E_self = self.SS.createDataFrame(v.select(F.col('id')).rdd) E = AM.getCachedDataFrame( self.gf.edges.union( E_self.withColumn('dst', F.col('id')).withColumnRenamed('id', 'src'))) # Create a new graphframe object with labels attached LPAbgf = GraphFrame(v, E) # Create a UDAF (User Defined Aggregate Function) that returns the most frequent # label @pandas_udf("int", PandasUDFType.GROUPED_AGG) def maxLabel_udf(label_list): label_list = list(filter(None, label_list)) LabelCounts = Counter(label_list) mostCommonLabels = [ i[0] for i in LabelCounts.items() if i[1] == max(LabelCounts.values()) ] return np.random.choice(mostCommonLabels) for iter_ in range(numIter): for nodeType in [1, 2]: # For user and repo nodes, send their labels to # their destination nodes in alternating order msgForDst = F.when(AM.src['nodeType'] == nodeType, AM.src['label']) # If it's repo's turn to send label to their destinations, # also send repo's label's to its contributors if nodeType == 2: msgForSrc = F.when(AM.src['nodeType'] == 1, AM.dst['label']) else: msgForSrc = None # Aggregate messages received from each node aggregates = LPAbgf.aggregateMessages(aggCol=maxLabel_udf( AM.msg).alias("aggMess"), sendToDst=msgForDst, sendToSrc=msgForSrc) v = LPAbgf.vertices # Update Labels for each node; If there is message for # the node, update the node's Label newLabelCol = F.when(aggregates["aggMess"].isNotNull(), aggregates["aggMess"]).otherwise( v['label']) # Outer join aggregates and vertices vNew = ( v.join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer').drop(aggregates['id']) # Compute new column .withColumn('newLabel', newLabelCol) # Drop messages .drop('aggMess') # Drop old labels .drop('label').withColumnRenamed('newLabel', 'label')) cachedvNew = AM.getCachedDataFrame(vNew) LPAbgf = GraphFrame(cachedvNew, E) # Delete the edges that goes from itself LPAbgf = GraphFrame(LPAbgf.vertices, self.gf.edges) return LPAbgf
color = old_local_max.color step = old_local_max.step + 1 if new_local_max.id < old_local_max.id: maxima = True color = old_local_max.step return {"id": old_local_max.id, "color": color, "maxima": maxima, "step": step} compare_local_max_value_udf = F.udf(compare_local_max_value, local_max_value_type) # Local Maxima First Algorithm while True: # Aggregate messages from the neighbors. aggregates = g.aggregateMessages( F.collect_set(AM.msg).alias("agg"), sendToDst=AM.src["localMaxima"] ) res = aggregates.withColumn( "newlocalMaxima", greater_local_max_value_udf("agg") ).drop("agg") # Aggregate and Join vertices leveraging localMaxima values new_vertices = ( g.vertices.join(res, on="id", how="left_outer") .withColumnRenamed("localMaxima", "oldlocalMaxima") .withColumn( "localMaxima", compare_local_max_value_udf( F.col("oldlocalMaxima"), F.col("newlocalMaxima") ), )
min_rating_id = rating.id return {"id": min_rating_id, "rating": min_rating} min_rating_udf = F.udf(min_rating, player_rating_type) # UDF for finding the minimum rating between the old one and the new one. def compare_rating(old_rating, new_rating): return old_rating if old_rating.rating < new_rating.rating else new_rating compare_rating_udf = F.udf(compare_rating, player_rating_type) # Iterative graph computations max_iterations = 5 for _ in range(max_iterations): aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=AM.src["minRating"]) res = aggregates.withColumn("newMinRating", min_rating_udf("agg")).drop("agg") new_vertices = g.vertices.join( res, on="id", how="left_outer").withColumnRenamed( "minRating", "oldMinRating").withColumn( "minRating", compare_rating_udf(F.col("oldMinRating"), F.col( "newMinRating"))).drop("oldMinRating").drop("newMinRating") cached_new_vertices = AM.getCachedDataFrame(new_vertices) g = GraphFrame(cached_new_vertices, g.edges) g.vertices.show()
处理接收到src消息后的操作 :param datas: :return: """ max_ratio = -1 cnodes = [] for msg in msgs: if msg.max_ratio > max_ratio: max_ratio = msg.max_ratio cnodes = msg.cnodes return {'max_ratio': max_ratio, 'cnodes': cnodes} aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=AM.src["rc"]) agg_src_udf = F.udf(agg_src_func, rc_type) res = aggregates.withColumn("rc", agg_src_udf("agg")).drop("agg") print("第一次初始化".center(88, "*")) init_vertices = g.vertices.join(res, res.id == g.vertices.id, "left").select(g.vertices.id, g.vertices.name, res.rc) new_vertices = init_vertices.select( init_vertices.id, init_vertices.name, F.when( init_vertices.rc.isNull(), rc_func_udf(init_vertices["id"], F.lit(100), F.array(init_vertices['id']))).otherwise( init_vertices['rc']).alias('rc'))