def bfs_recursive_worker(annotate, df_start, df_end, nodes, edges, list_results, index, num_hops): """ Recursively find shortest paths from the set of current source nodes (df_start) and the set of current target nodes (df_end) with increasing number of hops. The idea is to call GraphFrame BFS following the below approach. Refer to the code for more details. .. code-block:: none root / \\ / \\ / \\ / \\ l1 r1 one hop / \\ / \\ / \\ / \\ / \\ / \\ l2 r2 l2 r2 two hops /\\ /\\ /\\ /\\ / \\ / \\ / \\ / \\ / \\ / \\ / \\ / \\ . . . .. .. . three hops . . . .. .. . . . . . .. .. . . . . . .. .. . four hops :param annotate: indicate 'left' or 'right' branch :type annotate: str :param df_start: the current set of source nodes. The schema must be: .. code-block:: none |-- id: string (nullable = true) :type df_start: pyspark.sql.DataFrame :param df_end: the current set of target nodes. The schema must be: .. code-block:: none |-- id: string (nullable = true) :type df_end: pyspark.sql.DataFrame :param nodes: all nodes for constructing the GraphFrame. The schema is: .. code-block:: none |-- id: string (nullable = true) |-- Category: string (nullable = true) :type nodes: pyspark.sql.DataFrame :param edges: all edges for constructing the GraphFrame .. code-block:: none |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) :type edges: pyspark.sql.DataFrame :param list_results: accumulator to store the results of shortest paths. Each result dataframe has the following schemas: One hop: .. code-block:: none from: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e0: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) to: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) or two hops: .. code-block:: none from: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e0: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) v1: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e1: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) to: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) or three hops: .. code-block:: none from: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e0: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) v1: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e1: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) v2: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e2: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) to: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) or four hops: .. code-block:: none from: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e0: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) v1: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e1: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) v2: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e2: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) v3: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) e3: struct (nullable = true) |-- src: string (nullable = true) |-- dst: string (nullable = true) |-- relationship: string (nullable = true) |-- Type: string (nullable = true) |-- Source_Type: string (nullable = true) |-- Target_Type: string (nullable = true) to: struct (nullable = true) |-- id: string (nullable = true) |-- Category: string (nullable = true) |-- rightAttribute4: string (nullable = true) :type list_results: list of pyspark.sql.DataFrame :param index: the current maximum number of hops :type index: int :param num_hops: the maximum allowed number of hops :type num_hops: int :return: None :rtype: None """ from graphframes import GraphFrame if index > num_hops: return df_start = df_start.withColumn( annotate + 'Attribute' + str(index), lit('start')) df_end = df_end.withColumn( annotate + 'Attribute' + str(index), lit('end')) nodes_new = (nodes .join(df_start.unionAll(df_end).distinct(), 'id', 'left_outer'))[['id', 'Category', annotate + 'Attribute' + str(index)]] nodes_new.persist() coi_graph = GraphFrame(nodes_new, edges) results = coi_graph.bfs( "%sAttribute%d='start'" % (annotate, index), "%sAttribute%d='end'" % (annotate, index), maxPathLength=num_hops) if results.count() >= 1: results.persist() list_results.append(results) if index < num_hops: start_remove = results[['from']] \ .withColumn('id', results['from'].id)[['id']].distinct() end_remove = results[['to']].withColumn('id', results['to'].id)[ ['id']].distinct() new_start = df_start[['id']].subtract(start_remove).distinct() new_end = df_end[['id']].subtract(end_remove).distinct() index = int(len(results.columns) / 2) bfs_recursive_worker('left', new_start, df_end[['id']], nodes, edges, list_results, index + 1, num_hops) bfs_recursive_worker('right', df_start[['id']], new_end, nodes, edges, list_results, index + 1, num_hops)
outDeg = stationGraph.outDegrees outDeg.orderBy(desc("outDegree")).show(5, False) # COMMAND ---------- degreeRatio = inDeg.join(outDeg, "id")\ .selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio") degreeRatio.orderBy(desc("degreeRatio")).show(10, False) degreeRatio.orderBy("degreeRatio").show(10, False) # COMMAND ---------- stationGraph.bfs(fromExpr="id = 'Townsend at 7th'", toExpr="id = 'Spear at Folsom'", maxPathLength=2).show(10) # COMMAND ---------- spark.sparkContext.setCheckpointDir("/tmp/checkpoints") # COMMAND ---------- minGraph = GraphFrame(stationVertices, tripEdges.sample(False, 0.1)) cc = minGraph.connectedComponents() # COMMAND ----------
# | b| e| follow| # | c| e| follow| # | e| f| follow| # +---+---+------------+ # Step-3: Create a GraphFrame. Using GraphFrames API, a graph # is built as an instance of a GraphFrame, which is a pair of # vertices (as `v`) and edges (as `e`): graph = GraphFrame(v, e) print("graph=", graph) # GraphFrame(v:[id: string, name: string ... 1 more field], # e:[src: string, dst: string ... 1 more field]) #============== # BFS Algorithm #============== # The following code snippets uses BFS to find path between # vertex with name "Alice" to a vertex with age < 27. # # Search from "Alice" for users of age < 27. paths = graph.bfs("name = 'Alice'", "age < 27") paths.show() # Specify edge filters or max path lengths. paths2 = graph.bfs("name = 'Alice'", "age > 30",\ edgeFilter="relationship == 'follow'", maxPathLength=4) paths2.show() # done! spark.stop()
#union user vertices together with business vertices. all_vertices = userV.union(businessV) #union friend Edges together with review edges. all_edges = friendE.union(reviewE) # Create the GraphFrame object g = GraphFrame(all_vertices, all_edges) # Make sure GraphFrame is cached in memory in case we want to query/manipulate it multiple times in a row. g.cache() ### 12.2 ##### # Find shortest paths between users named Eric and Restaurants with 'Taco Bell' in their name paths = g.bfs(" name = 'Eva'", " type = 'company' and review_count >=10", " stars='5' or relationship = 'friend'") # Get list of columns cols = paths.columns # Get the label/name of the last Edge in the path last_edge = cols[len(cols) - 2] # The resulting paths can be manipulated as normal DataFrames # order by the stars of the last edge (which must be reviewd type edge) print("BFS :: ") paths.orderBy(last_edge + ".stars", ascending=False).show(5, False) ###### 12.3.1 ### query = "(u)-[e1]->(b1); (u)-[e2] -> (b2)" results = g.find(query)
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)") motifs.show() # More complex queries motifs.filter("b.age > 30").show() print("\ngenerate subgraph --- ") g1 = (g.filterVertices("age > 30").filterEdges( "relationship = 'friend'").dropIsolatedVertices()) g1.vertices.show() g1.edges.show() # Breadth-first search (BFS) print("\n BFS") paths = g.bfs( "name = 'Esther'", "age < 32", edgeFilter="relationship != 'friend'", maxPathLength=3, ).show() # In-Degree and Out-Degree Metrics print("\n Degree--------------") inDeg = g.inDegrees inDeg.orderBy(desc("inDegree")) outDeg = g.outDegrees outDeg.orderBy(desc("outDegree")) degreeRatio = (inDeg.join(outDeg, "id").selectExpr( "*", "double(inDegree)/ double(outDegree) as degreeRatio").orderBy( desc("degreeRatio")).show(10, False)) # print("\n strong connected component") # result = g.stronglyConnectedComponents(maxIter=10)
# add GraphFrames package to spark-submit import os os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11' import sys from functools import reduce from pyspark.sql.functions import col, lit, when import pyspark from graphframes.examples import Graphs from graphframes import GraphFrame import config sc = pyspark.SparkContext() sqlContext = pyspark.SQLContext(sc) inputFile = sys.argv[1] # g = Graphs(sqlContext).friends() # Get example graph df = sqlContext.read.format("csv").option("delimiter", config.delimiter).load(inputFile) # Rename columns to something decent. df = df.withColumnRenamed("_c0", "src")\ .withColumnRenamed("_c1", "dst")\ .withColumnRenamed("_c2", "weight") df.show(5) aggcodes = df.select("src","dst").rdd.flatMap(lambda x: x).distinct() vertices = aggcodes.map(lambda x: (x, x)).toDF(["id","name"]) edges = df.select("src", "dst") graph = GraphFrame(vertices, edges) paths = graph.bfs("name = '500'", "name = 100") paths.show()
# 广度优先搜索 # 1- Builds a graph using GraphFrame package # 2- Applies BFS find the shortest paths from one vectex to another import sys, os from pyspark.sql import SparkSession, SQLContext from graphframes import GraphFrame if __name__ == '__main__': spark = SparkSession.builder.appName('BFS').getOrCreate() # 1- 载入数据 vertices = [("a", "Alice", 30), ("b", "Bob", 31), ("c", "Charlie", 32), ("d", "David", 23), ("e", "Emma", 24), ("f", "Frank", 26)] v = spark.createDataFrame(vertices, ["id", "name", "age"]) edges = [("a", "b", "follow"), ("b", "c", "follow"), ("c", "d", "follow"), ("d", "e", "follow"), ("b", "e", "follow"), ("c", "e", "follow"), ("e", "f", "follow")] e = spark.createDataFrame(edges, ["src", "dst", "relationship"]) e.show() # 2- 创建图 ## 报错修正(少jars包):https://blog.csdn.net/qq_15098623/article/details/91533349 graph = GraphFrame(v, e) print("graph=", graph) # 3- 广度搜索 paths = graph.bfs("name = 'Alice'", "age < 27") paths.show() spark.stop()