def bipartition(graph: GraphFrame, partitions: List[str] = [], iteration: int = 0): if iteration == max_iter: return graph # relabel all partitions for scipy.sparse performance graph.cache() induced = induce_graph(graph, True, partitions) induced.cache() partition = f"sign_{iteration}" fiedler_value = f"fiedler_{iteration}" # The fiedler vector is the second smallest eigenvector associated with # with the graph laplacian, representing the algebraic connectivity of # the graph. This is used to implement spectral clustering, recursively, # by partitioning by the sign of the fiedler value. The partitions are # evenly distributed. fiedler = (edges_with_partitions( induced, partitions).groupBy(*partitions).apply( compute_fiedler_udf(fiedler_value, partitions)).withColumn( partition, F.expr(f"{fiedler_value} >= 0").astype("boolean"))) vertices = undo_relabel( induced.vertices.join(fiedler, on=["id"] + partitions, how="left").repartitionByRange(*partitions + [partition])) if should_checkpoint and iteration % checkpoint_interval == 0: # truncate logical plan to prevent out-of-memory on query plan # string representation. The edges are reused every iteration # and should not need to be checkpointed. vertices.cache() parted_graph = GraphFrame(vertices.localCheckpoint(eager=True), graph.edges) else: parted_graph = GraphFrame(vertices, graph.edges) return bipartition(parted_graph, partitions + [partition], iteration + 1)
.csv("/data/bike-data/201508_trip_data.csv") # COMMAND ---------- stationVertices = bikeStations.withColumnRenamed("name", "id").distinct() tripEdges = tripData\ .withColumnRenamed("Start Station", "src")\ .withColumnRenamed("End Station", "dst") # COMMAND ---------- from graphframes import GraphFrame stationGraph = GraphFrame(stationVertices, tripEdges) stationGraph.cache() # COMMAND ---------- print "Total Number of Stations: " + str(stationGraph.vertices.count()) print "Total Number of Trips in Graph: " + str(stationGraph.edges.count()) print "Total Number of Trips in Original Data: " + str(tripData.count()) # COMMAND ---------- from pyspark.sql.functions import desc stationGraph.edges.groupBy("src", "dst").count().orderBy(desc("count")).show(10)
"relationship", "stars", "useful") print("Friendship Edges: ") friendE.show(8, False) #union user vertices together with business vertices. all_vertices = userV.union(businessV) #union friend Edges together with review edges. all_edges = friendE.union(reviewE) # Create the GraphFrame object g = GraphFrame(all_vertices, all_edges) # Make sure GraphFrame is cached in memory in case we want to query/manipulate it multiple times in a row. g.cache() ### 12.2 ##### # Find shortest paths between users named Eric and Restaurants with 'Taco Bell' in their name paths = g.bfs(" name = 'Eva'", " type = 'company' and review_count >=10", " stars='5' or relationship = 'friend'") # Get list of columns cols = paths.columns # Get the label/name of the last Edge in the path last_edge = cols[len(cols) - 2] # The resulting paths can be manipulated as normal DataFrames # order by the stars of the last edge (which must be reviewd type edge) print("BFS :: ") paths.orderBy(last_edge + ".stars", ascending=False).show(5, False)
# COMMAND ---------- friendsVertices = friends.withColumnRenamed("userID", "id").distinct().show() # COMMAND ---------- #Build a graph friendsVertices = friends.withColumnRenamed("userID", "id").distinct() friendsEdges = friends.withColumnRenamed("userID", "src")\ .withColumnRenamed("friendID", "dst") #Build GraphFrame object from graphframes import GraphFrame friendsGraph = GraphFrame(friendsVertices, friendsEdges) friendsGraph.cache() # COMMAND ---------- #Inspect the GraphFrame friendsGraph.vertices.show(3, False) friendsGraph.edges.show(3, False) # COMMAND ---------- #Count the number of friends in a given user (in-degree) and out of a given user (out-degree) from pyspark.sql.functions import desc inDeg = friendsGraph.inDegrees inDeg.orderBy(desc("inDegree")).show(5, False) outDeg = friendsGraph.outDegrees
tripVertices.cache() # COMMAND ---------- display(tripVertices) # COMMAND ---------- display(tripEdges) # COMMAND ---------- # Build `tripGraph` GraphFrame # This GraphFrame builds up on the vertices and edges based on our trips (flights) tripGraph = GraphFrame(tripVertices, tripEdges) tripGraph.cache() #Build `tripGraphPrime` GraphFrame #This graphframe contains a smaller subset of data to make it easier to display motifs and subgraphs (below) tripEdgesPrime = df2.select("delay", "src", "dst") tripGraphPrime = GraphFrame(tripVertices, tripEdgesPrime) # COMMAND ---------- tripGraph.vertices.count() # COMMAND ---------- tripGraph.edges.count() # COMMAND ----------