def bipartition(graph: GraphFrame,
                    partitions: List[str] = [],
                    iteration: int = 0):

        if iteration == max_iter:
            return graph

        # relabel all partitions for scipy.sparse performance
        graph.cache()
        induced = induce_graph(graph, True, partitions)
        induced.cache()

        partition = f"sign_{iteration}"
        fiedler_value = f"fiedler_{iteration}"

        # The fiedler vector is the second smallest eigenvector associated with
        # with the graph laplacian, representing the algebraic connectivity of
        # the graph. This is used to implement spectral clustering, recursively,
        # by partitioning by the sign of the fiedler value. The partitions are
        # evenly distributed.
        fiedler = (edges_with_partitions(
            induced, partitions).groupBy(*partitions).apply(
                compute_fiedler_udf(fiedler_value, partitions)).withColumn(
                    partition,
                    F.expr(f"{fiedler_value} >= 0").astype("boolean")))
        vertices = undo_relabel(
            induced.vertices.join(fiedler, on=["id"] + partitions,
                                  how="left").repartitionByRange(*partitions +
                                                                 [partition]))

        if should_checkpoint and iteration % checkpoint_interval == 0:
            # truncate logical plan to prevent out-of-memory on query plan
            # string representation. The edges are reused every iteration
            # and should not need to be checkpointed.
            vertices.cache()
            parted_graph = GraphFrame(vertices.localCheckpoint(eager=True),
                                      graph.edges)
        else:
            parted_graph = GraphFrame(vertices, graph.edges)

        return bipartition(parted_graph, partitions + [partition],
                           iteration + 1)
  .csv("/data/bike-data/201508_trip_data.csv")


# COMMAND ----------

stationVertices = bikeStations.withColumnRenamed("name", "id").distinct()
tripEdges = tripData\
  .withColumnRenamed("Start Station", "src")\
  .withColumnRenamed("End Station", "dst")


# COMMAND ----------

from graphframes import GraphFrame
stationGraph = GraphFrame(stationVertices, tripEdges)
stationGraph.cache()


# COMMAND ----------

print "Total Number of Stations: " + str(stationGraph.vertices.count())
print "Total Number of Trips in Graph: " + str(stationGraph.edges.count())
print "Total Number of Trips in Original Data: " + str(tripData.count())


# COMMAND ----------

from pyspark.sql.functions import desc
stationGraph.edges.groupBy("src", "dst").count().orderBy(desc("count")).show(10)

Esempio n. 3
0
                "relationship", "stars", "useful")

    print("Friendship Edges: ")
    friendE.show(8, False)

    #union user vertices together with business vertices.
    all_vertices = userV.union(businessV)

    #union friend Edges together with review edges.
    all_edges = friendE.union(reviewE)

    # Create the GraphFrame object
    g = GraphFrame(all_vertices, all_edges)

    # Make sure GraphFrame is cached in memory in case we want to query/manipulate it multiple times in a row.
    g.cache()

    ### 12.2 #####
    # Find shortest paths between users named Eric and Restaurants with 'Taco Bell' in their name
    paths = g.bfs(" name = 'Eva'", " type = 'company' and review_count >=10",
                  " stars='5' or relationship = 'friend'")
    # Get list of columns
    cols = paths.columns

    # Get the label/name of the last Edge in the path
    last_edge = cols[len(cols) - 2]

    # The resulting paths can be manipulated as normal DataFrames
    # order by the stars of the last edge (which must be reviewd type edge)
    print("BFS :: ")
    paths.orderBy(last_edge + ".stars", ascending=False).show(5, False)
# COMMAND ----------

friendsVertices = friends.withColumnRenamed("userID", "id").distinct().show()

# COMMAND ----------

#Build a graph
friendsVertices = friends.withColumnRenamed("userID", "id").distinct()
friendsEdges = friends.withColumnRenamed("userID", "src")\
              .withColumnRenamed("friendID", "dst")

#Build GraphFrame object
from graphframes import GraphFrame

friendsGraph = GraphFrame(friendsVertices, friendsEdges)
friendsGraph.cache()

# COMMAND ----------

#Inspect the GraphFrame
friendsGraph.vertices.show(3, False)
friendsGraph.edges.show(3, False)

# COMMAND ----------

#Count the number of friends in a given user (in-degree) and out of a given user (out-degree)
from pyspark.sql.functions import desc
inDeg = friendsGraph.inDegrees
inDeg.orderBy(desc("inDegree")).show(5, False)

outDeg = friendsGraph.outDegrees
tripVertices.cache()

# COMMAND ----------

display(tripVertices)

# COMMAND ----------

display(tripEdges)

# COMMAND ----------

# Build `tripGraph` GraphFrame
# This GraphFrame builds up on the vertices and edges based on our trips (flights)
tripGraph = GraphFrame(tripVertices, tripEdges)
tripGraph.cache()

#Build `tripGraphPrime` GraphFrame
#This graphframe contains a smaller subset of data to make it easier to display motifs and subgraphs (below)
tripEdgesPrime = df2.select("delay", "src", "dst")
tripGraphPrime = GraphFrame(tripVertices, tripEdgesPrime)

# COMMAND ----------

tripGraph.vertices.count()

# COMMAND ----------

tripGraph.edges.count()

# COMMAND ----------