Esempi in Python per GraphFrame.cache

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: graphframes

Classe/tipologia: GraphFrame

Metodo/funzione: cache

Esempi su hotexamples.com: 5

GraphFrame.cache in Python: 5 esempi trovati. Questi sono i migliori esempi reali in Python per graphframes.GraphFrame.cache, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

GraphFrame(30)

labelPropagation(24)

aggregateMessages(11)

pageRank(9)

connectedComponents(8)

bfs(7)

find(7)

cache(5)

filterVertices(4)

stronglyConnectedComponents(3)

shortestPaths(2)

triangleCount(2)

filterEdges(1)

Esempio n. 1

Mostra file

File: bipartition.py Progetto: shaonc/cs229-f19-wiki-forecast

    def bipartition(graph: GraphFrame,
                    partitions: List[str] = [],
                    iteration: int = 0):

        if iteration == max_iter:
            return graph

        # relabel all partitions for scipy.sparse performance
        graph.cache()
        induced = induce_graph(graph, True, partitions)
        induced.cache()

        partition = f"sign_{iteration}"
        fiedler_value = f"fiedler_{iteration}"

        # The fiedler vector is the second smallest eigenvector associated with
        # with the graph laplacian, representing the algebraic connectivity of
        # the graph. This is used to implement spectral clustering, recursively,
        # by partitioning by the sign of the fiedler value. The partitions are
        # evenly distributed.
        fiedler = (edges_with_partitions(
            induced, partitions).groupBy(*partitions).apply(
                compute_fiedler_udf(fiedler_value, partitions)).withColumn(
                    partition,
                    F.expr(f"{fiedler_value} >= 0").astype("boolean")))
        vertices = undo_relabel(
            induced.vertices.join(fiedler, on=["id"] + partitions,
                                  how="left").repartitionByRange(*partitions +
                                                                 [partition]))

        if should_checkpoint and iteration % checkpoint_interval == 0:
            # truncate logical plan to prevent out-of-memory on query plan
            # string representation. The edges are reused every iteration
            # and should not need to be checkpointed.
            vertices.cache()
            parted_graph = GraphFrame(vertices.localCheckpoint(eager=True),
                                      graph.edges)
        else:
            parted_graph = GraphFrame(vertices, graph.edges)

        return bipartition(parted_graph, partitions + [partition],
                           iteration + 1)

Esempio n. 2

Mostra file

File: Advanced_Analytics_and_Machine_Learning-Chapter_30_Graph_Analysis.py Progetto: orekoya/spark-demo

  .csv("/data/bike-data/201508_trip_data.csv")


# COMMAND ----------

stationVertices = bikeStations.withColumnRenamed("name", "id").distinct()
tripEdges = tripData\
  .withColumnRenamed("Start Station", "src")\
  .withColumnRenamed("End Station", "dst")


# COMMAND ----------

from graphframes import GraphFrame
stationGraph = GraphFrame(stationVertices, tripEdges)
stationGraph.cache()


# COMMAND ----------

print "Total Number of Stations: " + str(stationGraph.vertices.count())
print "Total Number of Trips in Graph: " + str(stationGraph.edges.count())
print "Total Number of Trips in Original Data: " + str(tripData.count())


# COMMAND ----------

from pyspark.sql.functions import desc
stationGraph.edges.groupBy("src", "dst").count().orderBy(desc("count")).show(10)

Esempio n. 3

Mostra file

                "relationship", "stars", "useful")

    print("Friendship Edges: ")
    friendE.show(8, False)

    #union user vertices together with business vertices.
    all_vertices = userV.union(businessV)

    #union friend Edges together with review edges.
    all_edges = friendE.union(reviewE)

    # Create the GraphFrame object
    g = GraphFrame(all_vertices, all_edges)

    # Make sure GraphFrame is cached in memory in case we want to query/manipulate it multiple times in a row.
    g.cache()

    ### 12.2 #####
    # Find shortest paths between users named Eric and Restaurants with 'Taco Bell' in their name
    paths = g.bfs(" name = 'Eva'", " type = 'company' and review_count >=10",
                  " stars='5' or relationship = 'friend'")
    # Get list of columns
    cols = paths.columns

    # Get the label/name of the last Edge in the path
    last_edge = cols[len(cols) - 2]

    # The resulting paths can be manipulated as normal DataFrames
    # order by the stars of the last edge (which must be reviewd type edge)
    print("BFS :: ")
    paths.orderBy(last_edge + ".stars", ascending=False).show(5, False)

Esempio n. 4

Mostra file

File: Assignment2_Chenxin.py Progetto: xiechenxin/Big-Data-Tool-Spark

# COMMAND ----------

friendsVertices = friends.withColumnRenamed("userID", "id").distinct().show()

# COMMAND ----------

#Build a graph
friendsVertices = friends.withColumnRenamed("userID", "id").distinct()
friendsEdges = friends.withColumnRenamed("userID", "src")\
              .withColumnRenamed("friendID", "dst")

#Build GraphFrame object
from graphframes import GraphFrame

friendsGraph = GraphFrame(friendsVertices, friendsEdges)
friendsGraph.cache()

# COMMAND ----------

#Inspect the GraphFrame
friendsGraph.vertices.show(3, False)
friendsGraph.edges.show(3, False)

# COMMAND ----------

#Count the number of friends in a given user (in-degree) and out of a given user (out-degree)
from pyspark.sql.functions import desc
inDeg = friendsGraph.inDegrees
inDeg.orderBy(desc("inDegree")).show(5, False)

outDeg = friendsGraph.outDegrees

Esempio n. 5

Mostra file

File: M2 -Project NEW.py Progetto: RanjaniAnjurVenkatraman/Flight-Delay-Prediction-Pyspark

tripVertices.cache()

# COMMAND ----------

display(tripVertices)

# COMMAND ----------

display(tripEdges)

# COMMAND ----------

# Build `tripGraph` GraphFrame
# This GraphFrame builds up on the vertices and edges based on our trips (flights)
tripGraph = GraphFrame(tripVertices, tripEdges)
tripGraph.cache()

#Build `tripGraphPrime` GraphFrame
#This graphframe contains a smaller subset of data to make it easier to display motifs and subgraphs (below)
tripEdgesPrime = df2.select("delay", "src", "dst")
tripGraphPrime = GraphFrame(tripVertices, tripEdgesPrime)

# COMMAND ----------

tripGraph.vertices.count()

# COMMAND ----------

tripGraph.edges.count()

# COMMAND ----------