def bfs_recursive_worker(annotate,
                         df_start,
                         df_end,
                         nodes,
                         edges,
                         list_results,
                         index,
                         num_hops):
    """
    Recursively find shortest paths from the set of current source nodes
    (df_start) and the set of current target nodes (df_end) with increasing
    number of hops. The idea is to call GraphFrame BFS
    following the below approach. Refer to the code for more details.

    .. code-block:: none

                                  root
                                /     \\
                               /       \\
                              /         \\
                             /           \\
                           l1            r1          one hop
                          /  \\           / \\
                         /    \\         /   \\
                        /      \\       /     \\
                      l2        r2    l2     r2      two hops
                     /\\        /\\     /\\     /\\
                    /  \\      /  \\   /  \\   /  \\
                   /    \\    /    \\ /    \\ /    \\
                   .     .   .     ..     ..     .   three hops
                   .     .   .     ..     ..     .   .
                   .     .   .     ..     ..     .   .
                   .     .   .     ..     ..     .   four hops

    :param annotate: indicate 'left' or 'right' branch
    :type annotate: str
    :param df_start: the current set of source nodes. The schema must be:

        .. code-block:: none

            |-- id: string (nullable = true)
    :type df_start: pyspark.sql.DataFrame
    :param df_end: the current set of target nodes. The schema must be:

        .. code-block:: none

            |-- id: string (nullable = true)
    :type df_end: pyspark.sql.DataFrame
    :param nodes: all nodes for constructing the GraphFrame. The schema is:

        .. code-block:: none

            |-- id: string (nullable = true)
            |-- Category: string (nullable = true)
    :type nodes: pyspark.sql.DataFrame
    :param edges: all edges for constructing the GraphFrame

        .. code-block:: none

            |-- src: string (nullable = true)
            |-- dst: string (nullable = true)
            |-- relationship: string (nullable = true)
            |-- Type: string (nullable = true)
            |-- Source_Type: string (nullable = true)
            |-- Target_Type: string (nullable = true)
    :type edges: pyspark.sql.DataFrame
    :param list_results:
        accumulator to store the results of shortest paths. Each result
        dataframe has the following schemas:

        One hop:

        .. code-block:: none

            from: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e0: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            to: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)

        or two hops:

        .. code-block:: none

            from: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e0: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            v1: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e1: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            to: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)

        or three hops:

        .. code-block:: none

            from: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e0: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            v1: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e1: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            v2: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e2: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            to: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)

        or four hops:

        .. code-block:: none

            from: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e0: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            v1: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e1: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            v2: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e2: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            v3: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
            e3: struct (nullable = true)
             |-- src: string (nullable = true)
             |-- dst: string (nullable = true)
             |-- relationship: string (nullable = true)
             |-- Type: string (nullable = true)
             |-- Source_Type: string (nullable = true)
             |-- Target_Type: string (nullable = true)
            to: struct (nullable = true)
             |-- id: string (nullable = true)
             |-- Category: string (nullable = true)
             |-- rightAttribute4: string (nullable = true)
    :type list_results: list of pyspark.sql.DataFrame
    :param index: the current maximum number of hops
    :type index: int
    :param num_hops: the maximum allowed number of hops
    :type num_hops: int
    :return: None
    :rtype: None
    """
    from graphframes import GraphFrame

    if index > num_hops:
        return

    df_start = df_start.withColumn(
        annotate + 'Attribute' + str(index), lit('start'))
    df_end = df_end.withColumn(
        annotate + 'Attribute' + str(index), lit('end'))
    nodes_new = (nodes
                 .join(df_start.unionAll(df_end).distinct(),
                       'id',
                       'left_outer'))[['id',
                                       'Category',
                                       annotate + 'Attribute' + str(index)]]
    nodes_new.persist()

    coi_graph = GraphFrame(nodes_new, edges)
    results = coi_graph.bfs(
        "%sAttribute%d='start'" % (annotate, index),
        "%sAttribute%d='end'" % (annotate, index),
        maxPathLength=num_hops)

    if results.count() >= 1:
        results.persist()
        list_results.append(results)

        if index < num_hops:
            start_remove = results[['from']] \
                .withColumn('id', results['from'].id)[['id']].distinct()
            end_remove = results[['to']].withColumn('id', results['to'].id)[
                ['id']].distinct()

            new_start = df_start[['id']].subtract(start_remove).distinct()
            new_end = df_end[['id']].subtract(end_remove).distinct()

            index = int(len(results.columns) / 2)

            bfs_recursive_worker('left',
                                 new_start,
                                 df_end[['id']],
                                 nodes,
                                 edges,
                                 list_results,
                                 index + 1,
                                 num_hops)
            bfs_recursive_worker('right',
                                 df_start[['id']],
                                 new_end,
                                 nodes,
                                 edges,
                                 list_results,
                                 index + 1,
                                 num_hops)
outDeg = stationGraph.outDegrees
outDeg.orderBy(desc("outDegree")).show(5, False)


# COMMAND ----------

degreeRatio = inDeg.join(outDeg, "id")\
  .selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio")
degreeRatio.orderBy(desc("degreeRatio")).show(10, False)
degreeRatio.orderBy("degreeRatio").show(10, False)


# COMMAND ----------

stationGraph.bfs(fromExpr="id = 'Townsend at 7th'",
  toExpr="id = 'Spear at Folsom'", maxPathLength=2).show(10)


# COMMAND ----------

spark.sparkContext.setCheckpointDir("/tmp/checkpoints")


# COMMAND ----------

minGraph = GraphFrame(stationVertices, tripEdges.sample(False, 0.1))
cc = minGraph.connectedComponents()


# COMMAND ----------
    # |  b|  e|      follow|
    # |  c|  e|      follow|
    # |  e|  f|      follow|
    # +---+---+------------+

    # Step-3: Create a GraphFrame. Using GraphFrames API, a graph
    # is built as an instance of a GraphFrame, which is a pair of
    # vertices (as `v`) and edges (as `e`):
    graph = GraphFrame(v, e)
    print("graph=", graph)
    # GraphFrame(v:[id: string, name: string ... 1 more field],
    #            e:[src: string, dst: string ... 1 more field])

    #==============
    # BFS Algorithm
    #==============
    # The following code snippets uses BFS to find path between
    # vertex with name "Alice" to a vertex with age < 27.
    #
    # Search from "Alice" for users of age < 27.
    paths = graph.bfs("name = 'Alice'", "age < 27")
    paths.show()

    # Specify edge filters or max path lengths.
    paths2 = graph.bfs("name = 'Alice'", "age > 30",\
               edgeFilter="relationship == 'follow'", maxPathLength=4)
    paths2.show()

    # done!
    spark.stop()
Esempio n. 4
0
    #union user vertices together with business vertices.
    all_vertices = userV.union(businessV)

    #union friend Edges together with review edges.
    all_edges = friendE.union(reviewE)

    # Create the GraphFrame object
    g = GraphFrame(all_vertices, all_edges)

    # Make sure GraphFrame is cached in memory in case we want to query/manipulate it multiple times in a row.
    g.cache()

    ### 12.2 #####
    # Find shortest paths between users named Eric and Restaurants with 'Taco Bell' in their name
    paths = g.bfs(" name = 'Eva'", " type = 'company' and review_count >=10",
                  " stars='5' or relationship = 'friend'")
    # Get list of columns
    cols = paths.columns

    # Get the label/name of the last Edge in the path
    last_edge = cols[len(cols) - 2]

    # The resulting paths can be manipulated as normal DataFrames
    # order by the stars of the last edge (which must be reviewd type edge)
    print("BFS :: ")
    paths.orderBy(last_edge + ".stars", ascending=False).show(5, False)

    ###### 12.3.1   ###
    query = "(u)-[e1]->(b1); (u)-[e2] -> (b2)"
    results = g.find(query)
Esempio n. 5
0
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.show()
# More complex queries
motifs.filter("b.age > 30").show()

print("\ngenerate subgraph --- ")
g1 = (g.filterVertices("age > 30").filterEdges(
    "relationship = 'friend'").dropIsolatedVertices())
g1.vertices.show()
g1.edges.show()

# Breadth-first search (BFS)
print("\n BFS")
paths = g.bfs(
    "name = 'Esther'",
    "age < 32",
    edgeFilter="relationship != 'friend'",
    maxPathLength=3,
).show()

# In-Degree and Out-Degree Metrics
print("\n Degree--------------")
inDeg = g.inDegrees
inDeg.orderBy(desc("inDegree"))
outDeg = g.outDegrees
outDeg.orderBy(desc("outDegree"))
degreeRatio = (inDeg.join(outDeg, "id").selectExpr(
    "*", "double(inDegree)/ double(outDegree) as degreeRatio").orderBy(
        desc("degreeRatio")).show(10, False))

# print("\n strong connected component")
# result = g.stronglyConnectedComponents(maxIter=10)
Esempio n. 6
0
# add GraphFrames package to spark-submit
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11'
import sys
from functools import reduce
from pyspark.sql.functions import col, lit, when
import pyspark
from graphframes.examples import Graphs
from graphframes import GraphFrame
import config

sc = pyspark.SparkContext()
sqlContext = pyspark.SQLContext(sc)
inputFile = sys.argv[1]
# g = Graphs(sqlContext).friends()  # Get example graph
df = sqlContext.read.format("csv").option("delimiter", config.delimiter).load(inputFile)
# Rename columns to something decent.
df = df.withColumnRenamed("_c0", "src")\
.withColumnRenamed("_c1", "dst")\
.withColumnRenamed("_c2", "weight")
df.show(5)

aggcodes = df.select("src","dst").rdd.flatMap(lambda x: x).distinct()
vertices = aggcodes.map(lambda x: (x, x)).toDF(["id","name"])

edges = df.select("src", "dst")
graph = GraphFrame(vertices, edges)

paths = graph.bfs("name = '500'", "name = 100")
paths.show()
Esempio n. 7
0
# 广度优先搜索
# 1- Builds a graph using GraphFrame package
# 2- Applies BFS find the shortest paths from one vectex to another

import sys, os
from pyspark.sql import SparkSession, SQLContext
from graphframes import GraphFrame

if __name__ == '__main__':
    spark = SparkSession.builder.appName('BFS').getOrCreate()
    # 1- 载入数据
    vertices = [("a", "Alice", 30), ("b", "Bob", 31), ("c", "Charlie", 32),
                ("d", "David", 23), ("e", "Emma", 24), ("f", "Frank", 26)]
    v = spark.createDataFrame(vertices, ["id", "name", "age"])

    edges = [("a", "b", "follow"), ("b", "c", "follow"), ("c", "d", "follow"),
             ("d", "e", "follow"), ("b", "e", "follow"), ("c", "e", "follow"),
             ("e", "f", "follow")]
    e = spark.createDataFrame(edges, ["src", "dst", "relationship"])
    e.show()
    # 2- 创建图
    ## 报错修正(少jars包):https://blog.csdn.net/qq_15098623/article/details/91533349
    graph = GraphFrame(v, e)
    print("graph=", graph)

    # 3- 广度搜索
    paths = graph.bfs("name = 'Alice'", "age < 27")
    paths.show()

    spark.stop()