Example #1
0
def sample_induced_subgraph(
    graph: GraphFrame,
    seed: int,
    k_hops: int = 2,
    pr_alpha: float = 0.85,
    pr_tol: float = 0.001,
) -> GraphFrame:
    assert k_hops <= 3

    # build motif for finding a k-hop neighborhood localized to a node
    symbols = string.ascii_letters
    motif_edges = [(symbols[i], symbols[i + 1]) for i in range(k_hops)]
    paths = graph.find(";".join(
        [f"({e1})-[]->({e2})" for e1, e2 in motif_edges]))
    # the center of odd paths have indices greater than the midpoint
    # this should increase the seed article's pagerank score on 1-hop networks
    centered_paths = paths.where(f"{symbols[k_hops-k_hops//2]}.id = {seed}")

    vertices = centered_paths.selectExpr(f"{symbols[0]} as v")
    for i in range(1, k_hops + 1):
        vertices = vertices.union(
            centered_paths.selectExpr(f"{symbols[i]} as v"))
    vertices = vertices.select("v.*").distinct()
    vertices.cache()

    edges = (graph.edges.join(
        vertices, on=graph.edges["src"] == vertices["id"],
        how="right").select("src", "dst").join(
            vertices,
            on=graph.edges["dst"] == vertices["id"], how="right").select(
                "src", "dst").where("src is not null AND dst is not null"))
    return GraphFrame(vertices, edges).pageRank(pr_alpha, tol=pr_tol)
  .where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")\
  .groupBy("src", "dst").count()\
  .orderBy(desc("count"))\
  .show(10)


# COMMAND ----------

townAnd7thEdges = stationGraph.edges\
  .where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")
subgraph = GraphFrame(stationGraph.vertices, townAnd7thEdges)


# COMMAND ----------

motifs = stationGraph.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[ca]->(a)")


# COMMAND ----------

from pyspark.sql.functions import expr
motifs.selectExpr("*",
    "to_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm') as abStart",
    "to_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm') as bcStart",
    "to_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm') as caStart")\
  .where("ca.`Bike #` = bc.`Bike #`").where("ab.`Bike #` = bc.`Bike #`")\
  .where("a.id != b.id").where("b.id != c.id")\
  .where("abStart < bcStart").where("bcStart < caStart")\
  .orderBy(expr("cast(caStart as long) - cast(abStart as long)"))\
  .selectExpr("a.id", "b.id", "c.id", "ab.`Start Date`", "ca.`End Date`")\
  .limit(1).show(1, False)
Example #3
0
def process_graphs (sc, in_dir, partitions):
    """
    Read graph vertices and edges from disk if already saved.
    Otherwise,
    Read chem2bio2rdf drugbank, pubchem, and other N3 RDF models.
    Save vertices and edges to disk.
    
    Traverse the resulting graph - calculating page rank, using
    SQL to get names and PDB links of drugs.

    Args:
        sc (SparkContext): Access to the Spark compute fabric.
        in_dir (str): Path to Chemotext data storage for raw chem2bio2rdf N3 RDF models.
        partitions (int): Number of data partitions.
    """
    sqlContext = SQLContext (sc)

    n3_dirs = [ os.path.join (in_dir, d) for d in [ "drugbank", "pubchem" ] ]

    vertices_path_posix = os.path.join (in_dir, "vertices")
    edges_path_posix = os.path.join (in_dir, "edges")
    vertices_path = "file://{0}".format (vertices_path_posix)
    edges_path = "file://{0}".format (edges_path_posix)

    triples = None
    vertices = None
    edges = None
    g = None

    if os.path.exists (vertices_path_posix) and os.path.exists (edges_path_posix):

        print ("Loading existing vertices: {0}".format (vertices_path))
        start = time.time ()
        vertices = sqlContext.read.parquet (vertices_path).repartition(partitions).cache ()
        print ("Elapsed time for loading precomputed vertices: {0} seconds.".format (
            time.time () - start))

        print ("Loading existing edges: {0}".format (edges_path))
        start = time.time ()
        edges = sqlContext.read.parquet (edges_path).repartition(partitions).cache ()
        print ("Elapsed time for loading precomputed edges: {0} seconds.".format (
            time.time () - start))

    else:
        print ("Constructing vertices and edges from chem2bio2rdf data sources")

        files = [ os.path.join (n3_dir, n3_file) for n3_dir in n3_dirs for n3_file in os.listdir (n3_dir) ]
        triples = sc.parallelize (files, numSlices=partitions). \
                  flatMap (lambda n3_file : process_chunk (n3_file))

        vertices = sqlContext.createDataFrame (
            data = triples.flatMap (lambda d : [
                ( trim_uri (d.S), "attr0" ),
                ( trim_uri (d.O), "attr1" ) ]),
            schema=[ "id", "attr" ]).\
            cache () 
        edges = sqlContext.createDataFrame (
            data = triples.map (lambda d : (
                trim_uri (d.S),
                trim_uri (d.O),
                trim_uri (d.P) )),
            schema = [ "src", "dst", "relationship" ]). \
            cache ()
 
        print ("Triples: {0}".format (triples.count ()))

        if os.path.exists (vertices_path_posix):
            shutil.rmtree (vertices_path_posix)
        if os.path.exists (edges_path_posix):
            shutil.rmtree (edges_path_posix)
        vertices.write.parquet (vertices_path)
        edges.write.parquet (edges_path)

    if vertices is not None and edges is not None:
        start = time.time ()
        vertices.printSchema ()
        edges.printSchema ()
        print ("Elapsed time for print schema: {0} seconds.".format (
            time.time () - start))

        start = time.time ()
        print (" Total of {0} edges.".format (edges.count ()))
        print ("Elapsed time for count edges: {0}".format (time.time () - start))

        g = GraphFrame(vertices, edges)

        print ("Query: Get in-degree of each vertex.")
        start = time.time ()
        g.inDegrees.\
            sort ("inDegree", ascending=False).\
            show(n=3, truncate=False)
        print ("Elapsed time for computing in-degree: {0} seconds.".format (
            time.time () - start))

        start = time.time ()
        print ("Query: Number of protein database relationships: {0}".format (
            g.edges.\
            filter("relationship LIKE '%resource/PDB_ID%' ").\
            count ()))
        print ("Elapsed time for edge filter and count query: {0} seconds.".format (
            time.time () - start))
        
        edges.registerTempTable ("edges")

        sqlContext.sql ("""
           SELECT substring(src, length(src)-7, 6) as Drug,
                  dst as Name
           FROM edges
           WHERE relationship LIKE '%resource/Name%'
        """).show (n=3, truncate=False)

        start = time.time ()
        sqlContext.sql ("""
           SELECT substring(src, length(src)-7, 6) as Compound,
                  dst as SMILES
           FROM edges
           WHERE relationship LIKE '%open%_smiles%'
        """).show (n=3, truncate=False)
        print ("Elapsed time for SQL query: {0} seconds.".format (
            time.time () - start))

        start = time.time ()
        g.find ("()-[Drug2PDB]->()"). \
            filter ("Drug2PDB.relationship LIKE '%/PDB_ID' "). \
            show (n=3, truncate=False)
        print ("Elapsed time for graph motif query: {0} seconds.".format (
            time.time () - start))

    return g
Example #4
0
    paths = g.bfs(" name = 'Eva'", " type = 'company' and review_count >=10",
                  " stars='5' or relationship = 'friend'")
    # Get list of columns
    cols = paths.columns

    # Get the label/name of the last Edge in the path
    last_edge = cols[len(cols) - 2]

    # The resulting paths can be manipulated as normal DataFrames
    # order by the stars of the last edge (which must be reviewd type edge)
    print("BFS :: ")
    paths.orderBy(last_edge + ".stars", ascending=False).show(5, False)

    ###### 12.3.1   ###
    query = "(u)-[e1]->(b1); (u)-[e2] -> (b2)"
    results = g.find(query)

    out = results.filter(
        " b1.type != 'user' and b2.type != 'user' and e1.stars = '5' and  e2.stars = '5' and b1.city != b2.city "
    )
    grouped = out.groupBy("u").count().orderBy("count",
                                               ascending=False).limit(5)
    #print(grouped.count())
    print(
        "12.3.1 --------------------------------12.3.1--------------------------------------------- "
    )
    grouped.show(10, False)

    ##### 12.3.2 ####
    query2 = "(u1)-[e1]->(b); (u2)-[e2] -> (b); (u1)-[e3]->(u2)"
    results2 = g.find(query2)
    ],
    ["src", "dst", "relationship"],
)
# create graph
g = GraphFrame(v, e)
g.vertices.show()
g.edges.show()

# Find the youngest user's age in the graph.
g.vertices.groupBy().min("age").show()

# Count the number of "follows" in the graph.
numFollows = g.edges.filter("relationship = 'follow'").count()

# motif finding
motifs = g.find("(a)-[e]->(b); (b)-[e2]->(c)").filter("a.id != c.id")

motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)")
motifs.show()
# More complex queries
motifs.filter("b.age > 30").show()

print("\ngenerate subgraph --- ")
g1 = (g.filterVertices("age > 30").filterEdges(
    "relationship = 'friend'").dropIsolatedVertices())
g1.vertices.show()
g1.edges.show()

# Breadth-first search (BFS)
print("\n BFS")
paths = g.bfs(
 # find all triangles, which 
 # might have duplicates, since
 # a triangle can be represented
 # in 6 different ways: given 3 
 # vertices {a, b, c} of a triangle, 
 # it can be represented by the 
 # following 6 representations:
 #
 #   a -> b -> c -> a
 #   a -> c -> b -> a
 #   b -> a -> c -> b
 #   b -> c -> a -> b
 #   c -> a -> b -> c
 #   c -> b -> a -> c
 #-------------------------------
 motifs = graph.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(a)")
 print("motifs=", motifs)
 print("motifs.count()=", motifs.count())
 motifs.show(1000, truncate=False)
 
 #-------------------------------
 # now remove duplicate triangles
 # keep only one representation of a triangle 
 # {a, b, c} where a > b > c
 #-------------------------------
 unique_triangles = motifs[ (motifs.a > motifs.b) & (motifs.b > motifs.c)]
 print("unique_triangles=", unique_triangles)
 unique_triangles.show(truncate=False)
 
 # done!
 spark.stop()
Example #7
0
                           ("e", "f", "follow"), ("e", "d", "friend"),
                           ("d", "a", "friend"), ("a", "e", "friend")],
                          ["src", "dst", "relationship"])
# Create a GraphFrame,本质上就是两个dataframe
g = GraphFrame(v, e)

# 获取两个dataframe
g.vertices
g.edges

## 每个节点的出度 入度
g.inDegrees
g.outDegrees

# motifs 语法
chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")
# chain4.show()
# g.find("(c)-[m]->()").show()
# Query on sequence, with state (cnt)
#  (a) Define method for updating state given the next element of the motif.
sumFriends = \
    lambda cnt, relationship: F.when(relationship == "friend", cnt + 1).otherwise(cnt)
#  (b) Use sequence operation to apply method to sequence of elements in motif.
#      In this case, the elements are the 3 edges.
condition = \
    reduce(lambda cnt, e: sumFriends(cnt, F.col(e).relationship), ["ab", "bc", "cd"], F.lit(0))
#  (c) Apply filter to DataFrame.
chainWith2Friends2 = chain4.where(condition >= 2)
# chainWith2Friends2.show()

result = g.connectedComponents()
Example #8
0
edges = sqlContext.createDataFrame([(1, 2), (1, 3), (1, 4), (2, 3), (2, 4),
                                    (3, 1), (4, 1), (4, 3)], ["src", "dst"])

graph = GraphFrame(vertices, edges)
""" ## Show Vertices """
display_graph(graph.vertices)
doc.show()
""" ## Show Edges """
display_graph(graph.edges)
doc.show()
""" ## Show Degrees (Sum of in and out degrees by node) """
display_graph(graph.degrees)
doc.show()
""" Show all motifs which satisfy a->b->c """
display_graph(graph.find("(a)-[e]->(b); (b)-[e2]->(a)"))


def display_graph(item):
    # Redirects Standard Out to the document
    with io.StringIO() as buf, redirect_stdout(buf):
        graph.find("(a)-[e]->(b); (b)-[e2]->(a)").show()
        redirect_to_handout(buf.getvalue())


doc.show()
""" ## Get pagerank using m=0.15 and tolerance=0.01
"""
pr = graph.pageRank(resetProbability=0.15, tol=0.01)
""" ### look at the pagerank score for every vertex """
display_graph(pr.vertices)