def sample_induced_subgraph( graph: GraphFrame, seed: int, k_hops: int = 2, pr_alpha: float = 0.85, pr_tol: float = 0.001, ) -> GraphFrame: assert k_hops <= 3 # build motif for finding a k-hop neighborhood localized to a node symbols = string.ascii_letters motif_edges = [(symbols[i], symbols[i + 1]) for i in range(k_hops)] paths = graph.find(";".join( [f"({e1})-[]->({e2})" for e1, e2 in motif_edges])) # the center of odd paths have indices greater than the midpoint # this should increase the seed article's pagerank score on 1-hop networks centered_paths = paths.where(f"{symbols[k_hops-k_hops//2]}.id = {seed}") vertices = centered_paths.selectExpr(f"{symbols[0]} as v") for i in range(1, k_hops + 1): vertices = vertices.union( centered_paths.selectExpr(f"{symbols[i]} as v")) vertices = vertices.select("v.*").distinct() vertices.cache() edges = (graph.edges.join( vertices, on=graph.edges["src"] == vertices["id"], how="right").select("src", "dst").join( vertices, on=graph.edges["dst"] == vertices["id"], how="right").select( "src", "dst").where("src is not null AND dst is not null")) return GraphFrame(vertices, edges).pageRank(pr_alpha, tol=pr_tol)
.where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")\ .groupBy("src", "dst").count()\ .orderBy(desc("count"))\ .show(10) # COMMAND ---------- townAnd7thEdges = stationGraph.edges\ .where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'") subgraph = GraphFrame(stationGraph.vertices, townAnd7thEdges) # COMMAND ---------- motifs = stationGraph.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[ca]->(a)") # COMMAND ---------- from pyspark.sql.functions import expr motifs.selectExpr("*", "to_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm') as abStart", "to_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm') as bcStart", "to_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm') as caStart")\ .where("ca.`Bike #` = bc.`Bike #`").where("ab.`Bike #` = bc.`Bike #`")\ .where("a.id != b.id").where("b.id != c.id")\ .where("abStart < bcStart").where("bcStart < caStart")\ .orderBy(expr("cast(caStart as long) - cast(abStart as long)"))\ .selectExpr("a.id", "b.id", "c.id", "ab.`Start Date`", "ca.`End Date`")\ .limit(1).show(1, False)
def process_graphs (sc, in_dir, partitions): """ Read graph vertices and edges from disk if already saved. Otherwise, Read chem2bio2rdf drugbank, pubchem, and other N3 RDF models. Save vertices and edges to disk. Traverse the resulting graph - calculating page rank, using SQL to get names and PDB links of drugs. Args: sc (SparkContext): Access to the Spark compute fabric. in_dir (str): Path to Chemotext data storage for raw chem2bio2rdf N3 RDF models. partitions (int): Number of data partitions. """ sqlContext = SQLContext (sc) n3_dirs = [ os.path.join (in_dir, d) for d in [ "drugbank", "pubchem" ] ] vertices_path_posix = os.path.join (in_dir, "vertices") edges_path_posix = os.path.join (in_dir, "edges") vertices_path = "file://{0}".format (vertices_path_posix) edges_path = "file://{0}".format (edges_path_posix) triples = None vertices = None edges = None g = None if os.path.exists (vertices_path_posix) and os.path.exists (edges_path_posix): print ("Loading existing vertices: {0}".format (vertices_path)) start = time.time () vertices = sqlContext.read.parquet (vertices_path).repartition(partitions).cache () print ("Elapsed time for loading precomputed vertices: {0} seconds.".format ( time.time () - start)) print ("Loading existing edges: {0}".format (edges_path)) start = time.time () edges = sqlContext.read.parquet (edges_path).repartition(partitions).cache () print ("Elapsed time for loading precomputed edges: {0} seconds.".format ( time.time () - start)) else: print ("Constructing vertices and edges from chem2bio2rdf data sources") files = [ os.path.join (n3_dir, n3_file) for n3_dir in n3_dirs for n3_file in os.listdir (n3_dir) ] triples = sc.parallelize (files, numSlices=partitions). \ flatMap (lambda n3_file : process_chunk (n3_file)) vertices = sqlContext.createDataFrame ( data = triples.flatMap (lambda d : [ ( trim_uri (d.S), "attr0" ), ( trim_uri (d.O), "attr1" ) ]), schema=[ "id", "attr" ]).\ cache () edges = sqlContext.createDataFrame ( data = triples.map (lambda d : ( trim_uri (d.S), trim_uri (d.O), trim_uri (d.P) )), schema = [ "src", "dst", "relationship" ]). \ cache () print ("Triples: {0}".format (triples.count ())) if os.path.exists (vertices_path_posix): shutil.rmtree (vertices_path_posix) if os.path.exists (edges_path_posix): shutil.rmtree (edges_path_posix) vertices.write.parquet (vertices_path) edges.write.parquet (edges_path) if vertices is not None and edges is not None: start = time.time () vertices.printSchema () edges.printSchema () print ("Elapsed time for print schema: {0} seconds.".format ( time.time () - start)) start = time.time () print (" Total of {0} edges.".format (edges.count ())) print ("Elapsed time for count edges: {0}".format (time.time () - start)) g = GraphFrame(vertices, edges) print ("Query: Get in-degree of each vertex.") start = time.time () g.inDegrees.\ sort ("inDegree", ascending=False).\ show(n=3, truncate=False) print ("Elapsed time for computing in-degree: {0} seconds.".format ( time.time () - start)) start = time.time () print ("Query: Number of protein database relationships: {0}".format ( g.edges.\ filter("relationship LIKE '%resource/PDB_ID%' ").\ count ())) print ("Elapsed time for edge filter and count query: {0} seconds.".format ( time.time () - start)) edges.registerTempTable ("edges") sqlContext.sql (""" SELECT substring(src, length(src)-7, 6) as Drug, dst as Name FROM edges WHERE relationship LIKE '%resource/Name%' """).show (n=3, truncate=False) start = time.time () sqlContext.sql (""" SELECT substring(src, length(src)-7, 6) as Compound, dst as SMILES FROM edges WHERE relationship LIKE '%open%_smiles%' """).show (n=3, truncate=False) print ("Elapsed time for SQL query: {0} seconds.".format ( time.time () - start)) start = time.time () g.find ("()-[Drug2PDB]->()"). \ filter ("Drug2PDB.relationship LIKE '%/PDB_ID' "). \ show (n=3, truncate=False) print ("Elapsed time for graph motif query: {0} seconds.".format ( time.time () - start)) return g
paths = g.bfs(" name = 'Eva'", " type = 'company' and review_count >=10", " stars='5' or relationship = 'friend'") # Get list of columns cols = paths.columns # Get the label/name of the last Edge in the path last_edge = cols[len(cols) - 2] # The resulting paths can be manipulated as normal DataFrames # order by the stars of the last edge (which must be reviewd type edge) print("BFS :: ") paths.orderBy(last_edge + ".stars", ascending=False).show(5, False) ###### 12.3.1 ### query = "(u)-[e1]->(b1); (u)-[e2] -> (b2)" results = g.find(query) out = results.filter( " b1.type != 'user' and b2.type != 'user' and e1.stars = '5' and e2.stars = '5' and b1.city != b2.city " ) grouped = out.groupBy("u").count().orderBy("count", ascending=False).limit(5) #print(grouped.count()) print( "12.3.1 --------------------------------12.3.1--------------------------------------------- " ) grouped.show(10, False) ##### 12.3.2 #### query2 = "(u1)-[e1]->(b); (u2)-[e2] -> (b); (u1)-[e3]->(u2)" results2 = g.find(query2)
], ["src", "dst", "relationship"], ) # create graph g = GraphFrame(v, e) g.vertices.show() g.edges.show() # Find the youngest user's age in the graph. g.vertices.groupBy().min("age").show() # Count the number of "follows" in the graph. numFollows = g.edges.filter("relationship = 'follow'").count() # motif finding motifs = g.find("(a)-[e]->(b); (b)-[e2]->(c)").filter("a.id != c.id") motifs = g.find("(a)-[e]->(b); (b)-[e2]->(a)") motifs.show() # More complex queries motifs.filter("b.age > 30").show() print("\ngenerate subgraph --- ") g1 = (g.filterVertices("age > 30").filterEdges( "relationship = 'friend'").dropIsolatedVertices()) g1.vertices.show() g1.edges.show() # Breadth-first search (BFS) print("\n BFS") paths = g.bfs(
# find all triangles, which # might have duplicates, since # a triangle can be represented # in 6 different ways: given 3 # vertices {a, b, c} of a triangle, # it can be represented by the # following 6 representations: # # a -> b -> c -> a # a -> c -> b -> a # b -> a -> c -> b # b -> c -> a -> b # c -> a -> b -> c # c -> b -> a -> c #------------------------------- motifs = graph.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(a)") print("motifs=", motifs) print("motifs.count()=", motifs.count()) motifs.show(1000, truncate=False) #------------------------------- # now remove duplicate triangles # keep only one representation of a triangle # {a, b, c} where a > b > c #------------------------------- unique_triangles = motifs[ (motifs.a > motifs.b) & (motifs.b > motifs.c)] print("unique_triangles=", unique_triangles) unique_triangles.show(truncate=False) # done! spark.stop()
("e", "f", "follow"), ("e", "d", "friend"), ("d", "a", "friend"), ("a", "e", "friend")], ["src", "dst", "relationship"]) # Create a GraphFrame,本质上就是两个dataframe g = GraphFrame(v, e) # 获取两个dataframe g.vertices g.edges ## 每个节点的出度 入度 g.inDegrees g.outDegrees # motifs 语法 chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)") # chain4.show() # g.find("(c)-[m]->()").show() # Query on sequence, with state (cnt) # (a) Define method for updating state given the next element of the motif. sumFriends = \ lambda cnt, relationship: F.when(relationship == "friend", cnt + 1).otherwise(cnt) # (b) Use sequence operation to apply method to sequence of elements in motif. # In this case, the elements are the 3 edges. condition = \ reduce(lambda cnt, e: sumFriends(cnt, F.col(e).relationship), ["ab", "bc", "cd"], F.lit(0)) # (c) Apply filter to DataFrame. chainWith2Friends2 = chain4.where(condition >= 2) # chainWith2Friends2.show() result = g.connectedComponents()
edges = sqlContext.createDataFrame([(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 1), (4, 1), (4, 3)], ["src", "dst"]) graph = GraphFrame(vertices, edges) """ ## Show Vertices """ display_graph(graph.vertices) doc.show() """ ## Show Edges """ display_graph(graph.edges) doc.show() """ ## Show Degrees (Sum of in and out degrees by node) """ display_graph(graph.degrees) doc.show() """ Show all motifs which satisfy a->b->c """ display_graph(graph.find("(a)-[e]->(b); (b)-[e2]->(a)")) def display_graph(item): # Redirects Standard Out to the document with io.StringIO() as buf, redirect_stdout(buf): graph.find("(a)-[e]->(b); (b)-[e2]->(a)").show() redirect_to_handout(buf.getvalue()) doc.show() """ ## Get pagerank using m=0.15 and tolerance=0.01 """ pr = graph.pageRank(resetProbability=0.15, tol=0.01) """ ### look at the pagerank score for every vertex """ display_graph(pr.vertices)