def graphframes_pagerank(self, sc, sqlc): """ GraphFrame's PageRank implementation """ from graphframes import GraphFrame # pylint: disable=import-error edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) graph = GraphFrame(vertex_df, edge_df) withPageRank = graph.pageRank(maxIter=self.args.maxiter) final_df = sql( sqlc, """ SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r FROM ranks ORDER BY ranks.pagerank DESC """, {"ranks": withPageRank.vertices}) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none") else: print(final_df.rdd.collect())
def test_gf(self): vertices = spark.createDataFrame([('1', 'Carter', 'Derrick', 50), ('2', 'May', 'Derrick', 26), ('3', 'Mills', 'Jeff', 80), ('4', 'Hood', 'Robert', 65), ('5', 'Banks', 'Mike', 93), ('98', 'Berg', 'Tim', 28), ('99', 'Page', 'Allan', 16)], ['id', 'name', 'firstname', 'age']) edges = spark.createDataFrame([('1', '2', 'friend'), ('2', '1', 'friend'), ('3', '1', 'friend'), ('1', '3', 'friend'), ('2', '3', 'follows'), ('3', '4', 'friend'), ('4', '3', 'friend'), ('5', '3', 'friend'), ('3', '5', 'friend'), ('4', '5', 'follows'), ('98', '99', 'friend'), ('99', '98', 'friend')], ['src', 'dst', 'type']) g = GraphFrame(vertices, edges) g.connectedComponents().show()
def run(sc, TH, infile, outfile): rdd = sc.textFile(infile) fisrtline = rdd.first() data = rdd.filter(lambda line: line != fisrtline) uid = data.map(lambda line: (line.split(',')[0], line.split(',')[1])) \ .groupByKey() \ .mapValues(lambda x: sorted(list(x))) \ .collectAsMap() cand_pairs = list(itertools.combinations(list(uid.keys()), 2)) edge, vertex = list(), set() for pair in cand_pairs: if len(set(uid[pair[0]]).intersection(set(uid[pair[1]]))) >= TH: edge.append(tuple((pair[0], pair[1]))) edge.append(tuple((pair[1], pair[0]))) vertex.add(pair[0]) vertex.add(pair[1]) graph = GraphFrame(sc.parallelize(list(vertex)).map(lambda uid: (uid,)).toDF(['id']), sc.parallelize(edge).toDF(["src", "dst"])) communities = graph.labelPropagation(maxIter=5) communities = communities.rdd.coalesce(1) \ .map(lambda idx_label: (idx_label[1], idx_label[0])) \ .groupByKey() \ .map(lambda label_idxes: sorted(list(label_idxes[1]))) \ .sortBy(lambda idxes: (len(idxes), idxes)) \ .collect() Task1.toFile(outfile, communities)
def algorithm2(i, g): while (True): aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=F.when( AM.src['value'] == -1, AM.src["id"])) new_vertices = g.vertices.join( aggregates, on="id", how="left_outer").withColumn( "newValue", getid_maximum_udf2("id", "agg", lit(i), "value")).drop("agg").withColumn( 'max_by_rows', greatest('value', 'newValue')).drop( "value", "newValue").withColumnRenamed( "max_by_rows", "value") cached_new_vertices = AM.getCachedDataFrame(new_vertices) g = GraphFrame(cached_new_vertices, g.edges) i += 1 g.vertices.show() if (g.filterVertices( "value == -1").dropIsolatedVertices().edges.count() == 0): final_df = g.vertices final_df = final_df.withColumn( "value", F.when(final_df["value"] == -1, i).otherwise(final_df["value"])) break return final_df
def graphframes_pagerank(self, sc, sqlc): """ GraphFrame's PageRank implementation """ from graphframes import GraphFrame # pylint: disable=import-error edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges")) vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices")) graph = GraphFrame(vertex_df, edge_df) withPageRank = graph.pageRank(maxIter=self.args.maxiter) final_df = sql(sqlc, """ SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r FROM ranks ORDER BY ranks.pagerank DESC """, {"ranks": withPageRank.vertices}) if self.args.dump: final_df.coalesce(1).write.text( self.args.dump, compression="gzip" if self.args.gzip else "none" ) else: print(final_df.rdd.collect())
def sample_induced_subgraph( graph: GraphFrame, seed: int, k_hops: int = 2, pr_alpha: float = 0.85, pr_tol: float = 0.001, ) -> GraphFrame: assert k_hops <= 3 # build motif for finding a k-hop neighborhood localized to a node symbols = string.ascii_letters motif_edges = [(symbols[i], symbols[i + 1]) for i in range(k_hops)] paths = graph.find(";".join( [f"({e1})-[]->({e2})" for e1, e2 in motif_edges])) # the center of odd paths have indices greater than the midpoint # this should increase the seed article's pagerank score on 1-hop networks centered_paths = paths.where(f"{symbols[k_hops-k_hops//2]}.id = {seed}") vertices = centered_paths.selectExpr(f"{symbols[0]} as v") for i in range(1, k_hops + 1): vertices = vertices.union( centered_paths.selectExpr(f"{symbols[i]} as v")) vertices = vertices.select("v.*").distinct() vertices.cache() edges = (graph.edges.join( vertices, on=graph.edges["src"] == vertices["id"], how="right").select("src", "dst").join( vertices, on=graph.edges["dst"] == vertices["id"], how="right").select( "src", "dst").where("src is not null AND dst is not null")) return GraphFrame(vertices, edges).pageRank(pr_alpha, tol=pr_tol)
def main(): # crate spark session spark = SparkSession.builder.appName("keepindoors graphx connectedComponents()").getOrCreate() # get a mongo client cli = mongo.__get__() # v, ["id","url","title","datetime"] localVertices=[] cursor = mongo.getCollection(cli,"keepindoors","docs").find() for r in cursor: # del "_id" key which will throws error when createDataFrame r["id"] = r["docno"] localVertices.append((r["docno"],r["url"],r["title"],str(r["_id"].generation_time + timedelta(hours=8)))) # e cursor = mongo.getCollection(cli, "keepindoors", "distances").find() localEdges = [] for r in cursor: localEdges.append((r["docno1"],r["docno2"],r["distance"])) v = spark.createDataFrame(localVertices,["id","url","title","datetime"]) e = spark.createDataFrame(localEdges, ["src", "dst","distance"]) g = GraphFrame(v,e) # get sparkContext from sparkSession spark.sparkContext.setCheckpointDir("/tmp/spark/checkpoint") result = g.connectedComponents() # order by component,datetime result = result.orderBy(["component", "datetime"], ascending=[1, 0]).collect() # create component dict component_dict = {} for row in result: record = row.asDict() if record["component"] not in component_dict.keys(): component_dict[record["component"]] = [] component_dict[record["component"]].append(record) # delete mongo collection "component" mongo.deleteAll(cli,"keepindoors","components") # save component_dict into mongo index = 1 for key,item in component_dict.items(): links = [] titles = [] title = "empty title" update_time = "1970-01-01 00:00:00+00:00" for doc in item: titles.append(doc["title"]) links.append(doc["url"]) if doc["datetime"] > update_time: update_time = doc["datetime"] title = doc["title"] mongo.insertDoc({"no":index,"component":key,"title":title,"size":len(item),"links":links,"titles":titles,"update_time":update_time,"docs":item},cli,"keepindoors","components") index += 1
def main(argv): filter_threshold = int(argv[1]) input_file_path = argv[2] output_file_path = argv[3] # os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.6.0-spark2.4-s_2.11") # filter_threshold = 7 # input_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/data/ub_sample_data.csv" # output_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/output/task1.txt" # 38648 records total_start = time.time() start = time.time() input_data = sc.textFile(input_file_path).\ filter(lambda line: "user_id" not in line).\ map(lambda line: tuple(line.split(","))).\ groupByKey().\ mapValues(set).\ persist() # 3374 edges = input_data.\ cartesian(input_data).\ filter(lambda pair: pair[0][0] < pair[1][0]).\ filter(lambda pair: len(pair[0][1].intersection(pair[1][1])) >= filter_threshold).\ flatMap(lambda pair: [(pair[0][0], pair[1][0]), (pair[1][0], pair[0][0])]).\ persist() # 996 498 edges_df = edges.map(lambda pair: Row(src=pair[0], dst=pair[1])).toDF() vertices = edges.flatMap(lambda _: _).distinct().persist() # 222 vertices_df = vertices.map(Row("id")).toDF() print("finish building edges and vertices:", time.time() - start) start = time.time() graph = GraphFrame(vertices_df, edges_df) result = graph.labelPropagation(maxIter=5) print("finish running LPA:", time.time() - start) # result.count() # 222 # result.show() result_rdd = result.rdd.\ map(lambda pair: (pair['label'], pair['id'])).\ groupByKey().\ mapValues(lambda values: (sorted(list(values)), len(values))).\ persist() result_collection = result_rdd.collect() result_collection.sort(key=lambda kv: (kv[1][1], kv[1][0][0])) with open(output_file_path, "w") as output_file: for community_id, (user_list, length) in result_collection: output_file.write(f"'{user_list[0]}'") for user in user_list[1:]: output_file.write(f", '{user}'") output_file.write("\n") print("total running time:", time.time() - total_start)
def lpa(self, graph, iter): print("Community Detection\t1\tInitializing Algorithm", flush=True) edges = graph.get_df() vertices = edges.select('src').union( edges.select('dst')).distinct().withColumnRenamed('src', 'id') print("Community Detection\t2\tExecuting Label Propagation Algorithm", flush=True) graph = GraphFrame(vertices, edges) result = graph.labelPropagation(maxIter=iter) return result.orderBy('label', ascending=True).withColumnRenamed( 'label', 'Community')
def runBPwithGraphFrames(cls, g, numIter): """Run Belief Propagation using GraphFrame. This implementation of BP shows how to use GraphFrame's aggregateMessages method. """ # choose colors for vertices for BP scheduling colorG = cls._colorGraph(g) numColors = colorG.vertices.select('color').distinct().count() # TODO: handle vertices without any edges # initialize vertex beliefs at 0.0 gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges) # run BP for numIter iterations for iter_ in range(numIter): # for each color, have that color receive messages from neighbors for color in range(numColors): # Send messages to vertices of the current color. # We may send to source or destination since edges are treated as undirected. msgForSrc = sqlfunctions.when( AM.src['color'] == color, AM.edge['b'] * AM.dst['belief']) msgForDst = sqlfunctions.when( AM.dst['color'] == color, AM.edge['b'] * AM.src['belief']) # numerically stable sigmoid logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType()) aggregates = gx.aggregateMessages( sqlfunctions.sum(AM.msg).alias("aggMess"), sendToSrc=msgForSrc, sendToDst=msgForDst) v = gx.vertices # receive messages and update beliefs for vertices of the current color newBeliefCol = sqlfunctions.when( (v['color'] == color) & (aggregates['aggMess'].isNotNull()), logistic(aggregates['aggMess'] + v['a']) ).otherwise(v['belief']) # keep old beliefs for other colors newVertices = (v .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer') .drop(aggregates['id']) # drop duplicate ID column (from outer join) .withColumn('newBelief', newBeliefCol) # compute new beliefs .drop('aggMess') # drop messages .drop('belief') # drop old beliefs .withColumnRenamed('newBelief', 'belief') ) # cache new vertices using workaround for SPARK-1334 cachedNewVertices = AM.getCachedDataFrame(newVertices) gx = GraphFrame(cachedNewVertices, gx.edges) # Drop the "color" column from vertices return GraphFrame(gx.vertices.drop('color'), gx.edges)
def main(argv): assert len( argv ) == 3, "Script takes 3 arguments <filter_threshold><input_file><community_output_file>" filter_threshold, input_file, output_file = argv filter_threshold = int(filter_threshold) config = SparkConf().setMaster("local[*]") \ .setAppName("Task2") \ .set("spark.executor.memory", "4g") \ .set("spark.driver.memory", "4g") sc = SparkContext(conf=config).getOrCreate() spark = SparkSession(sc) sc.setLogLevel("ERROR") lines = sc.textFile(input_file) header = lines.first() rdd_dict = lines.filter(lambda x: x != header) \ .map(lambda x: (x.split(',')[0], x.split(',')[1])) \ .groupByKey().collectAsMap() user_pairs = list(combinations(rdd_dict.keys(), 2)) edges_rdd = sc.parallelize(user_pairs) \ .map(lambda x: (x[0], x[1])) \ .filter(lambda x: get_intersection(rdd_dict[x[0]], rdd_dict[x[1]]) >= filter_threshold) \ .cache() nodes_df = edges_rdd.flatMap(lambda x: x).distinct().map( lambda x: (x, )).toDF(["id"]) edges_df = edges_rdd.toDF(["src", "dst"]) gf = GraphFrame(nodes_df, edges_df) communities_rdd = gf.labelPropagation(maxIter=5).rdd.coalesce(1) communities = communities_rdd.map(lambda x: (x[1], x[0])) \ .groupByKey() \ .map(lambda x: sorted(list(x[1]))) \ .sortBy(lambda x: (len(x), x)) \ .collect() with open(output_file, "w+") as file: for community in communities: value = str(community)[1:-1] file.writelines(value + "\n") file.close()
def cull_graph(graph, by="degree", quantile=0.25, quantile_accuracy=0.1, max_iter=2): """Reduce a spark graph by getting rid of nodes that are not high value. This is done either by removing nodes that have number of degrees below a quantile or removing nodes with pagerank below a quantile.""" wanted_nodes = None if by == "degree": nth_percentile = graph.degrees.approxQuantile("degree", [quantile], quantile_accuracy)[0] wanted_nodes = graph.degrees\ .filter(graph.degrees.degree > nth_percentile)\ .select("id") elif by == "pagerank": results = graph.pageRank(resetProbability=0.15, maxIter=max_iter).vertices nth_percentile = results.approxQuantile("pagerank", [quantile], quantile_accuracy)[0] wanted_nodes = results\ .filter(results.pagerank > nth_percentile)\ .select("id") else: raise ValueError("by must be degree or pagerank!") filtered_nodes = graph.vertices.join(wanted_nodes, "id") filtered_edges = graph.edges.join(wanted_nodes, (graph.edges.src == wanted_nodes.id) | (graph.edges.dst == wanted_nodes.id)) return GraphFrame(filtered_nodes, filtered_edges)
def algorithm1(i, g): while (True): aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=F.when( AM.src['value'] == -1, AM.src["id"])) new_vertices = g.vertices.join( aggregates, on="id", how="left_outer").withColumn( "newValue", getid_maximum_udf2("id", "agg", lit(i), "value")).drop("agg").withColumn( 'max_by_rows', greatest('value', 'newValue')).drop( "value", "newValue").withColumnRenamed( "max_by_rows", "value") cached_new_vertices = AM.getCachedDataFrame(new_vertices) g = GraphFrame(cached_new_vertices, g.edges) i += 1 g.vertices.show() g.vertices.createOrReplaceTempView("temp_table") if (spark.sql("SELECT * from temp_table where value = -1").count() == 0 ): final_df = g.vertices break return final_df
def set_infected_nodes(self, list_or_dataframe): """ Set nodes that is infected or is the source of influence using pyspark dataframe. :param dataframe: pyspark dataframe with column 'id' or python list :return: """ infected_dataframe = list_or_dataframe # Convert list to dataframe if type(list_or_dataframe) == list: rdd_list = self.sc.parallelize(list_or_dataframe) row_rdd_list = rdd_list.map(lambda x: Row(x)) field_list = [StructField("id", LongType(), True)] schema_list = StructType(field_list) infected_dataframe = self.sqlContext.createDataFrame( row_rdd_list, schema_list) # Create column for influence attribute containing 1's infected_dataframe = infected_dataframe.withColumn( self.attribute, lit(1.0)) infected = infected_dataframe self.infected_nodes = infected_dataframe # Merge to original vertices of graph orig_vertices = self.graph.vertices.selectExpr("id as id") # Update graph orig_edges = self.graph.edges new_vertices = orig_vertices.join(infected, "id", "left_outer").na.fill(0) self.graph = GraphFrame(new_vertices, orig_edges)
def create_graph(V, E): v = sqlContext.createDataFrame(V, ['id']) e = sqlContext.createDataFrame(E, ["src", "dst"]) G = GraphFrame(v, e) return G
def induce_graph(graph, relabel=True, partitions=[]): """Remove extra edges that do not belong to the graph""" # small dataframe for reindexing/relabeling window = Window.orderBy("id") if partitions: window = window.partitionBy(partitions) # ensure 0 index for mapping into a scipy.sparse matrix rank = graph.vertices.select( "id", F.row_number().over(window).alias("rank")).withColumn( "rank", F.expr("rank - 1")) vertices = graph.vertices.join(rank, on="id", how="left") edges = graph.edges.join(vertices.selectExpr("id as src", "rank as rank_src"), on="src", how="inner").join(vertices.selectExpr( "id as dst", "rank as rank_dst"), on="dst", how="inner") if relabel: vertices = vertices.withColumn("relabeled_id", F.col("id")).withColumn( "id", F.col("rank")) edges = (edges.withColumn("relabeled_src", F.col("src")).withColumn( "relabeled_dst", F.col("dst")).withColumn("src", F.col("rank_src")).withColumn( "dst", F.col("rank_dst"))) vertices = vertices.drop("rank") edges = edges.drop("rank_src", "rank_dst") return GraphFrame(vertices, edges)
def readFile(filename, large, sqlContext=sqlContext): # lines = sc.textFile(filename) spark = SparkSession.builder.getOrCreate() if large: delim = " " # Strip off header row. # lines = lines.mapPartitionsWithIndex(lambda ind, it: iter(list(it)[1:]) if ind == 0 else it) header = True else: delim = "," header = False # Extract pairs from input file and convert to data frame matching # schema for graphframe edges. # YOUR CODE HERE edges = spark.read.csv(path=filename, sep=delim, schema='src INT, dst INT', header=header) # Extract all endpoints from input file (hence flatmap) and create # data frame containing all those node names in schema matching # graphframe vertices # YOUR CODE HERE vertices = edges.select(edges['src'].alias('id')).union(edges.select('dst')) \ .distinct() # Create graphframes g from the vertices and edges. g = GraphFrame(vertices, edges) return g
def compute_degrees(self, graph): """ Compute weighted and unweighted in and out degrees in graph. Re-declares graph to add the following attributes: inDegree, outDegree, w_inDegree, w_outDegree. :param graph: graphframe object, network :return: """ g_vertices = graph.vertices g_edges = graph.edges # Get unweighted degrees indeg = graph.inDegrees outdeg = graph.outDegrees # Get weighted degrees w_indeg = (g_edges.groupby("dst").agg( sum("weight").alias("w_inDegree"))).selectExpr( "dst as id", "w_inDegree as w_inDegree") w_outdeg = (g_edges.groupby("src").agg( sum("weight").alias("w_outDegree"))).selectExpr( "src as id", "w_outDegree as w_outDegree") # Update vertices attribute new_v = g_vertices.join(indeg, "id", "left_outer") new_v = new_v.join(outdeg, "id", "left_outer") new_v = new_v.join(w_indeg, "id", "left_outer") new_v = new_v.join(w_outdeg, "id", "left_outer") new_v = new_v.na.fill(0) # Update graph self.graph = GraphFrame(new_v, g_edges)
def __init__(self, vertices_pq, edges_pq): # Create configuration for Spark Session conf = SparkConf() \ .setAll([('spark.executor.memory', '16g'), ('spark.executor.cores', '8'), ('spark.cores.max', '8'), ('spark.driver.memory','16g'), ('spark.sql.execution.arrow.enabled', True), ('spark.python.profile', True), ('spark.python.profile.dump', './spark_profile')]) # Create a spark session self.SS = SparkSession.builder.config(conf=conf).getOrCreate() # Construct the vertices and edges DataFrame vertices_df = self.SS.read.parquet(vertices_pq) edges_df = self.SS.read.parquet(edges_pq) # Append a column that specifies whether the # node is a user or a repo in the table of vertices # 1 is for user, 2 is for repo nodeTypeUDF = F.udf(lambda i: 1 if i > 0 else 2, types.IntegerType()) vertices_df = vertices_df.withColumn('nodeType', nodeTypeUDF(F.col('id'))) # Create the graphframe object self.gf = GraphFrame(vertices_df, edges_df)
class LPA(): def __init__(self): self.spark = SparkSession \ .builder \ .appName('Example_2') \ .getOrCreate() def graphx(self): self.df = self.spark.read.option("header", "true").csv('results_new/data-00000-of-00010.csv') # print(self.df.show(n=5)) self.df = self.df.dropna() self.rdd = self.df.select("url","mention").rdd.flatMap(lambda x: x).distinct() # print(self.rdd.take(5)) def hashnode(x): return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8] hashnode_udf = udf(hashnode) vertices = self.rdd.map(lambda x: (hashnode(x), x)).toDF(["id", "url"]) vertices.show(5) edges = self.df.select("url", "mention") \ .withColumn("src", hashnode_udf("url")) \ .withColumn("dst", hashnode_udf("mention")) \ .select("src", "dst") edges.show(5) self.graph = GraphFrame(vertices, edges) # print(self.graph) print('communities are ') self.communities = self.graph.labelPropagation(maxIter=2) print(self.communities.persist().show(10)) print(self.communities.sort(desc("label")).show(50)) self.communities.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("communities") print("There are " + str(self.communities.select('label').distinct().count()) + " communities in sample graph.") print(self.graph.inDegrees.join(vertices, on="id") \ .orderBy("inDegree", ascending=False).show(10)) print(self.graph.stronglyConnectedComponents(maxIter=2).select('url','component').show(20))
def find_the_largest_subgraph(graph): result = graph.connectedComponents() componentCount = result.groupBy('component').count().orderBy(desc('count')) componentCount.show() largestComponent = componentCount.first()['component'] vertices = result\ .filter(result.component == largestComponent)\ .select('id') return GraphFrame(vertices, graph.edges)
def GraphFrame(vertices: pyspark.sql.DataFrame, edges: pyspark.sql.DataFrame) -> GraphFrame: """Simply calls the graphframes.GraphFrame Args: vertices (pyspark.sql.DataFrame): edges (pyspark.sql.DataFrame): """ return GraphFrame(vertices, edges)
def graphFrame(self): """A GraphFrame representation of the constructed graph. :type: :class:`graphframes.GraphFrame` """ return GraphFrame( self.verticesDataFrame, self.edgesDataFrame )
def get_graph(orig_df, predictions, orig_df_id_col="row_id", predictions_id_col="id"): predictions_nodes = orig_df.withColumnRenamed(orig_df_id_col, "id") predictions_edges = predictions.withColumnRenamed( f"{predictions_id_col}_l", "src").withColumnRenamed(f"{predictions_id_col}_r", "dst").filter(predictions.prediction == 1.0) return GraphFrame(predictions_nodes, predictions_edges)
def bipartition(graph: GraphFrame, partitions: List[str] = [], iteration: int = 0): if iteration == max_iter: return graph # relabel all partitions for scipy.sparse performance graph.cache() induced = induce_graph(graph, True, partitions) induced.cache() partition = f"sign_{iteration}" fiedler_value = f"fiedler_{iteration}" # The fiedler vector is the second smallest eigenvector associated with # with the graph laplacian, representing the algebraic connectivity of # the graph. This is used to implement spectral clustering, recursively, # by partitioning by the sign of the fiedler value. The partitions are # evenly distributed. fiedler = (edges_with_partitions( induced, partitions).groupBy(*partitions).apply( compute_fiedler_udf(fiedler_value, partitions)).withColumn( partition, F.expr(f"{fiedler_value} >= 0").astype("boolean"))) vertices = undo_relabel( induced.vertices.join(fiedler, on=["id"] + partitions, how="left").repartitionByRange(*partitions + [partition])) if should_checkpoint and iteration % checkpoint_interval == 0: # truncate logical plan to prevent out-of-memory on query plan # string representation. The edges are reused every iteration # and should not need to be checkpointed. vertices.cache() parted_graph = GraphFrame(vertices.localCheckpoint(eager=True), graph.edges) else: parted_graph = GraphFrame(vertices, graph.edges) return bipartition(parted_graph, partitions + [partition], iteration + 1)
def sample_graph(pages, pagelinks, sampling_ratio, relabel=True, ensure_connected=True): vertices = pages.sample(sampling_ratio) edges = pagelinks.selectExpr("from as src", "dest as dst") graph = induce_graph(GraphFrame(vertices, edges), False) if ensure_connected: # only do this when sampling, on the full dataset takes 12 minutes. This may # be required in order to guarantee connectivity. components = graph.connectedComponents() largest_component = (components.groupBy("component").count().orderBy( F.desc("count")).limit(1).select("component")) vertices = components.join(largest_component, on="component", how="inner").drop("component") return induce_graph(GraphFrame(vertices, graph.edges), relabel=relabel) else: return graph
def comments_to_graph(df, id_col, src_col, dest_col): ''' takes in a table of raw reddit data returns a graphframe ''' vertices = df.withColumnRenamed(id_col, 'id') edges = vertices.select(src_col, dest_col).withColumnRenamed( src_col, 'src').withColumnRenamed(dest_col, 'dst') graph = GraphFrame(vertices, edges) return graph
def do_triangles(conf: Conf, g: graphframes.GraphFrame, s: Stepper, vertices_count: int) -> None: """ Pattern for batch oriented iteration - we split the graph into batches using the filterVertices mechanism - we mark the total count of triangles and the partial count - in case of error: * we double the number of batches and the batch number * we restart the iteration at this point with smaller subgraph """ full_set = vertices_count batches = conf.batches_for_triangles total_triangles = conf.count_at_restart batch = conf.batch_at_restart subset = int(full_set / batches) while batch < batches: st = Stepper() count = 0 try: print("try batches=", batches, "subset=", subset, "at batch=", batch) gc.collect() # g1 = g.filterVertices("int(cell/{}) == {}".format(subset, batch)) g1 = g.filterVertices("int(id/{}) == {}".format(subset, batch)) triangles = g1.triangleCount() st.show_step("partial triangleCount") gc.collect() count = triangles.agg({"cell": "sum"}).toPandas()["sum(cell)"][0] st.show_step("partial triangleCount sum") total_triangles += count print("batch=", batch, "vertices=", g1.vertices.count(), "edges=", g1.edges.count(), "total=", total_triangles, "partial", count) except: print("memory error") batches *= 2 batch *= 2 subset = int(full_set / batches) print("restarting with batches=", batches, "subset=", subset, "at batch=", batch) if subset >= 1: continue break batch += 1 s.show_step("triangleCount") print("total=", total_triangles)
def get_connected_components(vertices_path, edges_path, checkpoint_dir, num_reads): # Read vertices and edges files df_vertices = build_vertices(vertices_path) df_edges = build_edges(edges_path, num_reads) # Build Graph spark = SparkSession.builder.appName("build_graph").getOrCreate() vertices = spark.createDataFrame(df_vertices) edges = spark.createDataFrame(df_edges) g = GraphFrame(vertices, edges) # Display Graph g.vertices.show() g.edges.show() # Connected Components # Get SparkContext using spark.sparkContext spark.sparkContext.setCheckpointDir(dirName=checkpoint_dir) result = g.connectedComponents() dictionary = {} sorted_result = result.select("id", "component").orderBy('component', ascending=False) for row in sorted_result.collect(): if row[1] in dictionary: dictionary[row[1]].append(row[0]) else: dictionary[row[1]] = [row[0]] GL = [] for _, value in dictionary.items(): GL.append(value) return GL, spark, g
def main(): print('Read data from BigQuery') vertices = load_data(bq_vertices_table) edges = load_data(bq_edges_table) graph = GraphFrame(vertices, edges) print('Find the largest connected subgraph') subgraph = find_the_largest_subgraph(graph) print('Caculate pagerank') results = subgraph.pageRank(resetProbability=0.15, maxIter=10) results.vertices\ .select('id', 'pagerank')\ .orderBy(desc('pagerank'))\ .show(20, False) spark.stop()
def runBPwithGraphFrames(cls, g, numIter): """Run Belief Propagation using GraphFrame. This implementation of BP shows how to use GraphFrame's aggregateMessages method. """ # choose colors for vertices for BP scheduling colorG = cls._colorGraph(g) numColors = colorG.vertices.select('color').distinct().count() # TODO: handle vertices without any edges # initialize vertex beliefs at 0.0 gx = GraphFrame( colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges) # run BP for numIter iterations for iter_ in range(numIter): # for each color, have that color receive messages from neighbors for color in range(numColors): # Send messages to vertices of the current color. # We may send to source or destination since edges are treated as undirected. msgForSrc = sqlfunctions.when(AM.src['color'] == color, AM.edge['b'] * AM.dst['belief']) msgForDst = sqlfunctions.when(AM.dst['color'] == color, AM.edge['b'] * AM.src['belief']) # numerically stable sigmoid logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType()) aggregates = gx.aggregateMessages(sqlfunctions.sum( AM.msg).alias("aggMess"), msgToSrc=msgForSrc, msgToDst=msgForDst) v = gx.vertices # receive messages and update beliefs for vertices of the current color newBeliefCol = sqlfunctions.when( (v['color'] == color) & (aggregates['aggMess'].isNotNull()), logistic(aggregates['aggMess'] + v['a'])).otherwise( v['belief']) # keep old beliefs for other colors newVertices = ( v.join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer').drop( aggregates['id'] ) # drop duplicate ID column (from outer join) .withColumn('newBelief', newBeliefCol) # compute new beliefs .drop('aggMess') # drop messages .drop('belief') # drop old beliefs .withColumnRenamed('newBelief', 'belief')) # cache new vertices using workaround for SPARK-1334 cachedNewVertices = AM.getCachedDataFrame(newVertices) gx = GraphFrame(cachedNewVertices, gx.edges) # Drop the "color" column from vertices return GraphFrame(gx.vertices.drop('color'), gx.edges)
def _colorGraph(g): """Given a GraphFrame, choose colors for each vertex. No neighboring vertices will share the same color. The number of colors is minimized. This is written specifically for grid graphs. For non-grid graphs, it should be generalized, such as by using a greedy coloring scheme. :param g: Grid graph generated by :meth:`Graphs.gridIsingModel()` :return: Same graph, but with a new vertex column "color" of type Int (0 or 1) """ colorUDF = sqlfunctions.udf(lambda i, j: (i + j) % 2, returnType=types.IntegerType()) v = g.vertices.withColumn('color', colorUDF(sqlfunctions.col('i'), sqlfunctions.col('j'))) return GraphFrame(v, g.edges)
def process_graphs (sc, in_dir, partitions): """ Read graph vertices and edges from disk if already saved. Otherwise, Read chem2bio2rdf drugbank, pubchem, and other N3 RDF models. Save vertices and edges to disk. Traverse the resulting graph - calculating page rank, using SQL to get names and PDB links of drugs. Args: sc (SparkContext): Access to the Spark compute fabric. in_dir (str): Path to Chemotext data storage for raw chem2bio2rdf N3 RDF models. partitions (int): Number of data partitions. """ sqlContext = SQLContext (sc) n3_dirs = [ os.path.join (in_dir, d) for d in [ "drugbank", "pubchem" ] ] vertices_path_posix = os.path.join (in_dir, "vertices") edges_path_posix = os.path.join (in_dir, "edges") vertices_path = "file://{0}".format (vertices_path_posix) edges_path = "file://{0}".format (edges_path_posix) triples = None vertices = None edges = None g = None if os.path.exists (vertices_path_posix) and os.path.exists (edges_path_posix): print ("Loading existing vertices: {0}".format (vertices_path)) start = time.time () vertices = sqlContext.read.parquet (vertices_path).repartition(partitions).cache () print ("Elapsed time for loading precomputed vertices: {0} seconds.".format ( time.time () - start)) print ("Loading existing edges: {0}".format (edges_path)) start = time.time () edges = sqlContext.read.parquet (edges_path).repartition(partitions).cache () print ("Elapsed time for loading precomputed edges: {0} seconds.".format ( time.time () - start)) else: print ("Constructing vertices and edges from chem2bio2rdf data sources") files = [ os.path.join (n3_dir, n3_file) for n3_dir in n3_dirs for n3_file in os.listdir (n3_dir) ] triples = sc.parallelize (files, numSlices=partitions). \ flatMap (lambda n3_file : process_chunk (n3_file)) vertices = sqlContext.createDataFrame ( data = triples.flatMap (lambda d : [ ( trim_uri (d.S), "attr0" ), ( trim_uri (d.O), "attr1" ) ]), schema=[ "id", "attr" ]).\ cache () edges = sqlContext.createDataFrame ( data = triples.map (lambda d : ( trim_uri (d.S), trim_uri (d.O), trim_uri (d.P) )), schema = [ "src", "dst", "relationship" ]). \ cache () print ("Triples: {0}".format (triples.count ())) if os.path.exists (vertices_path_posix): shutil.rmtree (vertices_path_posix) if os.path.exists (edges_path_posix): shutil.rmtree (edges_path_posix) vertices.write.parquet (vertices_path) edges.write.parquet (edges_path) if vertices is not None and edges is not None: start = time.time () vertices.printSchema () edges.printSchema () print ("Elapsed time for print schema: {0} seconds.".format ( time.time () - start)) start = time.time () print (" Total of {0} edges.".format (edges.count ())) print ("Elapsed time for count edges: {0}".format (time.time () - start)) g = GraphFrame(vertices, edges) print ("Query: Get in-degree of each vertex.") start = time.time () g.inDegrees.\ sort ("inDegree", ascending=False).\ show(n=3, truncate=False) print ("Elapsed time for computing in-degree: {0} seconds.".format ( time.time () - start)) start = time.time () print ("Query: Number of protein database relationships: {0}".format ( g.edges.\ filter("relationship LIKE '%resource/PDB_ID%' ").\ count ())) print ("Elapsed time for edge filter and count query: {0} seconds.".format ( time.time () - start)) edges.registerTempTable ("edges") sqlContext.sql (""" SELECT substring(src, length(src)-7, 6) as Drug, dst as Name FROM edges WHERE relationship LIKE '%resource/Name%' """).show (n=3, truncate=False) start = time.time () sqlContext.sql (""" SELECT substring(src, length(src)-7, 6) as Compound, dst as SMILES FROM edges WHERE relationship LIKE '%open%_smiles%' """).show (n=3, truncate=False) print ("Elapsed time for SQL query: {0} seconds.".format ( time.time () - start)) start = time.time () g.find ("()-[Drug2PDB]->()"). \ filter ("Drug2PDB.relationship LIKE '%/PDB_ID' "). \ show (n=3, truncate=False) print ("Elapsed time for graph motif query: {0} seconds.".format ( time.time () - start)) return g
#filename = '/home/user/leaflet-spark/atom_position_frame_1.npz.npy' coord_matrix = np.load(filename) coord_matrix_broadcast = sc.broadcast(coord_matrix) matrix_size = len(coord_matrix) dist_Matrix = sc.parallelize(coord_matrix) dist_Matrix = dist_Matrix.zipWithIndex() #key-value pairs edge_list = dist_Matrix.flatMap(find_edges) edge_list = edge_list.filter(lambda x: x[0]!=-1) # filter the -1 values sqlContext = SQLContext(sc) Edges = Row('src','dst') edge = edge_list.map(lambda x: Edges(*x)) e = sqlContext.createDataFrame(edge) # e.take(10) v = sqlContext.createDataFrame(sc.parallelize(xrange(matrix_size)).map(lambda i:Row(id=i+1))) # v.show() # create the graph g = GraphFrame(v, e) #g.vertices.show() #g.edges.show() total_time = time() - start_time cc = g.connectedComponents() print cc.select("id", "component").orderBy("component").show() print 'Total time to create the Graphframe: %i sec' % (total_time) print 'Time to calculate the connected components: %i sec ' % (time() - total_time-start_time)