class LPA(): def __init__(self): self.spark = SparkSession \ .builder \ .appName('Example_2') \ .getOrCreate() def graphx(self): self.df = self.spark.read.option("header", "true").csv('results_new/data-00000-of-00010.csv') # print(self.df.show(n=5)) self.df = self.df.dropna() self.rdd = self.df.select("url","mention").rdd.flatMap(lambda x: x).distinct() # print(self.rdd.take(5)) def hashnode(x): return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8] hashnode_udf = udf(hashnode) vertices = self.rdd.map(lambda x: (hashnode(x), x)).toDF(["id", "url"]) vertices.show(5) edges = self.df.select("url", "mention") \ .withColumn("src", hashnode_udf("url")) \ .withColumn("dst", hashnode_udf("mention")) \ .select("src", "dst") edges.show(5) self.graph = GraphFrame(vertices, edges) # print(self.graph) print('communities are ') self.communities = self.graph.labelPropagation(maxIter=2) print(self.communities.persist().show(10)) print(self.communities.sort(desc("label")).show(50)) self.communities.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("communities") print("There are " + str(self.communities.select('label').distinct().count()) + " communities in sample graph.") print(self.graph.inDegrees.join(vertices, on="id") \ .orderBy("inDegree", ascending=False).show(10)) print(self.graph.stronglyConnectedComponents(maxIter=2).select('url','component').show(20))
# COMMAND ---------- stationGraph.bfs(fromExpr="id = 'Townsend at 7th'", toExpr="id = 'Spear at Folsom'", maxPathLength=2).show(10) # COMMAND ---------- spark.sparkContext.setCheckpointDir("/tmp/checkpoints") # COMMAND ---------- minGraph = GraphFrame(stationVertices, tripEdges.sample(False, 0.1)) cc = minGraph.connectedComponents() # COMMAND ---------- cc.where("component != 0").show() # COMMAND ---------- scc = minGraph.stronglyConnectedComponents(maxIter=3) # COMMAND ----------
# add GraphFrames package to spark-submit import os os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11' import sys from functools import reduce from pyspark.sql.functions import col, lit, when import pyspark from graphframes.examples import Graphs from graphframes import GraphFrame import config sc = pyspark.SparkContext() sqlContext = pyspark.SQLContext(sc) inputFile = sys.argv[1] # g = Graphs(sqlContext).friends() # Get example graph df = sqlContext.read.format("csv").option("delimiter", config.delimiter).load(inputFile) # Rename columns to something decent. df = df.withColumnRenamed("_c0", "src")\ .withColumnRenamed("_c1", "dst")\ .withColumnRenamed("_c2", "weight") df.show(5) aggcodes = df.select("src","dst").rdd.flatMap(lambda x: x).distinct() vertices = aggcodes.map(lambda x: (x, x)).toDF(["id","name"]) edges = df.select("src", "dst") graph = GraphFrame(vertices, edges) result = graph.stronglyConnectedComponents(maxIter=10) result.select("id", "component").orderBy("component").show()