Esempio n. 1
0
class LPA():

    def __init__(self):
        self.spark = SparkSession \
            .builder \
            .appName('Example_2') \
            .getOrCreate()

    def graphx(self):
        self.df = self.spark.read.option("header", "true").csv('results_new/data-00000-of-00010.csv')
        # print(self.df.show(n=5))

        self.df = self.df.dropna()
        self.rdd = self.df.select("url","mention").rdd.flatMap(lambda x: x).distinct()
        # print(self.rdd.take(5))

        def hashnode(x):
            return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

        hashnode_udf = udf(hashnode)

        vertices = self.rdd.map(lambda x: (hashnode(x), x)).toDF(["id", "url"])

        vertices.show(5)

        edges = self.df.select("url", "mention") \
            .withColumn("src", hashnode_udf("url")) \
            .withColumn("dst", hashnode_udf("mention")) \
            .select("src", "dst")

        edges.show(5)

        self.graph = GraphFrame(vertices, edges)
        # print(self.graph)
        print('communities are ')
        self.communities = self.graph.labelPropagation(maxIter=2)

        print(self.communities.persist().show(10))
        print(self.communities.sort(desc("label")).show(50))
        self.communities.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("communities")
        print("There are " + str(self.communities.select('label').distinct().count()) + " communities in sample graph.")

        print(self.graph.inDegrees.join(vertices, on="id") \
            .orderBy("inDegree", ascending=False).show(10))

        print(self.graph.stronglyConnectedComponents(maxIter=2).select('url','component').show(20))
# COMMAND ----------

stationGraph.bfs(fromExpr="id = 'Townsend at 7th'",
  toExpr="id = 'Spear at Folsom'", maxPathLength=2).show(10)


# COMMAND ----------

spark.sparkContext.setCheckpointDir("/tmp/checkpoints")


# COMMAND ----------

minGraph = GraphFrame(stationVertices, tripEdges.sample(False, 0.1))
cc = minGraph.connectedComponents()


# COMMAND ----------

cc.where("component != 0").show()


# COMMAND ----------

scc = minGraph.stronglyConnectedComponents(maxIter=3)


# COMMAND ----------

Esempio n. 3
0
# add GraphFrames package to spark-submit
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11'
import sys
from functools import reduce
from pyspark.sql.functions import col, lit, when
import pyspark
from graphframes.examples import Graphs
from graphframes import GraphFrame
import config

sc = pyspark.SparkContext()
sqlContext = pyspark.SQLContext(sc)
inputFile = sys.argv[1]
# g = Graphs(sqlContext).friends()  # Get example graph
df = sqlContext.read.format("csv").option("delimiter", config.delimiter).load(inputFile)
# Rename columns to something decent.
df = df.withColumnRenamed("_c0", "src")\
.withColumnRenamed("_c1", "dst")\
.withColumnRenamed("_c2", "weight")
df.show(5)

aggcodes = df.select("src","dst").rdd.flatMap(lambda x: x).distinct()
vertices = aggcodes.map(lambda x: (x, x)).toDF(["id","name"])

edges = df.select("src", "dst")
graph = GraphFrame(vertices, edges)

result = graph.stronglyConnectedComponents(maxIter=10)
result.select("id", "component").orderBy("component").show()