Esempio n. 1
0
    def test_gf(self):

        vertices = spark.createDataFrame([('1', 'Carter', 'Derrick', 50),
                                          ('2', 'May', 'Derrick', 26),
                                          ('3', 'Mills', 'Jeff', 80),
                                          ('4', 'Hood', 'Robert', 65),
                                          ('5', 'Banks', 'Mike', 93),
                                          ('98', 'Berg', 'Tim', 28),
                                          ('99', 'Page', 'Allan', 16)],
                                         ['id', 'name', 'firstname', 'age'])
        edges = spark.createDataFrame([('1', '2', 'friend'),
                                       ('2', '1', 'friend'),
                                       ('3', '1', 'friend'),
                                       ('1', '3', 'friend'),
                                       ('2', '3', 'follows'),
                                       ('3', '4', 'friend'),
                                       ('4', '3', 'friend'),
                                       ('5', '3', 'friend'),
                                       ('3', '5', 'friend'),
                                       ('4', '5', 'follows'),
                                       ('98', '99', 'friend'),
                                       ('99', '98', 'friend')],
                                      ['src', 'dst', 'type'])
        g = GraphFrame(vertices, edges)
        g.connectedComponents().show()
Esempio n. 2
0
def main():
    # crate spark session
    spark = SparkSession.builder.appName("keepindoors graphx connectedComponents()").getOrCreate()

    # get a mongo client
    cli = mongo.__get__()

    # v, ["id","url","title","datetime"]
    localVertices=[]
    cursor = mongo.getCollection(cli,"keepindoors","docs").find()
    for r in cursor:
        # del "_id" key which will throws error when createDataFrame
        r["id"] = r["docno"]
        localVertices.append((r["docno"],r["url"],r["title"],str(r["_id"].generation_time + timedelta(hours=8))))

    # e
    cursor = mongo.getCollection(cli, "keepindoors", "distances").find()
    localEdges = []
    for r in cursor:
        localEdges.append((r["docno1"],r["docno2"],r["distance"]))

    v = spark.createDataFrame(localVertices,["id","url","title","datetime"])
    e = spark.createDataFrame(localEdges, ["src", "dst","distance"])
    g = GraphFrame(v,e)
    # get sparkContext from sparkSession
    spark.sparkContext.setCheckpointDir("/tmp/spark/checkpoint")
    result = g.connectedComponents()

    # order by component,datetime
    result = result.orderBy(["component", "datetime"], ascending=[1, 0]).collect()

    # create component dict
    component_dict = {}
    for row in result:
        record = row.asDict()
        if record["component"] not in component_dict.keys():
            component_dict[record["component"]] = []
        component_dict[record["component"]].append(record)

    # delete mongo collection "component"
    mongo.deleteAll(cli,"keepindoors","components")

    # save component_dict into mongo
    index = 1
    for key,item in component_dict.items():
        links = []
        titles = []
        title = "empty title"
        update_time = "1970-01-01 00:00:00+00:00"
        for doc in item:
            titles.append(doc["title"])
            links.append(doc["url"])
            if doc["datetime"] > update_time:
                update_time = doc["datetime"]
                title = doc["title"]
        mongo.insertDoc({"no":index,"component":key,"title":title,"size":len(item),"links":links,"titles":titles,"update_time":update_time,"docs":item},cli,"keepindoors","components")
        index += 1
Esempio n. 3
0
def get_connected_components(vertices_path, edges_path, checkpoint_dir,
                             num_reads):
    # Read vertices and edges files
    df_vertices = build_vertices(vertices_path)
    df_edges = build_edges(edges_path, num_reads)

    # Build Graph
    spark = SparkSession.builder.appName("build_graph").getOrCreate()
    vertices = spark.createDataFrame(df_vertices)

    edges = spark.createDataFrame(df_edges)
    g = GraphFrame(vertices, edges)

    # Display Graph
    g.vertices.show()
    g.edges.show()

    # Connected Components
    # Get SparkContext using spark.sparkContext
    spark.sparkContext.setCheckpointDir(dirName=checkpoint_dir)
    result = g.connectedComponents()

    dictionary = {}

    sorted_result = result.select("id", "component").orderBy('component',
                                                             ascending=False)

    for row in sorted_result.collect():
        if row[1] in dictionary:
            dictionary[row[1]].append(row[0])
        else:
            dictionary[row[1]] = [row[0]]

    GL = []

    for _, value in dictionary.items():
        GL.append(value)

    return GL, spark, g
Esempio n. 4
0
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11'
import sys
from functools import reduce
from pyspark.sql.functions import col, lit, when
import pyspark
from graphframes.examples import Graphs
from graphframes import GraphFrame
import config

sc = pyspark.SparkContext()
sc.setCheckpointDir('/tmp')
sqlContext = pyspark.SQLContext(sc)
inputFile = sys.argv[1]
# g = Graphs(sqlContext).friends()  # Get example graph
df = sqlContext.read.format("csv").option("delimiter", config.delimiter).load(inputFile)
# Rename columns to something decent.
df = df.withColumnRenamed("_c0", "src")\
.withColumnRenamed("_c1", "dst")\
.withColumnRenamed("_c2", "weight")
df.show(5)

aggcodes = df.select("src","dst").rdd.flatMap(lambda x: x).distinct()
vertices = aggcodes.map(lambda x: (x, x)).toDF(["id","name"])

edges = df.select("src", "dst")
graph = GraphFrame(vertices, edges)

result = graph.connectedComponents()
result.select("id", "component").orderBy("component").show()
# COMMAND ----------

stationGraph.bfs(fromExpr="id = 'Townsend at 7th'",
  toExpr="id = 'Spear at Folsom'", maxPathLength=2).show(10)


# COMMAND ----------

spark.sparkContext.setCheckpointDir("/tmp/checkpoints")


# COMMAND ----------

minGraph = GraphFrame(stationVertices, tripEdges.sample(False, 0.1))
cc = minGraph.connectedComponents()


# COMMAND ----------

cc.where("component != 0").show()


# COMMAND ----------

scc = minGraph.stronglyConnectedComponents(maxIter=3)


# COMMAND ----------

Esempio n. 6
0
    # of each vertex and returns a graph with each 
    # vertex assigned a component ID.
    # NOTE: With GraphFrames 0.3.0 and later releases, 
    # the default Connected Components algorithm requires 
    # setting a Spark checkpoint directory. Users can 
    # revert to the old algorithm using 
    # connectedComponents.setAlgorithm("graphx").
    
    #=====================================
    # setting a Spark "checkpoint" directory
    #=====================================
    # What is a Checkpointing? Checkpointing is a process 
    # of truncating RDD lineage graph and saving it to a 
    # reliable distributed (HDFS) or local file system.
    #
    # You call SparkContext.setCheckpointDir(directory: String) 
    # to set the checkpoint directory - the directory where RDDs 
    # are checkpointed.
    #
    spark.sparkContext.setCheckpointDir("/tmp/spark_check_point_dir")
    
    #==========================================
    # apply the connectedComponents() algorithm
    #==========================================
    #
    connected_components = graph.connectedComponents()
    connected_components.select("id", "component").orderBy("component").show()
    
    # done!
    spark.stop()
Esempio n. 7
0
    #filename = '/home/user/leaflet-spark/atom_position_frame_1.npz.npy'
    
    coord_matrix = np.load(filename)
    coord_matrix_broadcast = sc.broadcast(coord_matrix)
    matrix_size = len(coord_matrix)
    dist_Matrix = sc.parallelize(coord_matrix)
    dist_Matrix = dist_Matrix.zipWithIndex()  #key-value pairs
    edge_list = dist_Matrix.flatMap(find_edges)
    
    edge_list = edge_list.filter(lambda x: x[0]!=-1) # filter the -1 values
    
    sqlContext = SQLContext(sc)
    
    Edges = Row('src','dst')
    edge = edge_list.map(lambda x: Edges(*x))
    e = sqlContext.createDataFrame(edge)
    # e.take(10)
    v = sqlContext.createDataFrame(sc.parallelize(xrange(matrix_size)).map(lambda i:Row(id=i+1)))
    # v.show()
    
    # create the graph
    g = GraphFrame(v, e)
    #g.vertices.show()
    #g.edges.show()
    total_time = time() - start_time
    cc = g.connectedComponents()
    print cc.select("id", "component").orderBy("component").show()
    print 'Total time to create the Graphframe: %i sec'  % (total_time)
    print 'Time to calculate the connected components: %i sec ' % (time() - total_time-start_time)
    
Esempio n. 8
0
chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")
# chain4.show()
# g.find("(c)-[m]->()").show()
# Query on sequence, with state (cnt)
#  (a) Define method for updating state given the next element of the motif.
sumFriends = \
    lambda cnt, relationship: F.when(relationship == "friend", cnt + 1).otherwise(cnt)
#  (b) Use sequence operation to apply method to sequence of elements in motif.
#      In this case, the elements are the 3 edges.
condition = \
    reduce(lambda cnt, e: sumFriends(cnt, F.col(e).relationship), ["ab", "bc", "cd"], F.lit(0))
#  (c) Apply filter to DataFrame.
chainWith2Friends2 = chain4.where(condition >= 2)
# chainWith2Friends2.show()

result = g.connectedComponents()
# 结果含义的解释
# result.show()

# 强连通图和连通图的区别
'''
Connected is usually associated with undirected graphs (two way edges): there is a path between every two nodes.
Strongly connected is usually associated with directed graphs (one way edges): there is a route between every two nodes.
Complete graphs are undirected graphs where there is an edge between every pair of nodes.
'''
result = g.stronglyConnectedComponents(maxIter=10)
# result.orderBy("component").show()

## 社区发现,本质是个聚类算法
result = g.labelPropagation(maxIter=5)
# result.show()