Esempio n. 1
0
'''
Connected is usually associated with undirected graphs (two way edges): there is a path between every two nodes.
Strongly connected is usually associated with directed graphs (one way edges): there is a route between every two nodes.
Complete graphs are undirected graphs where there is an edge between every pair of nodes.
'''
result = g.stronglyConnectedComponents(maxIter=10)
# result.orderBy("component").show()

## 社区发现,本质是个聚类算法
result = g.labelPropagation(maxIter=5)
# result.show()

## 一个实例,背景讲解

v = spark.createDataFrame([("a", "Alice", 34), ("b", "Bob", 36),
                           ("c", "Charlie", 30), ("d", "David", 29),
                           ("e", "Esther", 32), ("f", "Fanny", 36),
                           ("g", "Gabby", 60)], ["id", "name", "age"])
# Edge DataFrame
e = spark.createDataFrame([("a", "b", 0.7), ("b", "c", 0.8), ("c", "b", 0.1),
                           ("f", "c", 0.8), ("e", "f", 0.2), ("e", "d", 0.2),
                           ("d", "a", 0.3), ("a", "e", 0.22)],
                          ["src", "dst", "sim"])
# Create a GraphFrame,本质上就是两个dataframe
g = GraphFrame(v, e)
g = g.filterEdges("sim > 0.6")
result = g.connectedComponents()
# graphframe的好处,可以用我们熟悉的api做后续处理
result.groupby("component").agg(F.collect_list("name").alias("name")).select(
    F.col("component").alias("group"), F.col("name")).show()