Beispiel #1
0
    def run(sc, TH, infile, outfile):
        rdd = sc.textFile(infile)
        fisrtline = rdd.first()
        data = rdd.filter(lambda line: line != fisrtline)

        uid = data.map(lambda line: (line.split(',')[0], line.split(',')[1])) \
            .groupByKey() \
            .mapValues(lambda x: sorted(list(x))) \
            .collectAsMap()
         
        cand_pairs = list(itertools.combinations(list(uid.keys()), 2))
        
        edge, vertex = list(), set()
        for pair in cand_pairs:
            if len(set(uid[pair[0]]).intersection(set(uid[pair[1]]))) >= TH:
                edge.append(tuple((pair[0], pair[1])))
                edge.append(tuple((pair[1], pair[0])))
                vertex.add(pair[0])
                vertex.add(pair[1])
        graph = GraphFrame(sc.parallelize(list(vertex)).map(lambda uid: (uid,)).toDF(['id']),
                                 sc.parallelize(edge).toDF(["src", "dst"]))
        communities = graph.labelPropagation(maxIter=5)
        communities = communities.rdd.coalesce(1) \
            .map(lambda idx_label: (idx_label[1], idx_label[0])) \
            .groupByKey() \
            .map(lambda label_idxes: sorted(list(label_idxes[1]))) \
            .sortBy(lambda idxes: (len(idxes), idxes)) \
            .collect()
        Task1.toFile(outfile, communities)
def main(argv):
    filter_threshold = int(argv[1])
    input_file_path = argv[2]
    output_file_path = argv[3]
    # os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.6.0-spark2.4-s_2.11")

    # filter_threshold = 7
    # input_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/data/ub_sample_data.csv"
    # output_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/output/task1.txt"

    # 38648 records
    total_start = time.time()
    start = time.time()
    input_data = sc.textFile(input_file_path).\
        filter(lambda line: "user_id" not in line).\
        map(lambda line: tuple(line.split(","))).\
        groupByKey().\
        mapValues(set).\
        persist()  # 3374

    edges = input_data.\
        cartesian(input_data).\
        filter(lambda pair: pair[0][0] < pair[1][0]).\
        filter(lambda pair: len(pair[0][1].intersection(pair[1][1])) >= filter_threshold).\
        flatMap(lambda pair: [(pair[0][0], pair[1][0]), (pair[1][0], pair[0][0])]).\
        persist()  # 996 498
    edges_df = edges.map(lambda pair: Row(src=pair[0], dst=pair[1])).toDF()

    vertices = edges.flatMap(lambda _: _).distinct().persist()  # 222
    vertices_df = vertices.map(Row("id")).toDF()
    print("finish building edges and vertices:", time.time() - start)

    start = time.time()
    graph = GraphFrame(vertices_df, edges_df)
    result = graph.labelPropagation(maxIter=5)
    print("finish running LPA:", time.time() - start)
    # result.count()  # 222
    # result.show()

    result_rdd = result.rdd.\
        map(lambda pair: (pair['label'], pair['id'])).\
        groupByKey().\
        mapValues(lambda values: (sorted(list(values)), len(values))).\
        persist()

    result_collection = result_rdd.collect()
    result_collection.sort(key=lambda kv: (kv[1][1], kv[1][0][0]))
    with open(output_file_path, "w") as output_file:
        for community_id, (user_list, length) in result_collection:
            output_file.write(f"'{user_list[0]}'")
            for user in user_list[1:]:
                output_file.write(f", '{user}'")
            output_file.write("\n")
    print("total running time:", time.time() - total_start)
Beispiel #3
0
    def lpa(self, graph, iter):
        print("Community Detection\t1\tInitializing Algorithm", flush=True)
        edges = graph.get_df()
        vertices = edges.select('src').union(
            edges.select('dst')).distinct().withColumnRenamed('src', 'id')

        print("Community Detection\t2\tExecuting Label Propagation Algorithm",
              flush=True)
        graph = GraphFrame(vertices, edges)
        result = graph.labelPropagation(maxIter=iter)
        return result.orderBy('label', ascending=True).withColumnRenamed(
            'label', 'Community')
Beispiel #4
0
def main(argv):
    assert len(
        argv
    ) == 3, "Script takes 3 arguments <filter_threshold><input_file><community_output_file>"

    filter_threshold, input_file, output_file = argv

    filter_threshold = int(filter_threshold)

    config = SparkConf().setMaster("local[*]") \
                        .setAppName("Task2") \
                        .set("spark.executor.memory", "4g") \
                        .set("spark.driver.memory", "4g")

    sc = SparkContext(conf=config).getOrCreate()
    spark = SparkSession(sc)
    sc.setLogLevel("ERROR")

    lines = sc.textFile(input_file)
    header = lines.first()

    rdd_dict = lines.filter(lambda x: x != header) \
               .map(lambda x: (x.split(',')[0], x.split(',')[1])) \
               .groupByKey().collectAsMap()

    user_pairs = list(combinations(rdd_dict.keys(), 2))

    edges_rdd = sc.parallelize(user_pairs) \
                       .map(lambda x: (x[0], x[1])) \
                       .filter(lambda x: get_intersection(rdd_dict[x[0]], rdd_dict[x[1]]) >= filter_threshold) \
                       .cache()

    nodes_df = edges_rdd.flatMap(lambda x: x).distinct().map(
        lambda x: (x, )).toDF(["id"])

    edges_df = edges_rdd.toDF(["src", "dst"])

    gf = GraphFrame(nodes_df, edges_df)

    communities_rdd = gf.labelPropagation(maxIter=5).rdd.coalesce(1)

    communities = communities_rdd.map(lambda x: (x[1], x[0])) \
                                 .groupByKey() \
                                 .map(lambda x: sorted(list(x[1]))) \
                                 .sortBy(lambda x: (len(x), x)) \
                                 .collect()

    with open(output_file, "w+") as file:
        for community in communities:
            value = str(community)[1:-1]
            file.writelines(value + "\n")
        file.close()
Beispiel #5
0
class LPA():

    def __init__(self):
        self.spark = SparkSession \
            .builder \
            .appName('Example_2') \
            .getOrCreate()

    def graphx(self):
        self.df = self.spark.read.option("header", "true").csv('results_new/data-00000-of-00010.csv')
        # print(self.df.show(n=5))

        self.df = self.df.dropna()
        self.rdd = self.df.select("url","mention").rdd.flatMap(lambda x: x).distinct()
        # print(self.rdd.take(5))

        def hashnode(x):
            return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

        hashnode_udf = udf(hashnode)

        vertices = self.rdd.map(lambda x: (hashnode(x), x)).toDF(["id", "url"])

        vertices.show(5)

        edges = self.df.select("url", "mention") \
            .withColumn("src", hashnode_udf("url")) \
            .withColumn("dst", hashnode_udf("mention")) \
            .select("src", "dst")

        edges.show(5)

        self.graph = GraphFrame(vertices, edges)
        # print(self.graph)
        print('communities are ')
        self.communities = self.graph.labelPropagation(maxIter=2)

        print(self.communities.persist().show(10))
        print(self.communities.sort(desc("label")).show(50))
        self.communities.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("communities")
        print("There are " + str(self.communities.select('label').distinct().count()) + " communities in sample graph.")

        print(self.graph.inDegrees.join(vertices, on="id") \
            .orderBy("inDegree", ascending=False).show(10))

        print(self.graph.stronglyConnectedComponents(maxIter=2).select('url','component').show(20))
Beispiel #6
0
edgelist = []
verticeSet = set()
for x in combinations(dic.keys(), 2):
    if len(dic[x[0]].intersection(dic[x[1]])) >= threshold:
        edgelist.append(x)
        edgelist.append((x[1], x[0]))
        verticeSet.add(x[0])
        verticeSet.add(x[1])

verticelist = list(combinations(verticeSet, 1))

sqlContext = SQLContext(sc)
vertices = sqlContext.createDataFrame(verticelist, ["id"])
edges = sqlContext.createDataFrame(edgelist, ["src", "dst"])
g = GraphFrame(vertices, edges)
labeled = g.labelPropagation(maxIter=5)
resRDD = labeled.rdd.map(lambda x: (x['label'], [x['id']]))\
    .reduceByKey(lambda x, y: x+y)\
    .map(lambda x: (len(x[1]), [sorted(x[1])]))\
    .reduceByKey(lambda x, y: x+y)\
    .map(lambda x: (x[0], sorted(x[1])))\
    .sortByKey()

f = open(output, "w")
for x in resRDD.collect():
    communities = x[1]
    for community in communities:
        f.write(str(community)[1:-1]+'\n')
f.close()

print("runtime", time.time()-curr, "s")
Beispiel #7
0
start = time.time()
threshold = int(sys.argv[1])
inputfile = sys.argv[2]
outputfile = sys.argv[3]
sc = SparkContext(master="local[3]")
sc.setLogLevel("WARN")
sql_sc = SQLContext(sc)
#sql_sc.sql("set spark.sql.shuffle.partitions=200")
data = sc.textFile(inputfile).map(lambda x: x.split(',')).filter(
    lambda x: x[0] != 'user_id')
user_pairs = data.map(lambda x: [x[1], x[0]]).groupByKey().mapValues(
    sorted).mapValues(lambda x: combinations(x, 2)).flatMap(
        lambda x: x[1]).flatMap(lambda x: [[x, 1], [x[::-1], 1]]).reduceByKey(
            add).map(lambda x: [x[0][0], x[0][1], x[1]]).filter(
                lambda x: x[2] >= threshold)
users = user_pairs.flatMap(lambda x: x[:2]).distinct().map(lambda x: [x])
print(users.count())
vertices = sql_sc.createDataFrame(users, ['id'])
edges = sql_sc.createDataFrame(user_pairs, ["src", "dst", "intersection"])
graph = GraphFrame(vertices, edges)
result = graph.labelPropagation(maxIter=5).select(
    'id',
    'label').rdd.map(lambda x: [x[1], x[0]]).groupByKey().map(lambda x: sorted(
        list(x[1]))).sortBy(lambda x: x[0]).sortBy(len).collect()
file = open(outputfile, 'w')
for item in result:
    file.write("'" + "', '".join(item) + "'")
    file.write('\n')

print("Duration: ", time.time() - start)
Beispiel #8
0
    output_path = sys.argv[2]

    os.environ['PYSPARK_SUBMIT_ARGS'] = (
        '--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11')

    sc = SparkContext('local[*]', 'task1')
    sc.setLogLevel('OFF')

    start = time.time()

    vertices, edges = read_file(input_path)

    sqlc = SQLContext(sc)
    vertices = sqlc.createDataFrame(vertices, ['id'])
    edges = sqlc.createDataFrame(edges, ['src', 'dst'])
    graph = GraphFrame(vertices, edges)

    community = graph.labelPropagation(maxIter=5)
    communityRDD = community.rdd.map(
        lambda x: (x['label'], '\'{}\''.format(x['id']))).groupByKey()
    communityRDD = communityRDD.map(lambda x: sorted(x[1])).sortBy(
        lambda x: (len(x), x[0]))
    result = communityRDD.collect()

    with open(output_path, 'w') as f:
        for group in result:
            f.write(', '.join(group))
            f.write('\n')

    print('Duration: {}'.format(time.time() - start))
Beispiel #9
0
    
    sourceRdd = sc.textFile(input_file)
    first_line = sourceRdd.first()
    
    uid_bid = sourceRdd.filter(lambda line: line != first_line).distinct().map(lambda tup: tup.split(','))
    
    gen_pair = uid_bid.map(lambda line: (line[1], [line[0]])).reduceByKey(add).flatMap(lambda list_pair:make_pair(list_pair[1]))
    
    freq_pair = gen_pair.map(lambda line: (line, 1)).reduceByKey(add).filter(lambda a: a[1] >= filter_threshold).keys()
    
    edges = freq_pair.flatMap(lambda line: ((line[0], line[1]), (line[1], line[0])))
    
    #create DataFrame needs a tuple as input
    vertices = freq_pair.flatMap(lambda line: line).distinct().map(lambda line: (line, ))
    
    sql_ctx = SQLContext(sc)

    v_df = sql_ctx.createDataFrame(vertices, ["id"])
    e_df = sql_ctx.createDataFrame(edges, ["src", "dst"])
    
    graph_f = GraphFrame(v_df, e_df)
    
    label_prop = graph_f.labelPropagation(maxIter=5)
    
    result_community = label_prop.rdd.coalesce(1).map(lambda id_label: (id_label[1], id_label[0])).groupByKey()\
                       .map(lambda communities: sorted(list(communities[1]))).sortBy(lambda comms: (len(comms), comms))
    
    dumpResult(output_file, result_community)
    
    end = time.time()
    print("Duration:", end-start)
Beispiel #10
0
edge_list = []
vertex_list = []
for each in user_pair:
    edge_list.append(tuple([each[0],each[1]]))
    edge_list.append(tuple([each[1], each[0]]))
    vertex_list.append(each[0])
    vertex_list.append(each[1])
#remove same vertex
vertex_list = list(set(vertex_list))


edge = sc.parallelize(edge_list).toDF(["src","dst"])
vertex = sc.parallelize(vertex_list).map(lambda s: Row(id=s)).toDF(['id'])

graph = GraphFrame(vertex,edge)
community = graph.labelPropagation(maxIter=5).rdd.map(lambda s: (s[1],s[0])).groupByKey()\
    .map(lambda s: list(sorted(s[1]))).collect()

sorted_community = sorted(community,key=lambda s:(len(s),min(s)))

with open(community_output_file_path, 'w') as output_file:
    for i in sorted_community:
        output_file.writelines(str(i)[1:-1] + "\n")
    output_file.close()

end = time.time()
print ("Duration: %s Seconds"%(end-start))




Beispiel #11
0
    input_lines = sc.textFile(input_file_path).map(lambda x: x.split(',')).map(
        lambda x: (x[0], x[1])).filter(lambda x: x[0] != "user_id").groupByKey(
        ).mapValues(lambda x: list(x))
    ub_dict = input_lines.collectAsMap()

    edges = []
    points = set()
    for x in list(itertools.combinations(ub_dict.keys(), 2)):
        if len(set(ub_dict[x[0]]).intersection(set(
                ub_dict[x[1]]))) >= int(filter_threshold):
            edges.append(x)
            edges.append((x[1], x[0]))
            points.add(x[0])
            points.add(x[1])
    points_df = sc.parallelize(list(points)).map(lambda x: (x, ))
    points_df = ss.createDataFrame(points_df, ['id'])
    edges_df = sc.parallelize(edges)
    edges_df = ss.createDataFrame(edges_df, schema=['src', 'dst'])
    graph = GraphFrame(points_df, edges_df)
    lpa_graph = graph.labelPropagation(maxIter=5)
    communities = lpa_graph.rdd.map(lambda x: (x[1], x[0])).groupByKey().map(
        lambda x: sorted(list(x[1]))).sortBy(lambda x: (len(x), x))

    result = communities.collect()
    # output
    with open(output_file_path, 'w+') as output_file:
        for line in result:
            output_file.writelines(str(line)[1:-1] + "\n")
        output_file.close()
    print('Duration:', (time.time() - start_time))
Beispiel #12
0
    os.environ["PYSPARK_SUBMIT_ARGS"] = (
        "--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11")

    filter_threshold = 7
    fpath = sys.argv[1]
    output_path = sys.argv[2]


    start = time.time()

    rdd = sc.textFile(fpath).filter(lambda x: x != 'user_id,business_id') \
        .map(lambda x: (x.split(',')[0], x.split(',')[1])) \
        .groupByKey() \
        .map(lambda x: (x[0], list(x[1]))).persist()
    uid_lst = rdd.map(lambda x: x[0]).collect()
    bid_lst_info = rdd.collectAsMap()
    edges = rdd.flatMap(count_common).distinct().map(lambda x: (x[0][0], x[0][1])).toDF(["src", "dst"])
    v = rdd.map(lambda x: (x[0],)).toDF(['id'])

    gf = GraphFrame(v, edges).dropIsolatedVertices()
    result = gf.labelPropagation(maxIter=5)
    ans_rdd = result.rdd.coalesce(4).map(tuple)
    ans = ans_rdd.map(lambda x: (x[1], x[0])).groupByKey().mapValues(list).mapValues(sorted).sortBy(
        lambda x: len(x[1])).map(lambda x: x[1]).collect()

    #export answers
    write_files(output_path, ans)

    end = time.time()
    duration = end - start
    print(f'finished in {duration} seconds!')
Beispiel #13
0
    # is built as an instance of a GraphFrame, which is a pair of
    # vertices (as `v`) and edges (as `e`):
    graph = GraphFrame(v, e)
    print("graph=", graph)
    # GraphFrame(v:[id: string, name: string ... 1 more field],
    #            e:[src: string, dst: string ... 1 more field])

    #==================================
    # Label Propagation Algorithm (LPA)
    #==================================
    # Run static Label Propagation Algorithm for detecting
    # communities in networks.
    # Each node in the network is initially assigned to
    # its own community. At every superstep, nodes send
    # their community affiliation to all neighbors and
    # update their state to the mode community affiliation
    # of incoming messages.
    #
    # LPA is a standard community detection algorithm for
    # graphs. It is very inexpensive computationally, although
    # (1) convergence is not guaranteed and
    # (2) one can end up with trivial solutions
    # (all nodes are identified into a single community).
    #
    result = graph.labelPropagation(maxIter=5)
    result.show(truncate=False)
    result.select("id", "label").show(truncate=False)

    # done!
    spark.stop()
Beispiel #14
0
        .map(lambda x: tuple([x]))\
        .distinct()

    nodes_df = nodes.toDF(["id"])
    # nodes_df.show()

    # get all possible edges
    edges_df = user_pairs.union(user_pairs.map(lambda x: (x[1], x[0]))).toDF(
        ["src", "dst"])
    # edges_df.show()

    # create graph
    community_graph = GraphFrame(nodes_df, edges_df)

    # get labels
    result = community_graph.labelPropagation(maxIter=5)
    # result.show()

    # group communities
    detected_communities = result\
        .rdd\
        .map(tuple)\
        .map(lambda x: (x[1], x[0]))\
        .groupByKey()\
        .map(lambda x: sorted(list(x[1])))\
        .sortBy(lambda x: (len(x), x[0]))

    detected_communities_collection = detected_communities.collect()

    write_results(detected_communities_collection, argv[3])
Beispiel #15
0
## Create the mapping.
graphVertices = (rawData.map(lambda f : (f[0])) + rawData.map(lambda f : (f[1]))).distinct().map(lambda x: Row(id = x)).collect()
pprint(graphVertices)
graphEdges = rawData.flatMap(lambda f : ((f[0], f[1]), (f[1], f[0]))).collect()

## Create the vertices.
vertices = sqlContext.createDataFrame(graphVertices, ['id'])

## Create the edges.
edges = sqlContext.createDataFrame(graphEdges, ["src", "dst"])

## Construct the graph.
g = GraphFrame(vertices, edges)

## Run the label propagation algorithm.
result = g.labelPropagation(maxIter = 5).collect()
resultDict = {}

for i in result:
    label = i["label"]
    if label not in resultDict.keys():
        resultDict[label] = []
    resultDict[label].append(str(i["id"]))

ordered_keys = sorted(resultDict, key=lambda k: (len(resultDict[k]), min(resultDict[k])))
with open(outfilePath, "w") as f:
    for key in ordered_keys:
        temp = sorted(resultDict[key])
        f.write("'" + "', '".join(temp) + "'\n")

tockTime = time.time()
Beispiel #16
0
input_file = sys.argv[1]
output_file = sys.argv[2]
# input_file = 'dataset/power_input.txt'
# output_file = 'output/task1.csv'

conf = SparkConf().setAppName("INF553").setMaster('local[*]')
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)

input_data = sc.textFile(input_file)
input_data = input_data.map(lambda x: x.split(" "))

vertices_src = input_data.map(lambda x: x[0]).persist()
vertices_dst = input_data.map(lambda x: x[1]).persist()
vertices = sc.union([vertices_src, vertices_dst]).distinct().map(lambda x: Row(x))

edges_forward = input_data.map(lambda x: (x[0], x[1])).persist()
edges_backward = input_data.map(lambda x: (x[1], x[0])).persist()
edges = sc.union([edges_forward, edges_backward]).distinct()

vertices = sqlContext.createDataFrame(vertices, ["id"])
edges = sqlContext.createDataFrame(edges, ["src", "dst"])

gf = GraphFrame(vertices, edges)
lpa_df = gf.labelPropagation(maxIter=5)
output = lpa_df.rdd.map(lambda x: (x[1], x[0])).groupByKey().mapValues(lambda x: sorted(list(x))) \
    .sortBy(lambda x: (len(x[1]), x[1])).map(lambda x: tuple(x[1])).collect()
write_to_file(output_file, output)
print("Duration:", time.time() - start_time)
g.edges.show()

result_df_political = sqlContext.read.parquet("result.df.f_political")


# result_df_political.show()

def get_groups(row):
    return row[0]


min_length = 9999999
iteration_min_length = 100
yeah_good_length = False
for i in range(7, 15):
    result_df = g.labelPropagation(maxIter=i)

    clusters_result = result_df_political.join(result_df, result_df_political["identifier"] == result_df["id"])
    clusters_result = clusters_result.drop("id")

    clusters_result_groups = clusters_result.groupBy("label").count().sort(col("label")).rdd.map(get_groups).collect()
    clusters_result.groupBy("label").count().sort(col("label")).show()
    users_by_groups_7 = {}
    groups_length = 0
    for group in clusters_result_groups:
        groups_length += 1
    print "Length:\n" + str(groups_length)
    print "Iteration:" + str(i)
    result_df.show()

    if min_length > groups_length:
    lambda x: (str(x[0]), str(x[1]))).collect()
edges2_rdd = data.map(lambda x: x.split(' ')).map(
    lambda x: (str(x[1]), str(x[0]))).collect()
edges1 = edges1_rdd + edges2_rdd
edges_rdd = sc.parallelize(edges1)

vertices = sqlContext.createDataFrame(vertices_rdd, ["id"])

edges = sqlContext.createDataFrame(edges_rdd, ["src", "dst"])

#add edges and vertices to graph in library
g = GraphFrame(vertices, edges)
#print(g)

#call algorithm to detect communities
result = g.labelPropagation(maxIter=5).rdd.map(
    lambda x: (x[1], str(x[0]))).groupByKey().map(lambda x: sorted(x[1]))
sorted_graph_list = result.collect()

sorted_graph_list.sort()
sorted_graph_list.sort(key=len)

#print(sorted_graph_list)

f = open(output_file_name, 'w')

for i in sorted_graph_list:
    s = str(i).replace("[", "").replace("]", "")
    f.write(str(s))
    f.write("\n")
f.close()
Beispiel #19
0
# | a|  Alice| 34|
# | b|    Bob| 36|
# | c|Charlie| 30|
# | d|  David| 29|
# | e| Esther| 32|
# | f|  Fanny| 36|
# | g|  Gabby| 60|
# +--+-------+---+

g.edges.show(5)
# +---+---+------------+
# |src|dst|relationship|
# +---+---+------------+
# |  a|  b|      friend|
# |  b|  c|      follow|
# |  c|  b|      follow|
# |  f|  c|      follow|
# |  e|  f|      follow|
# |  e|  d|      friend|
# |  d|  a|      friend|
# |  a|  e|      friend|
# +---+---+------------+

# Get a DataFrame with columns "id" and "inDegree" (in-degree)
vertexInDegrees = g.inDegrees
vertexInDegrees.show(5)

# Run LPA
communities = g.labelPropagation(maxIter=5)
communities.persist().show(10)
Beispiel #20
0
user_distinct = graph_rdd.map(lambda line: (line[0], line[1]))\
            .flatMap(lambda x: x)\
            .distinct()\
            .map(lambda x: [x])

graph_with_reverse = graph_rdd.map(lambda line: ([line[0], line[1]], [line[1], line[0]]) )\
            .flatMap(lambda x: x)


vertex_df = spark_sql.createDataFrame(user_distinct, ["id"]).coalesce(6)
edge_df = spark_sql.createDataFrame(graph_with_reverse,["src", "dst"]).coalesce(6)

time_here = time.time()
g = GraphFrame(vertex_df, edge_df)

res_df = g.labelPropagation(maxIter=5)
# res_df.show()
print("Duration for graph propagtion:", time.time() - time_here)

res_rdd = res_df.coalesce(1).rdd.map(tuple)

res_list = res_rdd.map(lambda pair: (pair[1], pair[0]))\
        .groupByKey()\
        .map(lambda line: sorted(list(line[1])))\
        .sortBy(lambda x: [len(x), x] )\
        .map(lambda x: ", ".join(x))\
        .collect()

convert_to_file(OUTPUT_PATH, res_list)
print("Duration: ", time.time() - start_time)
Beispiel #21
0
sqlContext = SQLContext(sc)

sc.setLogLevel("ERROR") 
sample_text = sc.textFile(sys.argv[2]).map(lambda x : x.split(','))
sample_first = sample_text.take(1) 
sample_text = sample_text.filter(lambda x : x !=sample_first[0]).map(lambda x : (x[1],[x[0]])).reduceByKey(lambda x,y: x+y)
#edges = sample_text.map(lambda x : (x[0],[x[1])])).reduceByKey(lambda x,y : x+y).map(lambda x :(1,[x])).reduceByKey(lambda x,y : x+y).flatMap(lambda x : itertools.combinations(x[1],2)).filter(lambda x : len(set(x[0][1]).intersection(set(x[1][1]))) >= 7).map(lambda x : (x[0][0].encode('UTF-8'),x[1][0].encode('UTF-8'))).distinct().map(lambda x : [x[0],x[1]])

t = int(sys.argv[1])
edges = sample_text.flatMap(lambda x : ([x[0],y] for y in list(itertools.permutations(x[1],2)))).filter(lambda x : len(set(x[1]))  != 1).map(lambda x : (x[1],[x[0]])).reduceByKey(lambda x,y : x+y).filter(lambda x : len(set(x[1])) >= t).map(lambda x :(x[0][0],x[0][1])).distinct().map(lambda x :[x[0],x[1]])
edgesDF = sqlContext.createDataFrame(edges,["src","dst"])
vertices = edges.flatMap(lambda x: ((x[0]),(x[1]))).distinct().map(lambda x : [x])

verticesDF = sqlContext.createDataFrame(vertices,["id"])

g = GraphFrame(verticesDF, edgesDF)
comm = g.labelPropagation(maxIter=5).select('label','id')
comm_rdd = comm.rdd.map(lambda x:(x[0],[x[1]])).reduceByKey(lambda x,y: x+y).sortBy(lambda x : (len(x[1]),sorted(x[1]))).collect()

f = open(sys.argv[3],"w")
for community in comm_rdd :  
    print(community)
    f.write(str(sorted(community[1])).strip("[]"))
    f.write("\n")
f.close()
sc.stop()
print("Duration : " + str(time.time()-start))



Beispiel #22
0
    edge_list = list()
    vertex_set = set()
    for pair in uid_pairs:
        if len(
                set(uid_bidxes_dict[pair[0]]).intersection(
                    set(uid_bidxes_dict[pair[1]]))) >= int(filter_threshold):
            edge_list.append(tuple(pair))
            edge_list.append(tuple((pair[1], pair[0])))
            vertex_set.add(pair[0])
            vertex_set.add(pair[1])

    # vertex_df = vertex_rdd.toDF(["id"]).write.csv('vertex.csv')
    # edge_df = edge_rdd.toDF(["src", "dst"]).write.csv('edge.csv')
    vertex_df = sc.parallelize(list(vertex_set)).map(lambda uid: (uid, )).toDF(
        ['id'])
    edge_df = sc.parallelize(edge_list).toDF(["src", "dst"])

    graph_frame = GraphFrame(vertex_df, edge_df)

    communities = graph_frame.labelPropagation(maxIter=5)

    communities_rdd = communities.rdd.coalesce(1) \
        .map(lambda idx_label: (idx_label[1], idx_label[0])) \
        .groupByKey().map(lambda label_idxes: sorted(list(label_idxes[1]))) \
        .sortBy(lambda idxes: (len(idxes), idxes))

    # export your finding
    export2File(communities_rdd.collect(), output_file_path)

    print("Duration: %d s." % (time.time() - start))
Beispiel #23
0
        return False


new_rddr = user_rdd.cartesian(user_rdd).filter(lambda a: a[0] < a[1]).filter(
    lambda a: check3(user_busi_dict[a[0]], user_busi_dict[a[1]]))
user = new_rddr.map(lambda a: list([a[0], a[1]])).flatMap(
    lambda a: a).distinct()
user_df = user.map(lambda x: (x, )).toDF(["id"])
edge_df = new_rddr.toDF(["src", "dst"])
# print(user.take(10))
# edge_df.show()

# user_df.show()
g = GraphFrame(user_df, edge_df)
# print(g)
result = g.labelPropagation(maxIter=5).persist()
# result.show()
res = result.select(
    "id", "label").rdd.map(tuple).map(lambda a: (a[1], a[0])).groupByKey().map(
        lambda a: (a[0], tuple(sorted(list(a[1]))))).map(lambda a: (a[1][0], a[
            1])).sortByKey().map(lambda a: (len(a[1]), a[1])).sortByKey().map(
                lambda a: a[1]).collect()
# print(res.take(50))
filename = out_path
with open(filename, 'w') as zaili:
    for a in res:
        l = len(a)
        for i in range(l):
            if i == l - 1:
                zaili.write(str(a[i]))
            else:
e = sqlContext.createDataFrame(edges, ["src", "dst", "relationship"])
# Create a GraphFrame

print "Vertex: " + str(len(vertex))
print "Edges: " + str(len(edges))

g = GraphFrame(v, e)
g.vertices.show()
g.edges.show()


def get_groups(row):
    return row[0]


result_df_7 = g.labelPropagation(maxIter=7)
result_df_7.show()
groups_by_7 = result_df_7.groupBy("label").count().sort(
    col("label")).rdd.map(get_groups).collect()
result_df_7.groupBy("label").count().sort(col("label")).show()

groups_length = 0
for group in groups_by_7:
    groups_length += 1
print "Length:\n" + str(groups_length)
"""result_df_7.write.parquet("result.df.7")

result_df_6 = g.labelPropagation(maxIter=6)
result_df_6.show()
result_df_6.write.parquet("result.df.6")