Esempio n. 1
0
    def graphframes_pagerank(self, sc, sqlc):
        """ GraphFrame's PageRank implementation """

        from graphframes import GraphFrame  # pylint: disable=import-error

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))
        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph,
                                                "vertices"))

        graph = GraphFrame(vertex_df, edge_df)

        withPageRank = graph.pageRank(maxIter=self.args.maxiter)

        final_df = sql(
            sqlc, """
            SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r
            FROM ranks
            ORDER BY ranks.pagerank DESC
        """, {"ranks": withPageRank.vertices})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none")

        else:
            print(final_df.rdd.collect())
Esempio n. 2
0
    def test_gf(self):

        vertices = spark.createDataFrame([('1', 'Carter', 'Derrick', 50),
                                          ('2', 'May', 'Derrick', 26),
                                          ('3', 'Mills', 'Jeff', 80),
                                          ('4', 'Hood', 'Robert', 65),
                                          ('5', 'Banks', 'Mike', 93),
                                          ('98', 'Berg', 'Tim', 28),
                                          ('99', 'Page', 'Allan', 16)],
                                         ['id', 'name', 'firstname', 'age'])
        edges = spark.createDataFrame([('1', '2', 'friend'),
                                       ('2', '1', 'friend'),
                                       ('3', '1', 'friend'),
                                       ('1', '3', 'friend'),
                                       ('2', '3', 'follows'),
                                       ('3', '4', 'friend'),
                                       ('4', '3', 'friend'),
                                       ('5', '3', 'friend'),
                                       ('3', '5', 'friend'),
                                       ('4', '5', 'follows'),
                                       ('98', '99', 'friend'),
                                       ('99', '98', 'friend')],
                                      ['src', 'dst', 'type'])
        g = GraphFrame(vertices, edges)
        g.connectedComponents().show()
Esempio n. 3
0
    def run(sc, TH, infile, outfile):
        rdd = sc.textFile(infile)
        fisrtline = rdd.first()
        data = rdd.filter(lambda line: line != fisrtline)

        uid = data.map(lambda line: (line.split(',')[0], line.split(',')[1])) \
            .groupByKey() \
            .mapValues(lambda x: sorted(list(x))) \
            .collectAsMap()
         
        cand_pairs = list(itertools.combinations(list(uid.keys()), 2))
        
        edge, vertex = list(), set()
        for pair in cand_pairs:
            if len(set(uid[pair[0]]).intersection(set(uid[pair[1]]))) >= TH:
                edge.append(tuple((pair[0], pair[1])))
                edge.append(tuple((pair[1], pair[0])))
                vertex.add(pair[0])
                vertex.add(pair[1])
        graph = GraphFrame(sc.parallelize(list(vertex)).map(lambda uid: (uid,)).toDF(['id']),
                                 sc.parallelize(edge).toDF(["src", "dst"]))
        communities = graph.labelPropagation(maxIter=5)
        communities = communities.rdd.coalesce(1) \
            .map(lambda idx_label: (idx_label[1], idx_label[0])) \
            .groupByKey() \
            .map(lambda label_idxes: sorted(list(label_idxes[1]))) \
            .sortBy(lambda idxes: (len(idxes), idxes)) \
            .collect()
        Task1.toFile(outfile, communities)
def algorithm2(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        if (g.filterVertices(
                "value == -1").dropIsolatedVertices().edges.count() == 0):
            final_df = g.vertices
            final_df = final_df.withColumn(
                "value",
                F.when(final_df["value"] == -1,
                       i).otherwise(final_df["value"]))
            break
    return final_df
Esempio n. 5
0
    def graphframes_pagerank(self, sc, sqlc):
        """ GraphFrame's PageRank implementation """

        from graphframes import GraphFrame  # pylint: disable=import-error

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))
        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph, "vertices"))

        graph = GraphFrame(vertex_df, edge_df)

        withPageRank = graph.pageRank(maxIter=self.args.maxiter)

        final_df = sql(sqlc, """
            SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r
            FROM ranks
            ORDER BY ranks.pagerank DESC
        """, {"ranks": withPageRank.vertices})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none"
            )

        else:
            print(final_df.rdd.collect())
Esempio n. 6
0
def sample_induced_subgraph(
    graph: GraphFrame,
    seed: int,
    k_hops: int = 2,
    pr_alpha: float = 0.85,
    pr_tol: float = 0.001,
) -> GraphFrame:
    assert k_hops <= 3

    # build motif for finding a k-hop neighborhood localized to a node
    symbols = string.ascii_letters
    motif_edges = [(symbols[i], symbols[i + 1]) for i in range(k_hops)]
    paths = graph.find(";".join(
        [f"({e1})-[]->({e2})" for e1, e2 in motif_edges]))
    # the center of odd paths have indices greater than the midpoint
    # this should increase the seed article's pagerank score on 1-hop networks
    centered_paths = paths.where(f"{symbols[k_hops-k_hops//2]}.id = {seed}")

    vertices = centered_paths.selectExpr(f"{symbols[0]} as v")
    for i in range(1, k_hops + 1):
        vertices = vertices.union(
            centered_paths.selectExpr(f"{symbols[i]} as v"))
    vertices = vertices.select("v.*").distinct()
    vertices.cache()

    edges = (graph.edges.join(
        vertices, on=graph.edges["src"] == vertices["id"],
        how="right").select("src", "dst").join(
            vertices,
            on=graph.edges["dst"] == vertices["id"], how="right").select(
                "src", "dst").where("src is not null AND dst is not null"))
    return GraphFrame(vertices, edges).pageRank(pr_alpha, tol=pr_tol)
Esempio n. 7
0
def main():
    # crate spark session
    spark = SparkSession.builder.appName("keepindoors graphx connectedComponents()").getOrCreate()

    # get a mongo client
    cli = mongo.__get__()

    # v, ["id","url","title","datetime"]
    localVertices=[]
    cursor = mongo.getCollection(cli,"keepindoors","docs").find()
    for r in cursor:
        # del "_id" key which will throws error when createDataFrame
        r["id"] = r["docno"]
        localVertices.append((r["docno"],r["url"],r["title"],str(r["_id"].generation_time + timedelta(hours=8))))

    # e
    cursor = mongo.getCollection(cli, "keepindoors", "distances").find()
    localEdges = []
    for r in cursor:
        localEdges.append((r["docno1"],r["docno2"],r["distance"]))

    v = spark.createDataFrame(localVertices,["id","url","title","datetime"])
    e = spark.createDataFrame(localEdges, ["src", "dst","distance"])
    g = GraphFrame(v,e)
    # get sparkContext from sparkSession
    spark.sparkContext.setCheckpointDir("/tmp/spark/checkpoint")
    result = g.connectedComponents()

    # order by component,datetime
    result = result.orderBy(["component", "datetime"], ascending=[1, 0]).collect()

    # create component dict
    component_dict = {}
    for row in result:
        record = row.asDict()
        if record["component"] not in component_dict.keys():
            component_dict[record["component"]] = []
        component_dict[record["component"]].append(record)

    # delete mongo collection "component"
    mongo.deleteAll(cli,"keepindoors","components")

    # save component_dict into mongo
    index = 1
    for key,item in component_dict.items():
        links = []
        titles = []
        title = "empty title"
        update_time = "1970-01-01 00:00:00+00:00"
        for doc in item:
            titles.append(doc["title"])
            links.append(doc["url"])
            if doc["datetime"] > update_time:
                update_time = doc["datetime"]
                title = doc["title"]
        mongo.insertDoc({"no":index,"component":key,"title":title,"size":len(item),"links":links,"titles":titles,"update_time":update_time,"docs":item},cli,"keepindoors","components")
        index += 1
def main(argv):
    filter_threshold = int(argv[1])
    input_file_path = argv[2]
    output_file_path = argv[3]
    # os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.6.0-spark2.4-s_2.11")

    # filter_threshold = 7
    # input_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/data/ub_sample_data.csv"
    # output_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/output/task1.txt"

    # 38648 records
    total_start = time.time()
    start = time.time()
    input_data = sc.textFile(input_file_path).\
        filter(lambda line: "user_id" not in line).\
        map(lambda line: tuple(line.split(","))).\
        groupByKey().\
        mapValues(set).\
        persist()  # 3374

    edges = input_data.\
        cartesian(input_data).\
        filter(lambda pair: pair[0][0] < pair[1][0]).\
        filter(lambda pair: len(pair[0][1].intersection(pair[1][1])) >= filter_threshold).\
        flatMap(lambda pair: [(pair[0][0], pair[1][0]), (pair[1][0], pair[0][0])]).\
        persist()  # 996 498
    edges_df = edges.map(lambda pair: Row(src=pair[0], dst=pair[1])).toDF()

    vertices = edges.flatMap(lambda _: _).distinct().persist()  # 222
    vertices_df = vertices.map(Row("id")).toDF()
    print("finish building edges and vertices:", time.time() - start)

    start = time.time()
    graph = GraphFrame(vertices_df, edges_df)
    result = graph.labelPropagation(maxIter=5)
    print("finish running LPA:", time.time() - start)
    # result.count()  # 222
    # result.show()

    result_rdd = result.rdd.\
        map(lambda pair: (pair['label'], pair['id'])).\
        groupByKey().\
        mapValues(lambda values: (sorted(list(values)), len(values))).\
        persist()

    result_collection = result_rdd.collect()
    result_collection.sort(key=lambda kv: (kv[1][1], kv[1][0][0]))
    with open(output_file_path, "w") as output_file:
        for community_id, (user_list, length) in result_collection:
            output_file.write(f"'{user_list[0]}'")
            for user in user_list[1:]:
                output_file.write(f", '{user}'")
            output_file.write("\n")
    print("total running time:", time.time() - total_start)
Esempio n. 9
0
    def lpa(self, graph, iter):
        print("Community Detection\t1\tInitializing Algorithm", flush=True)
        edges = graph.get_df()
        vertices = edges.select('src').union(
            edges.select('dst')).distinct().withColumnRenamed('src', 'id')

        print("Community Detection\t2\tExecuting Label Propagation Algorithm",
              flush=True)
        graph = GraphFrame(vertices, edges)
        result = graph.labelPropagation(maxIter=iter)
        return result.orderBy('label', ascending=True).withColumnRenamed(
            'label', 'Community')
Esempio n. 10
0
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(
                    AM.src['color'] == color,
                    AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(
                    AM.dst['color'] == color,
                    AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(
                    sqlfunctions.sum(AM.msg).alias("aggMess"),
                    sendToSrc=msgForSrc,
                    sendToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) & (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])
                ).otherwise(v['belief'])  # keep old beliefs for other colors
                newVertices = (v
                    .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer')
                    .drop(aggregates['id'])  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief', newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief')
                )
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
Esempio n. 11
0
def main(argv):
    assert len(
        argv
    ) == 3, "Script takes 3 arguments <filter_threshold><input_file><community_output_file>"

    filter_threshold, input_file, output_file = argv

    filter_threshold = int(filter_threshold)

    config = SparkConf().setMaster("local[*]") \
                        .setAppName("Task2") \
                        .set("spark.executor.memory", "4g") \
                        .set("spark.driver.memory", "4g")

    sc = SparkContext(conf=config).getOrCreate()
    spark = SparkSession(sc)
    sc.setLogLevel("ERROR")

    lines = sc.textFile(input_file)
    header = lines.first()

    rdd_dict = lines.filter(lambda x: x != header) \
               .map(lambda x: (x.split(',')[0], x.split(',')[1])) \
               .groupByKey().collectAsMap()

    user_pairs = list(combinations(rdd_dict.keys(), 2))

    edges_rdd = sc.parallelize(user_pairs) \
                       .map(lambda x: (x[0], x[1])) \
                       .filter(lambda x: get_intersection(rdd_dict[x[0]], rdd_dict[x[1]]) >= filter_threshold) \
                       .cache()

    nodes_df = edges_rdd.flatMap(lambda x: x).distinct().map(
        lambda x: (x, )).toDF(["id"])

    edges_df = edges_rdd.toDF(["src", "dst"])

    gf = GraphFrame(nodes_df, edges_df)

    communities_rdd = gf.labelPropagation(maxIter=5).rdd.coalesce(1)

    communities = communities_rdd.map(lambda x: (x[1], x[0])) \
                                 .groupByKey() \
                                 .map(lambda x: sorted(list(x[1]))) \
                                 .sortBy(lambda x: (len(x), x)) \
                                 .collect()

    with open(output_file, "w+") as file:
        for community in communities:
            value = str(community)[1:-1]
            file.writelines(value + "\n")
        file.close()
Esempio n. 12
0
def cull_graph(graph,
               by="degree",
               quantile=0.25,
               quantile_accuracy=0.1,
               max_iter=2):
    """Reduce a spark graph by getting rid of nodes that are not high value.
    This is done either by removing nodes that have number of degrees below a
    quantile or removing nodes with pagerank below a quantile."""

    wanted_nodes = None
    if by == "degree":
        nth_percentile = graph.degrees.approxQuantile("degree", [quantile],
                                                      quantile_accuracy)[0]
        wanted_nodes = graph.degrees\
            .filter(graph.degrees.degree > nth_percentile)\
            .select("id")
    elif by == "pagerank":
        results = graph.pageRank(resetProbability=0.15,
                                 maxIter=max_iter).vertices
        nth_percentile = results.approxQuantile("pagerank", [quantile],
                                                quantile_accuracy)[0]
        wanted_nodes = results\
            .filter(results.pagerank > nth_percentile)\
            .select("id")
    else:
        raise ValueError("by must be degree or pagerank!")

    filtered_nodes = graph.vertices.join(wanted_nodes, "id")
    filtered_edges = graph.edges.join(wanted_nodes,
                                      (graph.edges.src == wanted_nodes.id) |
                                      (graph.edges.dst == wanted_nodes.id))

    return GraphFrame(filtered_nodes, filtered_edges)
def algorithm1(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        g.vertices.createOrReplaceTempView("temp_table")
        if (spark.sql("SELECT * from temp_table where value = -1").count() == 0
            ):
            final_df = g.vertices
            break
    return final_df
Esempio n. 14
0
    def set_infected_nodes(self, list_or_dataframe):
        """
        Set nodes that is infected or is the source of influence using pyspark dataframe.
        :param dataframe: pyspark dataframe with column 'id' or python list
        :return:
        """

        infected_dataframe = list_or_dataframe

        # Convert list to dataframe
        if type(list_or_dataframe) == list:
            rdd_list = self.sc.parallelize(list_or_dataframe)
            row_rdd_list = rdd_list.map(lambda x: Row(x))
            field_list = [StructField("id", LongType(), True)]
            schema_list = StructType(field_list)
            infected_dataframe = self.sqlContext.createDataFrame(
                row_rdd_list, schema_list)

        # Create column for influence attribute containing 1's
        infected_dataframe = infected_dataframe.withColumn(
            self.attribute, lit(1.0))
        infected = infected_dataframe

        self.infected_nodes = infected_dataframe

        # Merge to original vertices of graph
        orig_vertices = self.graph.vertices.selectExpr("id as id")

        # Update graph
        orig_edges = self.graph.edges
        new_vertices = orig_vertices.join(infected, "id",
                                          "left_outer").na.fill(0)
        self.graph = GraphFrame(new_vertices, orig_edges)
Esempio n. 15
0
def create_graph(V, E):
    v = sqlContext.createDataFrame(V, ['id'])
    e = sqlContext.createDataFrame(E, ["src", "dst"])

    G = GraphFrame(v, e)

    return G
def induce_graph(graph, relabel=True, partitions=[]):
    """Remove extra edges that do not belong to the graph"""
    # small dataframe for reindexing/relabeling

    window = Window.orderBy("id")
    if partitions:
        window = window.partitionBy(partitions)

    # ensure 0 index for mapping into a scipy.sparse matrix
    rank = graph.vertices.select(
        "id",
        F.row_number().over(window).alias("rank")).withColumn(
            "rank", F.expr("rank - 1"))

    vertices = graph.vertices.join(rank, on="id", how="left")

    edges = graph.edges.join(vertices.selectExpr("id as src",
                                                 "rank as rank_src"),
                             on="src",
                             how="inner").join(vertices.selectExpr(
                                 "id as dst", "rank as rank_dst"),
                                               on="dst",
                                               how="inner")

    if relabel:
        vertices = vertices.withColumn("relabeled_id", F.col("id")).withColumn(
            "id", F.col("rank"))
        edges = (edges.withColumn("relabeled_src", F.col("src")).withColumn(
            "relabeled_dst",
            F.col("dst")).withColumn("src", F.col("rank_src")).withColumn(
                "dst", F.col("rank_dst")))

    vertices = vertices.drop("rank")
    edges = edges.drop("rank_src", "rank_dst")
    return GraphFrame(vertices, edges)
Esempio n. 17
0
def readFile(filename, large, sqlContext=sqlContext):
    # lines = sc.textFile(filename)
    spark = SparkSession.builder.getOrCreate()

    if large:
        delim = " "
        # Strip off header row.
        # lines = lines.mapPartitionsWithIndex(lambda ind, it: iter(list(it)[1:]) if ind == 0 else it)
        header = True
    else:
        delim = ","
        header = False

    # Extract pairs from input file and convert to data frame matching
    # schema for graphframe edges.
    # YOUR CODE HERE
    edges = spark.read.csv(path=filename, sep=delim, schema='src INT, dst INT', header=header)

    # Extract all endpoints from input file (hence flatmap) and create
    # data frame containing all those node names in schema matching
    # graphframe vertices
    # YOUR CODE HERE

    vertices = edges.select(edges['src'].alias('id')).union(edges.select('dst')) \
        .distinct()

    # Create graphframes g from the vertices and edges.
    g = GraphFrame(vertices, edges)

    return g
Esempio n. 18
0
    def compute_degrees(self, graph):
        """
        Compute weighted and unweighted in and out degrees in graph. Re-declares graph to add the following
        attributes: inDegree, outDegree, w_inDegree, w_outDegree.
        :param graph: graphframe object, network
        :return:
        """

        g_vertices = graph.vertices
        g_edges = graph.edges

        # Get unweighted degrees
        indeg = graph.inDegrees
        outdeg = graph.outDegrees

        # Get weighted degrees
        w_indeg = (g_edges.groupby("dst").agg(
            sum("weight").alias("w_inDegree"))).selectExpr(
                "dst as id", "w_inDegree as w_inDegree")
        w_outdeg = (g_edges.groupby("src").agg(
            sum("weight").alias("w_outDegree"))).selectExpr(
                "src as id", "w_outDegree as w_outDegree")
        # Update vertices attribute
        new_v = g_vertices.join(indeg, "id", "left_outer")
        new_v = new_v.join(outdeg, "id", "left_outer")
        new_v = new_v.join(w_indeg, "id", "left_outer")
        new_v = new_v.join(w_outdeg, "id", "left_outer")
        new_v = new_v.na.fill(0)

        # Update graph
        self.graph = GraphFrame(new_v, g_edges)
Esempio n. 19
0
    def __init__(self, vertices_pq, edges_pq):
        # Create configuration for Spark Session
        conf = SparkConf() \
            .setAll([('spark.executor.memory', '16g'),
                     ('spark.executor.cores', '8'),
                     ('spark.cores.max', '8'),
                     ('spark.driver.memory','16g'),
                     ('spark.sql.execution.arrow.enabled', True),
                     ('spark.python.profile', True),
                     ('spark.python.profile.dump',
                      './spark_profile')])

        # Create a spark session
        self.SS = SparkSession.builder.config(conf=conf).getOrCreate()

        # Construct the vertices and edges DataFrame
        vertices_df = self.SS.read.parquet(vertices_pq)
        edges_df = self.SS.read.parquet(edges_pq)

        # Append a column that specifies whether the
        # node is a user or a repo in the table of vertices
        # 1 is for user, 2 is for repo
        nodeTypeUDF = F.udf(lambda i: 1 if i > 0 else 2, types.IntegerType())
        vertices_df = vertices_df.withColumn('nodeType',
                                             nodeTypeUDF(F.col('id')))
        # Create the graphframe object
        self.gf = GraphFrame(vertices_df, edges_df)
Esempio n. 20
0
class LPA():

    def __init__(self):
        self.spark = SparkSession \
            .builder \
            .appName('Example_2') \
            .getOrCreate()

    def graphx(self):
        self.df = self.spark.read.option("header", "true").csv('results_new/data-00000-of-00010.csv')
        # print(self.df.show(n=5))

        self.df = self.df.dropna()
        self.rdd = self.df.select("url","mention").rdd.flatMap(lambda x: x).distinct()
        # print(self.rdd.take(5))

        def hashnode(x):
            return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

        hashnode_udf = udf(hashnode)

        vertices = self.rdd.map(lambda x: (hashnode(x), x)).toDF(["id", "url"])

        vertices.show(5)

        edges = self.df.select("url", "mention") \
            .withColumn("src", hashnode_udf("url")) \
            .withColumn("dst", hashnode_udf("mention")) \
            .select("src", "dst")

        edges.show(5)

        self.graph = GraphFrame(vertices, edges)
        # print(self.graph)
        print('communities are ')
        self.communities = self.graph.labelPropagation(maxIter=2)

        print(self.communities.persist().show(10))
        print(self.communities.sort(desc("label")).show(50))
        self.communities.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("communities")
        print("There are " + str(self.communities.select('label').distinct().count()) + " communities in sample graph.")

        print(self.graph.inDegrees.join(vertices, on="id") \
            .orderBy("inDegree", ascending=False).show(10))

        print(self.graph.stronglyConnectedComponents(maxIter=2).select('url','component').show(20))
Esempio n. 21
0
def find_the_largest_subgraph(graph):
    result = graph.connectedComponents()
    componentCount = result.groupBy('component').count().orderBy(desc('count'))
    componentCount.show()
    largestComponent = componentCount.first()['component']
    vertices = result\
        .filter(result.component == largestComponent)\
        .select('id')
    return GraphFrame(vertices, graph.edges)
Esempio n. 22
0
    def GraphFrame(vertices: pyspark.sql.DataFrame,
                   edges: pyspark.sql.DataFrame) -> GraphFrame:
        """Simply calls the graphframes.GraphFrame

        Args:
            vertices (pyspark.sql.DataFrame):
            edges (pyspark.sql.DataFrame):
        """

        return GraphFrame(vertices, edges)
Esempio n. 23
0
    def graphFrame(self):
        """A GraphFrame representation of the constructed graph.

        :type: :class:`graphframes.GraphFrame`
        """

        return GraphFrame(
            self.verticesDataFrame, 
            self.edgesDataFrame
        )
Esempio n. 24
0
def get_graph(orig_df,
              predictions,
              orig_df_id_col="row_id",
              predictions_id_col="id"):
    predictions_nodes = orig_df.withColumnRenamed(orig_df_id_col, "id")
    predictions_edges = predictions.withColumnRenamed(
        f"{predictions_id_col}_l",
        "src").withColumnRenamed(f"{predictions_id_col}_r",
                                 "dst").filter(predictions.prediction == 1.0)
    return GraphFrame(predictions_nodes, predictions_edges)
    def bipartition(graph: GraphFrame,
                    partitions: List[str] = [],
                    iteration: int = 0):

        if iteration == max_iter:
            return graph

        # relabel all partitions for scipy.sparse performance
        graph.cache()
        induced = induce_graph(graph, True, partitions)
        induced.cache()

        partition = f"sign_{iteration}"
        fiedler_value = f"fiedler_{iteration}"

        # The fiedler vector is the second smallest eigenvector associated with
        # with the graph laplacian, representing the algebraic connectivity of
        # the graph. This is used to implement spectral clustering, recursively,
        # by partitioning by the sign of the fiedler value. The partitions are
        # evenly distributed.
        fiedler = (edges_with_partitions(
            induced, partitions).groupBy(*partitions).apply(
                compute_fiedler_udf(fiedler_value, partitions)).withColumn(
                    partition,
                    F.expr(f"{fiedler_value} >= 0").astype("boolean")))
        vertices = undo_relabel(
            induced.vertices.join(fiedler, on=["id"] + partitions,
                                  how="left").repartitionByRange(*partitions +
                                                                 [partition]))

        if should_checkpoint and iteration % checkpoint_interval == 0:
            # truncate logical plan to prevent out-of-memory on query plan
            # string representation. The edges are reused every iteration
            # and should not need to be checkpointed.
            vertices.cache()
            parted_graph = GraphFrame(vertices.localCheckpoint(eager=True),
                                      graph.edges)
        else:
            parted_graph = GraphFrame(vertices, graph.edges)

        return bipartition(parted_graph, partitions + [partition],
                           iteration + 1)
def sample_graph(pages,
                 pagelinks,
                 sampling_ratio,
                 relabel=True,
                 ensure_connected=True):
    vertices = pages.sample(sampling_ratio)
    edges = pagelinks.selectExpr("from as src", "dest as dst")
    graph = induce_graph(GraphFrame(vertices, edges), False)
    if ensure_connected:
        # only do this when sampling, on the full dataset takes 12 minutes. This may
        # be required in order to guarantee connectivity.
        components = graph.connectedComponents()
        largest_component = (components.groupBy("component").count().orderBy(
            F.desc("count")).limit(1).select("component"))
        vertices = components.join(largest_component,
                                   on="component",
                                   how="inner").drop("component")
        return induce_graph(GraphFrame(vertices, graph.edges), relabel=relabel)
    else:
        return graph
Esempio n. 27
0
def comments_to_graph(df, id_col, src_col, dest_col):
    '''
    takes in a table of raw reddit data
    returns a graphframe
    '''
    vertices = df.withColumnRenamed(id_col, 'id')
    edges = vertices.select(src_col, dest_col).withColumnRenamed(
        src_col, 'src').withColumnRenamed(dest_col, 'dst')

    graph = GraphFrame(vertices, edges)
    return graph
Esempio n. 28
0
def do_triangles(conf: Conf, g: graphframes.GraphFrame, s: Stepper,
                 vertices_count: int) -> None:
    """
    Pattern for batch oriented iteration

    - we split the graph into batches using the filterVertices mechanism
    - we mark the total count of triangles and the partial count
    - in case of error:
       * we double the number of batches and the batch number
       * we restart the iteration at this point with smaller subgraph
    """

    full_set = vertices_count
    batches = conf.batches_for_triangles
    total_triangles = conf.count_at_restart
    batch = conf.batch_at_restart
    subset = int(full_set / batches)

    while batch < batches:
        st = Stepper()
        count = 0
        try:
            print("try batches=", batches, "subset=", subset, "at batch=",
                  batch)
            gc.collect()
            # g1 = g.filterVertices("int(cell/{}) == {}".format(subset, batch))
            g1 = g.filterVertices("int(id/{}) == {}".format(subset, batch))
            triangles = g1.triangleCount()
            st.show_step("partial triangleCount")
            gc.collect()
            count = triangles.agg({"cell": "sum"}).toPandas()["sum(cell)"][0]
            st.show_step("partial triangleCount sum")

            total_triangles += count

            print("batch=", batch, "vertices=", g1.vertices.count(), "edges=",
                  g1.edges.count(), "total=", total_triangles, "partial",
                  count)
        except:
            print("memory error")
            batches *= 2
            batch *= 2
            subset = int(full_set / batches)
            print("restarting with batches=", batches, "subset=", subset,
                  "at batch=", batch)
            if subset >= 1:
                continue
            break

        batch += 1

    s.show_step("triangleCount")
    print("total=", total_triangles)
Esempio n. 29
0
def get_connected_components(vertices_path, edges_path, checkpoint_dir,
                             num_reads):
    # Read vertices and edges files
    df_vertices = build_vertices(vertices_path)
    df_edges = build_edges(edges_path, num_reads)

    # Build Graph
    spark = SparkSession.builder.appName("build_graph").getOrCreate()
    vertices = spark.createDataFrame(df_vertices)

    edges = spark.createDataFrame(df_edges)
    g = GraphFrame(vertices, edges)

    # Display Graph
    g.vertices.show()
    g.edges.show()

    # Connected Components
    # Get SparkContext using spark.sparkContext
    spark.sparkContext.setCheckpointDir(dirName=checkpoint_dir)
    result = g.connectedComponents()

    dictionary = {}

    sorted_result = result.select("id", "component").orderBy('component',
                                                             ascending=False)

    for row in sorted_result.collect():
        if row[1] in dictionary:
            dictionary[row[1]].append(row[0])
        else:
            dictionary[row[1]] = [row[0]]

    GL = []

    for _, value in dictionary.items():
        GL.append(value)

    return GL, spark, g
Esempio n. 30
0
def main():
    print('Read data from BigQuery')
    vertices = load_data(bq_vertices_table)
    edges = load_data(bq_edges_table)
    graph = GraphFrame(vertices, edges)
    print('Find the largest connected subgraph')
    subgraph = find_the_largest_subgraph(graph)
    print('Caculate pagerank')
    results = subgraph.pageRank(resetProbability=0.15, maxIter=10)
    results.vertices\
        .select('id', 'pagerank')\
        .orderBy(desc('pagerank'))\
        .show(20, False)
    spark.stop()
Esempio n. 31
0
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(
            colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)),
            colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(AM.src['color'] == color,
                                              AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(AM.dst['color'] == color,
                                              AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid,
                                            returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(sqlfunctions.sum(
                    AM.msg).alias("aggMess"),
                                                  msgToSrc=msgForSrc,
                                                  msgToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) &
                    (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])).otherwise(
                        v['belief'])  # keep old beliefs for other colors
                newVertices = (
                    v.join(aggregates,
                           on=(v['id'] == aggregates['id']),
                           how='left_outer').drop(
                               aggregates['id']
                           )  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief',
                                newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief'))
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
Esempio n. 32
0
    def _colorGraph(g):
        """Given a GraphFrame, choose colors for each vertex.

        No neighboring vertices will share the same color. The number of colors is minimized.

        This is written specifically for grid graphs. For non-grid graphs, it should be generalized,
        such as by using a greedy coloring scheme.

        :param g: Grid graph generated by :meth:`Graphs.gridIsingModel()`
        :return: Same graph, but with a new vertex column "color" of type Int (0 or 1)

        """

        colorUDF = sqlfunctions.udf(lambda i, j: (i + j) % 2, returnType=types.IntegerType())
        v = g.vertices.withColumn('color', colorUDF(sqlfunctions.col('i'), sqlfunctions.col('j')))
        return GraphFrame(v, g.edges)
Esempio n. 33
0
def process_graphs (sc, in_dir, partitions):
    """
    Read graph vertices and edges from disk if already saved.
    Otherwise,
    Read chem2bio2rdf drugbank, pubchem, and other N3 RDF models.
    Save vertices and edges to disk.
    
    Traverse the resulting graph - calculating page rank, using
    SQL to get names and PDB links of drugs.

    Args:
        sc (SparkContext): Access to the Spark compute fabric.
        in_dir (str): Path to Chemotext data storage for raw chem2bio2rdf N3 RDF models.
        partitions (int): Number of data partitions.
    """
    sqlContext = SQLContext (sc)

    n3_dirs = [ os.path.join (in_dir, d) for d in [ "drugbank", "pubchem" ] ]

    vertices_path_posix = os.path.join (in_dir, "vertices")
    edges_path_posix = os.path.join (in_dir, "edges")
    vertices_path = "file://{0}".format (vertices_path_posix)
    edges_path = "file://{0}".format (edges_path_posix)

    triples = None
    vertices = None
    edges = None
    g = None

    if os.path.exists (vertices_path_posix) and os.path.exists (edges_path_posix):

        print ("Loading existing vertices: {0}".format (vertices_path))
        start = time.time ()
        vertices = sqlContext.read.parquet (vertices_path).repartition(partitions).cache ()
        print ("Elapsed time for loading precomputed vertices: {0} seconds.".format (
            time.time () - start))

        print ("Loading existing edges: {0}".format (edges_path))
        start = time.time ()
        edges = sqlContext.read.parquet (edges_path).repartition(partitions).cache ()
        print ("Elapsed time for loading precomputed edges: {0} seconds.".format (
            time.time () - start))

    else:
        print ("Constructing vertices and edges from chem2bio2rdf data sources")

        files = [ os.path.join (n3_dir, n3_file) for n3_dir in n3_dirs for n3_file in os.listdir (n3_dir) ]
        triples = sc.parallelize (files, numSlices=partitions). \
                  flatMap (lambda n3_file : process_chunk (n3_file))

        vertices = sqlContext.createDataFrame (
            data = triples.flatMap (lambda d : [
                ( trim_uri (d.S), "attr0" ),
                ( trim_uri (d.O), "attr1" ) ]),
            schema=[ "id", "attr" ]).\
            cache () 
        edges = sqlContext.createDataFrame (
            data = triples.map (lambda d : (
                trim_uri (d.S),
                trim_uri (d.O),
                trim_uri (d.P) )),
            schema = [ "src", "dst", "relationship" ]). \
            cache ()
 
        print ("Triples: {0}".format (triples.count ()))

        if os.path.exists (vertices_path_posix):
            shutil.rmtree (vertices_path_posix)
        if os.path.exists (edges_path_posix):
            shutil.rmtree (edges_path_posix)
        vertices.write.parquet (vertices_path)
        edges.write.parquet (edges_path)

    if vertices is not None and edges is not None:
        start = time.time ()
        vertices.printSchema ()
        edges.printSchema ()
        print ("Elapsed time for print schema: {0} seconds.".format (
            time.time () - start))

        start = time.time ()
        print (" Total of {0} edges.".format (edges.count ()))
        print ("Elapsed time for count edges: {0}".format (time.time () - start))

        g = GraphFrame(vertices, edges)

        print ("Query: Get in-degree of each vertex.")
        start = time.time ()
        g.inDegrees.\
            sort ("inDegree", ascending=False).\
            show(n=3, truncate=False)
        print ("Elapsed time for computing in-degree: {0} seconds.".format (
            time.time () - start))

        start = time.time ()
        print ("Query: Number of protein database relationships: {0}".format (
            g.edges.\
            filter("relationship LIKE '%resource/PDB_ID%' ").\
            count ()))
        print ("Elapsed time for edge filter and count query: {0} seconds.".format (
            time.time () - start))
        
        edges.registerTempTable ("edges")

        sqlContext.sql ("""
           SELECT substring(src, length(src)-7, 6) as Drug,
                  dst as Name
           FROM edges
           WHERE relationship LIKE '%resource/Name%'
        """).show (n=3, truncate=False)

        start = time.time ()
        sqlContext.sql ("""
           SELECT substring(src, length(src)-7, 6) as Compound,
                  dst as SMILES
           FROM edges
           WHERE relationship LIKE '%open%_smiles%'
        """).show (n=3, truncate=False)
        print ("Elapsed time for SQL query: {0} seconds.".format (
            time.time () - start))

        start = time.time ()
        g.find ("()-[Drug2PDB]->()"). \
            filter ("Drug2PDB.relationship LIKE '%/PDB_ID' "). \
            show (n=3, truncate=False)
        print ("Elapsed time for graph motif query: {0} seconds.".format (
            time.time () - start))

    return g
Esempio n. 34
0
    #filename = '/home/user/leaflet-spark/atom_position_frame_1.npz.npy'
    
    coord_matrix = np.load(filename)
    coord_matrix_broadcast = sc.broadcast(coord_matrix)
    matrix_size = len(coord_matrix)
    dist_Matrix = sc.parallelize(coord_matrix)
    dist_Matrix = dist_Matrix.zipWithIndex()  #key-value pairs
    edge_list = dist_Matrix.flatMap(find_edges)
    
    edge_list = edge_list.filter(lambda x: x[0]!=-1) # filter the -1 values
    
    sqlContext = SQLContext(sc)
    
    Edges = Row('src','dst')
    edge = edge_list.map(lambda x: Edges(*x))
    e = sqlContext.createDataFrame(edge)
    # e.take(10)
    v = sqlContext.createDataFrame(sc.parallelize(xrange(matrix_size)).map(lambda i:Row(id=i+1)))
    # v.show()
    
    # create the graph
    g = GraphFrame(v, e)
    #g.vertices.show()
    #g.edges.show()
    total_time = time() - start_time
    cc = g.connectedComponents()
    print cc.select("id", "component").orderBy("component").show()
    print 'Total time to create the Graphframe: %i sec'  % (total_time)
    print 'Time to calculate the connected components: %i sec ' % (time() - total_time-start_time)