Esempio n. 1
0
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(
            colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)),
            colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(AM.src['color'] == color,
                                              AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(AM.dst['color'] == color,
                                              AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid,
                                            returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(sqlfunctions.sum(
                    AM.msg).alias("aggMess"),
                                                  msgToSrc=msgForSrc,
                                                  msgToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) &
                    (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])).otherwise(
                        v['belief'])  # keep old beliefs for other colors
                newVertices = (
                    v.join(aggregates,
                           on=(v['id'] == aggregates['id']),
                           how='left_outer').drop(
                               aggregates['id']
                           )  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief',
                                newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief'))
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
Esempio n. 2
0
    def test_gf(self):

        vertices = spark.createDataFrame([('1', 'Carter', 'Derrick', 50),
                                          ('2', 'May', 'Derrick', 26),
                                          ('3', 'Mills', 'Jeff', 80),
                                          ('4', 'Hood', 'Robert', 65),
                                          ('5', 'Banks', 'Mike', 93),
                                          ('98', 'Berg', 'Tim', 28),
                                          ('99', 'Page', 'Allan', 16)],
                                         ['id', 'name', 'firstname', 'age'])
        edges = spark.createDataFrame([('1', '2', 'friend'),
                                       ('2', '1', 'friend'),
                                       ('3', '1', 'friend'),
                                       ('1', '3', 'friend'),
                                       ('2', '3', 'follows'),
                                       ('3', '4', 'friend'),
                                       ('4', '3', 'friend'),
                                       ('5', '3', 'friend'),
                                       ('3', '5', 'friend'),
                                       ('4', '5', 'follows'),
                                       ('98', '99', 'friend'),
                                       ('99', '98', 'friend')],
                                      ['src', 'dst', 'type'])
        g = GraphFrame(vertices, edges)
        g.connectedComponents().show()
def algorithm1(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        g.vertices.createOrReplaceTempView("temp_table")
        if (spark.sql("SELECT * from temp_table where value = -1").count() == 0
            ):
            final_df = g.vertices
            break
    return final_df
def algorithm2(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        if (g.filterVertices(
                "value == -1").dropIsolatedVertices().edges.count() == 0):
            final_df = g.vertices
            final_df = final_df.withColumn(
                "value",
                F.when(final_df["value"] == -1,
                       i).otherwise(final_df["value"]))
            break
    return final_df
Esempio n. 5
0
def readFile(filename, large, sqlContext=sqlContext):
    # lines = sc.textFile(filename)
    spark = SparkSession.builder.getOrCreate()

    if large:
        delim = " "
        # Strip off header row.
        # lines = lines.mapPartitionsWithIndex(lambda ind, it: iter(list(it)[1:]) if ind == 0 else it)
        header = True
    else:
        delim = ","
        header = False

    # Extract pairs from input file and convert to data frame matching
    # schema for graphframe edges.
    # YOUR CODE HERE
    edges = spark.read.csv(path=filename, sep=delim, schema='src INT, dst INT', header=header)

    # Extract all endpoints from input file (hence flatmap) and create
    # data frame containing all those node names in schema matching
    # graphframe vertices
    # YOUR CODE HERE

    vertices = edges.select(edges['src'].alias('id')).union(edges.select('dst')) \
        .distinct()

    # Create graphframes g from the vertices and edges.
    g = GraphFrame(vertices, edges)

    return g
Esempio n. 6
0
    def run(sc, TH, infile, outfile):
        rdd = sc.textFile(infile)
        fisrtline = rdd.first()
        data = rdd.filter(lambda line: line != fisrtline)

        uid = data.map(lambda line: (line.split(',')[0], line.split(',')[1])) \
            .groupByKey() \
            .mapValues(lambda x: sorted(list(x))) \
            .collectAsMap()
         
        cand_pairs = list(itertools.combinations(list(uid.keys()), 2))
        
        edge, vertex = list(), set()
        for pair in cand_pairs:
            if len(set(uid[pair[0]]).intersection(set(uid[pair[1]]))) >= TH:
                edge.append(tuple((pair[0], pair[1])))
                edge.append(tuple((pair[1], pair[0])))
                vertex.add(pair[0])
                vertex.add(pair[1])
        graph = GraphFrame(sc.parallelize(list(vertex)).map(lambda uid: (uid,)).toDF(['id']),
                                 sc.parallelize(edge).toDF(["src", "dst"]))
        communities = graph.labelPropagation(maxIter=5)
        communities = communities.rdd.coalesce(1) \
            .map(lambda idx_label: (idx_label[1], idx_label[0])) \
            .groupByKey() \
            .map(lambda label_idxes: sorted(list(label_idxes[1]))) \
            .sortBy(lambda idxes: (len(idxes), idxes)) \
            .collect()
        Task1.toFile(outfile, communities)
Esempio n. 7
0
    def graphframes_pagerank(self, sc, sqlc):
        """ GraphFrame's PageRank implementation """

        from graphframes import GraphFrame  # pylint: disable=import-error

        edge_df = sqlc.read.load(os.path.join(self.args.webgraph, "edges"))
        vertex_df = sqlc.read.load(os.path.join(self.args.webgraph,
                                                "vertices"))

        graph = GraphFrame(vertex_df, edge_df)

        withPageRank = graph.pageRank(maxIter=self.args.maxiter)

        final_df = sql(
            sqlc, """
            SELECT CONCAT(ranks.domain, ' ', ranks.pagerank) r
            FROM ranks
            ORDER BY ranks.pagerank DESC
        """, {"ranks": withPageRank.vertices})

        if self.args.dump:

            final_df.coalesce(1).write.text(
                self.args.dump,
                compression="gzip" if self.args.gzip else "none")

        else:
            print(final_df.rdd.collect())
Esempio n. 8
0
    def compute_degrees(self, graph):
        """
        Compute weighted and unweighted in and out degrees in graph. Re-declares graph to add the following
        attributes: inDegree, outDegree, w_inDegree, w_outDegree.
        :param graph: graphframe object, network
        :return:
        """

        g_vertices = graph.vertices
        g_edges = graph.edges

        # Get unweighted degrees
        indeg = graph.inDegrees
        outdeg = graph.outDegrees

        # Get weighted degrees
        w_indeg = (g_edges.groupby("dst").agg(
            sum("weight").alias("w_inDegree"))).selectExpr(
                "dst as id", "w_inDegree as w_inDegree")
        w_outdeg = (g_edges.groupby("src").agg(
            sum("weight").alias("w_outDegree"))).selectExpr(
                "src as id", "w_outDegree as w_outDegree")
        # Update vertices attribute
        new_v = g_vertices.join(indeg, "id", "left_outer")
        new_v = new_v.join(outdeg, "id", "left_outer")
        new_v = new_v.join(w_indeg, "id", "left_outer")
        new_v = new_v.join(w_outdeg, "id", "left_outer")
        new_v = new_v.na.fill(0)

        # Update graph
        self.graph = GraphFrame(new_v, g_edges)
Esempio n. 9
0
def cull_graph(graph,
               by="degree",
               quantile=0.25,
               quantile_accuracy=0.1,
               max_iter=2):
    """Reduce a spark graph by getting rid of nodes that are not high value.
    This is done either by removing nodes that have number of degrees below a
    quantile or removing nodes with pagerank below a quantile."""

    wanted_nodes = None
    if by == "degree":
        nth_percentile = graph.degrees.approxQuantile("degree", [quantile],
                                                      quantile_accuracy)[0]
        wanted_nodes = graph.degrees\
            .filter(graph.degrees.degree > nth_percentile)\
            .select("id")
    elif by == "pagerank":
        results = graph.pageRank(resetProbability=0.15,
                                 maxIter=max_iter).vertices
        nth_percentile = results.approxQuantile("pagerank", [quantile],
                                                quantile_accuracy)[0]
        wanted_nodes = results\
            .filter(results.pagerank > nth_percentile)\
            .select("id")
    else:
        raise ValueError("by must be degree or pagerank!")

    filtered_nodes = graph.vertices.join(wanted_nodes, "id")
    filtered_edges = graph.edges.join(wanted_nodes,
                                      (graph.edges.src == wanted_nodes.id) |
                                      (graph.edges.dst == wanted_nodes.id))

    return GraphFrame(filtered_nodes, filtered_edges)
Esempio n. 10
0
def create_graph(V, E):
    v = sqlContext.createDataFrame(V, ['id'])
    e = sqlContext.createDataFrame(E, ["src", "dst"])

    G = GraphFrame(v, e)

    return G
def induce_graph(graph, relabel=True, partitions=[]):
    """Remove extra edges that do not belong to the graph"""
    # small dataframe for reindexing/relabeling

    window = Window.orderBy("id")
    if partitions:
        window = window.partitionBy(partitions)

    # ensure 0 index for mapping into a scipy.sparse matrix
    rank = graph.vertices.select(
        "id",
        F.row_number().over(window).alias("rank")).withColumn(
            "rank", F.expr("rank - 1"))

    vertices = graph.vertices.join(rank, on="id", how="left")

    edges = graph.edges.join(vertices.selectExpr("id as src",
                                                 "rank as rank_src"),
                             on="src",
                             how="inner").join(vertices.selectExpr(
                                 "id as dst", "rank as rank_dst"),
                                               on="dst",
                                               how="inner")

    if relabel:
        vertices = vertices.withColumn("relabeled_id", F.col("id")).withColumn(
            "id", F.col("rank"))
        edges = (edges.withColumn("relabeled_src", F.col("src")).withColumn(
            "relabeled_dst",
            F.col("dst")).withColumn("src", F.col("rank_src")).withColumn(
                "dst", F.col("rank_dst")))

    vertices = vertices.drop("rank")
    edges = edges.drop("rank_src", "rank_dst")
    return GraphFrame(vertices, edges)
Esempio n. 12
0
def sample_induced_subgraph(
    graph: GraphFrame,
    seed: int,
    k_hops: int = 2,
    pr_alpha: float = 0.85,
    pr_tol: float = 0.001,
) -> GraphFrame:
    assert k_hops <= 3

    # build motif for finding a k-hop neighborhood localized to a node
    symbols = string.ascii_letters
    motif_edges = [(symbols[i], symbols[i + 1]) for i in range(k_hops)]
    paths = graph.find(";".join(
        [f"({e1})-[]->({e2})" for e1, e2 in motif_edges]))
    # the center of odd paths have indices greater than the midpoint
    # this should increase the seed article's pagerank score on 1-hop networks
    centered_paths = paths.where(f"{symbols[k_hops-k_hops//2]}.id = {seed}")

    vertices = centered_paths.selectExpr(f"{symbols[0]} as v")
    for i in range(1, k_hops + 1):
        vertices = vertices.union(
            centered_paths.selectExpr(f"{symbols[i]} as v"))
    vertices = vertices.select("v.*").distinct()
    vertices.cache()

    edges = (graph.edges.join(
        vertices, on=graph.edges["src"] == vertices["id"],
        how="right").select("src", "dst").join(
            vertices,
            on=graph.edges["dst"] == vertices["id"], how="right").select(
                "src", "dst").where("src is not null AND dst is not null"))
    return GraphFrame(vertices, edges).pageRank(pr_alpha, tol=pr_tol)
Esempio n. 13
0
    def __init__(self, vertices_pq, edges_pq):
        # Create configuration for Spark Session
        conf = SparkConf() \
            .setAll([('spark.executor.memory', '16g'),
                     ('spark.executor.cores', '8'),
                     ('spark.cores.max', '8'),
                     ('spark.driver.memory','16g'),
                     ('spark.sql.execution.arrow.enabled', True),
                     ('spark.python.profile', True),
                     ('spark.python.profile.dump',
                      './spark_profile')])

        # Create a spark session
        self.SS = SparkSession.builder.config(conf=conf).getOrCreate()

        # Construct the vertices and edges DataFrame
        vertices_df = self.SS.read.parquet(vertices_pq)
        edges_df = self.SS.read.parquet(edges_pq)

        # Append a column that specifies whether the
        # node is a user or a repo in the table of vertices
        # 1 is for user, 2 is for repo
        nodeTypeUDF = F.udf(lambda i: 1 if i > 0 else 2, types.IntegerType())
        vertices_df = vertices_df.withColumn('nodeType',
                                             nodeTypeUDF(F.col('id')))
        # Create the graphframe object
        self.gf = GraphFrame(vertices_df, edges_df)
Esempio n. 14
0
    def set_infected_nodes(self, list_or_dataframe):
        """
        Set nodes that is infected or is the source of influence using pyspark dataframe.
        :param dataframe: pyspark dataframe with column 'id' or python list
        :return:
        """

        infected_dataframe = list_or_dataframe

        # Convert list to dataframe
        if type(list_or_dataframe) == list:
            rdd_list = self.sc.parallelize(list_or_dataframe)
            row_rdd_list = rdd_list.map(lambda x: Row(x))
            field_list = [StructField("id", LongType(), True)]
            schema_list = StructType(field_list)
            infected_dataframe = self.sqlContext.createDataFrame(
                row_rdd_list, schema_list)

        # Create column for influence attribute containing 1's
        infected_dataframe = infected_dataframe.withColumn(
            self.attribute, lit(1.0))
        infected = infected_dataframe

        self.infected_nodes = infected_dataframe

        # Merge to original vertices of graph
        orig_vertices = self.graph.vertices.selectExpr("id as id")

        # Update graph
        orig_edges = self.graph.edges
        new_vertices = orig_vertices.join(infected, "id",
                                          "left_outer").na.fill(0)
        self.graph = GraphFrame(new_vertices, orig_edges)
Esempio n. 15
0
def find_the_largest_subgraph(graph):
    result = graph.connectedComponents()
    componentCount = result.groupBy('component').count().orderBy(desc('count'))
    componentCount.show()
    largestComponent = componentCount.first()['component']
    vertices = result\
        .filter(result.component == largestComponent)\
        .select('id')
    return GraphFrame(vertices, graph.edges)
Esempio n. 16
0
def main():
    # crate spark session
    spark = SparkSession.builder.appName("keepindoors graphx connectedComponents()").getOrCreate()

    # get a mongo client
    cli = mongo.__get__()

    # v, ["id","url","title","datetime"]
    localVertices=[]
    cursor = mongo.getCollection(cli,"keepindoors","docs").find()
    for r in cursor:
        # del "_id" key which will throws error when createDataFrame
        r["id"] = r["docno"]
        localVertices.append((r["docno"],r["url"],r["title"],str(r["_id"].generation_time + timedelta(hours=8))))

    # e
    cursor = mongo.getCollection(cli, "keepindoors", "distances").find()
    localEdges = []
    for r in cursor:
        localEdges.append((r["docno1"],r["docno2"],r["distance"]))

    v = spark.createDataFrame(localVertices,["id","url","title","datetime"])
    e = spark.createDataFrame(localEdges, ["src", "dst","distance"])
    g = GraphFrame(v,e)
    # get sparkContext from sparkSession
    spark.sparkContext.setCheckpointDir("/tmp/spark/checkpoint")
    result = g.connectedComponents()

    # order by component,datetime
    result = result.orderBy(["component", "datetime"], ascending=[1, 0]).collect()

    # create component dict
    component_dict = {}
    for row in result:
        record = row.asDict()
        if record["component"] not in component_dict.keys():
            component_dict[record["component"]] = []
        component_dict[record["component"]].append(record)

    # delete mongo collection "component"
    mongo.deleteAll(cli,"keepindoors","components")

    # save component_dict into mongo
    index = 1
    for key,item in component_dict.items():
        links = []
        titles = []
        title = "empty title"
        update_time = "1970-01-01 00:00:00+00:00"
        for doc in item:
            titles.append(doc["title"])
            links.append(doc["url"])
            if doc["datetime"] > update_time:
                update_time = doc["datetime"]
                title = doc["title"]
        mongo.insertDoc({"no":index,"component":key,"title":title,"size":len(item),"links":links,"titles":titles,"update_time":update_time,"docs":item},cli,"keepindoors","components")
        index += 1
Esempio n. 17
0
    def graphFrame(self):
        """A GraphFrame representation of the constructed graph.

        :type: :class:`graphframes.GraphFrame`
        """

        return GraphFrame(
            self.verticesDataFrame, 
            self.edgesDataFrame
        )
Esempio n. 18
0
def get_graph(orig_df,
              predictions,
              orig_df_id_col="row_id",
              predictions_id_col="id"):
    predictions_nodes = orig_df.withColumnRenamed(orig_df_id_col, "id")
    predictions_edges = predictions.withColumnRenamed(
        f"{predictions_id_col}_l",
        "src").withColumnRenamed(f"{predictions_id_col}_r",
                                 "dst").filter(predictions.prediction == 1.0)
    return GraphFrame(predictions_nodes, predictions_edges)
Esempio n. 19
0
    def GraphFrame(vertices: pyspark.sql.DataFrame,
                   edges: pyspark.sql.DataFrame) -> GraphFrame:
        """Simply calls the graphframes.GraphFrame

        Args:
            vertices (pyspark.sql.DataFrame):
            edges (pyspark.sql.DataFrame):
        """

        return GraphFrame(vertices, edges)
    def bipartition(graph: GraphFrame,
                    partitions: List[str] = [],
                    iteration: int = 0):

        if iteration == max_iter:
            return graph

        # relabel all partitions for scipy.sparse performance
        graph.cache()
        induced = induce_graph(graph, True, partitions)
        induced.cache()

        partition = f"sign_{iteration}"
        fiedler_value = f"fiedler_{iteration}"

        # The fiedler vector is the second smallest eigenvector associated with
        # with the graph laplacian, representing the algebraic connectivity of
        # the graph. This is used to implement spectral clustering, recursively,
        # by partitioning by the sign of the fiedler value. The partitions are
        # evenly distributed.
        fiedler = (edges_with_partitions(
            induced, partitions).groupBy(*partitions).apply(
                compute_fiedler_udf(fiedler_value, partitions)).withColumn(
                    partition,
                    F.expr(f"{fiedler_value} >= 0").astype("boolean")))
        vertices = undo_relabel(
            induced.vertices.join(fiedler, on=["id"] + partitions,
                                  how="left").repartitionByRange(*partitions +
                                                                 [partition]))

        if should_checkpoint and iteration % checkpoint_interval == 0:
            # truncate logical plan to prevent out-of-memory on query plan
            # string representation. The edges are reused every iteration
            # and should not need to be checkpointed.
            vertices.cache()
            parted_graph = GraphFrame(vertices.localCheckpoint(eager=True),
                                      graph.edges)
        else:
            parted_graph = GraphFrame(vertices, graph.edges)

        return bipartition(parted_graph, partitions + [partition],
                           iteration + 1)
Esempio n. 21
0
def main(argv):
    filter_threshold = int(argv[1])
    input_file_path = argv[2]
    output_file_path = argv[3]
    # os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.6.0-spark2.4-s_2.11")

    # filter_threshold = 7
    # input_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/data/ub_sample_data.csv"
    # output_file_path = "/Users/zhijunliao/Marks/USC/INF-553/HW/INF553HW4/output/task1.txt"

    # 38648 records
    total_start = time.time()
    start = time.time()
    input_data = sc.textFile(input_file_path).\
        filter(lambda line: "user_id" not in line).\
        map(lambda line: tuple(line.split(","))).\
        groupByKey().\
        mapValues(set).\
        persist()  # 3374

    edges = input_data.\
        cartesian(input_data).\
        filter(lambda pair: pair[0][0] < pair[1][0]).\
        filter(lambda pair: len(pair[0][1].intersection(pair[1][1])) >= filter_threshold).\
        flatMap(lambda pair: [(pair[0][0], pair[1][0]), (pair[1][0], pair[0][0])]).\
        persist()  # 996 498
    edges_df = edges.map(lambda pair: Row(src=pair[0], dst=pair[1])).toDF()

    vertices = edges.flatMap(lambda _: _).distinct().persist()  # 222
    vertices_df = vertices.map(Row("id")).toDF()
    print("finish building edges and vertices:", time.time() - start)

    start = time.time()
    graph = GraphFrame(vertices_df, edges_df)
    result = graph.labelPropagation(maxIter=5)
    print("finish running LPA:", time.time() - start)
    # result.count()  # 222
    # result.show()

    result_rdd = result.rdd.\
        map(lambda pair: (pair['label'], pair['id'])).\
        groupByKey().\
        mapValues(lambda values: (sorted(list(values)), len(values))).\
        persist()

    result_collection = result_rdd.collect()
    result_collection.sort(key=lambda kv: (kv[1][1], kv[1][0][0]))
    with open(output_file_path, "w") as output_file:
        for community_id, (user_list, length) in result_collection:
            output_file.write(f"'{user_list[0]}'")
            for user in user_list[1:]:
                output_file.write(f", '{user}'")
            output_file.write("\n")
    print("total running time:", time.time() - total_start)
Esempio n. 22
0
def comments_to_graph(df, id_col, src_col, dest_col):
    '''
    takes in a table of raw reddit data
    returns a graphframe
    '''
    vertices = df.withColumnRenamed(id_col, 'id')
    edges = vertices.select(src_col, dest_col).withColumnRenamed(
        src_col, 'src').withColumnRenamed(dest_col, 'dst')

    graph = GraphFrame(vertices, edges)
    return graph
def sample_graph(pages,
                 pagelinks,
                 sampling_ratio,
                 relabel=True,
                 ensure_connected=True):
    vertices = pages.sample(sampling_ratio)
    edges = pagelinks.selectExpr("from as src", "dest as dst")
    graph = induce_graph(GraphFrame(vertices, edges), False)
    if ensure_connected:
        # only do this when sampling, on the full dataset takes 12 minutes. This may
        # be required in order to guarantee connectivity.
        components = graph.connectedComponents()
        largest_component = (components.groupBy("component").count().orderBy(
            F.desc("count")).limit(1).select("component"))
        vertices = components.join(largest_component,
                                   on="component",
                                   how="inner").drop("component")
        return induce_graph(GraphFrame(vertices, graph.edges), relabel=relabel)
    else:
        return graph
Esempio n. 24
0
    def lpa(self, graph, iter):
        print("Community Detection\t1\tInitializing Algorithm", flush=True)
        edges = graph.get_df()
        vertices = edges.select('src').union(
            edges.select('dst')).distinct().withColumnRenamed('src', 'id')

        print("Community Detection\t2\tExecuting Label Propagation Algorithm",
              flush=True)
        graph = GraphFrame(vertices, edges)
        result = graph.labelPropagation(maxIter=iter)
        return result.orderBy('label', ascending=True).withColumnRenamed(
            'label', 'Community')
Esempio n. 25
0
def main(argv):
    assert len(
        argv
    ) == 3, "Script takes 3 arguments <filter_threshold><input_file><community_output_file>"

    filter_threshold, input_file, output_file = argv

    filter_threshold = int(filter_threshold)

    config = SparkConf().setMaster("local[*]") \
                        .setAppName("Task2") \
                        .set("spark.executor.memory", "4g") \
                        .set("spark.driver.memory", "4g")

    sc = SparkContext(conf=config).getOrCreate()
    spark = SparkSession(sc)
    sc.setLogLevel("ERROR")

    lines = sc.textFile(input_file)
    header = lines.first()

    rdd_dict = lines.filter(lambda x: x != header) \
               .map(lambda x: (x.split(',')[0], x.split(',')[1])) \
               .groupByKey().collectAsMap()

    user_pairs = list(combinations(rdd_dict.keys(), 2))

    edges_rdd = sc.parallelize(user_pairs) \
                       .map(lambda x: (x[0], x[1])) \
                       .filter(lambda x: get_intersection(rdd_dict[x[0]], rdd_dict[x[1]]) >= filter_threshold) \
                       .cache()

    nodes_df = edges_rdd.flatMap(lambda x: x).distinct().map(
        lambda x: (x, )).toDF(["id"])

    edges_df = edges_rdd.toDF(["src", "dst"])

    gf = GraphFrame(nodes_df, edges_df)

    communities_rdd = gf.labelPropagation(maxIter=5).rdd.coalesce(1)

    communities = communities_rdd.map(lambda x: (x[1], x[0])) \
                                 .groupByKey() \
                                 .map(lambda x: sorted(list(x[1]))) \
                                 .sortBy(lambda x: (len(x), x)) \
                                 .collect()

    with open(output_file, "w+") as file:
        for community in communities:
            value = str(community)[1:-1]
            file.writelines(value + "\n")
        file.close()
Esempio n. 26
0
def main():
    print('Read data from BigQuery')
    vertices = load_data(bq_vertices_table)
    edges = load_data(bq_edges_table)
    graph = GraphFrame(vertices, edges)
    print('Find the largest connected subgraph')
    subgraph = find_the_largest_subgraph(graph)
    print('Caculate pagerank')
    results = subgraph.pageRank(resetProbability=0.15, maxIter=10)
    results.vertices\
        .select('id', 'pagerank')\
        .orderBy(desc('pagerank'))\
        .show(20, False)
    spark.stop()
Esempio n. 27
0
    def _colorGraph(g):
        """Given a GraphFrame, choose colors for each vertex.

        No neighboring vertices will share the same color. The number of colors is minimized.

        This is written specifically for grid graphs. For non-grid graphs, it should be generalized,
        such as by using a greedy coloring scheme.

        :param g: Grid graph generated by :meth:`Graphs.gridIsingModel()`
        :return: Same graph, but with a new vertex column "color" of type Int (0 or 1)

        """

        colorUDF = sqlfunctions.udf(lambda i, j: (i + j) % 2, returnType=types.IntegerType())
        v = g.vertices.withColumn('color', colorUDF(sqlfunctions.col('i'), sqlfunctions.col('j')))
        return GraphFrame(v, g.edges)
Esempio n. 28
0
	def getConnectivity(self,rddv,spark):
		sc = spark.sparkContext
		radius = self.getRadius()
		dist = self.getDistance()
		dlist = rddv.collect()
		featurecol = self.getFeaturesCol()
		irows = [IndexedRow(i,dlist[i][featurecol].toArray()) for i in range(0,len(dlist))]
		imatrix = IndexedRowMatrix(sc.parallelize(irows))
		cart = imatrix.rows.cartesian(imatrix.rows)

		rows = Row("id","vector")
		usr_row = [rows(i,np.float_(x).tolist()) for i,x in enumerate(dlist)]
		verts = spark.createDataFrame(usr_row)
		A = cart.filter(lambda x : dist(x[0].vector,x[1].vector) <= radius).map(lambda x : (x[0].index, x[1].index, 1))
		edges = spark.createDataFrame(A,['src','dst','connected'])
		return GraphFrame(verts,edges)
Esempio n. 29
0
    def spread_activation_step(self, graph, attribute, spreading_factor,
                               transfer_function):
        """
        One step in the spread activation model.
        :param graph: graphframe object, network
        :param attribute: str, name of attribute/influence
        :param spreading_factor: 0 - 1, amount of influence to spread
        :param transfer_function: weighted or unweighted, how to transfer influence along edges
        :return: graphframe object, new network with updated new calculation of attribute in vertices
        """

        # Pass influence/message to neighboring nodes (weighted/unweighted option)
        if transfer_function == "unweighted":
            msgToSrc = (AM.src[attribute] /
                        AM.src["outDegree"]) * (1 - spreading_factor)
            msgToDst = sqlfunctions.when(
                AM.dst["outDegree"] != 0,
                ((AM.src[attribute] / AM.src["outDegree"]) * (spreading_factor)
                 )).otherwise(((1 / AM.dst["inDegree"]) * AM.dst[attribute]) +
                              ((AM.src[attribute] / AM.src["outDegree"]) *
                               (spreading_factor)))
        if transfer_function == "weighted":
            weight = AM.edge["weight"] / AM.src["w_outDegree"]
            msgToSrc = (AM.src[attribute] /
                        AM.src["outDegree"]) * (1 - spreading_factor)
            msgToDst = sqlfunctions.when(
                AM.dst["outDegree"] != 0,
                ((AM.src[attribute]) *
                 (spreading_factor * weight))).otherwise((
                     (1 / AM.dst["inDegree"]) * AM.dst[attribute]) + (
                         (AM.src[attribute]) * (spreading_factor * weight)))

        # Aggregate messages
        agg = graph.aggregateMessages(sqlsum(AM.msg).alias(attribute),
                                      sendToSrc=msgToSrc,
                                      sendToDst=msgToDst)

        # Create a new cached copy of the dataFrame to get new calculated attribute
        cachedNewVertices = AM.getCachedDataFrame(agg)
        tojoin = graph.vertices.select("id", "inDegree", "outDegree",
                                       "w_inDegree", "w_outDegree")
        new_cachedNewVertices = cachedNewVertices.join(tojoin, "id",
                                                       "left_outer")
        new_cachedNewVertices = new_cachedNewVertices.na.fill(0)

        # Return graph with new calculated attribute
        return GraphFrame(new_cachedNewVertices, graph.edges)
Esempio n. 30
0
def sample_subgraph(
    pages_path,
    pagelinks_path,
    pageviews_path,
    artifact_path,
    article_seed,
    k_hops,
    pagerank_alpha,
    pagerank_tol,
):
    start_time = time()
    click.echo("Starting!")

    if not os.path.exists(artifact_path):
        os.makedirs(artifact_path)

    # fetch the source datasets
    spark = SparkSession.builder.getOrCreate()
    pages = spark.read.parquet(pages_path)
    pagelinks = spark.read.parquet(pagelinks_path)
    pageviews = spark.read.parquet(pageviews_path)

    # extract induced subgraph
    graph = GraphFrame(pages, pagelinks.selectExpr("from as src",
                                                   "dest as dst"))
    if not article_seed:
        seed = (graph.vertices.sample(False, 0.001).orderBy(
            F.rand()).limit(1).collect()[0])
    else:
        seed = graph.vertices.where(f"id = {article_seed}").collect()[0]
    induced_subgraph = sample_induced_subgraph(graph, seed.id, k_hops,
                                               pagerank_alpha, pagerank_tol)

    # write to disk
    with open(f"{artifact_path}/seed.txt", "w") as f:
        f.write(f"{seed.id},{seed.title},{k_hops}")

    edges_df = induced_subgraph.edges.toPandas()
    edges_df.to_csv(f"{artifact_path}/edges.csv", index=False)

    vertices_df = induced_subgraph.vertices.orderBy(
        F.desc("pagerank")).toPandas()
    vertices_df.to_csv(f"{artifact_path}/mapping.csv", index=False)
    end_time = time()
    click.echo(f"Done! Took {end_time-start_time}")