Esempio n. 1
0
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(
            colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)),
            colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(AM.src['color'] == color,
                                              AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(AM.dst['color'] == color,
                                              AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid,
                                            returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(sqlfunctions.sum(
                    AM.msg).alias("aggMess"),
                                                  msgToSrc=msgForSrc,
                                                  msgToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) &
                    (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])).otherwise(
                        v['belief'])  # keep old beliefs for other colors
                newVertices = (
                    v.join(aggregates,
                           on=(v['id'] == aggregates['id']),
                           how='left_outer').drop(
                               aggregates['id']
                           )  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief',
                                newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief'))
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
Esempio n. 2
0
    def runBPwithGraphFrames(cls, g, numIter):
        """Run Belief Propagation using GraphFrame.

        This implementation of BP shows how to use GraphFrame's aggregateMessages method.
        """
        # choose colors for vertices for BP scheduling
        colorG = cls._colorGraph(g)
        numColors = colorG.vertices.select('color').distinct().count()

        # TODO: handle vertices without any edges

        # initialize vertex beliefs at 0.0
        gx = GraphFrame(colorG.vertices.withColumn('belief', sqlfunctions.lit(0.0)), colorG.edges)

        # run BP for numIter iterations
        for iter_ in range(numIter):
            # for each color, have that color receive messages from neighbors
            for color in range(numColors):
                # Send messages to vertices of the current color.
                # We may send to source or destination since edges are treated as undirected.
                msgForSrc = sqlfunctions.when(
                    AM.src['color'] == color,
                    AM.edge['b'] * AM.dst['belief'])
                msgForDst = sqlfunctions.when(
                    AM.dst['color'] == color,
                    AM.edge['b'] * AM.src['belief'])
                # numerically stable sigmoid
                logistic = sqlfunctions.udf(cls._sigmoid, returnType=types.DoubleType())
                aggregates = gx.aggregateMessages(
                    sqlfunctions.sum(AM.msg).alias("aggMess"),
                    sendToSrc=msgForSrc,
                    sendToDst=msgForDst)
                v = gx.vertices
                # receive messages and update beliefs for vertices of the current color
                newBeliefCol = sqlfunctions.when(
                    (v['color'] == color) & (aggregates['aggMess'].isNotNull()),
                    logistic(aggregates['aggMess'] + v['a'])
                ).otherwise(v['belief'])  # keep old beliefs for other colors
                newVertices = (v
                    .join(aggregates, on=(v['id'] == aggregates['id']), how='left_outer')
                    .drop(aggregates['id'])  # drop duplicate ID column (from outer join)
                    .withColumn('newBelief', newBeliefCol)  # compute new beliefs
                    .drop('aggMess')  # drop messages
                    .drop('belief')  # drop old beliefs
                    .withColumnRenamed('newBelief', 'belief')
                )
                # cache new vertices using workaround for SPARK-1334
                cachedNewVertices = AM.getCachedDataFrame(newVertices)
                gx = GraphFrame(cachedNewVertices, gx.edges)

        # Drop the "color" column from vertices
        return GraphFrame(gx.vertices.drop('color'), gx.edges)
vertices = grouped_edges.withColumn(
    'neighbors_of_neighbors',
    create_id_list_column_udf(grouped_edges['id'],
                              grouped_edges['neighbors'])).join(
                                  vertices, on='id',
                                  how='left_outer').drop('neighbors')
vertices.show(500, truncate=False)

#---------------------------------------
print('State 3')
g = GraphFrame(vertices, edges)
#g.vertices.show(500, truncate=False)
#g.edges.show(500, truncate=False)

aggregates = g.aggregateMessages(F.collect_set(
    AM.msg).alias('neighbors_of_neighbors2'),
                                 sendToDst=AM.src['neighbors_of_neighbors'])

vertices = vertices.join(aggregates, on='id',
                         how='left_outer').withColumnRenamed(
                             'neighbors_of_neighbors',
                             'neighbors').withColumnRenamed(
                                 'neighbors_of_neighbors2',
                                 'neighbors_of_neighbors')
vertices.show(500, truncate=False)

#---------------------------------------
print('State 4')


def same_neighbors(neighbors, neighbors_of_neighbors):
    .save()
#
logging.info('Done!')
#
# #
# ## closeness centrality
#
vertices = g.vertices.withColumn("ids", F.array())
cached_vertices = AM.getCachedDataFrame(vertices)
g2 = GraphFrame(cached_vertices, g.edges)

for i in range(0, g2.vertices.count()):
    msg_dst = new_paths_udf(AM.src["ids"], AM.src["id"])
    msg_src = new_paths_udf(AM.dst["ids"], AM.dst["id"])
    agg = g2.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                               sendToSrc=msg_src,
                               sendToDst=msg_dst)
    res = agg.withColumn("newIds", flatten_udf("agg")).drop("agg")
    new_vertices = (g2.vertices.join(
        res, on="id", how="left_outer").withColumn(
            "mergedIds", merge_paths_udf("ids", "newIds", "id")).drop(
                "ids", "newIds").withColumnRenamed("mergedIds", "ids"))
    cached_new_vertices = AM.getCachedDataFrame(new_vertices)
    g2 = GraphFrame(cached_new_vertices, g2.edges)


closeness_centrality = g2.vertices\
    .withColumn("closeness", closeness_udf("ids"))\
    .sort("closeness", ascending=False)

# closeness_centrality.write.csv('Closeness_centrality.csv')
Esempio n. 5
0
    def run(self, graph, num_iter=1):

        model = self.model

        # Make a spark UDF to initialize each node with a random trait
        @sqlfunctions.udf(returnType=types.ArrayType(types.IntegerType()))
        def udf_initialize_traits(arg):
            return model.initialize_traits(arg)

        # Make a spark UDF handling inter-node communication
        @sqlfunctions.udf(returnType=types.ArrayType(types.IntegerType()))
        def udf_node_interaction(my_traits, neighbor_traits):
            return model.check_neighbor_encounters(my_traits, neighbor_traits)

        # Make a spark UDF handling inter-node communication
        @sqlfunctions.udf(returnType=types.ArrayType(types.IntegerType()))
        def udf_combine_traits(my_traits, other_traits):
            return model.combine_traits(my_traits, other_traits)

        # Initialize random traits
        new_vertices = graph.vertices\
            .withColumn("traits", udf_initialize_traits(graph.vertices.id))

        # Make a new graph with this random trait
        current_graph = GraphFrame(new_vertices, graph.edges)

        # For number of iterations we want to run:
        diversities = []
        for i in range(num_iter):

            # Send neighbor traits to each node
            neighbor_traits = current_graph.aggregateMessages(
                sqlfunctions.collect_list(AM.msg).alias("neighbor_traits"),
                sendToSrc=None,
                sendToDst=AM.src["traits"])

            # Join neighbor traits back to main table
            new_vertices = current_graph.vertices\
                .join(neighbor_traits, "id", "left_outer")

            # Select which neighbor to interact with
            new_vertices = new_vertices\
                .withColumn("interaction_traits", udf_node_interaction("traits", "neighbor_traits"))\
                .drop("neighbor_traits")

            # Mix your and neighbor traits
            new_vertices = new_vertices\
                .withColumn("combined_traits", udf_combine_traits("traits", "interaction_traits"))

            # Drop intermediate columns
            new_vertices = new_vertices\
                .drop("traits", "interaction_traits")\
                .withColumnRenamed("combined_traits", "traits")

            # Cache
            cached_new_vertices = AM.getCachedDataFrame(new_vertices)

            # Update current graph with new nodes
            current_graph = GraphFrame(cached_new_vertices, graph.edges)

            # Record trait diversity
            diversity = current_graph.vertices.select(
                'traits').distinct().count()
            diversities.append(diversity)

            print("Iteration: {}, trait diversity: {}".format(i, diversity))

        return current_graph, diversities
Esempio n. 6
0
# Page rank
print("\n Page rank")
# # run until convergence to tol
# results = g.pageRank(resetProbability=0.15, tol=0.01)
# results.vertices.select("id", "pagerank").show()
# results.edges.select("src", "dst", "weight").show()

## Run PageRank personalized for vertex ["a", "b", "c", "d"] in parallel
# results4 = g.parallelPersonalizedPageRank(resetProbability=0.15, sourceIds=["a", "b", "c", "d"], maxIter=10)\

print("\n shortest paths from each node to landmards node")
results = g.shortestPaths(landmarks=["a", "d"])
results.select("id", "distances").show()

# # Saving and Loading GraphFrames
# g.vertices.write.parquet("hdfs://myLocation/vertices")
# g.edges.write.parquet("hdfs://myLocation/edges")
#
# # Load the vertices and edges back.
# sameV = sqlContext.read.parquet("hdfs://myLocation/vertices")
# sameE = sqlContext.read.parquet("hdfs://myLocation/edges")

# message passing via AggregateMessages
# For each user, sum the ages of the adjacent users.
msgToSrc = AM.dst["age"]
msgToDst = AM.src["age"]
agg = g.aggregateMessages(sum(AM.msg).alias("summedAges"),
                          sendToSrc=msgToSrc,
                          sentToDst=msgToDst)
agg.show()
display(e)

# COMMAND ----------

from graphframes import GraphFrame
from graphframes.lib import AggregateMessages as AM

# Create a graph with vertices containing an empty parents array column
g = GraphFrame(v.withColumn("parents", array()), e)

# Initial message to be passed to neighbor vertices. We want to traverse from the leaf, hence AM.src
msgToDst = AM.src["name"]

for i in range(6):
    # AM.msg contains the next message i.e. next parent in our case
    agg = g.aggregateMessages(collect_list(AM.msg).alias("tmpParent"),
                              sendToDst=msgToDst)

    # Append this message to the parents array column of vertices and also keep it as a standalone column for next iteration
    currentV = g.vertices
    newV = currentV.join(agg, "id", how = "left") \
      .drop(agg["id"]) \
      .withColumn("parents", concat(agg["tmpParent"], currentV["parents"])) \
      .withColumn("lastParent", col("tmpParent")[0]) \
      .drop("tmpParent")

    # Caching the transitionary vertices dataframe is important here, otherwise the Spark job will take very long time to complete
    cachedNewV = AM.getCachedDataFrame(newV)
    g = GraphFrame(cachedNewV, g.edges)

    # Pass the standalone column i.e recent parent to the next iteration
    msgToDst = AM.src["lastParent"]
    types.StructField("neighbours", types.ArrayType(types.StringType()))
])
new_neighbours_udf = F.udf(new_neighbours, neighbours_type)

vertices = vertices.withColumn(
    "neighbours",
    new_neighbours_udf(vertices["id"], vertices["neighbours_list"]))

# construct the graph
graph = GraphFrame(vertices, edges)
print("Graph after tweaks:")
graph.vertices.show()
graph.edges.show()

# send neighbours list to neighbours
aggregates = graph.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                     sendToDst=AM.src["neighbours"])
print("Using aggregateMessages:")
aggregates.show()

# find common neighbours
print("Finding common neighbours:")
aggregates = aggregates.join(graph.vertices, on="id").drop("neighbours")
# aggregates.show()


def common_neighbours(node_neighbours, messagers_neighbours):
    common_list = []
    for neighbours in messagers_neighbours:
        common_list.append({
            "id":
            neighbours.id,
Esempio n. 9
0
    def LPAImp(self, numIter, modularity=True):
        """Label propogation algorithm for bipartite networks with synchronous
        updating scheme; Return a data frame with columns which containts the
        vertices ID, labeling assignment and modularity (if specified to
        be returned)

        Keyword Arguments:

        numIter -- Number of iteration for LPAb

        modularity -- A boolean variable indicating whether the
        modularity should be calculated and returned.
        """
        # Assign initial label to the users
        initLabelUDF = F.udf(lambda i, j: i if j == 1 else None,
                             types.IntegerType())
        v = self.gf.vertices.withColumn(
            'label', initLabelUDF(F.col('id'), F.col('nodeType')))
        # Add edges for every node that goes to itself
        E_self = self.SS.createDataFrame(v.select(F.col('id')).rdd)
        E = AM.getCachedDataFrame(
            self.gf.edges.union(
                E_self.withColumn('dst',
                                  F.col('id')).withColumnRenamed('id', 'src')))

        # Create a new graphframe object with labels attached
        LPAbgf = GraphFrame(v, E)

        # Create a UDAF (User Defined Aggregate Function) that returns the most frequent
        # label
        @pandas_udf("int", PandasUDFType.GROUPED_AGG)
        def maxLabel_udf(label_list):
            label_list = list(filter(None, label_list))
            LabelCounts = Counter(label_list)
            mostCommonLabels = [
                i[0] for i in LabelCounts.items()
                if i[1] == max(LabelCounts.values())
            ]
            return np.random.choice(mostCommonLabels)

        for iter_ in range(numIter):
            for nodeType in [1, 2]:
                # For user and repo nodes, send their labels to
                # their destination nodes in alternating order
                msgForDst = F.when(AM.src['nodeType'] == nodeType,
                                   AM.src['label'])
                # If it's repo's turn to send label to their destinations,
                # also send repo's label's to its contributors
                if nodeType == 2:
                    msgForSrc = F.when(AM.src['nodeType'] == 1,
                                       AM.dst['label'])
                else:
                    msgForSrc = None

                # Aggregate messages received from each node
                aggregates = LPAbgf.aggregateMessages(aggCol=maxLabel_udf(
                    AM.msg).alias("aggMess"),
                                                      sendToDst=msgForDst,
                                                      sendToSrc=msgForSrc)
                v = LPAbgf.vertices

                # Update Labels for each node; If there is message for
                # the node, update the node's Label
                newLabelCol = F.when(aggregates["aggMess"].isNotNull(),
                                     aggregates["aggMess"]).otherwise(
                                         v['label'])
                # Outer join aggregates and vertices
                vNew = (
                    v.join(aggregates,
                           on=(v['id'] == aggregates['id']),
                           how='left_outer').drop(aggregates['id'])
                    # Compute new column
                    .withColumn('newLabel', newLabelCol)
                    # Drop messages
                    .drop('aggMess')
                    # Drop old labels
                    .drop('label').withColumnRenamed('newLabel', 'label'))

                cachedvNew = AM.getCachedDataFrame(vNew)
                LPAbgf = GraphFrame(cachedvNew, E)
        # Delete the edges that goes from itself
        LPAbgf = GraphFrame(LPAbgf.vertices, self.gf.edges)
        return LPAbgf
    color = old_local_max.color
    step = old_local_max.step + 1
    if new_local_max.id < old_local_max.id:
        maxima = True
        color = old_local_max.step

    return {"id": old_local_max.id, "color": color, "maxima": maxima, "step": step}


compare_local_max_value_udf = F.udf(compare_local_max_value, local_max_value_type)

# Local Maxima First Algorithm
while True:
    # Aggregate messages from the neighbors.
    aggregates = g.aggregateMessages(
        F.collect_set(AM.msg).alias("agg"), sendToDst=AM.src["localMaxima"]
    )
    res = aggregates.withColumn(
        "newlocalMaxima", greater_local_max_value_udf("agg")
    ).drop("agg")

    # Aggregate and Join vertices leveraging localMaxima values
    new_vertices = (
        g.vertices.join(res, on="id", how="left_outer")
        .withColumnRenamed("localMaxima", "oldlocalMaxima")
        .withColumn(
            "localMaxima",
            compare_local_max_value_udf(
                F.col("oldlocalMaxima"), F.col("newlocalMaxima")
            ),
        )
Esempio n. 11
0
            min_rating_id = rating.id
    return {"id": min_rating_id, "rating": min_rating}


min_rating_udf = F.udf(min_rating, player_rating_type)


# UDF for finding the minimum rating between the old one and the new one.
def compare_rating(old_rating, new_rating):
    return old_rating if old_rating.rating < new_rating.rating else new_rating


compare_rating_udf = F.udf(compare_rating, player_rating_type)

# Iterative graph computations
max_iterations = 5
for _ in range(max_iterations):
    aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                     sendToDst=AM.src["minRating"])
    res = aggregates.withColumn("newMinRating",
                                min_rating_udf("agg")).drop("agg")
    new_vertices = g.vertices.join(
        res, on="id", how="left_outer").withColumnRenamed(
            "minRating", "oldMinRating").withColumn(
                "minRating",
                compare_rating_udf(F.col("oldMinRating"), F.col(
                    "newMinRating"))).drop("oldMinRating").drop("newMinRating")
    cached_new_vertices = AM.getCachedDataFrame(new_vertices)
    g = GraphFrame(cached_new_vertices, g.edges)
    g.vertices.show()
Esempio n. 12
0
    处理接收到src消息后的操作
    :param datas:
    :return:
    """
    max_ratio = -1
    cnodes = []

    for msg in msgs:
        if msg.max_ratio > max_ratio:
            max_ratio = msg.max_ratio
            cnodes = msg.cnodes

    return {'max_ratio': max_ratio, 'cnodes': cnodes}


aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                 sendToDst=AM.src["rc"])

agg_src_udf = F.udf(agg_src_func, rc_type)
res = aggregates.withColumn("rc", agg_src_udf("agg")).drop("agg")
print("第一次初始化".center(88, "*"))
init_vertices = g.vertices.join(res, res.id == g.vertices.id,
                                "left").select(g.vertices.id, g.vertices.name,
                                               res.rc)

new_vertices = init_vertices.select(
    init_vertices.id, init_vertices.name,
    F.when(
        init_vertices.rc.isNull(),
        rc_func_udf(init_vertices["id"], F.lit(100),
                    F.array(init_vertices['id']))).otherwise(
                        init_vertices['rc']).alias('rc'))