Esempio n. 1
0
def process_batch(batch_filename, batch_num):
    import graphframes
    print('processing', batch_filename)
    edges = spark.read.csv(batch_filename, header=True)
    graph = graphframes.GraphFrame(vertices, edges)
    communities = graph.labelPropagation(maxIter=5)
    comm_filename = 'data/models/comm_{}.csv'.format(batch_num)
    communities.toPandas().to_csv(comm_filename, header=True, index=False)
Esempio n. 2
0
def createGraph(vertices, edges):
	'''
	Function: Create graph
	Parameters: Pyspark Dataframe - vertices, edges
	Returns: GraphFrame
	'''
	print("Creating graph..")
	# Generate the graph
	graph = GF.GraphFrame(vertices, edges)
	print("Graph creation complete.")
	return graph
Esempio n. 3
0
def main():

    sc = initializeSpark()

    spark = SparkSession(sc)

    directory, post_id = parse_pls()
    rdds = make_rdds_from_dir(directory, sc)
    post_rdd = rdds["posts_rdd"]

    string = make_stripped_string(post_rdd, post_id)

    print("\n Body from post_id: " + str(post_id) +
          ", stripped of shitespaces and special characters:\n")
    print("'" + string + "'\n")

    # Tokenize the string
    tokens = tokenize(string)
    # remove duplicate entries
    tokens_unique = remove_dupes(tokens)

    # Assign id to the unique tokens
    token_id_tuple = assign_id_to_list(tokens_unique)
    # Now assign these id's the the original token list
    token_id_all = assign_unique_ids(token_id_tuple, tokens)

    print("\nTokens retrieved from the body with their respective id's: \n")
    for i in token_id_all:
        print(i)

    print("\n\nEdges:\n")
    ids = []
    for i in token_id_all:
        ids.append(i[0])

    # Create edges on a window size of 5, using the ids of the tokens
    edges = create_edges(ids, 5)
    # Removes duplicate edges from list
    edges = remove_dupe_tuples(edges)
    print(edges)
    print("\n\nPageRank:")

    sqlContext = SQLContext(sc)

    v = sqlContext.createDataFrame(token_id_tuple, ["id", "word"])

    e = sqlContext.createDataFrame(edges, ["src", "dst"])

    g = graphframes.GraphFrame(v, e)

    results = g.pageRank(resetProbability=0.15, tol=0.0001)
    results.vertices.select("word", "pagerank").show(truncate=False)
Esempio n. 4
0
def calculaSugestaoDeAmigos(usuarios, relacionamentos, intervalo):
    grafo = graphframes.GraphFrame(usuarios, relacionamentos)
    resultado = []
    #calcula amigos proximo para cada usuario no intervalo
    for inter in intervalo:
        #para cada usuario deve_se gerar a sugestao de amigos
        resultado.append(sugestao(grafo, inter, usuarios))
        """criamos a funcao mas poderia ter feito
            graph.find (“(a)-[e]->(b); (b)-[e2]->(c); !(a)-[]->(c)”)
            queremos um usuário “a” que tenha conexão com “b” (a)-[e]->(b), 
            sendo que “b” tem conexão com “c” (b)-[e2]->(c), 
            mas “a” e “c” não tenha conexão !(a)-[]->(c)."""

    return resultado
Esempio n. 5
0
def loadGraph(context, name):
	'''
	Function: Load graph from file
	Parameters: N/A
	Returns: GraphFrame
	'''
	# Load the graph from file
	print("\nLoading graph data..")
	vertices = context.read.parquet('store/'+name+'Vertices.parquet')
	edges = context.read.parquet('store/'+name+'Edges.parquet')
	print("\nGenerating graph..")
	graph = GF.GraphFrame(vertices, edges)
	print("\nGraph load complete.")
	return graph
Esempio n. 6
0
        .appName("PostLinksRelation")\
        .getOrCreate()
    #people = spark.createDataFrame([("Bilbo Baggins",  50), ("Gandalf", 1000), ("Thorin", 195), ("Balin", 178), ("Kili", 77),
    #        ("Dwalin", 169), ("Oin", 167), ("Gloin", 158), ("Fili", 82), ("Bombur", None)], ["name", "age"])
    #people.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()
    #'''

    #df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
    df = spark.read.json(sys.argv[1])
    df.printSchema()
    edges = df.rdd.map(parseEdges).cache()
    nodes = edges.map(lambda x: (x[0], x[1])).flatMap(lambda X: X).distinct()
    edges = changeToDF_v2(edges, "src dst weight", "edges")
    #nodes=changeToDF_v1(nodes,"nodes")
    nodes = changeToDF_v2(nodes, "id", "nodes")
    g = graphframes.GraphFrame(nodes, edges)
    g.dropIsolatedVertices()

    print("+" * 100)
    vertices = g.vertices.collect()
    vertices = list(map(lambda x: x["id"], vertices))
    print(vertices[:36])
    print("+" * 100)

    distances_data = None
    for i in range(len(vertices)):
        print("*" * 50 + "{}/{}".format(i + 1, len(vertices)) + "*" * 50)
        for j in range(i):
            distances = g.bfs(
                fromExpr="id!=-123",
                toExpr="id!=-123",
Esempio n. 7
0
            if determine_edge((input_list[i],input_list[j]))==True:
                output.append((input_list[i],input_list[j]))
    return output

edge=select_pair(t0)

v=set()
for i in edge:
    for t in i:
        if t not in v:
            v.add(t)
vertices=sqlContext.createDataFrame(list(map(lambda a:[a], v)), ["id"])

edge+=list(map(lambda x:tuple(reversed(x)), edge))
edges=sqlContext.createDataFrame(edge, ["src", "dst"])
g=graphframes.GraphFrame(vertices, edges)



result = g.labelPropagation(maxIter=5).groupBy("label").agg(F.collect_list("id")).collect()
# result.select("id", "label").show()

output=[]
for i in result:
    output.append(list(sorted(i["collect_list(id)"])))
   

output.sort(key=lambda x:(len(x),x[0]))
file2=open(community_output_file_path, "w")
# for i in range(len(output)-1):
#     file2.write(''.join(str(s) for s in output[i]) + '\n')
Esempio n. 8
0
        biz1 = user_biz_dict[user_pair[0]]
        biz2 = user_biz_dict[user_pair[1]]
        return len(biz1 & biz2) >= filter_threshold

    edgeRDD = user_pairsRDD.filter(filter_edge)
    nodeRDD = edgeRDD.flatMap(lambda x: x).distinct()
    # undirected graph, edges need to be both way
    edgeRDD = edgeRDD.flatMap(lambda x: [x, (x[1], x[0])]) \
        .map(lambda x: (idx_user_dict[x[0]], idx_user_dict[x[1]]))
    edgeDF = edgeRDD.toDF(["src", "dst"])
    print("number of edges in the graph:", edgeDF.count())
    nodeDF = nodeRDD.map(lambda x: (idx_user_dict[x],)).toDF(["id"])
    print("number of nodes in the graph:", nodeDF.count())

    # build graph的node和edge还是要用原本的user_id,坑。。。
    g = graphframes.GraphFrame(nodeDF, edgeDF)
    resultDF = g.labelPropagation(maxIter=5)

    # (label, list of user_id)
    resultRDD = resultDF.rdd.map(lambda x: (x[1], x[0])) \
        .groupByKey().mapValues(list) \
        .map(lambda x: sorted(x[1])) \
        .sortBy(lambda x: (len(x), x[0]))
    
    result = resultRDD.collect()

    with open(community_output_file_path, 'w') as f:
        for one in result:
            print(str(one)[1: -1], file = f)

    end_time = time.time()
Esempio n. 9
0
def resolve(sc, nodes, edges):
    graph = gf.GraphFrame(nodes, edges)
    sc.setCheckpointDir("/tmp/checkpoints")
    return graph.connectedComponents()
Esempio n. 10
0
def generate_spark_graph(strings, sc, mat=None, min_ld=1, max_ld=1):
    """
    Make a graph using the Spark graphframes library

    Inputs
    ------

    strings: list
        a list of strings to use for the pairwise distance matrix
    sc : pyspark.SparkContext
        a live SparkContext
    mat : pyspark.RDD, optional
        an RDD representing the distance matrix (returned by `distance_matrix`). If not given, 
        it is generated automatically
    min_ld : int, optional
        minimum Levenshtein distance
    max_ld : int, optional
        maximum Levenshtein distance

    Returns
    -------
    g : graphframes.GraphFrame object with strings as node names

    """
    try:
        import findspark
        findspark.init()
        import graphframes
        from pyspark.sql import Row, SQLContext
        from pyspark.sql.types import StructField, StructType, IntegerType, ShortType, StringType, LongType
    except:
        warn(
            'Problem importing pyspark -- are you sure your SPARK_HOME is set?'
        )

    sqc = SQLContext(sc)

    strings_b = sc.broadcast(strings)
    size = len(strings)

    # make the vertex DataFrame
    v_schema = StructType([
        StructField('id', IntegerType()),
        StructField('string', StringType())
    ])
    v_rdd = sc.parallelize(
        range(size)).map(lambda x: Row(id=x, string=strings_b.value[x]))
    v = sqc.createDataFrame(v_rdd, schema=v_schema)

    # make the edge DataFrame
    if mat is None:
        mat = distance_matrix(strings, min_ld=min_ld, max_ld=max_ld, sc=sc)
    e_schema = StructType([
        StructField('src', IntegerType()),
        StructField('dst', IntegerType()),
        StructField('weight', ShortType())
    ])
    e = sqc.createDataFrame(mat, schema=e_schema)
    gf = graphframes.GraphFrame(v, e)

    return gf
Esempio n. 11
0
vertices_df = vertices_df.repartition(conf.partitions, "cell")
effective_partitions = vertices_df.rdd.getNumPartitions()
print("effective partitions # =", effective_partitions)

# ---------- edges

edge_values = lambda start, stop: [(i, e[0], e[1]) for i, e in enumerate(
    edge_it(conf.vertices, range(start, stop), conf.degree_max))]

edges = batch_create(directory=conf.graphs,
                     file="edges",
                     build_values=edge_values,
                     columns=["eid", "src", "dst"],
                     total_rows=conf.vertices,
                     batches=conf.batches_edges,
                     vertices=vertices_df,
                     grid_size=conf.g)

graph = graphframes.GraphFrame(vertices_df, edges)
stepper.show_step("Create a GraphFrame")

print("count: vertices=", graph.vertices.count(), "edges=",
      graph.edges.count())
stepper.show_step("count GraphFrame")

graph.vertices.show()
graph.edges.show()

spark.sparkContext.stop()
Esempio n. 12
0
    # company to product - minedFrom
    # product to company - belongsTo
    edges_to_product = values.withColumn("relationship", F.lit("minedFrom"))
    edges_to_product = edges_to_product \
     .select(F.col("company_nr").alias("src"), F.col("product").alias("dst"), "relationship") \
     .distinct()

    # as we need pagerank for companies then we create another df so that it would be directed
    edges_from_product = values.withColumn("relationship", F.lit("offers"))
    edges_from_product = edges_from_product \
     .select(F.col("product").alias("dst"), F.col("company_nr").alias("src"), "relationship") \
     .distinct()

    edges = edges_from_product.unionAll(edges_to_product)

    g = graphframes.GraphFrame(nodes, edges)

    # For each vertex, find the average neighbour degree
    # fist calculate the degree
    print("Calculating Degree")
    vertices_out = g.degrees

    gx = graphframes.GraphFrame(vertices_out, edges)

    print("Calculating Nearest Neighbour Degree")
    msgToSrc = None
    msgToDst = AM.src["degree"]
    nnd = gx.aggregateMessages(F.avg(AM.msg).alias("nearest-neighbour-degree"),
                               sendToSrc=msgToSrc,
                               sendToDst=msgToDst)