Ejemplo n.º 1
0
def power_iteration_clustering(unclustered_data,
                               number_of_clusters,
                               max_iterations=10,
                               init_mode='random'):
    if number_of_clusters < 1:
        raise ValueError("While clustering with PowerIterationClustering, \
                the given number of clusters is not positive")

    model = PowerIterationClustering.train(rdd=unclustered_data,
                                           k=number_of_clusters,
                                           maxIterations=max_iterations,
                                           initMode=init_mode)
    assignments = model.assignments().collect()
    return [model, assignments]
Ejemplo n.º 2
0
def Power_Iteration_Clustering(X, K, Adj=False, Lp_norm=2, sigma=1, max_Iter=20):
    '''
    Input: 
        X : [n_samples, n_samples] numpy array if adj=True, or, a [n_samples_a, n_features] array otherwise;
        K: int, The number of clusters;
        adj: boolean, Indicating whether the adjacency matrix is pre-computed. Default: False;
        Lp_norm: int, Indicating which L^p norm is using. Default: 2;
        sigma: float, The variance for the Gaussian (aka RBF) kernel. Default: 1;
        max_Iter: int, Maximum number of iterations of the PIC algorithm. Default: 20.
    Output:
        cluster labels: A [n_samples, ] numpy array,
        node ids: A list with length "n_samples".
    '''
    # Setting up PySpark Context
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    
    if Adj:
        # Concatenate the point ID to the last column of the array
        X1 = np.concatenate((X, np.array(range(X.shape[0]), ndmin=2).T), axis=1)
        data = sc.parallelize(X1.tolist())
        # Manipulate the RDD such that each entry is a tuple of the form (ID, distance_list)
        Adj_matRDD = data.map(lambda x: (int(x[len(x)-1]), x[:(len(x)-1)]))
    else:
        X1 = np.concatenate((X, np.array(range(X.shape[0]), ndmin=2).T), axis=1)
        data = sc.parallelize(X1.tolist())
        data = data.map(lambda x: (int(x[len(x)-1]), x[:(len(x)-1)]))
        # Compute the pairwise distances between points
        Adj_matRDD = data.map(lambda item: Distance_Computing(item, DF=X, p=Lp_norm))
    
    # Transform the affinity matrix such that each element in the list has the form (i, j, s_{ij})
    A_RDD = Adj_matRDD.flatMap(lambda item: Affinity(item, sigma=sigma))
    # Cluster the data into two classes using PowerIterationClustering
    model = PowerIterationClustering.train(A_RDD, K, 100)
    
    cluster_id = model.assignments().collect()
    sc.stop()
    IDs = [k.id for k in cluster_id]
    clusters = [k.cluster for k in cluster_id]
    # Sort the cluster label list based on the ascending order of their IDs
    IDs_sorted = sorted(IDs)
    clusters_sorted = np.array(clusters)[np.argsort(IDs)]
    
    return clusters_sorted, IDs_sorted
Ejemplo n.º 3
0
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vec_df = spark.createDataFrame(
    scalerModel.transform(trial_df).select("scaled_1").rdd.map(
        lambda x: (float(x[0][0]), float(x[0][1]), float(x[0][2]))))

# Create RowMatrix from the transpose of
spark_df = spark.createDataFrame(vec_df.toPandas().transpose()).rdd
vector_df = sc.parallelize(spark_df.map(lambda s: Vectors.dense(s)).collect())
mat = RowMatrix(vector_df)
bun = mat.rows.collect()
num_clusters = 3

pre = sc.parallelize(mat.columnSimilarities().entries.map(
    lambda e: (e.i, e.j, e.value)).collect())
model = PowerIterationClustering.train(pre, 3, 20, "random")
err = model.assignments().map(lambda x: (Vectors.dense(bun[0][x.id], bun[1][
    x.id], bun[2][x.id]), x.cluster)).collect()

# Silhoutte value
ag = 0
agi = 1700
for er in err:
    avg = [0] * num_clusters
    avgi = [0] * num_clusters
    for e in err:
        avg[e[1]] += Vectors.squared_distance(er[0], e[0])
        avgi[e[1]] += 1
    a = avg[er[1]] / avgi[er[1]]
    b = sys.maxint
    for i in range(len(avg)):
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
# $example on$
from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PowerIterationClusteringExample")  # SparkContext

    # $example on$
    # Load and parse the data
    data = sc.textFile("data/mllib/pic_data.txt")
    similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')]))

    # Cluster the data into two classes using PowerIterationClustering
    model = PowerIterationClustering.train(similarities, 2, 10)

    model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster)))

    # Save and load model
    model.save(sc, "target/org/apache/spark/PythonPowerIterationClusteringExample/PICModel")
    sameModel = PowerIterationClusteringModel\
        .load(sc, "target/org/apache/spark/PythonPowerIterationClusteringExample/PICModel")
    # $example off$

    sc.stop()
meta = raw_meta.map(parse_meta)
auths = raw_auth.map(parse_auth)

uniq_auths = auths.flatMap(lambda d: d["authors"]).distinct()
uniq_auths_int = uniq_auths.zipWithIndex().collectAsMap()

int_auth = sc.parallelize(int_auth_map(uniq_auths_int), 100)

authID_pairs = auths.flatMap(lambda x: auth_pairs(x, uniq_auths_int))

auth_ct = authID_pairs.map(auth_pairs_ct).reduceByKey(lambda a, b: a + b, numPartitions=100)
auth_ct = auth_ct.map(lambda (a, b): (b, a)).sortByKey(ascending=False, numPartitions=100).map(lambda (a, b): (b, a))

auth_net_edges = auth_ct.map(auth_ct_to_three_tpl)

auth_cluster_mdl = PowerIterationClustering.train(auth_net_edges, 200, 25)

result = sc.parallelize(sorted(auth_cluster_mdl.assignments().collect(), key=lambda x: x.id), 100)
result = result.map(result_to_tpl)

auth_net_clust = int_auth.join(result).map(lambda d: (d[1][0], d[1][1]))

auth_net_edges_names = auth_net_edges.map(lambda d: (d[0], (d[1], d[2])))
auth_net_edges_names = int_auth.join(auth_net_edges_names).map(lambda d: (d[1][1][0], (d[1][0], d[1][1][1])))
auth_net_edges_names = int_auth.join(auth_net_edges_names).map(lambda d: (d[1][1][0], d[1][0], d[1][1][1]))

auth_nodes_out = "s3://XX/auth_nodes"
auth_nodes_lines = auth_net_clust.map(auths_nodes_clusters_to_csv)
auth_nodes_lines.coalesce(1).saveAsTextFile(auth_nodes_out)

auth_edges_out = "s3://XX/auth_edges"
if __name__ == "__main__":

    # made the spark contest
    sc = SparkContext(appName="Spectral Clustering in Spark")
    # input file
    input_file_RDD = sc.textFile(input_file)

    withIndex = input_file_RDD.map(split_function).zipWithIndex()
    indexKey = withIndex.map(lambda (k,v): (v,k))

    C = indexKey.cartesian(indexKey)
    
    input_affinities = C.map(affinities)
    
    model = PowerIterationClustering.train(input_affinities, num_clusters,  upper_bound)

    joined = sc.parallelize(sorted(indexKey.join(model.assignments()).collect()))

    if (num_clusters == 2):
        two_clusters(joined)
        plt.scatter(x1, y1, c='r')
        plt.scatter(x2, y2, c='g')
        plt.show()
    elif (num_clusters == 3):
        three_clusters(joined)
        plt.scatter(x1, y1, c='r')
        plt.scatter(x2, y2, c='g')
        plt.scatter(x3, y3, c='b')
        plt.show()
	meta_math = meta.map(lambda d: (d["id"], d["subj"])).filter(lambda x: x[1] == u"math")

	uniq_papers = meta_math.map(lambda d: d[0]).distinct()
	uniq_papers_int = uniq_papers.zipWithIndex().collectAsMap()
	int_paper = sc.parallelize(int_paper_map(uniq_papers_int), numSlices=numParts)
	paper_int = int_paper.map(lambda x: (x[1], x[0]))

	intID_words = paper_int.join(id_words, numPartitions=numParts).map(lambda x: (x[1][0], x[1][1]))

	pairs = intID_words.cartesian(intID_words)

	sims = pairs.map(lambda x: (x[0][0], x[1][0], compute_jaccard(x[0][1], x[1][1])))
	sims = sims.filter(lambda x: x[2] < 1.0)

	subj_cluster_mdl = PowerIterationClustering.train(sims, 5, 25)

	result = sc.parallelize(sorted(subj_cluster_mdl.assignments().collect(), key=lambda x: x.id), numSlices=numParts)
	result = result.map(result_to_tpl)

	words_net_clust = int_paper.join(result, numPartitions=numParts).map(lambda d: (d[1][0], d[1][1]))

	words_net_clust_subj = words_net_clust.join(subj, numPartitions=numParts).map(lambda d: (d[0], d[1][1], d[1][0]))

	words_edges = sims.filter(lambda x: x[2] > 0.0)

	words_edges_names = words_edges.map(lambda d: (d[0], (d[1], d[2])))
	words_edges_names = int_paper.join(words_edges_names, numPartitions=numParts).map(lambda d: (d[1][1][0], (d[1][0], d[1][1][1])))
	words_edges_names = int_paper.join(words_edges_names, numPartitions=numParts).map(lambda d: (d[1][1][0], d[1][0], d[1][1][1]))

	words_nodes_out = "s3://XX/words_nodes"
#start working with spark
app_name = "PIC_Amazon_20030601"
source_path = "/home/ophidian/dataset/Amazon20030601_transform.txt"
my_model_path = "/home/ophidian/pyspark_models/PIC_amazon0601"
out_path = "home/ophidian/pyspark_results/PIC_amazon0601.result"

conf = SparkConf().setAppName(app_name)
sc = SparkContext(conf=conf)

"""
# example in document
data = sc.textFile("data/mllib/pic_data.txt")
similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')]))
model = PowerIterationClustering.train(similarities, 2, 10)
"""

#use pyspark PIC clustering on vertex
# Load data
data = sc.textFile(source_path)
weighted_edges = data.map(lambda line: tuple([float(x) for x in line.split(' ')]))
# Cluster the data into 10 classes using PowerIterationClustering
model = PowerIterationClustering.train(weighted_edges, 10, 100)

#model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster)))
with open(out_path,"w") as out_file:
    model.assignments().foreach(lambda x: out_file.write(str(x.id) + " -> " + str(x.cluster)) + "\n" )

# Save and load model
model.save(sc, my_model_path)
#sameModel = PowerIterationClusteringModel.load(sc, my_model_path)
Ejemplo n.º 9
0
if __name__ == "__main__":

    # made the spark contest
    sc = SparkContext(appName="Spectral Clustering in Spark")
    # input file
    input_file_RDD = sc.textFile(input_file)

    withIndex = input_file_RDD.map(split_function).zipWithIndex()
    indexKey = withIndex.map(lambda (k, v): (v, k))

    C = indexKey.cartesian(indexKey)

    input_affinities = C.map(affinities)

    model = PowerIterationClustering.train(input_affinities, num_clusters,
                                           upper_bound)

    joined = sc.parallelize(
        sorted(indexKey.join(model.assignments()).collect()))

    if (num_clusters == 2):
        two_clusters(joined)
        plt.scatter(x1, y1, c='r')
        plt.scatter(x2, y2, c='g')
        plt.show()
    elif (num_clusters == 3):
        three_clusters(joined)
        plt.scatter(x1, y1, c='r')
        plt.scatter(x2, y2, c='g')
        plt.scatter(x3, y3, c='b')
        plt.show()