def spark_means(self, Matrix, Kcluster=2, MaxIterations=10, runs=10): cluster_data = self.sc.parallelize(Matrix) trains = KMeans().train(cluster_data, Kcluster, MaxIterations, runs) results = trains.predict(cluster_data).collect() return results
movie_cluster_model = KMeans().train(movie_vectors, k=5, maxIterations=10, runs=3) print("movie cluster model kmeans :") print(movie_cluster_model) user_cluster_model = KMeans().train(user_vectors, k=5, maxIterations=10, runs=3) print("user cluster model kmeans :") print(user_cluster_model) # predict movie_1 = movie_vectors.first() movie_cluster = movie_cluster_model.predict(movie_1) print(movie_cluster) # evaluation movie_cost = movie_cluster_model.computeCost(movie_vectors) print("WCSS for movies :" + str(movie_cost)) train_test_split_movies = movie_vectors.randomSplit((0.6, 0.4), 123) train_movies = train_test_split_movies[0] test_movies = train_test_split_movies[1] def costs_movies(cluster, train, test): for c in cluster: m = KMeans().train(train, k=c, maxIterations=10, runs=3) wscc = m.computeCost(test) print("WSCC for k=" + str(c) + ":" + str(wscc))
def K_means(self,data): cluster_data = self.sc.parallelize(data) trains = KMeans().train(cluster_data,self.k,self.iteration,self.runs) results = trains.predict(cluster_data).collect() return results
def K_means(self, data): cluster_data = self.sc.parallelize(data) trains = KMeans().train(cluster_data, self.k, self.iteration, self.runs) results = trains.predict(cluster_data).collect() return results
pyspark.StorageLevel.DISK_ONLY) #adding first sample data in data_used and reoving those from originalRDD data_used = first_part.map(lambda line: int(line[0])).collect() data_used = set(data_used) originalRDD = originalRDD.filter(lambda line: True if int(line[0]) not in data_used else False) # Trains a k-means model. # make clusters model with 5*number of clusters # predicting results for every point in the first part train_data = first_part.map(lambda line: array(line[2:])) train_data = np.array(train_data.collect()) kmeans = KMeans(n_clusters=input_clusters * 5, random_state=0).fit(train_data) print(kmeans.labels_) results = first_part.map(lambda line: (kmeans.predict([line[ 2:]]), [int(line[0])])).map(lambda line: (line[0].tolist()[0], line[1]) ).reduceByKey(lambda a, b: a + b).persist( pyspark.StorageLevel.DISK_ONLY) # seperating the clusters with only one point and adding them to the retained set RetainedSetRDD = results.filter(lambda line: True if len(line[1]) == 1 else False).map(lambda line: line[1][0]) retained_set.update(set(RetainedSetRDD.collect())) # print(retained_set) # Running K means on the candidates for Discard Set remaining = results.filter(lambda line: True if len(line[1]) > 1 else False ).flatMap(lambda line: line[1]) remaining = set(remaining.collect()) # print(remaining)