def pick_k(df_vec, sample_rate=0.0005, sample_size=5, ktop=10): """ Input: df: pyspark dataframe sample_rate: float, the ratio rate of sampling df sample_size: int, how many time to run the elbow cost and silhouette_list methods ktop: int, the top k range for evaluation Output: df: pyspark dataframe, result for elbow cost and silhouette_list methods """ choose_k_list = [] for seed in range(sample_size): df_sample = df_vec.sample(False, sample_rate, seed=seed) # withReplacement: False elbow_cost = [] silhouette = [] for k in range(2, ktop + 1): kmeans = KMeans(k=k, seed=seed) tmp_model = kmeans.fit(df_sample) elbow_cost.append(tmp_model.summary.trainingCost) predictions = tmp_model.transform(df_sample) evaluator = ClusteringEvaluator() silhouette.append(evaluator.evaluate(predictions)) choose_k_list.append([seed, k, elbow_cost[-1], silhouette[-1]]) return spark.createDataFrame( pd.DataFrame(choose_k_list, columns=["seed", "k", "elbow_cost", "silhouette"]))
def kmeans_usecase(): spark = getSparkSession() schema = '' for i in range(65): schema = schema + '_c' + str(i) + ' DOUBLE' + ',' schema = schema[:len(schema) - 1] df_train = spark.read.csv('../data/optdigits.tra', schema=schema) df_test = spark.read.csv('../data/optdigits.tes', schema=schema) cols = [] for i in range(65): cols.append("_c" + str(i)) df_train.head = cols df_test.head = cols assembler = VectorAssembler(inputCols=cols[:-1], outputCol="features") train_output = assembler.transform(df_train) test_output = assembler.transform(df_test) train_features = train_output.select("features").toDF('features') test_features = test_output.select("features").toDF('features') train_features.show(truncate=False) test_features.show(truncate=False) kmeans = KMeans().setK(10).setSeed(1) model = kmeans.fit(train_features) predictions = model.transform(test_features) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette))
def kmeans(data): (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(trainingData) # Make predictions predictions = model.transform(testData) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) predictions.select("prediction", "label", "features").show(5) print("prediction=1.0 count: " + str(predictions.filter("prediction=1.0").count())) print("label=1.0 count: " + str(predictions.filter("label=1.0").count())) print("total count: " + str(predictions.count())) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") predictions = predictions.withColumn("prediction", predictions["prediction"].cast("double")) predictions = predictions.withColumnRenamed("label", "indexedLabel") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
def main(spark, model_file, data_file): '''Main routine for unsupervised evaluation Parameters ---------- spark : SparkSession object model_file : string, path to store the serialized model file data_file : string, path to the parquet file to load ''' DF = spark.read.parquet(data_file) DF .select("mfcc_00", "mfcc_01", "mfcc_02","mfcc_03","mfcc_04","mfcc_05","mfcc_06","mfcc_07","mfcc_08","mfcc_09","mfcc_10", "mfcc_11", "mfcc_12","mfcc_13","mfcc_14","mfcc_15","mfcc_16","mfcc_17","mfcc_18","mfcc_19") #assembler = VectorAssembler( # inputCols=["mfcc_00", "mfcc_01", "mfcc_02","mfcc_03","mfcc_04","mfcc_05","mfcc_06","mfcc_07","mfcc_08","mfcc_09","mfcc_10", "mfcc_11", "mfcc_12","mfcc_13","mfcc_14","mfcc_15","mfcc_16","mfcc_17","mfcc_18","mfcc_19"], #outputCol="features") #DF = assembler.transform(DF) #DFnew = scalerModel.transform(DF) Model=PipelineModel.load(model_file) predictions = Model.transform(DF) evaluator = ClusteringEvaluator() Result = evaluator.evaluate(predictions) print(str(Result)) ### # TODO: YOUR CODE GOES HERE ### pass
def kmeans(coordinates_list, spark): coordinates_list = [ [float(coordinates[0]), float(coordinates[1])] for coordinates in coordinates_list ] df = spark.createDataFrame(coordinates_list, ["Longitude", "Latitude"]) vecAssembler = VectorAssembler( inputCols=["Longitude", "Latitude"], outputCol="features" ) new_df = vecAssembler.transform(df) silhouettes = [] for k in range(2, 10): kmeans = KMeans().setK(k).setSeed(1) model = kmeans.fit(new_df.select("features")) predictions = model.transform(new_df) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) silhouettes.append([silhouette, predictions, k]) _, predictions, k = max(silhouettes, key=lambda x: x[0]) predictions.show() print(k) return predictions
def get_evaluation_results(self, predictions): self.evaluator = ClusteringEvaluator() silhoute = self.evaluator.evaluate(predictions) result = '\nsilhouette_score:' + str(silhoute) with open('kmeansresult.txt', 'a+') as fp: fp.write(result) return silhoute
def main(spark, model_file, data_file): '''Main routine for unsupervised evaluation Parameters ---------- spark : SparkSession object model_file : string, path to store the serialized model file data_file : string, path to the parquet file to load ''' ### # TODO: YOUR CODE GOES HERE K_Model = PipelineModel.load(model_file) df = spark.read.parquet(data_file) df_mfcc = df.select("mfcc_00", "mfcc_01", "mfcc_02", "mfcc_03", "mfcc_04", "mfcc_05", "mfcc_06", "mfcc_07", "mfcc_08", "mfcc_09", "mfcc_10", "mfcc_11", "mfcc_12", "mfcc_13", "mfcc_14", "mfcc_15", "mfcc_16", "mfcc_17", "mfcc_18", "mfcc_19") predictions = K_Model.transform(df_mfcc) evaluator = ClusteringEvaluator() K_model_evaluation = evaluator.evaluate(predictions) print("Score of K-Means Clustering Model: ", str(K_model_evaluation)) ### pass
def find_elbow(self): x, y = [], [] for k in range(2, 50): # Define the model, seed should be fixed between iteration # to prevent it from being a source of variance kmeans = self.kmeans_type(k=k, seed=SEED) model = kmeans.fit(self.dataset) # Make predictions; we are going to predict straight on our # training dataset since the clustering was derived from it predictions = model.transform(self.dataset) # Compute error evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) x.append(k) y.append(silhouette) ax = sns.lineplot(x=x, y=y, palette="coolwarm", marker="o") ax.set_xlabel("Number of Clusters") ax.set_ylabel("Silhouette Score") ax.set_title("Cluster Quality by Number of Clusters") plot_name = f"elbow-{self.dataset_name}-{self.kmeans_name}.png" plt.savefig(os.path.join("analysis", "results", "charts", plot_name))
def train_cluster(df, k): evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='final_features_scaled', \ metricName='silhouette', distanceMeasure='squaredEuclidean') kmeans = KMeans() \ .setK(k) \ .setFeaturesCol("final_features_scaled") \ .setPredictionCol("cluster") kmeans_model = kmeans.fit(df) output = kmeans_model.transform(df) score = evaluator.evaluate(output) print("k: {}, silhouette score: {}".format(k, score)) expr_mean = [F.avg(col).alias(col + '_mean') for col in final_features] # @pandas_udf(FloatType(), functionType=PandasUDFType.GROUPED_AGG) # def _func_median(v): # return v.median() # expr_median = [_func_median(output[col]).alias(col+'_median') for col in numeric_features] # df_median = output.groupBy('cluster').agg(*expr_median).toPandas() df_mean = output.groupBy('cluster').agg( F.count(F.lit(1)).alias("audience_num"), *expr_mean).toPandas() # result = pd.merge(df_mean, df_median, on='cluster') return output, df_mean
def main(spark, model_file, data_file): '''Main routine for unsupervised evaluation Parameters ---------- spark : SparkSession object model_file : string, path to store the serialized model file data_file : string, path to the parquet file to load ''' ### # TODO: YOUR CODE GOES HERE ### # Load the Pipeline model we trained model = PipelineModel.load(model_file) # Read the val val = spark.read.parquet(data_file) # Predictions predictions = model.transform(val) # Evaluations evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='scaled_features') result = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(result))
def cluster(self): from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator # Loads data. dataset = self.read.format("libsvm").load(self.dataDir + "data/mllib/sample_kmeans_data.txt") # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) # Make predictions predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def main(spark, model_file, data_file): '''Main routine for unsupervised evaluation Parameters ---------- spark : SparkSession object model_file : string, path to store the serialized model file data_file : string, path to the parquet file to load ''' # Load data. dataset = spark.read.parquet(data_file) # Load k-means model. model = PipelineModel.load(model_file) # Make predictions predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette))
def kmeans_scan(_data, _k_min=2, _k_max=6, _tmp_dir='tmp_models'): """Scan different kmeans model within the specified k range. The function assume that the input data are ready to be used and already contain the features column. """ # Define the evaluator to find the optimal k. The evaluator compute the Siluhette score. evaluator = ClusteringEvaluator() # Dictionaries use to save the results obtained for the diferent k considered. silhuette_scores = {} centers = {} # If the temporary directory already exists it will be removed to create a fresh one. # Other managements of this case are possible but they won't be considered here, the # extension to these cases is straitforward. if os.path.exists(_tmp_dir): shutil.rmtree(_tmp_dir) os.mkdir(_tmp_dir) # Fit and save the model for the specifoed k for k in range(_k_min, _k_max + 1): kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol('features') model = kmeans.fit(_data) transformed = model.transform(_data) silhuette_scores[k] = evaluator.evaluate(transformed) centers[k] = model.clusterCenters() model.save(os.path.join(_tmp_dir, "model_w_k_{}".format(k))) return centers, silhuette_scores
def main(): spark = SparkSession.Builder().getOrCreate() # load dataset # datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) # dataset = spark.read.format('libsvm').json(datapath+'/data/business.json') filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/business_MTL_ONLY.json' # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json' dataset = spark.read.format('libsvm').json(filename) print(dataset) # get longitude and latitude ll = dataset.select(dataset.categories[0], dataset.longitude, dataset.latitude) ll = ll.withColumnRenamed('categories[0]', 'categories') ll.show() print(ll.schema.names) # for item in ll.schema.names: # print(item) # for item2 in item: # print(item2) sys.exit() # convert ll to dense vectors # data =ll.rdd.map(lambda x:(Vectors.dense(float(x[0]), float(x[1])),)).collect() assembler = VectorAssembler(inputCols=['longitude', 'latitude'], outputCol='features') df = assembler.transform(ll) # set KMeans k and seed kmeans = KMeans(k=4, seed=1) # generate model model = kmeans.fit(df) # Make predictions predictions = model.transform(df) predictions.show(20) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # number of location in each cluster print('Number of business in each cluster: ') predictions.groupBy('prediction').count().sort(desc('count')).show() # show in which cluster do we have more restaurants print('Number of restaurant per clusters') predictions.where(predictions.categories == 'Restaurants').groupBy( 'prediction').count().sort(desc('count')).show() # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def getAllSimilar(self): resp = [] try: listings = spark.read.format('org.apache.spark.sql.cassandra').options(table=listings_table,keyspace=keyspace).load().cache() # read favorited listings favorites = spark.read.format('org.apache.spark.sql.cassandra').options(table=fav_table,keyspace=keyspace).load().cache() if(not favorites.rdd.isEmpty()): # one value at any given time city = spark.sql("SELECT city from listings as l \ WHERE l.postingid=(SELECT postingid FROM favorites WHERE userid='potato' LIMIT 1)").collect()[0]['city'] # get all listings with in the city data = listings.where(listings['city']==city) kval = 15 prediction = self.clusterize(data, kval) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(prediction) #score print('Silhoutte score(k=%s) with Euclidean distance : %s' % (kval, silhouette) ) # find similar posts similar = self.find_similar(prediction, favorites) resp = similar.drop('features').rdd.collect() except: print("Error fetching similar items") return resp
def train_cluster(df, k): evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='final_features_scaled', \ metricName='silhouette', distanceMeasure='squaredEuclidean') kmeans = KMeans() \ .setK(k) \ .setFeaturesCol("final_features_scaled") \ .setPredictionCol("cluster") kmeans_model = kmeans.fit(df) output = kmeans_model.transform(df) score = evaluator.evaluate(output) print("k: {}, silhouette score: {}".format(k, score)) expr_mean = [F.avg(col).alias(col + '_mean') for col in final_features] expr_median = [ F.expr('percentile({}, array(0.5))'.format(col))[0].alias(col + '_mean') for col in final_features ] df_median = output.groupBy('cluster').agg(*expr_median).toPandas() df_mean = output.groupBy('cluster').agg( F.count(F.lit(1)).alias("audience_num"), *expr_mean).toPandas() return output, df_median, df_mean
def optimal_k(df_in, k_min, k_max, num_runs): ''' Determine optimal number of clusters by using Silhoutte Score Analysis. :param df_in: the input dataframe :param index_col: the name of the index column :param k_min: the train dataset :param k_min: the minmum number of the clusters :param k_max: the maxmum number of the clusters :param num_runs: the number of runs for each fixed clusters :return k: optimal number of the clusters :return silh_lst: Silhouette score :return r_table: the running results table ''' from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator import time import pandas as pd start = time.time() silh_lst = [] k_lst = np.arange(k_min, k_max + 1) r_table = pd.DataFrame(index=range(data.count())) centers = pd.DataFrame() for k in k_lst: silh_val = [] for run in np.arange(1, num_runs + 1): # Trains a k-means model. kmeans = KMeans() \ .setK(k) \ .setSeed(int(np.random.randint(100, size=1))) model = kmeans.fit(df_in) # Make predictions predictions = model.transform(df_in) r_table['cluster_{k}_{run}'.format( k=k, run=run)] = predictions.select('prediction').toPandas() # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) silh_val.append(silhouette) silh_array = np.asanyarray(silh_val) silh_lst.append(silh_array.mean()) elapsed = time.time() - start silhouette = pd.DataFrame(list(zip(k_lst, silh_lst)), columns=['k', 'silhouette']) print('+------------------------------------------------------------+') print("| The finding optimal k phase took %8.0f s. |" % (elapsed)) print('+------------------------------------------------------------+') return k_lst[np.argmax(silh_lst, axis=0)], silhouette, r_table
def test_clustering_evaluator_with_cosine_distance(self): featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]), [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0), ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)]) dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"]) evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine") self.assertEqual(evaluator.getDistanceMeasure(), "cosine") self.assertTrue(np.isclose(evaluator.evaluate(dataset), 0.992671213, atol=1e-5))
def silhouete_score(self, data): """ returns the silhouette score of data. """ predictions = self.model.transform(data) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print(f"silhouette score: {silhouette:.4f}") return silhouette
def evaluators(self): evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(self.predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = self.model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def plot_silhouette(data): evaluator = ClusteringEvaluator() scores = [] for k in range(2, 50): kmeans = KMeans(k=k) model = kmeans.fit(data) predictions = model.transform(data) scores.append(evaluator.evaluate(predictions)) plt.plot(k, scores) plt.show()
def findSimillar(): #Dealing with the server request #project_ID = request.args.get('project_ID', None) project_ID = 'afd99a01739ad5557b51b1ba0174e832' projects.createOrReplaceTempView('projects') silhouette = [] cols = ["Project_Subject_Category_Tree","Project_Subject_Subcategory_Tree","Project_Grade_Level_Category","Project_Resource_Category"] colsa = [] #df = projects.select(cols) df = projects df = df.where(df.Project_Subject_Category_Tree.isNotNull()) df = df.where(df.Project_Subject_Subcategory_Tree.isNotNull()) df = df.where(df.Project_Grade_Level_Category.isNotNull()) df = df.where(df.Project_Resource_Category.isNotNull()) for i in range(len(cols)): stringIndexer = StringIndexer(inputCol=cols[i], outputCol=cols[i]+"a") model = stringIndexer.fit(df) df = model.transform(df) colsa.append(cols[i]+"a") for i in range(len(cols)): encoder = OneHotEncoder(inputCol=cols[i]+"a", outputCol=cols[i]+"v") encoded = encoder.transform(df) assembler = VectorAssembler( inputCols=colsa, outputCol="features") output = assembler.transform(encoded) kmax = 10; #optimal K happens at k=4 for i in range(2,kmax): # Trains a k-means model. kmeans = KMeans().setK(i).setSeed(1) model = kmeans.fit(output) # Evaluate clustering by computing Silhouette score predictions = model.transform(output) evaluator = ClusteringEvaluator() silhouette.append([i,evaluator.evaluate(predictions)]) k_optimal = np.array(silhouette)[int(np.where(np.array(silhouette)[:,1]==np.amax(np.array(silhouette)[:,1]))[0]),0] kmeans = KMeans().setK(k_optimal).setSeed(1)
def compute_metrics(model_list, i, metric, distance): from pyspark.ml.evaluation import ClusteringEvaluator # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator(distanceMeasure=distance) if metric == "wsse": res = model_list[i].summary.trainingCost elif metric == "asw": res = evaluator.evaluate(model_list[i].summary.predictions) else: print( "WARNING: wrong metric specified. Use either \"wsse\" or \"asw\".") return (None) return (res)
def __train_model(self): """Train the model with the current dataset """ logger.info("Splitting dataset into 3...") # Model 0: 1/3 data pertama. # Model 1: 1/3 data pertama + 1/3 data kedua. # Model 2: semua data self.df0 = self.dforiginal.limit(int(self.dataset_count / 3)) self.df1 = self.dforiginal.limit(int(self.dataset_count * 2 / 3)) self.df2 = self.dforiginal print('df 0 count = ' + str(self.df0.count())) print('df 1 count = ' + str(self.df1.count())) print('df 2 count = ' + str(self.df2.count())) logger.info("Dataset Splitted !") logger.info("Training model 0...") kmeans_0 = KMeans().setK(5).setSeed(1) model_0 = kmeans_0.fit(self.df0) self.predictions_0 = model_0.transform(self.df0) logger.info("Model 0 built!") logger.info("Evaluating the model 0...") evaluator_0 = ClusteringEvaluator() silhouette_0 = evaluator_0.evaluate(self.predictions_0) logger.info("Silhouette with squared euclidean distance = " + str(silhouette_0)) self.centers_0 = model_0.clusterCenters() logger.info("Model 0 Done !") logger.info("Training model 1...") kmeans_1 = KMeans().setK(5).setSeed(1) model_1 = kmeans_1.fit(self.df1) self.predictions_1 = model_1.transform(self.df1) logger.info("Model 1 built!") logger.info("Evaluating the model 1...") evaluator_1 = ClusteringEvaluator() silhouette_1 = evaluator_1.evaluate(self.predictions_1) logger.info("Silhouette with squared euclidean distance = " + str(silhouette_1)) self.centers_1 = model_1.clusterCenters() logger.info("Model 1 Done !") logger.info("Training model 2...") kmeans_2 = KMeans().setK(5).setSeed(1) model_2 = kmeans_2.fit(self.df2) self.predictions_2 = model_2.transform(self.df2) logger.info("Model 2 built!") logger.info("Evaluating the model 2...") evaluator_2 = ClusteringEvaluator() silhouette_2 = evaluator_2.evaluate(self.predictions_2) logger.info("Silhouette with squared euclidean distance = " + str(silhouette_2)) self.centers_2 = model_2.clusterCenters() logger.info("Model 2 Done !")
def kmeans_algorithm(dataframe): kmeans = KMeans().setK(5).setSeed(1) model = kmeans.fit(dataframe) predictions = model.transform(dataframe) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def get_kmeans_scores(model, dataset): # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(dataset) print("Within Set Sum of Squared Errors = " + str(wssse)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() predictions = model.transform(dataset) silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette))
def metric(data, metric): # Если по какой-то причине (случайно кто-то сломал сессию, пытаясь выполнить другую задачу и тд) # спарк упал, то пытаемся досчитать недосчитанное локально на одном узле try: spark_context = SparkSession.getActiveSession().sparkContext SQLContext(spark_context).clearCache() except AttributeError: spark_context = SparkContext.getOrCreate( SparkConf().setMaster("local[*]")) spark = SparkSession \ .builder \ .getOrCreate() data = data.drop('probability') try: if metric == 'sil': res = -ClusteringEvaluator( predictionCol='labels', distanceMeasure='squaredEuclidean').evaluate(data) elif metric == 'ch': res = ChIndex().find(data, spark_context) elif metric == 'db': res = DaviesIndex().find(data, spark_context) return res except TypeError: print("\n\nTYPE ERROR OCCURED IN Metric.py:\n\nDATA: {}\n\n".format( data)) return 0 except Py4JJavaError: print("\n\nPy4JJavaError ERROR OCCURED IN Metric.py:\n\nDATA: {}\n\n". format(data.printSchema())) return sys.float_info.max
def __optimal_k_kmeans_gmm_spark(cls, feature_col, df_norm, n_iterations, kmeans_method, sc): from pyspark.ml.clustering import KMeans, GaussianMixture from pyspark.ml.evaluation import ClusteringEvaluator start_time = time.time() #convert to spark df sqlContext = SQLContext(sc) spark_df = sqlContext.createDataFrame(df_norm) #assemble vector vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features") spark_df_clustering = vecAssembler.transform(spark_df).select('features') n_components_list = [] n_range = np.arange(2, 20) for iteration in np.arange(n_iterations): silh_val = [] cost = [] for k in n_range: if kmeans_method.lower() == 'kmeans': print("Kmeans Elbow Method K = ", k) kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(spark_df_clustering) cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later elif kmeans_method.lower() == 'gmm': print("Gmm Elbow Method K = ", k) gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features") model = gmm.fit(spark_df_clustering) #cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later predictions = model.transform(spark_df_clustering) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) silh_val.append(silhouette) print('Cluster List: ', list(n_range)) print('Silhouette score: ', silh_val) print('Sum of Square Distance Score: ', cost) n_split_silh = n_range[silh_val.index(np.max(silh_val))] if len(cost)>0: n_split_knee = cls.__knee_locator(n_range, cost, 'convex', 'decreasing', 'sum_of_square_error') print('Knee of sum of square distance: ', str(n_split_knee)) else: n_split_knee = n_split_silh print("Recommended no. of components by Silhouette Score: " + str(n_split_silh)) n_clusters = math.ceil(np.median([n_split_knee, n_split_silh])) n_components_list.append(n_clusters) n_components = int(np.median(n_components_list).round(0)) print('Recommended median number of splits: ', n_components) print("training time: ", time.time()-start_time, "(sec)") return n_components
def run(): dataset = spark.read.format("parquet").load( "hdfs:///user/spark/warehouse/kmeans-data.parquet") assembler = VectorAssembler( inputCols=["c{}".format(x) for x in range(0, 14)], outputCol="features") dataset = assembler.transform(dataset) kmeans = KMeans().setK(3).setSeed(1) model = kmeans.fit(dataset) predictions = model.transform(dataset) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) # print("Silhouette with squared euclidean distance = " + str(silhouette)) centers = model.clusterCenters()
def evaluateCluster(model, df): from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator wssse = model.computeCost(dfML.select('features')) print("Within Set Sum of Squared Errors = " + str(wssse)) evaluator = ClusteringEvaluator() predictions = model.transform(df) silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
spark = SparkSession\ .builder\ .appName("KMeansExample")\ .getOrCreate() # $example on$ # Loads data. dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) # Make predictions predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # $example off$ spark.stop()