def kmeans(data): (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(trainingData) # Make predictions predictions = model.transform(testData) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) predictions.select("prediction", "label", "features").show(5) print("prediction=1.0 count: " + str(predictions.filter("prediction=1.0").count())) print("label=1.0 count: " + str(predictions.filter("label=1.0").count())) print("total count: " + str(predictions.count())) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") predictions = predictions.withColumn("prediction", predictions["prediction"].cast("double")) predictions = predictions.withColumnRenamed("label", "indexedLabel") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
def __train_model(self): """Train the model with the current dataset """ logger.info("Splitting dataset into 3...") # Model 0: 1/3 data pertama. # Model 1: 1/3 data pertama + 1/3 data kedua. # Model 2: semua data self.df0 = self.dforiginal.limit(int(self.dataset_count / 3)) self.df1 = self.dforiginal.limit(int(self.dataset_count * 2 / 3)) self.df2 = self.dforiginal print('df 0 count = ' + str(self.df0.count())) print('df 1 count = ' + str(self.df1.count())) print('df 2 count = ' + str(self.df2.count())) logger.info("Dataset Splitted !") logger.info("Training model 0...") kmeans_0 = KMeans().setK(5).setSeed(1) model_0 = kmeans_0.fit(self.df0) self.predictions_0 = model_0.transform(self.df0) logger.info("Model 0 built!") logger.info("Evaluating the model 0...") evaluator_0 = ClusteringEvaluator() silhouette_0 = evaluator_0.evaluate(self.predictions_0) logger.info("Silhouette with squared euclidean distance = " + str(silhouette_0)) self.centers_0 = model_0.clusterCenters() logger.info("Model 0 Done !") logger.info("Training model 1...") kmeans_1 = KMeans().setK(5).setSeed(1) model_1 = kmeans_1.fit(self.df1) self.predictions_1 = model_1.transform(self.df1) logger.info("Model 1 built!") logger.info("Evaluating the model 1...") evaluator_1 = ClusteringEvaluator() silhouette_1 = evaluator_1.evaluate(self.predictions_1) logger.info("Silhouette with squared euclidean distance = " + str(silhouette_1)) self.centers_1 = model_1.clusterCenters() logger.info("Model 1 Done !") logger.info("Training model 2...") kmeans_2 = KMeans().setK(5).setSeed(1) model_2 = kmeans_2.fit(self.df2) self.predictions_2 = model_2.transform(self.df2) logger.info("Model 2 built!") logger.info("Evaluating the model 2...") evaluator_2 = ClusteringEvaluator() silhouette_2 = evaluator_2.evaluate(self.predictions_2) logger.info("Silhouette with squared euclidean distance = " + str(silhouette_2)) self.centers_2 = model_2.clusterCenters() logger.info("Model 2 Done !")
def pick_k(df_vec, sample_rate=0.0005, sample_size=5, ktop=10): """ Input: df: pyspark dataframe sample_rate: float, the ratio rate of sampling df sample_size: int, how many time to run the elbow cost and silhouette_list methods ktop: int, the top k range for evaluation Output: df: pyspark dataframe, result for elbow cost and silhouette_list methods """ choose_k_list = [] for seed in range(sample_size): df_sample = df_vec.sample(False, sample_rate, seed=seed) # withReplacement: False elbow_cost = [] silhouette = [] for k in range(2, ktop + 1): kmeans = KMeans(k=k, seed=seed) tmp_model = kmeans.fit(df_sample) elbow_cost.append(tmp_model.summary.trainingCost) predictions = tmp_model.transform(df_sample) evaluator = ClusteringEvaluator() silhouette.append(evaluator.evaluate(predictions)) choose_k_list.append([seed, k, elbow_cost[-1], silhouette[-1]]) return spark.createDataFrame( pd.DataFrame(choose_k_list, columns=["seed", "k", "elbow_cost", "silhouette"]))
def find_elbow(self): x, y = [], [] for k in range(2, 50): # Define the model, seed should be fixed between iteration # to prevent it from being a source of variance kmeans = self.kmeans_type(k=k, seed=SEED) model = kmeans.fit(self.dataset) # Make predictions; we are going to predict straight on our # training dataset since the clustering was derived from it predictions = model.transform(self.dataset) # Compute error evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) x.append(k) y.append(silhouette) ax = sns.lineplot(x=x, y=y, palette="coolwarm", marker="o") ax.set_xlabel("Number of Clusters") ax.set_ylabel("Silhouette Score") ax.set_title("Cluster Quality by Number of Clusters") plot_name = f"elbow-{self.dataset_name}-{self.kmeans_name}.png" plt.savefig(os.path.join("analysis", "results", "charts", plot_name))
def test_clustering_evaluator_with_cosine_distance(self): featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]), [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0), ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)]) dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"]) evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine") self.assertEqual(evaluator.getDistanceMeasure(), "cosine") self.assertTrue(np.isclose(evaluator.evaluate(dataset), 0.992671213, atol=1e-5))
def run(): dataset = spark.read.format("parquet").load( "hdfs:///user/spark/warehouse/kmeans-data.parquet") assembler = VectorAssembler( inputCols=["c{}".format(x) for x in range(0, 14)], outputCol="features") dataset = assembler.transform(dataset) kmeans = KMeans().setK(3).setSeed(1) model = kmeans.fit(dataset) predictions = model.transform(dataset) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) # print("Silhouette with squared euclidean distance = " + str(silhouette)) centers = model.clusterCenters()
def get_silhouette_score(self, clustered_url_vectors: DataFrame, distance_measure: str = "euclidean") -> float: """ Calculates the silhouette score of the given cluster in the parameter and returns. :param distance_measure: The distance measure that is used for clustering. :param clustered_url_vectors: A DataFrame of URLs and the cluster id of the URLs are assigned by the clustering algorithm with columns: id, url, split_url, coefficients, vector, cluster_id. :return: silhouette score of the clustering of clustered URLs. """ if distance_measure == "euclidean": distance_measure = "squaredEuclidean" evaluator = ClusteringEvaluator(predictionCol="cluster_id", featuresCol="vector", distanceMeasure=distance_measure) return evaluator.evaluate(clustered_url_vectors)
spark = SparkSession\ .builder\ .appName("KMeansExample")\ .getOrCreate() # $example on$ # Loads data. dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) # Make predictions predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # $example off$ spark.stop()