Beispiel #1
0
def kmeans(data):
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    # Trains a k-means model.
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(trainingData)

    # Make predictions
    predictions = model.transform(testData)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)

    predictions.select("prediction", "label", "features").show(5)

    print("prediction=1.0 count: " + str(predictions.filter("prediction=1.0").count()))
    print("label=1.0 count: " + str(predictions.filter("label=1.0").count()))
    print("total count: " + str(predictions.count()))

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

    predictions = predictions.withColumn("prediction", predictions["prediction"].cast("double"))
    predictions = predictions.withColumnRenamed("label", "indexedLabel")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))
Beispiel #2
0
    def __train_model(self):
        """Train the model with the current dataset
        """
        logger.info("Splitting dataset into 3...")
        # Model 0: 1/3 data pertama.
        # Model 1: 1/3 data pertama + 1/3 data kedua.
        # Model 2: semua data
        self.df0 = self.dforiginal.limit(int(self.dataset_count / 3))
        self.df1 = self.dforiginal.limit(int(self.dataset_count * 2 / 3))
        self.df2 = self.dforiginal
        print('df 0 count = ' + str(self.df0.count()))
        print('df 1 count = ' + str(self.df1.count()))
        print('df 2 count = ' + str(self.df2.count()))
        logger.info("Dataset Splitted !")

        logger.info("Training model 0...")
        kmeans_0 = KMeans().setK(5).setSeed(1)
        model_0 = kmeans_0.fit(self.df0)
        self.predictions_0 = model_0.transform(self.df0)
        logger.info("Model 0 built!")
        logger.info("Evaluating the model 0...")
        evaluator_0 = ClusteringEvaluator()
        silhouette_0 = evaluator_0.evaluate(self.predictions_0)
        logger.info("Silhouette with squared euclidean distance = " +
                    str(silhouette_0))
        self.centers_0 = model_0.clusterCenters()
        logger.info("Model 0 Done !")

        logger.info("Training model 1...")
        kmeans_1 = KMeans().setK(5).setSeed(1)
        model_1 = kmeans_1.fit(self.df1)
        self.predictions_1 = model_1.transform(self.df1)
        logger.info("Model 1 built!")
        logger.info("Evaluating the model 1...")
        evaluator_1 = ClusteringEvaluator()
        silhouette_1 = evaluator_1.evaluate(self.predictions_1)
        logger.info("Silhouette with squared euclidean distance = " +
                    str(silhouette_1))
        self.centers_1 = model_1.clusterCenters()
        logger.info("Model 1 Done !")

        logger.info("Training model 2...")
        kmeans_2 = KMeans().setK(5).setSeed(1)
        model_2 = kmeans_2.fit(self.df2)
        self.predictions_2 = model_2.transform(self.df2)
        logger.info("Model 2 built!")
        logger.info("Evaluating the model 2...")
        evaluator_2 = ClusteringEvaluator()
        silhouette_2 = evaluator_2.evaluate(self.predictions_2)
        logger.info("Silhouette with squared euclidean distance = " +
                    str(silhouette_2))
        self.centers_2 = model_2.clusterCenters()
        logger.info("Model 2 Done !")
Beispiel #3
0
def pick_k(df_vec, sample_rate=0.0005, sample_size=5, ktop=10):
    """
    Input:
    df: pyspark dataframe
    sample_rate: float, the ratio rate of sampling df
    sample_size: int, how many time to run the elbow cost and silhouette_list methods
    ktop: int, the top k range for evaluation

    Output:
    df: pyspark dataframe, result for elbow cost and silhouette_list methods
    """

    choose_k_list = []
    for seed in range(sample_size):
        df_sample = df_vec.sample(False, sample_rate,
                                  seed=seed)  # withReplacement: False
        elbow_cost = []
        silhouette = []
        for k in range(2, ktop + 1):
            kmeans = KMeans(k=k, seed=seed)
            tmp_model = kmeans.fit(df_sample)
            elbow_cost.append(tmp_model.summary.trainingCost)
            predictions = tmp_model.transform(df_sample)
            evaluator = ClusteringEvaluator()
            silhouette.append(evaluator.evaluate(predictions))
            choose_k_list.append([seed, k, elbow_cost[-1], silhouette[-1]])
    return spark.createDataFrame(
        pd.DataFrame(choose_k_list,
                     columns=["seed", "k", "elbow_cost", "silhouette"]))
    def find_elbow(self):
        x, y = [], []

        for k in range(2, 50):
            # Define the model, seed should be fixed between iteration
            # to prevent it from being a source of variance
            kmeans = self.kmeans_type(k=k, seed=SEED)
            model = kmeans.fit(self.dataset)

            # Make predictions; we are going to predict straight on our
            # training dataset since the clustering was derived from it
            predictions = model.transform(self.dataset)

            # Compute error
            evaluator = ClusteringEvaluator()
            silhouette = evaluator.evaluate(predictions)

            x.append(k)
            y.append(silhouette)

        ax = sns.lineplot(x=x, y=y, palette="coolwarm", marker="o")
        ax.set_xlabel("Number of Clusters")
        ax.set_ylabel("Silhouette Score")
        ax.set_title("Cluster Quality by Number of Clusters")
        plot_name = f"elbow-{self.dataset_name}-{self.kmeans_name}.png"
        plt.savefig(os.path.join("analysis", "results", "charts", plot_name))
Beispiel #5
0
 def test_clustering_evaluator_with_cosine_distance(self):
     featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
                                 [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0),
                                  ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)])
     dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
     evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine")
     self.assertEqual(evaluator.getDistanceMeasure(), "cosine")
     self.assertTrue(np.isclose(evaluator.evaluate(dataset),  0.992671213, atol=1e-5))
def run():
    dataset = spark.read.format("parquet").load(
        "hdfs:///user/spark/warehouse/kmeans-data.parquet")
    assembler = VectorAssembler(
        inputCols=["c{}".format(x) for x in range(0, 14)],
        outputCol="features")
    dataset = assembler.transform(dataset)

    kmeans = KMeans().setK(3).setSeed(1)
    model = kmeans.fit(dataset)
    predictions = model.transform(dataset)
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    # print("Silhouette with squared euclidean distance = " + str(silhouette))

    centers = model.clusterCenters()
    def get_silhouette_score(self,
                             clustered_url_vectors: DataFrame,
                             distance_measure: str = "euclidean") -> float:
        """
        Calculates the silhouette score of the given cluster in the parameter and returns.

        :param distance_measure: The distance measure that is used for clustering.
        :param clustered_url_vectors: A DataFrame of URLs and the cluster id of the URLs are assigned by the clustering
                                      algorithm with columns: id, url, split_url, coefficients, vector, cluster_id.
        :return: silhouette score of the clustering of clustered URLs.
        """
        if distance_measure == "euclidean":
            distance_measure = "squaredEuclidean"
        evaluator = ClusteringEvaluator(predictionCol="cluster_id",
                                        featuresCol="vector",
                                        distanceMeasure=distance_measure)
        return evaluator.evaluate(clustered_url_vectors)
Beispiel #8
0
    spark = SparkSession\
        .builder\
        .appName("KMeansExample")\
        .getOrCreate()

    # $example on$
    # Loads data.
    dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

    # Trains a k-means model.
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(dataset)

    # Make predictions
    predictions = model.transform(dataset)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    # $example off$

    spark.stop()