Exemple #1
0
def pick_k(df_vec, sample_rate=0.0005, sample_size=5, ktop=10):
    """
    Input:
    df: pyspark dataframe
    sample_rate: float, the ratio rate of sampling df
    sample_size: int, how many time to run the elbow cost and silhouette_list methods
    ktop: int, the top k range for evaluation

    Output:
    df: pyspark dataframe, result for elbow cost and silhouette_list methods
    """

    choose_k_list = []
    for seed in range(sample_size):
        df_sample = df_vec.sample(False, sample_rate,
                                  seed=seed)  # withReplacement: False
        elbow_cost = []
        silhouette = []
        for k in range(2, ktop + 1):
            kmeans = KMeans(k=k, seed=seed)
            tmp_model = kmeans.fit(df_sample)
            elbow_cost.append(tmp_model.summary.trainingCost)
            predictions = tmp_model.transform(df_sample)
            evaluator = ClusteringEvaluator()
            silhouette.append(evaluator.evaluate(predictions))
            choose_k_list.append([seed, k, elbow_cost[-1], silhouette[-1]])
    return spark.createDataFrame(
        pd.DataFrame(choose_k_list,
                     columns=["seed", "k", "elbow_cost", "silhouette"]))
Exemple #2
0
def kmeans_usecase():
    spark = getSparkSession()
    schema = ''
    for i in range(65):
        schema = schema + '_c' + str(i) + ' DOUBLE' + ','
    schema = schema[:len(schema) - 1]
    df_train = spark.read.csv('../data/optdigits.tra', schema=schema)
    df_test = spark.read.csv('../data/optdigits.tes', schema=schema)
    cols = []
    for i in range(65):
        cols.append("_c" + str(i))
    df_train.head = cols
    df_test.head = cols
    assembler = VectorAssembler(inputCols=cols[:-1], outputCol="features")
    train_output = assembler.transform(df_train)
    test_output = assembler.transform(df_test)
    train_features = train_output.select("features").toDF('features')
    test_features = test_output.select("features").toDF('features')
    train_features.show(truncate=False)
    test_features.show(truncate=False)
    kmeans = KMeans().setK(10).setSeed(1)
    model = kmeans.fit(train_features)
    predictions = model.transform(test_features)

    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))
Exemple #3
0
def kmeans(data):
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    # Trains a k-means model.
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(trainingData)

    # Make predictions
    predictions = model.transform(testData)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)

    predictions.select("prediction", "label", "features").show(5)

    print("prediction=1.0 count: " + str(predictions.filter("prediction=1.0").count()))
    print("label=1.0 count: " + str(predictions.filter("label=1.0").count()))
    print("total count: " + str(predictions.count()))

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

    predictions = predictions.withColumn("prediction", predictions["prediction"].cast("double"))
    predictions = predictions.withColumnRenamed("label", "indexedLabel")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))
def main(spark, model_file, data_file):
    '''Main routine for unsupervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''
    DF = spark.read.parquet(data_file)
    DF .select("mfcc_00", "mfcc_01", "mfcc_02","mfcc_03","mfcc_04","mfcc_05","mfcc_06","mfcc_07","mfcc_08","mfcc_09","mfcc_10", "mfcc_11", "mfcc_12","mfcc_13","mfcc_14","mfcc_15","mfcc_16","mfcc_17","mfcc_18","mfcc_19")
    #assembler = VectorAssembler(
               # inputCols=["mfcc_00", "mfcc_01", "mfcc_02","mfcc_03","mfcc_04","mfcc_05","mfcc_06","mfcc_07","mfcc_08","mfcc_09","mfcc_10", "mfcc_11", "mfcc_12","mfcc_13","mfcc_14","mfcc_15","mfcc_16","mfcc_17","mfcc_18","mfcc_19"],
               #outputCol="features")
    #DF = assembler.transform(DF)
    #DFnew = scalerModel.transform(DF)
    Model=PipelineModel.load(model_file)
    predictions = Model.transform(DF)
    evaluator = ClusteringEvaluator()
    Result = evaluator.evaluate(predictions)
    print(str(Result))
    ###
    # TODO: YOUR CODE GOES HERE
    ###

    pass
def kmeans(coordinates_list, spark):
    coordinates_list = [
        [float(coordinates[0]), float(coordinates[1])]
        for coordinates in coordinates_list
    ]
    df = spark.createDataFrame(coordinates_list, ["Longitude", "Latitude"])

    vecAssembler = VectorAssembler(
        inputCols=["Longitude", "Latitude"], outputCol="features"
    )
    new_df = vecAssembler.transform(df)

    silhouettes = []
    for k in range(2, 10):
        kmeans = KMeans().setK(k).setSeed(1)
        model = kmeans.fit(new_df.select("features"))
        predictions = model.transform(new_df)

        evaluator = ClusteringEvaluator()
        silhouette = evaluator.evaluate(predictions)
        silhouettes.append([silhouette, predictions, k])

    _, predictions, k = max(silhouettes, key=lambda x: x[0])

    predictions.show()
    print(k)

    return predictions
Exemple #6
0
 def get_evaluation_results(self, predictions):
     self.evaluator = ClusteringEvaluator()
     silhoute = self.evaluator.evaluate(predictions)
     result = '\nsilhouette_score:' + str(silhoute)
     with open('kmeansresult.txt', 'a+') as fp:
         fp.write(result)
     return silhoute
def main(spark, model_file, data_file):
    '''Main routine for unsupervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    ###
    # TODO: YOUR CODE GOES HERE

    K_Model = PipelineModel.load(model_file)

    df = spark.read.parquet(data_file)
    df_mfcc = df.select("mfcc_00", "mfcc_01", "mfcc_02", "mfcc_03", "mfcc_04",
                        "mfcc_05", "mfcc_06", "mfcc_07", "mfcc_08", "mfcc_09",
                        "mfcc_10", "mfcc_11", "mfcc_12", "mfcc_13", "mfcc_14",
                        "mfcc_15", "mfcc_16", "mfcc_17", "mfcc_18", "mfcc_19")

    predictions = K_Model.transform(df_mfcc)
    evaluator = ClusteringEvaluator()
    K_model_evaluation = evaluator.evaluate(predictions)

    print("Score of K-Means Clustering Model: ", str(K_model_evaluation))
    ###

    pass
    def find_elbow(self):
        x, y = [], []

        for k in range(2, 50):
            # Define the model, seed should be fixed between iteration
            # to prevent it from being a source of variance
            kmeans = self.kmeans_type(k=k, seed=SEED)
            model = kmeans.fit(self.dataset)

            # Make predictions; we are going to predict straight on our
            # training dataset since the clustering was derived from it
            predictions = model.transform(self.dataset)

            # Compute error
            evaluator = ClusteringEvaluator()
            silhouette = evaluator.evaluate(predictions)

            x.append(k)
            y.append(silhouette)

        ax = sns.lineplot(x=x, y=y, palette="coolwarm", marker="o")
        ax.set_xlabel("Number of Clusters")
        ax.set_ylabel("Silhouette Score")
        ax.set_title("Cluster Quality by Number of Clusters")
        plot_name = f"elbow-{self.dataset_name}-{self.kmeans_name}.png"
        plt.savefig(os.path.join("analysis", "results", "charts", plot_name))
Exemple #9
0
def train_cluster(df, k):
    evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='final_features_scaled', \
                                    metricName='silhouette', distanceMeasure='squaredEuclidean')
    kmeans = KMeans() \
        .setK(k) \
        .setFeaturesCol("final_features_scaled") \
        .setPredictionCol("cluster")

    kmeans_model = kmeans.fit(df)

    output = kmeans_model.transform(df)

    score = evaluator.evaluate(output)
    print("k: {}, silhouette score: {}".format(k, score))
    expr_mean = [F.avg(col).alias(col + '_mean') for col in final_features]

    #     @pandas_udf(FloatType(), functionType=PandasUDFType.GROUPED_AGG)
    #     def _func_median(v):
    #         return v.median()
    #     expr_median = [_func_median(output[col]).alias(col+'_median') for col in numeric_features]
    #     df_median = output.groupBy('cluster').agg(*expr_median).toPandas()
    df_mean = output.groupBy('cluster').agg(
        F.count(F.lit(1)).alias("audience_num"), *expr_mean).toPandas()
    #     result = pd.merge(df_mean, df_median, on='cluster')
    return output, df_mean
Exemple #10
0
def main(spark, model_file, data_file):
    '''Main routine for unsupervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    ###
    # TODO: YOUR CODE GOES HERE
    ###

    # Load the Pipeline model we trained
    model = PipelineModel.load(model_file)

    # Read the val
    val = spark.read.parquet(data_file)

    # Predictions
    predictions = model.transform(val)

    # Evaluations
    evaluator = ClusteringEvaluator(predictionCol='prediction',
                                    featuresCol='scaled_features')
    result = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(result))
Exemple #11
0
    def cluster(self):
        from pyspark.ml.clustering import KMeans
        from pyspark.ml.evaluation import ClusteringEvaluator

        # Loads data.
        dataset = self.read.format("libsvm").load(self.dataDir + "data/mllib/sample_kmeans_data.txt")

        # Trains a k-means model.
        kmeans = KMeans().setK(2).setSeed(1)
        model = kmeans.fit(dataset)

        # Make predictions
        predictions = model.transform(dataset)

        # Evaluate clustering by computing Silhouette score
        evaluator = ClusteringEvaluator()

        silhouette = evaluator.evaluate(predictions)
        print("Silhouette with squared euclidean distance = " + str(silhouette))

        # Shows the result.
        centers = model.clusterCenters()
        print("Cluster Centers: ")
        for center in centers:
            print(center)
Exemple #12
0
def main(spark, model_file, data_file):
    '''Main routine for unsupervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    # Load data.
    dataset = spark.read.parquet(data_file)

    # Load k-means model.
    model = PipelineModel.load(model_file)

    # Make predictions
    predictions = model.transform(dataset)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))
Exemple #13
0
def kmeans_scan(_data, _k_min=2, _k_max=6, _tmp_dir='tmp_models'):
    """Scan different kmeans model within the specified k range.
       The function assume that the input data are ready to be used and already contain the features column.
    """
    # Define the evaluator to find the optimal k. The evaluator compute the Siluhette score.
    evaluator = ClusteringEvaluator()

    # Dictionaries use to save the results obtained for the diferent k considered.
    silhuette_scores = {}
    centers = {}

    # If the temporary directory already exists it will be removed to create a fresh one.
    # Other managements of this case are possible but they won't be considered here, the
    # extension to these cases is straitforward.
    if os.path.exists(_tmp_dir):
        shutil.rmtree(_tmp_dir)

    os.mkdir(_tmp_dir)

    # Fit and save the model for the specifoed k
    for k in range(_k_min, _k_max + 1):
        kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol('features')
        model = kmeans.fit(_data)
        transformed = model.transform(_data)
        silhuette_scores[k] = evaluator.evaluate(transformed)
        centers[k] = model.clusterCenters()
        model.save(os.path.join(_tmp_dir, "model_w_k_{}".format(k)))

    return centers, silhuette_scores
def main():
    spark = SparkSession.Builder().getOrCreate()
    # load dataset
    # datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
    # dataset = spark.read.format('libsvm').json(datapath+'/data/business.json')

    filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/business_MTL_ONLY.json'
    # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json'
    dataset = spark.read.format('libsvm').json(filename)
    print(dataset)

    # get longitude and latitude
    ll = dataset.select(dataset.categories[0], dataset.longitude,
                        dataset.latitude)
    ll = ll.withColumnRenamed('categories[0]', 'categories')

    ll.show()

    print(ll.schema.names)
    # for item in ll.schema.names:
    #   print(item)
    #   for item2 in item:
    #     print(item2)
    sys.exit()
    # convert ll to dense vectors
    # data =ll.rdd.map(lambda x:(Vectors.dense(float(x[0]), float(x[1])),)).collect()
    assembler = VectorAssembler(inputCols=['longitude', 'latitude'],
                                outputCol='features')

    df = assembler.transform(ll)

    # set KMeans k and seed
    kmeans = KMeans(k=4, seed=1)

    # generate model
    model = kmeans.fit(df)

    # Make predictions
    predictions = model.transform(df)
    predictions.show(20)
    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # number of location in each cluster
    print('Number of business in each cluster: ')
    predictions.groupBy('prediction').count().sort(desc('count')).show()

    # show in which cluster do we have more restaurants
    print('Number of restaurant per clusters')
    predictions.where(predictions.categories == 'Restaurants').groupBy(
        'prediction').count().sort(desc('count')).show()

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
Exemple #15
0
    def getAllSimilar(self):
        resp = []
        try:
            listings = spark.read.format('org.apache.spark.sql.cassandra').options(table=listings_table,keyspace=keyspace).load().cache()
            # read favorited listings
            favorites = spark.read.format('org.apache.spark.sql.cassandra').options(table=fav_table,keyspace=keyspace).load().cache()

            if(not favorites.rdd.isEmpty()):
                # one value at any given time
                city = spark.sql("SELECT city from listings as l \
                    WHERE l.postingid=(SELECT postingid FROM favorites WHERE userid='potato' LIMIT 1)").collect()[0]['city']
                # get all listings with in the city
                data = listings.where(listings['city']==city)

                kval = 15
                prediction = self.clusterize(data, kval)
                evaluator = ClusteringEvaluator()

                silhouette = evaluator.evaluate(prediction)    #score
                print('Silhoutte score(k=%s) with Euclidean distance : %s' % (kval, silhouette) )

                # find similar posts
                similar = self.find_similar(prediction, favorites)
                resp = similar.drop('features').rdd.collect()
        except:
            print("Error fetching similar items")

        return resp
Exemple #16
0
def train_cluster(df, k):
    evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='final_features_scaled', \
                                    metricName='silhouette', distanceMeasure='squaredEuclidean')
    kmeans = KMeans() \
        .setK(k) \
        .setFeaturesCol("final_features_scaled") \
        .setPredictionCol("cluster")

    kmeans_model = kmeans.fit(df)

    output = kmeans_model.transform(df)

    score = evaluator.evaluate(output)
    print("k: {}, silhouette score: {}".format(k, score))
    expr_mean = [F.avg(col).alias(col + '_mean') for col in final_features]

    expr_median = [
        F.expr('percentile({}, array(0.5))'.format(col))[0].alias(col +
                                                                  '_mean')
        for col in final_features
    ]

    df_median = output.groupBy('cluster').agg(*expr_median).toPandas()
    df_mean = output.groupBy('cluster').agg(
        F.count(F.lit(1)).alias("audience_num"), *expr_mean).toPandas()
    return output, df_median, df_mean
Exemple #17
0
def optimal_k(df_in, k_min, k_max, num_runs):
    '''
    Determine optimal number of clusters by using Silhoutte Score Analysis.
    :param df_in: the input dataframe
    :param index_col: the name of the index column
    :param k_min: the train dataset
    :param k_min: the minmum number of the clusters
    :param k_max: the maxmum number of the clusters
    :param num_runs: the number of runs for each fixed clusters

    :return k: optimal number of the clusters
    :return silh_lst: Silhouette score
    :return r_table: the running results table
    '''
    from pyspark.ml.clustering import KMeans
    from pyspark.ml.evaluation import ClusteringEvaluator
    import time
    import pandas as pd

    start = time.time()
    silh_lst = []
    k_lst = np.arange(k_min, k_max + 1)

    r_table = pd.DataFrame(index=range(data.count()))
    centers = pd.DataFrame()

    for k in k_lst:
        silh_val = []
        for run in np.arange(1, num_runs + 1):
            # Trains a k-means model.
            kmeans = KMeans() \
                .setK(k) \
                .setSeed(int(np.random.randint(100, size=1)))
            model = kmeans.fit(df_in)

            # Make predictions
            predictions = model.transform(df_in)
            r_table['cluster_{k}_{run}'.format(
                k=k, run=run)] = predictions.select('prediction').toPandas()

            # Evaluate clustering by computing Silhouette score
            evaluator = ClusteringEvaluator()
            silhouette = evaluator.evaluate(predictions)
            silh_val.append(silhouette)

        silh_array = np.asanyarray(silh_val)
        silh_lst.append(silh_array.mean())

    elapsed = time.time() - start

    silhouette = pd.DataFrame(list(zip(k_lst, silh_lst)),
                              columns=['k', 'silhouette'])

    print('+------------------------------------------------------------+')
    print("|         The finding optimal k phase took %8.0f s.       |" %
          (elapsed))
    print('+------------------------------------------------------------+')

    return k_lst[np.argmax(silh_lst, axis=0)], silhouette, r_table
Exemple #18
0
 def test_clustering_evaluator_with_cosine_distance(self):
     featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
                                 [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0),
                                  ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)])
     dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
     evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine")
     self.assertEqual(evaluator.getDistanceMeasure(), "cosine")
     self.assertTrue(np.isclose(evaluator.evaluate(dataset),  0.992671213, atol=1e-5))
Exemple #19
0
 def test_clustering_evaluator_with_cosine_distance(self):
     featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
                                 [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0),
                                  ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)])
     dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
     evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine")
     self.assertEqual(evaluator.getDistanceMeasure(), "cosine")
     self.assertTrue(np.isclose(evaluator.evaluate(dataset),  0.992671213, atol=1e-5))
 def silhouete_score(self, data):
     """
 returns the silhouette score of data.
 """
     predictions = self.model.transform(data)
     evaluator = ClusteringEvaluator()
     silhouette = evaluator.evaluate(predictions)
     print(f"silhouette score: {silhouette:.4f}")
     return silhouette
 def evaluators(self):
     evaluator = ClusteringEvaluator()
     silhouette = evaluator.evaluate(self.predictions)
     print("Silhouette with squared euclidean distance = " +
           str(silhouette))
     # Shows the result.
     centers = self.model.clusterCenters()
     print("Cluster Centers: ")
     for center in centers:
         print(center)
def plot_silhouette(data):
    evaluator = ClusteringEvaluator()
    scores = []
    for k in range(2, 50):
        kmeans = KMeans(k=k)
        model = kmeans.fit(data)
        predictions = model.transform(data)
        scores.append(evaluator.evaluate(predictions))

    plt.plot(k, scores)
    plt.show()
Exemple #23
0
def findSimillar():

	#Dealing with the server request
	#project_ID = request.args.get('project_ID', None)
	project_ID = 'afd99a01739ad5557b51b1ba0174e832'
	projects.createOrReplaceTempView('projects')

	silhouette = []

	cols = ["Project_Subject_Category_Tree","Project_Subject_Subcategory_Tree","Project_Grade_Level_Category","Project_Resource_Category"]
	colsa = []

	#df = projects.select(cols)
	df = projects

	df = df.where(df.Project_Subject_Category_Tree.isNotNull())
	df = df.where(df.Project_Subject_Subcategory_Tree.isNotNull())
	df = df.where(df.Project_Grade_Level_Category.isNotNull())
	df = df.where(df.Project_Resource_Category.isNotNull())

	for i in range(len(cols)):
		stringIndexer = StringIndexer(inputCol=cols[i], outputCol=cols[i]+"a")
		model = stringIndexer.fit(df)
		df = model.transform(df)
		colsa.append(cols[i]+"a")

	for i in range(len(cols)):
		encoder = OneHotEncoder(inputCol=cols[i]+"a", outputCol=cols[i]+"v")
		encoded = encoder.transform(df)	

		
	assembler = VectorAssembler(
	inputCols=colsa,
	outputCol="features")
	output = assembler.transform(encoded)

	kmax = 10; #optimal K happens at k=4

	for i in range(2,kmax):
		# Trains a k-means model.
		kmeans = KMeans().setK(i).setSeed(1)
		model = kmeans.fit(output)
		# Evaluate clustering by computing Silhouette score
		predictions = model.transform(output)

		evaluator = ClusteringEvaluator()
		silhouette.append([i,evaluator.evaluate(predictions)])

	k_optimal = np.array(silhouette)[int(np.where(np.array(silhouette)[:,1]==np.amax(np.array(silhouette)[:,1]))[0]),0]
	kmeans = KMeans().setK(k_optimal).setSeed(1)
def compute_metrics(model_list, i, metric, distance):
    from pyspark.ml.evaluation import ClusteringEvaluator

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator(distanceMeasure=distance)

    if metric == "wsse":
        res = model_list[i].summary.trainingCost
    elif metric == "asw":
        res = evaluator.evaluate(model_list[i].summary.predictions)
    else:
        print(
            "WARNING: wrong metric specified. Use either \"wsse\" or \"asw\".")
        return (None)
    return (res)
Exemple #25
0
    def __train_model(self):
        """Train the model with the current dataset
        """
        logger.info("Splitting dataset into 3...")
        # Model 0: 1/3 data pertama.
        # Model 1: 1/3 data pertama + 1/3 data kedua.
        # Model 2: semua data
        self.df0 = self.dforiginal.limit(int(self.dataset_count / 3))
        self.df1 = self.dforiginal.limit(int(self.dataset_count * 2 / 3))
        self.df2 = self.dforiginal
        print('df 0 count = ' + str(self.df0.count()))
        print('df 1 count = ' + str(self.df1.count()))
        print('df 2 count = ' + str(self.df2.count()))
        logger.info("Dataset Splitted !")

        logger.info("Training model 0...")
        kmeans_0 = KMeans().setK(5).setSeed(1)
        model_0 = kmeans_0.fit(self.df0)
        self.predictions_0 = model_0.transform(self.df0)
        logger.info("Model 0 built!")
        logger.info("Evaluating the model 0...")
        evaluator_0 = ClusteringEvaluator()
        silhouette_0 = evaluator_0.evaluate(self.predictions_0)
        logger.info("Silhouette with squared euclidean distance = " +
                    str(silhouette_0))
        self.centers_0 = model_0.clusterCenters()
        logger.info("Model 0 Done !")

        logger.info("Training model 1...")
        kmeans_1 = KMeans().setK(5).setSeed(1)
        model_1 = kmeans_1.fit(self.df1)
        self.predictions_1 = model_1.transform(self.df1)
        logger.info("Model 1 built!")
        logger.info("Evaluating the model 1...")
        evaluator_1 = ClusteringEvaluator()
        silhouette_1 = evaluator_1.evaluate(self.predictions_1)
        logger.info("Silhouette with squared euclidean distance = " +
                    str(silhouette_1))
        self.centers_1 = model_1.clusterCenters()
        logger.info("Model 1 Done !")

        logger.info("Training model 2...")
        kmeans_2 = KMeans().setK(5).setSeed(1)
        model_2 = kmeans_2.fit(self.df2)
        self.predictions_2 = model_2.transform(self.df2)
        logger.info("Model 2 built!")
        logger.info("Evaluating the model 2...")
        evaluator_2 = ClusteringEvaluator()
        silhouette_2 = evaluator_2.evaluate(self.predictions_2)
        logger.info("Silhouette with squared euclidean distance = " +
                    str(silhouette_2))
        self.centers_2 = model_2.clusterCenters()
        logger.info("Model 2 Done !")
def kmeans_algorithm(dataframe):

    kmeans = KMeans().setK(5).setSeed(1)
    model = kmeans.fit(dataframe)

    predictions = model.transform(dataframe)

    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    centers = model.clusterCenters()
    print("Cluster Centers: ")

    for center in centers:
        print(center)
def get_kmeans_scores(model, dataset):
    # Evaluate clustering by computing Within Set Sum of Squared Errors.
    wssse = model.computeCost(dataset)
    print("Within Set Sum of Squared Errors = " + str(wssse))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()
    predictions = model.transform(dataset)
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))
def metric(data, metric):
    # Если по какой-то причине (случайно кто-то сломал сессию, пытаясь выполнить другую задачу и тд)
    # спарк упал, то пытаемся досчитать недосчитанное локально на одном узле
    try:
        spark_context = SparkSession.getActiveSession().sparkContext
        SQLContext(spark_context).clearCache()
    except AttributeError:
        spark_context = SparkContext.getOrCreate(
            SparkConf().setMaster("local[*]"))

        spark = SparkSession \
            .builder \
            .getOrCreate()

    data = data.drop('probability')
    try:
        if metric == 'sil':
            res = -ClusteringEvaluator(
                predictionCol='labels',
                distanceMeasure='squaredEuclidean').evaluate(data)
        elif metric == 'ch':
            res = ChIndex().find(data, spark_context)
        elif metric == 'db':
            res = DaviesIndex().find(data, spark_context)
        return res
    except TypeError:
        print("\n\nTYPE ERROR OCCURED IN Metric.py:\n\nDATA: {}\n\n".format(
            data))
        return 0
    except Py4JJavaError:
        print("\n\nPy4JJavaError ERROR OCCURED IN Metric.py:\n\nDATA: {}\n\n".
              format(data.printSchema()))
        return sys.float_info.max
Exemple #29
0
 def __optimal_k_kmeans_gmm_spark(cls, feature_col, df_norm, n_iterations, kmeans_method, sc):
     from pyspark.ml.clustering import KMeans, GaussianMixture
     from pyspark.ml.evaluation import ClusteringEvaluator
     start_time = time.time()
     #convert to spark df
     sqlContext = SQLContext(sc)
     spark_df = sqlContext.createDataFrame(df_norm)
     #assemble vector
     vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features")
     spark_df_clustering = vecAssembler.transform(spark_df).select('features')
     n_components_list = []
     n_range = np.arange(2, 20)
     for iteration in np.arange(n_iterations):
         silh_val = []
         cost = []
         for k in n_range:
             if kmeans_method.lower() == 'kmeans':
                 print("Kmeans Elbow Method K = ", k)
                 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
                 model = kmeans.fit(spark_df_clustering)
                 cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later
             elif kmeans_method.lower() == 'gmm':
                 print("Gmm Elbow Method K = ", k)
                 gmm = GaussianMixture().setK(k).setSeed(1).setFeaturesCol("features")
                 model = gmm.fit(spark_df_clustering)
             #cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later
             predictions = model.transform(spark_df_clustering)
             # Evaluate clustering by computing Silhouette score
             evaluator = ClusteringEvaluator()
             silhouette = evaluator.evaluate(predictions)
             silh_val.append(silhouette)
         print('Cluster List: ', list(n_range))
         print('Silhouette score: ', silh_val)
         print('Sum of Square Distance Score: ', cost)
         n_split_silh = n_range[silh_val.index(np.max(silh_val))]
         if len(cost)>0:
             n_split_knee = cls.__knee_locator(n_range, cost, 'convex', 'decreasing', 'sum_of_square_error')
             print('Knee of sum of square distance: ', str(n_split_knee))
         else:
             n_split_knee = n_split_silh
         print("Recommended no. of components by Silhouette Score: " + str(n_split_silh))
         n_clusters = math.ceil(np.median([n_split_knee, n_split_silh]))
         n_components_list.append(n_clusters)
     n_components = int(np.median(n_components_list).round(0))
     print('Recommended median number of splits: ', n_components)
     print("training time: ", time.time()-start_time, "(sec)")
     return n_components
def run():
    dataset = spark.read.format("parquet").load(
        "hdfs:///user/spark/warehouse/kmeans-data.parquet")
    assembler = VectorAssembler(
        inputCols=["c{}".format(x) for x in range(0, 14)],
        outputCol="features")
    dataset = assembler.transform(dataset)

    kmeans = KMeans().setK(3).setSeed(1)
    model = kmeans.fit(dataset)
    predictions = model.transform(dataset)
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    # print("Silhouette with squared euclidean distance = " + str(silhouette))

    centers = model.clusterCenters()
def evaluateCluster(model, df):
    from pyspark.ml.clustering import KMeans
    from pyspark.ml.evaluation import ClusteringEvaluator
    wssse = model.computeCost(dfML.select('features'))
    print("Within Set Sum of Squared Errors = " + str(wssse))

    evaluator = ClusteringEvaluator()

    predictions = model.transform(df)
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
Exemple #32
0
    spark = SparkSession\
        .builder\
        .appName("KMeansExample")\
        .getOrCreate()

    # $example on$
    # Loads data.
    dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

    # Trains a k-means model.
    kmeans = KMeans().setK(2).setSeed(1)
    model = kmeans.fit(dataset)

    # Make predictions
    predictions = model.transform(dataset)

    # Evaluate clustering by computing Silhouette score
    evaluator = ClusteringEvaluator()

    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    # $example off$

    spark.stop()