def cluster(self, df, session, repartition_num=8):
        n = df.count()
        # index rows
        df_index = df.select((row_number().over(
            Window.partitionBy(lit(0)).orderBy(self.featureCol)) -
                              1).alias('id'), "*")
        df_features = df_index.select('id', self.featureCol)

        # prep for joining
        df_features = df_features.repartitionByRange(repartition_num, 'id')

        left_df = df_features.select(
            df_features['id'].alias('left_id'),
            df_features[self.featureCol].alias('left_features'))
        right_df = df_features.select(
            df_features['id'].alias('right_id'),
            df_features[self.featureCol].alias('right_features'))

        # join on self where left_id does not equal right_id
        joined_df = left_df.join(right_df,
                                 left_df['left_id'] != right_df['right_id'])

        # comupte cosine similarity between vectors
        joined_df = joined_df.select(
            'left_id', 'right_id',
            cosine_similarity_udf(
                array(joined_df['left_features'],
                      joined_df['right_features'])).alias('norm'))
        ranked = joined_df.select(
            'left_id', 'right_id',
            rank().over(
                Window.partitionBy('left_id').orderBy('norm')).alias('rank'))
        knn = ranked.where(ranked['rank'] <= 5)
        knn_grouped = knn.groupBy('left_id').agg(
            f.collect_list('right_id').alias('nn'))

        # generate laplacian
        laplacian = knn_grouped.select(
            'left_id',
            laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'],
                                 lit(n),
                                 lit(self.k_nearest)).alias('lap_vector'))

        laplacian_matrix = RowMatrix(
            laplacian.select('lap_vector').rdd.map(lambda x: x[0]))
        eigenvectors = laplacian_matrix.computePrincipalComponents(
            k=self.num_eigenvectors)

        eigenvectors = [
            (idx, Vectors.dense([float(item) for item in row]))
            for idx, row in enumerate(eigenvectors.toArray().tolist())
        ]

        eigen_df = session.createDataFrame(eigenvectors,
                                           ['id', self.featureCol])
        model = KMeans(featuresCol=self.featureCol,
                       predictionCol=self.predictionCol,
                       k=self.k).fit(eigen_df)
        predictions = model.transform(eigen_df).join(df_index, on='id')
        return predictions
Esempio n. 2
0
def cluster(data, n_clusters):
    model = KMeans().setK(n_clusters).setSeed(1).setFeaturesCol("features").fit(scaledData)
    centers = model.clusterCenters()
    #print("Cluster Centers: ")
    #for center in centers:
     #   print(center)
    cl_labels = model.transform(scaledData).select('prediction')
    gr = cl_labels.groupBy("prediction").agg(countDistinct("prediction"))
    #gr.show()
    return cl_labels, gr.show()
Esempio n. 3
0
    def cluster(self, df, session, repartition_num=8):
        n = df.count()
        # index rows
        df_index = df.select((row_number().over(
            Window.partitionBy(lit(0)).orderBy(self.featureCol)) -
                              1).alias('id'), "*")
        df_features = df_index.select('id', self.featureCol)

        # prep for joining
        df_features = df_features.repartitionByRange(repartition_num, 'id')

        left_df = df_features.select(
            df_features['id'].alias('left_id'),
            df_features[self.featureCol].alias('left_features'))
        right_df = df_features.select(
            df_features['id'].alias('right_id'),
            df_features[self.featureCol].alias('right_features'))

        # join on self where left_id does not equal right_id
        joined_df = left_df.join(right_df,
                                 left_df['left_id'] != right_df['right_id'])

        # comupte cosine similarity between vectors
        joined_df = joined_df.select(
            'left_id', 'right_id',
            cosine_similarity_udf(
                array(joined_df['left_features'],
                      joined_df['right_features'])).alias('norm'))
        ranked = joined_df.select(
            'left_id', 'right_id',
            rank().over(
                Window.partitionBy('left_id').orderBy('norm')).alias('rank'))
        knn = ranked.where(ranked['rank'] <= 5)
        knn_grouped = knn.groupBy('left_id').agg(
            f.collect_list('right_id').alias('nn'))

        # generate laplacian
        laplacian = knn_grouped.select(
            knn_grouped['left_id'].alias('id'),
            toVector_udf(
                laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'],
                                     lit(n),
                                     lit(self.k_nearest))).alias('lap_vector'))

        pca = PCA(k=self.num_eigenvectors,
                  inputCol='lap_vector',
                  outputCol='features').fit(laplacian)
        eigenvectors = pca.transform(laplacian).select('id', 'features')

        model = KMeans(featuresCol='features',
                       predictionCol=self.predictionCol,
                       k=self.k).fit(eigenvectors)
        predictions = model.transform(eigenvectors).join(df_index, on='id')
        return predictions
Esempio n. 4
0
def kmeans(params):
    path = params[0]
    k = int(params[1])
    iterations = int(params[2])
    target_dir = params[3]

    try:
        # Creating session
        spark_session = SparkSession.builder.appName(
            "project4-jwj").getOrCreate()

        # loading the files from hdfs ang getting a DataFrame
        data = spark_session.read.format("csv").option("header", "true").load(
            "{}/*.csv".format(path))
        #data.show()
        # Getting column's name
        columns = data.columns
        # Removing null rows
        for i in columns:
            data = data.filter(col(i).isNotNull())

        # Breaking the content column into individual words
        tokenizer = Tokenizer(inputCol="content", outputCol="Words")
        tokenized = tokenizer.transform(data)
        #tokenized.show()
        # Removing stop words
        remover = StopWordsRemover(inputCol="Words", outputCol="Filtered")
        removed = remover.transform(tokenized)
        #removed.show()

        # Term frecuency - inverse document frecuency
        hashingTF = HashingTF(inputCol="Filtered",
                              outputCol="rawFeatures",
                              numFeatures=3000)

        # Getting the frecuency term vector to try to get k and train kmeans
        featurizedData = hashingTF.transform(removed)
        #featurizedData.show()

        idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
        idfModel = idf.fit(featurizedData)
        rescaledData = idfModel.transform(featurizedData)
        rescaledData.show()
        # Train KMeans
        kmean = KMeans().setK(k).setMaxIter(iterations).fit(rescaledData)
        clustersTable = kmean.transform(rescaledData)
        clustersTable.show()
        clustersTable.select("title", "prediction").repartition(
            1).write.format("com.databricks.spark.csv").save(target_dir)
    except Exception as e:
        print(str(e), file=sys.stderr)
        sys.exit(1)
Esempio n. 5
0
def process(sc):
    hiveContext = HiveContext(sc)
    hql = "select * from kmeans_cluster_feature where pt = '%s'" % (pt)
    df_raw = hiveContext.sql(hql).repartition(160)
    columns = df_raw.columns[1: -2]    
    feature_num = len(columns)
    # type
    #df_tmp = df_raw
    #for k, i in zip(columns, range(feature_num)):
    #    df_tmp = df_tmp.withColumn(k, df_tmp[i + 1] * 1.0)
    # Imputer
    mean_value = df_raw.describe().collect()[1]
    print mean_value
    df_train = df_raw
    for k, i in zip(columns, range(feature_num)):
        df_train = df_train.na.fill({k:mean_value[i + 1]})
    # minmax
    vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
    df_b_s = vecAssembler.transform(df_train)
    mmScaler = MinMaxScaler(inputCol="features", outputCol="scaled")
    model = mmScaler.fit(df_b_s)
    df_scaled = model.transform(df_b_s)
    # kmeans
    n_clusters_ = 20
    model = KMeans(k=n_clusters_, initSteps=10, maxIter=300, featuresCol='scaled').fit(df_scaled)
    df_result = model.transform(df_scaled)
    # map
    global sensitivity_1, sensitivity_3
    sensitivity_1 = []
    sensitivity_2 = []
    sensitivity_3 = []
    key_cnt = []
    centers = model.clusterCenters()
    for xx, yy in zip(centers, range(n_clusters_)):
        key_cnt.append([yy, xx[0]])
    sorted_cluster = sorted(key_cnt, key=lambda asd: asd[1])
    split = n_clusters_ / 3
    split_end = n_clusters_ - split
    for xx, yy in zip(sorted_cluster, range(n_clusters_)):
        if yy < split:
            sensitivity_3.append(xx[0])
        elif yy >= split_end:
            sensitivity_1.append(xx[0])
        else:
            sensitivity_2.append(xx[0])
    #result
    df_result.map(result_process).saveAsTextFile("kmeans_cluster_result/pt=%s/" % (pt))
Esempio n. 6
0
def user_cluster_model(spark, ratings, movies, k, genres):
    """ Returns a clustering model for users' genre preferences """

    # Get all user ids
    all_user_ids = ratings.select("userId").distinct().rdd.flatMap(
        lambda x: x).collect()

    # Calculate scores for each user
    scores = user_genre_scores(spark, ratings, movies, all_user_ids)\
        .sort(col("userId"), col("genre"))

    # Convert genres in rows to columns
    scores = scores.groupBy("userId").pivot("genre").agg(
        first("score")).na.fill(0)

    # Ignore movies without genres
    if "(no genres listed)" in scores.columns:
        scores = scores.drop("(no genres listed)")
    scores.cache()

    # Find genres in dataset used
    genres_in_scores = scores.drop("userId").columns

    # Train a k-means model
    scores = VectorAssembler(inputCols=genres_in_scores,
                             outputCol="features").transform(scores)
    kmeans_model = KMeans().setK(k).setSeed(5052).fit(scores)

    # Save genres used in model to model object
    kmeans_model.genres = genres_in_scores

    # Calculate sihlouette score & save to model
    train_predictions = kmeans_model.transform(scores)
    kmeans_model.sihlouette_score = ClusteringEvaluator().evaluate(
        train_predictions)

    return kmeans_model
Esempio n. 7
0
def chartShow():
    data = train()
    # ----Kmeans聚类----
    print("------------------Kmeans聚类--------------------")
    print("------------设定不同的K值,进行分类,计算平方误差之和------------")

    errors = []
    results = []
    centers = []

    for k in range(2, 10):
        # 获得模型
        kmeansmodel = KMeans().setK(k).setFeaturesCol('feature').setPredictionCol('prediction').fit(data)
        print("With K={}".format(k))

        # 带有预测簇标签的数据集
        kmeans_results = kmeansmodel.transform(data).collect()
        results.append(kmeans_results)
        #     for item in kmeans_results:
        #         print(item)
        #         print(str(item[0]) + ' is predcted as cluster' + str(item[1]))

        # 获取到模型的所有聚类中心情况
        kmeans_centers = kmeansmodel.clusterCenters()
        centers.append(kmeans_centers)
        center_seq = 0
        print(len(kmeans_centers))
        for item in kmeans_centers:
            print(item)
            #         print("Cluster" + str(center_seq) + "  Center" + str(item))
            center_seq = center_seq + 1

        # 计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE)
        WSSSE = kmeansmodel.computeCost(data)
        errors.append(WSSSE)
        print("Within Set Sum of Squared Error = " + str(WSSSE))

        print('--' * 30 + '\n')

    # ----WSSSE可视化----
    plt.figure()
    k_number = range(2, 10)
    plt.plot(k_number, errors)
    plt.xlabel('Number of K')
    plt.ylabel('WSSSE')
    plt.title('K-WSSSE')

    # ----聚类结果可视化----
    print("---------将数据转换为panda结构,并查看空间3d图心-----------")
    # 通过K-WSSSE图,k=6时聚类效果较好
    k = 4

    cluster_vis = plt.figure(figsize=(10, 10)).gca(projection='3d')

    for item in results[k - 2]:
        if item[1] == 0:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='b')  # blue
        if item[1] == 1:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='y')  # yellow
        if item[1] == 2:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='m')  # magenta
        if item[1] == 3:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='k')  # black
        if item[1] == 4:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='g')  # green
        if item[1] == 5:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='c')  # cyan

    for item in centers[k - 2]:
        cluster_vis.scatter(item[0], item[1], item[2], c='r', marker='p')  # red,五角

    plt.show()
    #----Kmeans聚类----
    print("------------------Kmeans聚类--------------------")
    print("------------设定不同的K值,进行分类,计算平方误差之和------------")

    errors = []
    results = []
    centers = []

    for k in range(2, 10):
        kmeansmodel = KMeans().setK(k).setFeaturesCol(
            'iris_features').setPredictionCol('prediction').fit(iris_DF)

        print("With K={}".format(k))

        #带有预测簇标签的数据集
        kmeans_results = kmeansmodel.transform(iris_DF).collect()
        results.append(kmeans_results)
        for item in kmeans_results:
            print(str(item[0]) + ' is predcted as cluster' + str(item[1]))

        #获取到模型的所有聚类中心情况
        kmeans_centers = kmeansmodel.clusterCenters()
        centers.append(kmeans_centers)
        center_seq = 0
        for item in kmeans_centers:
            print("Cluster" + str(center_seq) + "  Center" + str(item))
            center_seq = center_seq + 1

        #计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE)
        WSSSE = kmeansmodel.computeCost(iris_DF)
        errors.append(WSSSE)
Esempio n. 9
0
    def cluster(self, df, session, repartition_num=8):
        n = df.count()
        # index rows
        df_index = df.select(
            (row_number().over(
                Window.partitionBy(lit(0)).orderBy(self.featureCol)) -
             1).alias("id"),
            "*",
        )
        df_features = df_index.select("id", self.featureCol)

        # prep for joining
        df_features = df_features.repartitionByRange(repartition_num, "id")

        left_df = df_features.select(
            df_features["id"].alias("left_id"),
            df_features[self.featureCol].alias("left_features"),
        )
        right_df = df_features.select(
            df_features["id"].alias("right_id"),
            df_features[self.featureCol].alias("right_features"),
        )

        # join on self where left_id does not equal right_id
        joined_df = left_df.join(right_df,
                                 left_df["left_id"] != right_df["right_id"])

        # comupte cosine similarity between vectors
        joined_df = joined_df.select(
            "left_id",
            "right_id",
            self.distance(
                array(joined_df["left_features"],
                      joined_df["right_features"])).alias("norm"),
        )
        ranked = joined_df.select(
            "left_id",
            "right_id",
            rank().over(
                Window.partitionBy("left_id").orderBy("norm")).alias("rank"),
        )
        knn = ranked.where(ranked["rank"] <= 5)
        knn_grouped = knn.groupBy("left_id").agg(
            f.collect_list("right_id").alias("nn"))

        # generate laplacian
        laplacian = knn_grouped.select(
            knn_grouped["left_id"].alias("id"),
            toVector_udf(
                laplacian_vector_udf(
                    knn_grouped["left_id"],
                    knn_grouped["nn"],
                    lit(n),
                    lit(self.k_nearest),
                )).alias("lap_vector"),
        )

        laplacian_matrix = RowMatrix(
            laplacian.select("lap_vector").rdd.map(lambda x: x[0].toArray()),
            -1, -1)

        svd = laplacian_matrix.computeSVD(
            k=laplacian_matrix.numRows()  # self.num_eigenvectors
        )
        eigenvectors = [
            (idx, Vectors.dense([float(item) for item in row]))
            for idx, row in enumerate(svd.V.toArray()[:, -self.k:].tolist())
        ]

        eigen_df = session.createDataFrame(eigenvectors,
                                           ["id", self.featureCol])
        model = KMeans(featuresCol=self.featureCol,
                       predictionCol=self.predictionCol,
                       k=self.k).fit(eigen_df)
        predictions = model.transform(eigen_df).join(df_index, on="id")
        return predictions
Esempio n. 10
0
# MAGIC
# MAGIC **Note:** This command multiple spark jobs (one job per iteration in the KMeans algorithm). You will see the progress bar starting over and over again.

# COMMAND ----------

from pyspark.ml.clustering import KMeans

model = KMeans().setK(2).fit(trainingData)

# COMMAND ----------

# MAGIC %md To see the result of our clustering, we produce a scatter plot matrix that shows interaction between input variables and learned clusters. To get that we apply the model on the original data and pick four columns: `prediction` and the original features (`duration`, `tempo`, and `loudness`).

# COMMAND ----------

transformed = model.transform(trainingData).select("duration", "tempo", "loudness", "prediction")

# COMMAND ----------

# MAGIC %md To comfortably visualize the data we produce a random sample.
# MAGIC Remember the `display()` function? We can use it to produce a nicely rendered table of transformed DataFrame.

# COMMAND ----------

display(transformed.sample(False, fraction=0.005))

# COMMAND ----------

# MAGIC %md To generate a scatter plot matrix, click on the plot button bellow the table and select `scatter`. That will transform your table to a scatter plot matrix. It automatically picks all numeric columns as values. To include predicted clusters, click on `Plot Options` and drag `prediction` to the list of Keys. You will get the following plot. On the diagonal panels you see the PDF of marginal distribution of each variable. Non-diagonal panels show a scatter plot between variables of the two variables of the row and column. For example the top right panel shows the scatter plot between duration and loudness. Each point is colored according to the cluster it is assigned to.

# COMMAND ----------
Esempio n. 11
0
# TFIDF
tfidf_dataFrame = genre_and_sentences_after_flatmap.toDF(["genre","sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tfidf_words_data = tokenizer.transform(tfidf_dataFrame)

hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=512)
tfidf_featurized_data = hashing_tf.transform(tfidf_words_data)

idf_model = IDF(inputCol="rawFeatures", outputCol="features").fit(tfidf_featurized_data)
tfidf_rescaled_data = idf_model.transform(tfidf_featurized_data)
tfidf_genre_features = tfidf_rescaled_data.select("genre", "features")

# Confusion matrix for TFIDF
tfidf_kmeansmodel = KMeans().setK(5).setFeaturesCol('features').setPredictionCol('prediction').fit(tfidf_genre_features)
tfidf_predictions = tfidf_kmeansmodel.transform(tfidf_genre_features).select("prediction", "genre")
tfidf_res = tfidf_predictions.groupBy(['prediction', 'genre']).count().collect()
print("Confusion matrix for TFIDF:")
toPrint(tfidf_res)
print()

#######################################################################
## Vocabulary Exploration - Part B                                   ##
#######################################################################

# pretrained
pretrained_genre_features = genre_and_sentences_after_flatmap.mapPartitions(emb)
pretrained_dataFrame = pretrained_genre_features.map(toList).toDF(["genre","features"])

new_schema = ArrayType(DoubleType(), containsNull=False)
udf_foo = udf(lambda x:x, new_schema)
# MAGIC %md We can now pass this new DataFrame to the `KMeans` model and ask it to categorize different rows in our data to two different classes (`setK(2)`). We place the model in a variable named `model`.
# MAGIC 
# MAGIC **Note:** This command multiple spark jobs (one job per iteration in the KMeans algorithm). You will see the progress bar starting over and over again.

# COMMAND ----------

from pyspark.ml.clustering import KMeans
model = KMeans().setK(2).fit(trainingData)

# COMMAND ----------

# MAGIC %md To see the result of our clustering, we produce a scatter plot matrix that shows interaction between input variables and learned clusters. To get that we apply the model on the original data and pick four columns: `prediction` and the original features (`duration`, `tempo`, and `loudness`).

# COMMAND ----------

transformed = model.transform(trainingData).select("duration", "tempo", "loudness", "prediction")

# COMMAND ----------

# MAGIC %md To comfortably visualize the data we produce a random sample. 
# MAGIC Remember the `display()` function? We can use it to produce a nicely rendered table of transformed DataFrame. 

# COMMAND ----------

display(transformed.sample(False, fraction = 0.005))

# COMMAND ----------

# MAGIC %md To generate a scatter plot matrix, click on the plot button bellow the table and select `scatter`. That will transform your table to a scatter plot matrix. It automatically picks all numeric columns as values. To include predicted clusters, click on `Plot Options` and drag `prediction` to the list of Keys. You will get the following plot. On the diagonal panels you see the PDF of marginal distribution of each variable. Non-diagonal panels show a scatter plot between variables of the two variables of the row and column. For example the top right panel shows the scatter plot between duration and loudness. Each point is colored according to the cluster it is assigned to.

# COMMAND ----------
Esempio n. 13
0
k = args.k_clusters

if algorithm not in ['kmeans', 'gmm', 'lda', 'spectral']:
    raise ValueError('Not a valid algorithm')

ss = SparkSession.builder.getOrCreate()

df = ss.read.csv(path, header=True, inferSchema=True)

df_preprocessed = preprocessing(df, num_pca=num_pca_features)

df_preprocessed.write.parquet("preprocessed", mode="Overwrite")

if algorithm == 'kmeans':
    model = KMeans(k=k).setSeed(1).fit(df_preprocessed)
    predictions = model.transform(df_preprocessed)
elif algorithm == 'spectral':
    model = SpectralClustering(k=k, k_nearest=7)
    predictions = model.cluster(df_preprocessed, ss, repartition_num=num_nodes)
elif algorithm == 'lda':
    model = LDA(k=k, maxIter=10).fit(df_preprocessed)
    predictions = model.transform(df_preprocessed)
elif algorithm == 'gmm':
    model = GaussianMixture(k=k).fit(df_preprocessed)
    predictions = model.transform(df_preprocessed)


predictions.select([col for col in predictions.columns if col != 'features'])\
           .toPandas()\
           .to_csv(sys.stdout)
Esempio n. 14
0
from sklearn.grid_search import GridSearchCV


def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[1]), float(x[2]), float(x[3]),
                                    float(x[4]))
    rel['label'] = str(x[5]).strip("\"")
    return rel


spark = SparkSession.builder.appName("logistic_regression").getOrCreate()

df = spark.sparkContext.textFile("iris.txt").map(
    lambda line: line.split(",")).map(lambda p: Row(**f(p))).toDF()
"""创建Estimator并调用其fit()方法来生成相应的Transformer对象,
很显然,在这里KMeans类是Estimator,而用于保存训练后模型的KMeansModel类则属于Transformer"""
kmeans_model = KMeans().setK(3).setFeaturesCol("features").setPredictionCol(
    "prediction").fit(df)

results = kmeans_model.transform(df).collect()
for item in results:
    print(str(item[0]) + " is predicted as cluster " + str(item[1]))
"""有可以通过KMeansModel类自带的clusterCenter属性获取到模型的所有聚类中心情况"""
results2 = kmeans_model.clusterCenters()
for item in results2:
    print(item)
"""与MLLib下的实现相同,KMeansModel类也提供了计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE)
的方法来度量聚类的有效性,在真实K值未知的情况下,该值的变化可以作为选取合适K值的一个重要参考"""
print(kmeans_model.computeCost(df))
Esempio n. 15
0
# Reference https://chrisalbon.com/machine_learning/trees_and_forests/random_forest_classifier_example/

import time
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark import SparkConf, SparkContext, SQLContext

conf = SparkConf().setMaster("local[*]")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)

data = spark.read.format("libsvm").load(
    "D:\Outils\Spark\data\mllib\iris_libsvm.txt")

model = KMeans().setK(3)

model = model.fit(data)
predictions = model.transform(data)

evaluator = ClusteringEvaluator()

score = evaluator.evaluate(predictions)
print('Accuracy: ', score)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.feature import PCA as PCAml
from pyspark.ml.linalg import Vectors  # Pre 2.0 pyspark.mllib.linalg

pca = PCAml(k=2, inputCol="features", outputCol="pca")
model = pca.fit(df_)
df_pca = model.transform(df_)

# COMMAND ----------

model.explainedVariance

# COMMAND ----------

display(df_pca)

# COMMAND ----------

from pyspark.ml.clustering import KMeans

# Trains a k-means model.
model = KMeans().setParams(featuresCol="pca", k=20, seed=1).fit(df_pca)
Esempio n. 17
0
from pyspark.ml.clustering import KMeans
# 查找最佳k值
for i in range(2,11):
    km = KMeans().setK(i).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction')
    res_kmval = km.fit(clsdata_model).summary.trainingCost
    print(i,': ',res_kmval)
    

# k = 4
model = KMeans().setK(3).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction').fit(clsdata_model)
res_km = model.transform(clsdata_model)
summary = model.summary
summary.clusterSizes
[739011, 463649, 807578]
summary.trainingCost
>>> 7632810.723481619
model.clusterCenters()

model.save('kmeans3_model')
clsdata_vecform.createOrReplaceTempView('clsdata')
res_km.createOrReplaceTempView('reskm')
res4 = spark.sql('select c.*, r.prediction as prediction from clsdata c, reskm r where c.id = r.id').drop('feature')
Esempio n. 18
0
        print('Wrong arguments number')
        sys.exit(-1)

    spark = SparkSession.builder.appName('part2').getOrCreate()
    df=spark.read.csv(sys.argv[1], inferSchema=True, header=True)\
            .select(['Street Code1','Street Code2','Street Code3','Vehicle Color'])

    assembler = VectorAssembler(
        inputCols=['Street Code1', 'Street Code2', 'Street Code3'],
        outputCol='features')

    data = assembler.transform(df)

    # Setup KMeans model and train
    model = KMeans(featuresCol='features', k=int(sys.argv[2])).fit(data)
    prediction = model.transform(data)

    # out of sample data
    newdf = spark.createDataFrame(
        [(34510, 10030, 34050, 'BLK')],
        ('Street Code1', 'Street Code2', 'Street Code3', 'Vehicle Color'))

    newpred = model.transform(assembler.transform(newdf))

    # cluster of the out of sample data and its color
    cluster = newpred.select('prediction').collect()[0].prediction
    color = newpred.select('Vehicle Color').collect()[0]['Vehicle Color']

    colors = prediction.filter(
        prediction['prediction'] == cluster).select('Vehicle Color').collect()