Esempio n. 1
0
def cluster(data, n_clusters):
    model = KMeans().setK(n_clusters).setSeed(1).setFeaturesCol("features").fit(scaledData)
    centers = model.clusterCenters()
    #print("Cluster Centers: ")
    #for center in centers:
     #   print(center)
    cl_labels = model.transform(scaledData).select('prediction')
    gr = cl_labels.groupBy("prediction").agg(countDistinct("prediction"))
    #gr.show()
    return cl_labels, gr.show()
Esempio n. 2
0
def process(sc):
    hiveContext = HiveContext(sc)
    hql = "select * from kmeans_cluster_feature where pt = '%s'" % (pt)
    df_raw = hiveContext.sql(hql).repartition(160)
    columns = df_raw.columns[1: -2]    
    feature_num = len(columns)
    # type
    #df_tmp = df_raw
    #for k, i in zip(columns, range(feature_num)):
    #    df_tmp = df_tmp.withColumn(k, df_tmp[i + 1] * 1.0)
    # Imputer
    mean_value = df_raw.describe().collect()[1]
    print mean_value
    df_train = df_raw
    for k, i in zip(columns, range(feature_num)):
        df_train = df_train.na.fill({k:mean_value[i + 1]})
    # minmax
    vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
    df_b_s = vecAssembler.transform(df_train)
    mmScaler = MinMaxScaler(inputCol="features", outputCol="scaled")
    model = mmScaler.fit(df_b_s)
    df_scaled = model.transform(df_b_s)
    # kmeans
    n_clusters_ = 20
    model = KMeans(k=n_clusters_, initSteps=10, maxIter=300, featuresCol='scaled').fit(df_scaled)
    df_result = model.transform(df_scaled)
    # map
    global sensitivity_1, sensitivity_3
    sensitivity_1 = []
    sensitivity_2 = []
    sensitivity_3 = []
    key_cnt = []
    centers = model.clusterCenters()
    for xx, yy in zip(centers, range(n_clusters_)):
        key_cnt.append([yy, xx[0]])
    sorted_cluster = sorted(key_cnt, key=lambda asd: asd[1])
    split = n_clusters_ / 3
    split_end = n_clusters_ - split
    for xx, yy in zip(sorted_cluster, range(n_clusters_)):
        if yy < split:
            sensitivity_3.append(xx[0])
        elif yy >= split_end:
            sensitivity_1.append(xx[0])
        else:
            sensitivity_2.append(xx[0])
    #result
    df_result.map(result_process).saveAsTextFile("kmeans_cluster_result/pt=%s/" % (pt))
Esempio n. 3
0
from pyspark.ml.clustering import KMeans
# 查找最佳k值
for i in range(2,11):
    km = KMeans().setK(i).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction')
    res_kmval = km.fit(clsdata_model).summary.trainingCost
    print(i,': ',res_kmval)
    

# k = 4
model = KMeans().setK(3).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction').fit(clsdata_model)
res_km = model.transform(clsdata_model)
summary = model.summary
summary.clusterSizes
[739011, 463649, 807578]
summary.trainingCost
>>> 7632810.723481619
model.clusterCenters()

model.save('kmeans3_model')
clsdata_vecform.createOrReplaceTempView('clsdata')
res_km.createOrReplaceTempView('reskm')
res4 = spark.sql('select c.*, r.prediction as prediction from clsdata c, reskm r where c.id = r.id').drop('feature')
Esempio n. 4
0
def chartShow():
    data = train()
    # ----Kmeans聚类----
    print("------------------Kmeans聚类--------------------")
    print("------------设定不同的K值,进行分类,计算平方误差之和------------")

    errors = []
    results = []
    centers = []

    for k in range(2, 10):
        # 获得模型
        kmeansmodel = KMeans().setK(k).setFeaturesCol('feature').setPredictionCol('prediction').fit(data)
        print("With K={}".format(k))

        # 带有预测簇标签的数据集
        kmeans_results = kmeansmodel.transform(data).collect()
        results.append(kmeans_results)
        #     for item in kmeans_results:
        #         print(item)
        #         print(str(item[0]) + ' is predcted as cluster' + str(item[1]))

        # 获取到模型的所有聚类中心情况
        kmeans_centers = kmeansmodel.clusterCenters()
        centers.append(kmeans_centers)
        center_seq = 0
        print(len(kmeans_centers))
        for item in kmeans_centers:
            print(item)
            #         print("Cluster" + str(center_seq) + "  Center" + str(item))
            center_seq = center_seq + 1

        # 计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE)
        WSSSE = kmeansmodel.computeCost(data)
        errors.append(WSSSE)
        print("Within Set Sum of Squared Error = " + str(WSSSE))

        print('--' * 30 + '\n')

    # ----WSSSE可视化----
    plt.figure()
    k_number = range(2, 10)
    plt.plot(k_number, errors)
    plt.xlabel('Number of K')
    plt.ylabel('WSSSE')
    plt.title('K-WSSSE')

    # ----聚类结果可视化----
    print("---------将数据转换为panda结构,并查看空间3d图心-----------")
    # 通过K-WSSSE图,k=6时聚类效果较好
    k = 4

    cluster_vis = plt.figure(figsize=(10, 10)).gca(projection='3d')

    for item in results[k - 2]:
        if item[1] == 0:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='b')  # blue
        if item[1] == 1:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='y')  # yellow
        if item[1] == 2:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='m')  # magenta
        if item[1] == 3:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='k')  # black
        if item[1] == 4:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='g')  # green
        if item[1] == 5:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='c')  # cyan

    for item in centers[k - 2]:
        cluster_vis.scatter(item[0], item[1], item[2], c='r', marker='p')  # red,五角

    plt.show()
    centers = []

    for k in range(2, 10):
        kmeansmodel = KMeans().setK(k).setFeaturesCol(
            'iris_features').setPredictionCol('prediction').fit(iris_DF)

        print("With K={}".format(k))

        #带有预测簇标签的数据集
        kmeans_results = kmeansmodel.transform(iris_DF).collect()
        results.append(kmeans_results)
        for item in kmeans_results:
            print(str(item[0]) + ' is predcted as cluster' + str(item[1]))

        #获取到模型的所有聚类中心情况
        kmeans_centers = kmeansmodel.clusterCenters()
        centers.append(kmeans_centers)
        center_seq = 0
        for item in kmeans_centers:
            print("Cluster" + str(center_seq) + "  Center" + str(item))
            center_seq = center_seq + 1

        #计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE)
        WSSSE = kmeansmodel.computeCost(iris_DF)
        errors.append(WSSSE)
        print("Within Set Sum of Squared Error = " + str(WSSSE))

        print('--' * 30 + '\n')

    #----WSSSE可视化----
    plt.figure()
evaluator = MulticlassClassificationEvaluator(metricName="f1")
print("F1 Score = " + str(evaluator.evaluate(predictionAndLabels)))

# COMMAND ----------

from pyspark.ml.clustering import KMeans

# Trains a k-means model.
model = KMeans().setK(20).setSeed(1).fit(df_)

# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(df_)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.feature import PCA as PCAml
from pyspark.ml.linalg import Vectors  # Pre 2.0 pyspark.mllib.linalg

pca = PCAml(k=2, inputCol="features", outputCol="pca")
model = pca.fit(df_)
df_pca = model.transform(df_)

# COMMAND ----------
Esempio n. 7
0
from sklearn.grid_search import GridSearchCV


def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[1]), float(x[2]), float(x[3]),
                                    float(x[4]))
    rel['label'] = str(x[5]).strip("\"")
    return rel


spark = SparkSession.builder.appName("logistic_regression").getOrCreate()

df = spark.sparkContext.textFile("iris.txt").map(
    lambda line: line.split(",")).map(lambda p: Row(**f(p))).toDF()
"""创建Estimator并调用其fit()方法来生成相应的Transformer对象,
很显然,在这里KMeans类是Estimator,而用于保存训练后模型的KMeansModel类则属于Transformer"""
kmeans_model = KMeans().setK(3).setFeaturesCol("features").setPredictionCol(
    "prediction").fit(df)

results = kmeans_model.transform(df).collect()
for item in results:
    print(str(item[0]) + " is predicted as cluster " + str(item[1]))
"""有可以通过KMeansModel类自带的clusterCenter属性获取到模型的所有聚类中心情况"""
results2 = kmeans_model.clusterCenters()
for item in results2:
    print(item)
"""与MLLib下的实现相同,KMeansModel类也提供了计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE)
的方法来度量聚类的有效性,在真实K值未知的情况下,该值的变化可以作为选取合适K值的一个重要参考"""
print(kmeans_model.computeCost(df))