def cluster(data, n_clusters): model = KMeans().setK(n_clusters).setSeed(1).setFeaturesCol("features").fit(scaledData) centers = model.clusterCenters() #print("Cluster Centers: ") #for center in centers: # print(center) cl_labels = model.transform(scaledData).select('prediction') gr = cl_labels.groupBy("prediction").agg(countDistinct("prediction")) #gr.show() return cl_labels, gr.show()
def process(sc): hiveContext = HiveContext(sc) hql = "select * from kmeans_cluster_feature where pt = '%s'" % (pt) df_raw = hiveContext.sql(hql).repartition(160) columns = df_raw.columns[1: -2] feature_num = len(columns) # type #df_tmp = df_raw #for k, i in zip(columns, range(feature_num)): # df_tmp = df_tmp.withColumn(k, df_tmp[i + 1] * 1.0) # Imputer mean_value = df_raw.describe().collect()[1] print mean_value df_train = df_raw for k, i in zip(columns, range(feature_num)): df_train = df_train.na.fill({k:mean_value[i + 1]}) # minmax vecAssembler = VectorAssembler(inputCols=columns, outputCol="features") df_b_s = vecAssembler.transform(df_train) mmScaler = MinMaxScaler(inputCol="features", outputCol="scaled") model = mmScaler.fit(df_b_s) df_scaled = model.transform(df_b_s) # kmeans n_clusters_ = 20 model = KMeans(k=n_clusters_, initSteps=10, maxIter=300, featuresCol='scaled').fit(df_scaled) df_result = model.transform(df_scaled) # map global sensitivity_1, sensitivity_3 sensitivity_1 = [] sensitivity_2 = [] sensitivity_3 = [] key_cnt = [] centers = model.clusterCenters() for xx, yy in zip(centers, range(n_clusters_)): key_cnt.append([yy, xx[0]]) sorted_cluster = sorted(key_cnt, key=lambda asd: asd[1]) split = n_clusters_ / 3 split_end = n_clusters_ - split for xx, yy in zip(sorted_cluster, range(n_clusters_)): if yy < split: sensitivity_3.append(xx[0]) elif yy >= split_end: sensitivity_1.append(xx[0]) else: sensitivity_2.append(xx[0]) #result df_result.map(result_process).saveAsTextFile("kmeans_cluster_result/pt=%s/" % (pt))
from pyspark.ml.clustering import KMeans # 查找最佳k值 for i in range(2,11): km = KMeans().setK(i).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction') res_kmval = km.fit(clsdata_model).summary.trainingCost print(i,': ',res_kmval) # k = 4 model = KMeans().setK(3).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction').fit(clsdata_model) res_km = model.transform(clsdata_model) summary = model.summary summary.clusterSizes [739011, 463649, 807578] summary.trainingCost >>> 7632810.723481619 model.clusterCenters() model.save('kmeans3_model') clsdata_vecform.createOrReplaceTempView('clsdata') res_km.createOrReplaceTempView('reskm') res4 = spark.sql('select c.*, r.prediction as prediction from clsdata c, reskm r where c.id = r.id').drop('feature')
def chartShow(): data = train() # ----Kmeans聚类---- print("------------------Kmeans聚类--------------------") print("------------设定不同的K值,进行分类,计算平方误差之和------------") errors = [] results = [] centers = [] for k in range(2, 10): # 获得模型 kmeansmodel = KMeans().setK(k).setFeaturesCol('feature').setPredictionCol('prediction').fit(data) print("With K={}".format(k)) # 带有预测簇标签的数据集 kmeans_results = kmeansmodel.transform(data).collect() results.append(kmeans_results) # for item in kmeans_results: # print(item) # print(str(item[0]) + ' is predcted as cluster' + str(item[1])) # 获取到模型的所有聚类中心情况 kmeans_centers = kmeansmodel.clusterCenters() centers.append(kmeans_centers) center_seq = 0 print(len(kmeans_centers)) for item in kmeans_centers: print(item) # print("Cluster" + str(center_seq) + " Center" + str(item)) center_seq = center_seq + 1 # 计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE) WSSSE = kmeansmodel.computeCost(data) errors.append(WSSSE) print("Within Set Sum of Squared Error = " + str(WSSSE)) print('--' * 30 + '\n') # ----WSSSE可视化---- plt.figure() k_number = range(2, 10) plt.plot(k_number, errors) plt.xlabel('Number of K') plt.ylabel('WSSSE') plt.title('K-WSSSE') # ----聚类结果可视化---- print("---------将数据转换为panda结构,并查看空间3d图心-----------") # 通过K-WSSSE图,k=6时聚类效果较好 k = 4 cluster_vis = plt.figure(figsize=(10, 10)).gca(projection='3d') for item in results[k - 2]: if item[1] == 0: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='b') # blue if item[1] == 1: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='y') # yellow if item[1] == 2: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='m') # magenta if item[1] == 3: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='k') # black if item[1] == 4: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='g') # green if item[1] == 5: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='c') # cyan for item in centers[k - 2]: cluster_vis.scatter(item[0], item[1], item[2], c='r', marker='p') # red,五角 plt.show()
centers = [] for k in range(2, 10): kmeansmodel = KMeans().setK(k).setFeaturesCol( 'iris_features').setPredictionCol('prediction').fit(iris_DF) print("With K={}".format(k)) #带有预测簇标签的数据集 kmeans_results = kmeansmodel.transform(iris_DF).collect() results.append(kmeans_results) for item in kmeans_results: print(str(item[0]) + ' is predcted as cluster' + str(item[1])) #获取到模型的所有聚类中心情况 kmeans_centers = kmeansmodel.clusterCenters() centers.append(kmeans_centers) center_seq = 0 for item in kmeans_centers: print("Cluster" + str(center_seq) + " Center" + str(item)) center_seq = center_seq + 1 #计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE) WSSSE = kmeansmodel.computeCost(iris_DF) errors.append(WSSSE) print("Within Set Sum of Squared Error = " + str(WSSSE)) print('--' * 30 + '\n') #----WSSSE可视化---- plt.figure()
evaluator = MulticlassClassificationEvaluator(metricName="f1") print("F1 Score = " + str(evaluator.evaluate(predictionAndLabels))) # COMMAND ---------- from pyspark.ml.clustering import KMeans # Trains a k-means model. model = KMeans().setK(20).setSeed(1).fit(df_) # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(df_) print("Within Set Sum of Squared Errors = " + str(wssse)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.feature import PCA as PCAml from pyspark.ml.linalg import Vectors # Pre 2.0 pyspark.mllib.linalg pca = PCAml(k=2, inputCol="features", outputCol="pca") model = pca.fit(df_) df_pca = model.transform(df_) # COMMAND ----------
from sklearn.grid_search import GridSearchCV def f(x): rel = {} rel['features'] = Vectors.dense(float(x[1]), float(x[2]), float(x[3]), float(x[4])) rel['label'] = str(x[5]).strip("\"") return rel spark = SparkSession.builder.appName("logistic_regression").getOrCreate() df = spark.sparkContext.textFile("iris.txt").map( lambda line: line.split(",")).map(lambda p: Row(**f(p))).toDF() """创建Estimator并调用其fit()方法来生成相应的Transformer对象, 很显然,在这里KMeans类是Estimator,而用于保存训练后模型的KMeansModel类则属于Transformer""" kmeans_model = KMeans().setK(3).setFeaturesCol("features").setPredictionCol( "prediction").fit(df) results = kmeans_model.transform(df).collect() for item in results: print(str(item[0]) + " is predicted as cluster " + str(item[1])) """有可以通过KMeansModel类自带的clusterCenter属性获取到模型的所有聚类中心情况""" results2 = kmeans_model.clusterCenters() for item in results2: print(item) """与MLLib下的实现相同,KMeansModel类也提供了计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE) 的方法来度量聚类的有效性,在真实K值未知的情况下,该值的变化可以作为选取合适K值的一个重要参考""" print(kmeans_model.computeCost(df))