def chartShow(): data = train() # ----Kmeans聚类---- print("------------------Kmeans聚类--------------------") print("------------设定不同的K值,进行分类,计算平方误差之和------------") errors = [] results = [] centers = [] for k in range(2, 10): # 获得模型 kmeansmodel = KMeans().setK(k).setFeaturesCol('feature').setPredictionCol('prediction').fit(data) print("With K={}".format(k)) # 带有预测簇标签的数据集 kmeans_results = kmeansmodel.transform(data).collect() results.append(kmeans_results) # for item in kmeans_results: # print(item) # print(str(item[0]) + ' is predcted as cluster' + str(item[1])) # 获取到模型的所有聚类中心情况 kmeans_centers = kmeansmodel.clusterCenters() centers.append(kmeans_centers) center_seq = 0 print(len(kmeans_centers)) for item in kmeans_centers: print(item) # print("Cluster" + str(center_seq) + " Center" + str(item)) center_seq = center_seq + 1 # 计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE) WSSSE = kmeansmodel.computeCost(data) errors.append(WSSSE) print("Within Set Sum of Squared Error = " + str(WSSSE)) print('--' * 30 + '\n') # ----WSSSE可视化---- plt.figure() k_number = range(2, 10) plt.plot(k_number, errors) plt.xlabel('Number of K') plt.ylabel('WSSSE') plt.title('K-WSSSE') # ----聚类结果可视化---- print("---------将数据转换为panda结构,并查看空间3d图心-----------") # 通过K-WSSSE图,k=6时聚类效果较好 k = 4 cluster_vis = plt.figure(figsize=(10, 10)).gca(projection='3d') for item in results[k - 2]: if item[1] == 0: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='b') # blue if item[1] == 1: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='y') # yellow if item[1] == 2: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='m') # magenta if item[1] == 3: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='k') # black if item[1] == 4: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='g') # green if item[1] == 5: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='c') # cyan for item in centers[k - 2]: cluster_vis.scatter(item[0], item[1], item[2], c='r', marker='p') # red,五角 plt.show()
#带有预测簇标签的数据集 kmeans_results = kmeansmodel.transform(iris_DF).collect() results.append(kmeans_results) for item in kmeans_results: print(str(item[0]) + ' is predcted as cluster' + str(item[1])) #获取到模型的所有聚类中心情况 kmeans_centers = kmeansmodel.clusterCenters() centers.append(kmeans_centers) center_seq = 0 for item in kmeans_centers: print("Cluster" + str(center_seq) + " Center" + str(item)) center_seq = center_seq + 1 #计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE) WSSSE = kmeansmodel.computeCost(iris_DF) errors.append(WSSSE) print("Within Set Sum of Squared Error = " + str(WSSSE)) print('--' * 30 + '\n') #----WSSSE可视化---- plt.figure() k_number = range(2, 10) plt.plot(k_number, errors) plt.xlabel('Number of K') plt.ylabel('WSSSE') plt.title('K-WSSSE') #----聚类结果可视化---- print("---------将数据转换为panda结构,并查看空间3d图心-----------")
est2 = KMeans(featuresCol='features', predictionCol='pred2', k=2, seed=1) est3 = KMeans(featuresCol='features', predictionCol='pred3', k=3, seed=1) #%% from pyspark.ml import Pipeline pipeline = Pipeline(stages=[assembler, scaler, selector, est2, est3]) #%% result = pipeline.fit(data_raw).transform(data_raw) result.show() #%% wssse2 = est2.computeCost(result) print(wssse2) wssse3 = est3.computeCost(result) print(wssse3) #%% #%% #%%
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) # COMMAND ---------- evaluator = MulticlassClassificationEvaluator(metricName="f1") print("F1 Score = " + str(evaluator.evaluate(predictionAndLabels))) # COMMAND ---------- from pyspark.ml.clustering import KMeans # Trains a k-means model. model = KMeans().setK(20).setSeed(1).fit(df_) # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(df_) print("Within Set Sum of Squared Errors = " + str(wssse)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.feature import PCA as PCAml from pyspark.ml.linalg import Vectors # Pre 2.0 pyspark.mllib.linalg pca = PCAml(k=2, inputCol="features", outputCol="pca") model = pca.fit(df_)
from datetime import datetime from pyspark.ml.linalg import Vectors from pyspark.ml.clustering import KMeans from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import StandardScaler from pyspark.ml.regression import LinearRegression from pyspark.sql.functions import abs data_url = "gs://bigdatasystems_alex_bucket/project/user_info16/part*" raw_data = (spark.read.option("header", "true").option("inferschema", "true").option( "mode", "DROPMALFORMED").csv(data_url)) assembler = VectorAssembler(inputCols=[ "followers", "friends", "favorited", "status_count", "region_id", "user_desc_rating", "count" ], outputCol="feat_vector") featured_data = assembler.transform(raw_data.na.fill(0)) featuresScaler = StandardScaler(inputCol="feat_vector", outputCol="features") featuresModel = featuresScaler.fit(featured_data) scFeatData = featuresModel.transform(featured_data) for k in range(2, 25, 2): model = KMeans().setK(k).setSeed(0).fit(scFeatData) wssse = model.computeCost(scFeatData) print(k, "\t", wssse)
from sklearn.grid_search import GridSearchCV def f(x): rel = {} rel['features'] = Vectors.dense(float(x[1]), float(x[2]), float(x[3]), float(x[4])) rel['label'] = str(x[5]).strip("\"") return rel spark = SparkSession.builder.appName("logistic_regression").getOrCreate() df = spark.sparkContext.textFile("iris.txt").map( lambda line: line.split(",")).map(lambda p: Row(**f(p))).toDF() """创建Estimator并调用其fit()方法来生成相应的Transformer对象, 很显然,在这里KMeans类是Estimator,而用于保存训练后模型的KMeansModel类则属于Transformer""" kmeans_model = KMeans().setK(3).setFeaturesCol("features").setPredictionCol( "prediction").fit(df) results = kmeans_model.transform(df).collect() for item in results: print(str(item[0]) + " is predicted as cluster " + str(item[1])) """有可以通过KMeansModel类自带的clusterCenter属性获取到模型的所有聚类中心情况""" results2 = kmeans_model.clusterCenters() for item in results2: print(item) """与MLLib下的实现相同,KMeansModel类也提供了计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE) 的方法来度量聚类的有效性,在真实K值未知的情况下,该值的变化可以作为选取合适K值的一个重要参考""" print(kmeans_model.computeCost(df))