def cluster(self, df, session, repartition_num=8): n = df.count() # index rows df_index = df.select((row_number().over( Window.partitionBy(lit(0)).orderBy(self.featureCol)) - 1).alias('id'), "*") df_features = df_index.select('id', self.featureCol) # prep for joining df_features = df_features.repartitionByRange(repartition_num, 'id') left_df = df_features.select( df_features['id'].alias('left_id'), df_features[self.featureCol].alias('left_features')) right_df = df_features.select( df_features['id'].alias('right_id'), df_features[self.featureCol].alias('right_features')) # join on self where left_id does not equal right_id joined_df = left_df.join(right_df, left_df['left_id'] != right_df['right_id']) # comupte cosine similarity between vectors joined_df = joined_df.select( 'left_id', 'right_id', cosine_similarity_udf( array(joined_df['left_features'], joined_df['right_features'])).alias('norm')) ranked = joined_df.select( 'left_id', 'right_id', rank().over( Window.partitionBy('left_id').orderBy('norm')).alias('rank')) knn = ranked.where(ranked['rank'] <= 5) knn_grouped = knn.groupBy('left_id').agg( f.collect_list('right_id').alias('nn')) # generate laplacian laplacian = knn_grouped.select( 'left_id', laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'], lit(n), lit(self.k_nearest)).alias('lap_vector')) laplacian_matrix = RowMatrix( laplacian.select('lap_vector').rdd.map(lambda x: x[0])) eigenvectors = laplacian_matrix.computePrincipalComponents( k=self.num_eigenvectors) eigenvectors = [ (idx, Vectors.dense([float(item) for item in row])) for idx, row in enumerate(eigenvectors.toArray().tolist()) ] eigen_df = session.createDataFrame(eigenvectors, ['id', self.featureCol]) model = KMeans(featuresCol=self.featureCol, predictionCol=self.predictionCol, k=self.k).fit(eigen_df) predictions = model.transform(eigen_df).join(df_index, on='id') return predictions
def cluster(data, n_clusters): model = KMeans().setK(n_clusters).setSeed(1).setFeaturesCol("features").fit(scaledData) centers = model.clusterCenters() #print("Cluster Centers: ") #for center in centers: # print(center) cl_labels = model.transform(scaledData).select('prediction') gr = cl_labels.groupBy("prediction").agg(countDistinct("prediction")) #gr.show() return cl_labels, gr.show()
def cluster(self, df, session, repartition_num=8): n = df.count() # index rows df_index = df.select((row_number().over( Window.partitionBy(lit(0)).orderBy(self.featureCol)) - 1).alias('id'), "*") df_features = df_index.select('id', self.featureCol) # prep for joining df_features = df_features.repartitionByRange(repartition_num, 'id') left_df = df_features.select( df_features['id'].alias('left_id'), df_features[self.featureCol].alias('left_features')) right_df = df_features.select( df_features['id'].alias('right_id'), df_features[self.featureCol].alias('right_features')) # join on self where left_id does not equal right_id joined_df = left_df.join(right_df, left_df['left_id'] != right_df['right_id']) # comupte cosine similarity between vectors joined_df = joined_df.select( 'left_id', 'right_id', cosine_similarity_udf( array(joined_df['left_features'], joined_df['right_features'])).alias('norm')) ranked = joined_df.select( 'left_id', 'right_id', rank().over( Window.partitionBy('left_id').orderBy('norm')).alias('rank')) knn = ranked.where(ranked['rank'] <= 5) knn_grouped = knn.groupBy('left_id').agg( f.collect_list('right_id').alias('nn')) # generate laplacian laplacian = knn_grouped.select( knn_grouped['left_id'].alias('id'), toVector_udf( laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'], lit(n), lit(self.k_nearest))).alias('lap_vector')) pca = PCA(k=self.num_eigenvectors, inputCol='lap_vector', outputCol='features').fit(laplacian) eigenvectors = pca.transform(laplacian).select('id', 'features') model = KMeans(featuresCol='features', predictionCol=self.predictionCol, k=self.k).fit(eigenvectors) predictions = model.transform(eigenvectors).join(df_index, on='id') return predictions
def kmeans(params): path = params[0] k = int(params[1]) iterations = int(params[2]) target_dir = params[3] try: # Creating session spark_session = SparkSession.builder.appName( "project4-jwj").getOrCreate() # loading the files from hdfs ang getting a DataFrame data = spark_session.read.format("csv").option("header", "true").load( "{}/*.csv".format(path)) #data.show() # Getting column's name columns = data.columns # Removing null rows for i in columns: data = data.filter(col(i).isNotNull()) # Breaking the content column into individual words tokenizer = Tokenizer(inputCol="content", outputCol="Words") tokenized = tokenizer.transform(data) #tokenized.show() # Removing stop words remover = StopWordsRemover(inputCol="Words", outputCol="Filtered") removed = remover.transform(tokenized) #removed.show() # Term frecuency - inverse document frecuency hashingTF = HashingTF(inputCol="Filtered", outputCol="rawFeatures", numFeatures=3000) # Getting the frecuency term vector to try to get k and train kmeans featurizedData = hashingTF.transform(removed) #featurizedData.show() idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() # Train KMeans kmean = KMeans().setK(k).setMaxIter(iterations).fit(rescaledData) clustersTable = kmean.transform(rescaledData) clustersTable.show() clustersTable.select("title", "prediction").repartition( 1).write.format("com.databricks.spark.csv").save(target_dir) except Exception as e: print(str(e), file=sys.stderr) sys.exit(1)
def process(sc): hiveContext = HiveContext(sc) hql = "select * from kmeans_cluster_feature where pt = '%s'" % (pt) df_raw = hiveContext.sql(hql).repartition(160) columns = df_raw.columns[1: -2] feature_num = len(columns) # type #df_tmp = df_raw #for k, i in zip(columns, range(feature_num)): # df_tmp = df_tmp.withColumn(k, df_tmp[i + 1] * 1.0) # Imputer mean_value = df_raw.describe().collect()[1] print mean_value df_train = df_raw for k, i in zip(columns, range(feature_num)): df_train = df_train.na.fill({k:mean_value[i + 1]}) # minmax vecAssembler = VectorAssembler(inputCols=columns, outputCol="features") df_b_s = vecAssembler.transform(df_train) mmScaler = MinMaxScaler(inputCol="features", outputCol="scaled") model = mmScaler.fit(df_b_s) df_scaled = model.transform(df_b_s) # kmeans n_clusters_ = 20 model = KMeans(k=n_clusters_, initSteps=10, maxIter=300, featuresCol='scaled').fit(df_scaled) df_result = model.transform(df_scaled) # map global sensitivity_1, sensitivity_3 sensitivity_1 = [] sensitivity_2 = [] sensitivity_3 = [] key_cnt = [] centers = model.clusterCenters() for xx, yy in zip(centers, range(n_clusters_)): key_cnt.append([yy, xx[0]]) sorted_cluster = sorted(key_cnt, key=lambda asd: asd[1]) split = n_clusters_ / 3 split_end = n_clusters_ - split for xx, yy in zip(sorted_cluster, range(n_clusters_)): if yy < split: sensitivity_3.append(xx[0]) elif yy >= split_end: sensitivity_1.append(xx[0]) else: sensitivity_2.append(xx[0]) #result df_result.map(result_process).saveAsTextFile("kmeans_cluster_result/pt=%s/" % (pt))
def user_cluster_model(spark, ratings, movies, k, genres): """ Returns a clustering model for users' genre preferences """ # Get all user ids all_user_ids = ratings.select("userId").distinct().rdd.flatMap( lambda x: x).collect() # Calculate scores for each user scores = user_genre_scores(spark, ratings, movies, all_user_ids)\ .sort(col("userId"), col("genre")) # Convert genres in rows to columns scores = scores.groupBy("userId").pivot("genre").agg( first("score")).na.fill(0) # Ignore movies without genres if "(no genres listed)" in scores.columns: scores = scores.drop("(no genres listed)") scores.cache() # Find genres in dataset used genres_in_scores = scores.drop("userId").columns # Train a k-means model scores = VectorAssembler(inputCols=genres_in_scores, outputCol="features").transform(scores) kmeans_model = KMeans().setK(k).setSeed(5052).fit(scores) # Save genres used in model to model object kmeans_model.genres = genres_in_scores # Calculate sihlouette score & save to model train_predictions = kmeans_model.transform(scores) kmeans_model.sihlouette_score = ClusteringEvaluator().evaluate( train_predictions) return kmeans_model
def chartShow(): data = train() # ----Kmeans聚类---- print("------------------Kmeans聚类--------------------") print("------------设定不同的K值,进行分类,计算平方误差之和------------") errors = [] results = [] centers = [] for k in range(2, 10): # 获得模型 kmeansmodel = KMeans().setK(k).setFeaturesCol('feature').setPredictionCol('prediction').fit(data) print("With K={}".format(k)) # 带有预测簇标签的数据集 kmeans_results = kmeansmodel.transform(data).collect() results.append(kmeans_results) # for item in kmeans_results: # print(item) # print(str(item[0]) + ' is predcted as cluster' + str(item[1])) # 获取到模型的所有聚类中心情况 kmeans_centers = kmeansmodel.clusterCenters() centers.append(kmeans_centers) center_seq = 0 print(len(kmeans_centers)) for item in kmeans_centers: print(item) # print("Cluster" + str(center_seq) + " Center" + str(item)) center_seq = center_seq + 1 # 计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE) WSSSE = kmeansmodel.computeCost(data) errors.append(WSSSE) print("Within Set Sum of Squared Error = " + str(WSSSE)) print('--' * 30 + '\n') # ----WSSSE可视化---- plt.figure() k_number = range(2, 10) plt.plot(k_number, errors) plt.xlabel('Number of K') plt.ylabel('WSSSE') plt.title('K-WSSSE') # ----聚类结果可视化---- print("---------将数据转换为panda结构,并查看空间3d图心-----------") # 通过K-WSSSE图,k=6时聚类效果较好 k = 4 cluster_vis = plt.figure(figsize=(10, 10)).gca(projection='3d') for item in results[k - 2]: if item[1] == 0: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='b') # blue if item[1] == 1: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='y') # yellow if item[1] == 2: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='m') # magenta if item[1] == 3: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='k') # black if item[1] == 4: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='g') # green if item[1] == 5: cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='c') # cyan for item in centers[k - 2]: cluster_vis.scatter(item[0], item[1], item[2], c='r', marker='p') # red,五角 plt.show()
#----Kmeans聚类---- print("------------------Kmeans聚类--------------------") print("------------设定不同的K值,进行分类,计算平方误差之和------------") errors = [] results = [] centers = [] for k in range(2, 10): kmeansmodel = KMeans().setK(k).setFeaturesCol( 'iris_features').setPredictionCol('prediction').fit(iris_DF) print("With K={}".format(k)) #带有预测簇标签的数据集 kmeans_results = kmeansmodel.transform(iris_DF).collect() results.append(kmeans_results) for item in kmeans_results: print(str(item[0]) + ' is predcted as cluster' + str(item[1])) #获取到模型的所有聚类中心情况 kmeans_centers = kmeansmodel.clusterCenters() centers.append(kmeans_centers) center_seq = 0 for item in kmeans_centers: print("Cluster" + str(center_seq) + " Center" + str(item)) center_seq = center_seq + 1 #计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE) WSSSE = kmeansmodel.computeCost(iris_DF) errors.append(WSSSE)
def cluster(self, df, session, repartition_num=8): n = df.count() # index rows df_index = df.select( (row_number().over( Window.partitionBy(lit(0)).orderBy(self.featureCol)) - 1).alias("id"), "*", ) df_features = df_index.select("id", self.featureCol) # prep for joining df_features = df_features.repartitionByRange(repartition_num, "id") left_df = df_features.select( df_features["id"].alias("left_id"), df_features[self.featureCol].alias("left_features"), ) right_df = df_features.select( df_features["id"].alias("right_id"), df_features[self.featureCol].alias("right_features"), ) # join on self where left_id does not equal right_id joined_df = left_df.join(right_df, left_df["left_id"] != right_df["right_id"]) # comupte cosine similarity between vectors joined_df = joined_df.select( "left_id", "right_id", self.distance( array(joined_df["left_features"], joined_df["right_features"])).alias("norm"), ) ranked = joined_df.select( "left_id", "right_id", rank().over( Window.partitionBy("left_id").orderBy("norm")).alias("rank"), ) knn = ranked.where(ranked["rank"] <= 5) knn_grouped = knn.groupBy("left_id").agg( f.collect_list("right_id").alias("nn")) # generate laplacian laplacian = knn_grouped.select( knn_grouped["left_id"].alias("id"), toVector_udf( laplacian_vector_udf( knn_grouped["left_id"], knn_grouped["nn"], lit(n), lit(self.k_nearest), )).alias("lap_vector"), ) laplacian_matrix = RowMatrix( laplacian.select("lap_vector").rdd.map(lambda x: x[0].toArray()), -1, -1) svd = laplacian_matrix.computeSVD( k=laplacian_matrix.numRows() # self.num_eigenvectors ) eigenvectors = [ (idx, Vectors.dense([float(item) for item in row])) for idx, row in enumerate(svd.V.toArray()[:, -self.k:].tolist()) ] eigen_df = session.createDataFrame(eigenvectors, ["id", self.featureCol]) model = KMeans(featuresCol=self.featureCol, predictionCol=self.predictionCol, k=self.k).fit(eigen_df) predictions = model.transform(eigen_df).join(df_index, on="id") return predictions
# MAGIC # MAGIC **Note:** This command multiple spark jobs (one job per iteration in the KMeans algorithm). You will see the progress bar starting over and over again. # COMMAND ---------- from pyspark.ml.clustering import KMeans model = KMeans().setK(2).fit(trainingData) # COMMAND ---------- # MAGIC %md To see the result of our clustering, we produce a scatter plot matrix that shows interaction between input variables and learned clusters. To get that we apply the model on the original data and pick four columns: `prediction` and the original features (`duration`, `tempo`, and `loudness`). # COMMAND ---------- transformed = model.transform(trainingData).select("duration", "tempo", "loudness", "prediction") # COMMAND ---------- # MAGIC %md To comfortably visualize the data we produce a random sample. # MAGIC Remember the `display()` function? We can use it to produce a nicely rendered table of transformed DataFrame. # COMMAND ---------- display(transformed.sample(False, fraction=0.005)) # COMMAND ---------- # MAGIC %md To generate a scatter plot matrix, click on the plot button bellow the table and select `scatter`. That will transform your table to a scatter plot matrix. It automatically picks all numeric columns as values. To include predicted clusters, click on `Plot Options` and drag `prediction` to the list of Keys. You will get the following plot. On the diagonal panels you see the PDF of marginal distribution of each variable. Non-diagonal panels show a scatter plot between variables of the two variables of the row and column. For example the top right panel shows the scatter plot between duration and loudness. Each point is colored according to the cluster it is assigned to. # COMMAND ----------
# TFIDF tfidf_dataFrame = genre_and_sentences_after_flatmap.toDF(["genre","sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tfidf_words_data = tokenizer.transform(tfidf_dataFrame) hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=512) tfidf_featurized_data = hashing_tf.transform(tfidf_words_data) idf_model = IDF(inputCol="rawFeatures", outputCol="features").fit(tfidf_featurized_data) tfidf_rescaled_data = idf_model.transform(tfidf_featurized_data) tfidf_genre_features = tfidf_rescaled_data.select("genre", "features") # Confusion matrix for TFIDF tfidf_kmeansmodel = KMeans().setK(5).setFeaturesCol('features').setPredictionCol('prediction').fit(tfidf_genre_features) tfidf_predictions = tfidf_kmeansmodel.transform(tfidf_genre_features).select("prediction", "genre") tfidf_res = tfidf_predictions.groupBy(['prediction', 'genre']).count().collect() print("Confusion matrix for TFIDF:") toPrint(tfidf_res) print() ####################################################################### ## Vocabulary Exploration - Part B ## ####################################################################### # pretrained pretrained_genre_features = genre_and_sentences_after_flatmap.mapPartitions(emb) pretrained_dataFrame = pretrained_genre_features.map(toList).toDF(["genre","features"]) new_schema = ArrayType(DoubleType(), containsNull=False) udf_foo = udf(lambda x:x, new_schema)
# MAGIC %md We can now pass this new DataFrame to the `KMeans` model and ask it to categorize different rows in our data to two different classes (`setK(2)`). We place the model in a variable named `model`. # MAGIC # MAGIC **Note:** This command multiple spark jobs (one job per iteration in the KMeans algorithm). You will see the progress bar starting over and over again. # COMMAND ---------- from pyspark.ml.clustering import KMeans model = KMeans().setK(2).fit(trainingData) # COMMAND ---------- # MAGIC %md To see the result of our clustering, we produce a scatter plot matrix that shows interaction between input variables and learned clusters. To get that we apply the model on the original data and pick four columns: `prediction` and the original features (`duration`, `tempo`, and `loudness`). # COMMAND ---------- transformed = model.transform(trainingData).select("duration", "tempo", "loudness", "prediction") # COMMAND ---------- # MAGIC %md To comfortably visualize the data we produce a random sample. # MAGIC Remember the `display()` function? We can use it to produce a nicely rendered table of transformed DataFrame. # COMMAND ---------- display(transformed.sample(False, fraction = 0.005)) # COMMAND ---------- # MAGIC %md To generate a scatter plot matrix, click on the plot button bellow the table and select `scatter`. That will transform your table to a scatter plot matrix. It automatically picks all numeric columns as values. To include predicted clusters, click on `Plot Options` and drag `prediction` to the list of Keys. You will get the following plot. On the diagonal panels you see the PDF of marginal distribution of each variable. Non-diagonal panels show a scatter plot between variables of the two variables of the row and column. For example the top right panel shows the scatter plot between duration and loudness. Each point is colored according to the cluster it is assigned to. # COMMAND ----------
k = args.k_clusters if algorithm not in ['kmeans', 'gmm', 'lda', 'spectral']: raise ValueError('Not a valid algorithm') ss = SparkSession.builder.getOrCreate() df = ss.read.csv(path, header=True, inferSchema=True) df_preprocessed = preprocessing(df, num_pca=num_pca_features) df_preprocessed.write.parquet("preprocessed", mode="Overwrite") if algorithm == 'kmeans': model = KMeans(k=k).setSeed(1).fit(df_preprocessed) predictions = model.transform(df_preprocessed) elif algorithm == 'spectral': model = SpectralClustering(k=k, k_nearest=7) predictions = model.cluster(df_preprocessed, ss, repartition_num=num_nodes) elif algorithm == 'lda': model = LDA(k=k, maxIter=10).fit(df_preprocessed) predictions = model.transform(df_preprocessed) elif algorithm == 'gmm': model = GaussianMixture(k=k).fit(df_preprocessed) predictions = model.transform(df_preprocessed) predictions.select([col for col in predictions.columns if col != 'features'])\ .toPandas()\ .to_csv(sys.stdout)
from sklearn.grid_search import GridSearchCV def f(x): rel = {} rel['features'] = Vectors.dense(float(x[1]), float(x[2]), float(x[3]), float(x[4])) rel['label'] = str(x[5]).strip("\"") return rel spark = SparkSession.builder.appName("logistic_regression").getOrCreate() df = spark.sparkContext.textFile("iris.txt").map( lambda line: line.split(",")).map(lambda p: Row(**f(p))).toDF() """创建Estimator并调用其fit()方法来生成相应的Transformer对象, 很显然,在这里KMeans类是Estimator,而用于保存训练后模型的KMeansModel类则属于Transformer""" kmeans_model = KMeans().setK(3).setFeaturesCol("features").setPredictionCol( "prediction").fit(df) results = kmeans_model.transform(df).collect() for item in results: print(str(item[0]) + " is predicted as cluster " + str(item[1])) """有可以通过KMeansModel类自带的clusterCenter属性获取到模型的所有聚类中心情况""" results2 = kmeans_model.clusterCenters() for item in results2: print(item) """与MLLib下的实现相同,KMeansModel类也提供了计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE) 的方法来度量聚类的有效性,在真实K值未知的情况下,该值的变化可以作为选取合适K值的一个重要参考""" print(kmeans_model.computeCost(df))
# Reference https://chrisalbon.com/machine_learning/trees_and_forests/random_forest_classifier_example/ import time from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator from pyspark import SparkConf, SparkContext, SQLContext conf = SparkConf().setMaster("local[*]") sc = SparkContext(conf=conf) spark = SQLContext(sc) data = spark.read.format("libsvm").load( "D:\Outils\Spark\data\mllib\iris_libsvm.txt") model = KMeans().setK(3) model = model.fit(data) predictions = model.transform(data) evaluator = ClusteringEvaluator() score = evaluator.evaluate(predictions) print('Accuracy: ', score)
print("Within Set Sum of Squared Errors = " + str(wssse)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.feature import PCA as PCAml from pyspark.ml.linalg import Vectors # Pre 2.0 pyspark.mllib.linalg pca = PCAml(k=2, inputCol="features", outputCol="pca") model = pca.fit(df_) df_pca = model.transform(df_) # COMMAND ---------- model.explainedVariance # COMMAND ---------- display(df_pca) # COMMAND ---------- from pyspark.ml.clustering import KMeans # Trains a k-means model. model = KMeans().setParams(featuresCol="pca", k=20, seed=1).fit(df_pca)
from pyspark.ml.clustering import KMeans # 查找最佳k值 for i in range(2,11): km = KMeans().setK(i).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction') res_kmval = km.fit(clsdata_model).summary.trainingCost print(i,': ',res_kmval) # k = 4 model = KMeans().setK(3).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction').fit(clsdata_model) res_km = model.transform(clsdata_model) summary = model.summary summary.clusterSizes [739011, 463649, 807578] summary.trainingCost >>> 7632810.723481619 model.clusterCenters() model.save('kmeans3_model') clsdata_vecform.createOrReplaceTempView('clsdata') res_km.createOrReplaceTempView('reskm') res4 = spark.sql('select c.*, r.prediction as prediction from clsdata c, reskm r where c.id = r.id').drop('feature')
print('Wrong arguments number') sys.exit(-1) spark = SparkSession.builder.appName('part2').getOrCreate() df=spark.read.csv(sys.argv[1], inferSchema=True, header=True)\ .select(['Street Code1','Street Code2','Street Code3','Vehicle Color']) assembler = VectorAssembler( inputCols=['Street Code1', 'Street Code2', 'Street Code3'], outputCol='features') data = assembler.transform(df) # Setup KMeans model and train model = KMeans(featuresCol='features', k=int(sys.argv[2])).fit(data) prediction = model.transform(data) # out of sample data newdf = spark.createDataFrame( [(34510, 10030, 34050, 'BLK')], ('Street Code1', 'Street Code2', 'Street Code3', 'Vehicle Color')) newpred = model.transform(assembler.transform(newdf)) # cluster of the out of sample data and its color cluster = newpred.select('prediction').collect()[0].prediction color = newpred.select('Vehicle Color').collect()[0]['Vehicle Color'] colors = prediction.filter( prediction['prediction'] == cluster).select('Vehicle Color').collect()