def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training model 1...") kmeans_1 = KMeans().setK(9).setSeed(1) model_1 = kmeans_1.fit(self.df_cbg1) logger.info("Model 1 built!") logger.info("Evaluating the model 1...") self.predictions_1 = model_1.transform(self.df_cbg1) logger.info("Model 1 Done !") logger.info("Training model 2...") kmeans_2 = KMeans().setK(9).setSeed(1) model_2 = kmeans_2.fit(self.df_cbg2) logger.info("Model 2 built!") logger.info("Evaluating the model 2...") self.predictions_2 = model_2.transform(self.df_cbg2) logger.info("Model 2 Done !") logger.info("Training model 3...") kmeans_3 = KMeans().setK(9).setSeed(1) model_3 = kmeans_3.fit(self.df_cbg3) logger.info("Model 3 built!") logger.info("Evaluating the model 3...") self.predictions_3 = model_3.transform(self.df_cbg3) logger.info("Model 3 Done !")
def __train_model(self): logger.info("Training model 1...") kmeans_1 = KMeans().setK(9).setSeed(1) model_1 = kmeans_1.fit(self.df_crime1) logger.info("Model 1 built!") logger.info("Evaluating the model 1...") self.predictions_1 = model_1.transform(self.df_crime1) logger.info("Model 1 Done !") logger.info("Training model 2...") kmeans_2 = KMeans().setK(9).setSeed(1) model_2 = kmeans_2.fit(self.df_crime2) logger.info("Model 2 built!") logger.info("Evaluating the model 2...") self.predictions_2 = model_2.transform(self.df_crime2) logger.info("Model 2 Done !") logger.info("Training model 3...") kmeans_3 = KMeans().setK(9).setSeed(1) model_3 = kmeans_3.fit(self.df_crime3) logger.info("Model 3 built!") logger.info("Evaluating the model 3...") self.predictions_3 = model_3.transform(self.df_crime3) logger.info("Model 3 Done !")
def __train_model(self): """Train the model with the current dataset """ logger.info("Splitting dataset into 3...") # Model 0: 1/3 data pertama. # Model 1: 1/3 data pertama + 1/3 data kedua. # Model 2: semua data self.df0 = self.dforiginal.limit(int(self.dataset_count / 3)) self.df1 = self.dforiginal.limit(int(self.dataset_count * 2 / 3)) self.df2 = self.dforiginal print('df 0 count = ' + str(self.df0.count())) print('df 1 count = ' + str(self.df1.count())) print('df 2 count = ' + str(self.df2.count())) logger.info("Dataset Splitted !") logger.info("Training model 0...") kmeans_0 = KMeans().setK(5).setSeed(1) model_0 = kmeans_0.fit(self.df0) self.predictions_0 = model_0.transform(self.df0) logger.info("Model 0 built!") logger.info("Evaluating the model 0...") evaluator_0 = ClusteringEvaluator() silhouette_0 = evaluator_0.evaluate(self.predictions_0) logger.info("Silhouette with squared euclidean distance = " + str(silhouette_0)) self.centers_0 = model_0.clusterCenters() logger.info("Model 0 Done !") logger.info("Training model 1...") kmeans_1 = KMeans().setK(5).setSeed(1) model_1 = kmeans_1.fit(self.df1) self.predictions_1 = model_1.transform(self.df1) logger.info("Model 1 built!") logger.info("Evaluating the model 1...") evaluator_1 = ClusteringEvaluator() silhouette_1 = evaluator_1.evaluate(self.predictions_1) logger.info("Silhouette with squared euclidean distance = " + str(silhouette_1)) self.centers_1 = model_1.clusterCenters() logger.info("Model 1 Done !") logger.info("Training model 2...") kmeans_2 = KMeans().setK(5).setSeed(1) model_2 = kmeans_2.fit(self.df2) self.predictions_2 = model_2.transform(self.df2) logger.info("Model 2 built!") logger.info("Evaluating the model 2...") evaluator_2 = ClusteringEvaluator() silhouette_2 = evaluator_2.evaluate(self.predictions_2) logger.info("Silhouette with squared euclidean distance = " + str(silhouette_2)) self.centers_2 = model_2.clusterCenters() logger.info("Model 2 Done !")
def train(bucket_name, feature_path, feature_name, output_path, plot_path): sc = SparkContext.getOrCreate() sqlCtx = SQLContext(sc) # read from s3 csv and store to local path = feature_path + feature_name # used both locally and remotely: features/pca.csv s3 = boto3.resource('s3') s3.Object(bucket_name, path).download_file(path) df_spark = sqlCtx.read.csv(path, header=True, inferSchema=True) # Dataframe to rdd vecAssembler = VectorAssembler(inputCols=df_spark.columns, outputCol="features") df_spark = vecAssembler.transform(df_spark) rdd = df_spark.rdd.map(lambda x: array(x["features"])) print rdd.take(10) # From here: K-means specific # Pick k cost = np.zeros(20) for k in range(2, 20): kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(df_spark.sample(False, 0.5, seed=42)) cost[k] = model.computeCost(df_spark) plt.figure(1) fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax.plot(range(2, 20), cost[2:20]) ax.set_xlabel('k') ax.set_ylabel('cost') plt.savefig(plot_path + "k-means vary-k.png") # Train and upload model to s3 k = 8 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(df_spark) model.write().overwrite().save(output_path + "k-means.model") # save the model to s3 data = model.transform(df_spark).toPandas() print data.info() data.to_csv(output_path + "/transformed.csv") # #Plotting fig = plt.figure(2, figsize=(5, 5)) plt.scatter(data["pca1"], data["pca2"], c=data["prediction"], s=30, cmap='viridis') plt.title("K Means (K=%d)" % k, fontsize=14) plt.xlabel("PC1") plt.ylabel("PC2") plt.savefig(plot_path + "k-means-cluster.png")
def hacker_test(spark, resources_folder): data = spark.read.csv(resources_folder + 'hack_data.csv', header=True, inferSchema=True) data.printSchema() data.show() print(data.columns) assembler = VectorAssembler(inputCols=[ 'Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used', 'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed' ], outputCol='features') data_assembled = assembler.transform(data) data_assembled.show() scaler = StandardScaler(inputCol='features', outputCol='scaledfeatures') scaler_model = scaler.fit(data_assembled) data_assembled_scaled = scaler_model.transform(data_assembled) data_assembled_scaled.show() data_assembled = data_assembled_scaled.select('scaledfeatures').withColumn( 'features', data_assembled_scaled['scaledfeatures']) data_assembled.show() print( "************************************* con tres cluster *************************************" ) kmeans3 = KMeans(featuresCol='features', k=3, seed=10) model3 = kmeans3.fit(data_assembled) wssse3 = model3.summary.trainingCost print(wssse3) print(model3.clusterCenters()) model3.summary.predictions.show() predictions3 = model3.summary.predictions predictions3.groupBy('prediction').count().show() # predictions3.agg({'prediction': 'count'}).show() print( "************************************* con dos cluster *************************************" ) kmeans2 = KMeans(featuresCol='features', k=2, seed=10) model2 = kmeans2.fit(data_assembled) wssse2 = model2.summary.trainingCost print(wssse2) print(model2.clusterCenters()) model2.summary.predictions.show() predictions2 = model2.summary.predictions predictions2.groupBy('prediction').count().show()
def cluster_kmeans(self, k=22): # cost = np.zeros(20) # for k in range(2,20): # kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") # model = kmeans.fit(self.dataframe.sample(False,0.1, seed=42)) # cost[k] = model.computeCost(self.dataframe) # fig, ax = plt.subplots(1,1, figsize =(8,6)) # ax.plot(range(2,20),cost[2:20]) # ax.set_xlabel('k') # ax.set_ylabel('cost') # fig.show() # time.sleep(20) kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol( "features").setPredictionCol("kmeans_prediction") model = kmeans.fit(self.dataframe) centers = model.clusterCenters() try: model.save("kmeans-model" + str(k)) except: model.write().overwrite().save("kmeans-model" + str(k)) # print("Cluster Centers: ") # plt.plot(centers, '-o') # plt.show() self.dataframe = model.transform(self.dataframe)
def feature_engineering(class_balancedDf): # N-Gram ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams") ngramDataFrame = ngram.transform(class_balancedDf) # Hashing TF hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(ngramDataFrame) # IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # K-Means kmeans = KMeans().setK(6).setSeed(1) kmodel = kmeans.fit(rescaledData).transform(rescaledData) #LDA lda = LDA(k=10, maxIter=10) ldamodel = lda.fit(kmodel).transform(kmodel) # changing label column to int data = ldamodel.withColumn( "label", ldamodel.label.cast("Integer")).drop("prediction") return data
def kmeans(df): kmeans = KMeans(k=2,seed=1) model = kmeans.fit(df) centers = model.clusterCenters() print len(centers) kmFeatures = model.transform(df).select("features", "prediction") dfwrite(kmFeatures,'kmFeatures')
def cluster(self): from pyspark.ml.clustering import KMeans from pyspark.ml.evaluation import ClusteringEvaluator # Loads data. dataset = self.read.format("libsvm").load(self.dataDir + "data/mllib/sample_kmeans_data.txt") # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) # Make predictions predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def kmeans_scan(_data, _k_min=2, _k_max=6, _tmp_dir='tmp_models'): """Scan different kmeans model within the specified k range. The function assume that the input data are ready to be used and already contain the features column. """ # Define the evaluator to find the optimal k. The evaluator compute the Siluhette score. evaluator = ClusteringEvaluator() # Dictionaries use to save the results obtained for the diferent k considered. silhuette_scores = {} centers = {} # If the temporary directory already exists it will be removed to create a fresh one. # Other managements of this case are possible but they won't be considered here, the # extension to these cases is straitforward. if os.path.exists(_tmp_dir): shutil.rmtree(_tmp_dir) os.mkdir(_tmp_dir) # Fit and save the model for the specifoed k for k in range(_k_min, _k_max + 1): kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol('features') model = kmeans.fit(_data) transformed = model.transform(_data) silhuette_scores[k] = evaluator.evaluate(transformed) centers[k] = model.clusterCenters() model.save(os.path.join(_tmp_dir, "model_w_k_{}".format(k))) return centers, silhuette_scores
def main(): spark = SparkSession.Builder().getOrCreate() # load dataset # datapath = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) # dataset = spark.read.format('libsvm').json(datapath+'/data/business.json') filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/business_MTL_ONLY.json' # filename = '/Users/nicolasg-chausseau/Downloads/yelp_dataset/review_MTL_ONLY.json' dataset = spark.read.format('libsvm').json(filename) print(dataset) # get longitude and latitude ll = dataset.select(dataset.categories[0], dataset.longitude, dataset.latitude) ll = ll.withColumnRenamed('categories[0]', 'categories') ll.show() print(ll.schema.names) # for item in ll.schema.names: # print(item) # for item2 in item: # print(item2) sys.exit() # convert ll to dense vectors # data =ll.rdd.map(lambda x:(Vectors.dense(float(x[0]), float(x[1])),)).collect() assembler = VectorAssembler(inputCols=['longitude', 'latitude'], outputCol='features') df = assembler.transform(ll) # set KMeans k and seed kmeans = KMeans(k=4, seed=1) # generate model model = kmeans.fit(df) # Make predictions predictions = model.transform(df) predictions.show(20) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # number of location in each cluster print('Number of business in each cluster: ') predictions.groupBy('prediction').count().sort(desc('count')).show() # show in which cluster do we have more restaurants print('Number of restaurant per clusters') predictions.where(predictions.categories == 'Restaurants').groupBy( 'prediction').count().sort(desc('count')).show() # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def train_cluster(df, k): evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='final_features_scaled', \ metricName='silhouette', distanceMeasure='squaredEuclidean') kmeans = KMeans() \ .setK(k) \ .setFeaturesCol("final_features_scaled") \ .setPredictionCol("cluster") kmeans_model = kmeans.fit(df) output = kmeans_model.transform(df) score = evaluator.evaluate(output) print("k: {}, silhouette score: {}".format(k, score)) expr_mean = [F.avg(col).alias(col + '_mean') for col in final_features] # @pandas_udf(FloatType(), functionType=PandasUDFType.GROUPED_AGG) # def _func_median(v): # return v.median() # expr_median = [_func_median(output[col]).alias(col+'_median') for col in numeric_features] # df_median = output.groupBy('cluster').agg(*expr_median).toPandas() df_mean = output.groupBy('cluster').agg( F.count(F.lit(1)).alias("audience_num"), *expr_mean).toPandas() # result = pd.merge(df_mean, df_median, on='cluster') return output, df_mean
def ClusterWords(w2v \ , seqs ): #to force each word to be a cluster center we use a trick #we train a kmeans model such that the number of clusters is equal to the number of words words = w2v.getVectors() words = words.join(broadcast(seqs), words.word == seqs.word).select(words.word.alias('word'), 'vector') words.cache() nwords = words.count() km = KMeans(featuresCol='vector', predictionCol='cluster', k=nwords) centers = km.fit(words) #create a dictionary of the words d = MakeDict(words, 'word', 'vector') old_words = words words = centers.transform(words) \ .dropDuplicates(subset=['cluster']) \ .withColumnRenamed('vector', 'centerVector') words.cache() words.show(10, False) return (words, d, centers)
def kmeans(features, num_clusters): """Does clustering on the features dataset using KMeans clustering. Params: - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering - num_clusters (int): The number of clusters to be used Returns: - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column """ kmeans = KMeans(k=num_clusters, featuresCol='features', predictionCol='cluster') kmeans_model = kmeans.fit(features) clustered = kmeans_model.transform(features) clustered.show() cluster_centers = kmeans_model.clusterCenters() clustered = clustered.rdd.map( lambda row: Row(distance=Vectors.squared_distance( cluster_centers[row['cluster']], row['features']), **row.asDict())).toDF() clustered.show() print("=====Clustering Results=====") print("Clustering cost = ", kmeans_model.computeCost(features)) print("Cluster sizes = ", kmeans_model.summary.clusterSizes) return clustered
def train_cluster(df, k): evaluator = ClusteringEvaluator(predictionCol='cluster', featuresCol='final_features_scaled', \ metricName='silhouette', distanceMeasure='squaredEuclidean') kmeans = KMeans() \ .setK(k) \ .setFeaturesCol("final_features_scaled") \ .setPredictionCol("cluster") kmeans_model = kmeans.fit(df) output = kmeans_model.transform(df) score = evaluator.evaluate(output) print("k: {}, silhouette score: {}".format(k, score)) expr_mean = [F.avg(col).alias(col + '_mean') for col in final_features] expr_median = [ F.expr('percentile({}, array(0.5))'.format(col))[0].alias(col + '_mean') for col in final_features ] df_median = output.groupBy('cluster').agg(*expr_median).toPandas() df_mean = output.groupBy('cluster').agg( F.count(F.lit(1)).alias("audience_num"), *expr_mean).toPandas() return output, df_median, df_mean
def q6(df): import pandas as pd from pyspark.ml.clustering import KMeans from pyspark.ml.feature import VectorAssembler from pyspark.ml.evaluation import ClusteringEvaluator vectors = VectorAssembler(inputCols=['start_lat', 'start_long'], outputCol='features', handleInvalid='skip') df_ = vectors.transform(df) kmeans = KMeans(k=308, seed=1) model = kmeans.fit(df_.select('features')) predictions = model.transform(df_) centers = model.clusterCenters() predictions.centers = pd.Series(centers) # evaluator = ClusteringEvaluator() # silhouette = evaluator.evaluate(predictions) # print(f'Silhouette with squared euclidean distance = {str(silhouette)}') print('Cluster Centers: ') for center in centers: print(center) return predictions, centers
def frequency_vector_DataFrame(trainDF, cluster_count): regTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-z]") dfTokenizer = regTokenizer.transform(trainDF) remover = StopWordsRemover(inputCol="words", outputCol="filtered") df_remover = remover.transform(dfTokenizer) # feature extraction using Word2vec word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec") vectors = word2Vec.fit(df_remover).getVectors() vectors_DF = vectors.select(vectors.word, vectors.vector.alias("features")) # DF as kmeans kmeans = KMeans().setK(cluster_count).setSeed(1) km_model = kmeans.fit(vectors_DF) # Broadcast operation after getting the words and predictions vocabDF = km_model.transform(vectors_DF).select("word", "prediction") vocabDict = dict(vocabDF.rdd.collect()) vocab_dict = sc.broadcast(vocabDict) # Cluster vector is in RDD form reviewsDF = df_remover.select(df_remover.filtered, df_remover.label).rdd clusterVectorRdd = reviewsDF.map(partial(word_to_cluster, vocab_dict=vocab_dict)) cluster_frequency_feature_Rdd = clusterVectorRdd.map(partial(cluster_frequency_vector, cluster_count=cluster_count)) cluster_freqDF = cluster_frequency_feature_Rdd.map(lambda (x, y): Row(x, y)).toDF() cluster_freq_featureDF = cluster_freqDF.select(cluster_freqDF._1.alias("features"), cluster_freqDF._2.alias("label")) return cluster_freq_featureDF
def calculate_WSS(kmax, training): sse = [] for k in range(2, kmax): kmeans = KMeans().setK(k).setSeed(1) model = kmeans.fit(training) centroids = model.clusterCenters() transformed = model.transform(training).select("features", "prediction") curr_sse = 0 train_col = training.collect() trans_col = transformed.collect() for i in range(len(train_col)): curr_center = centroids[trans_col[i].prediction] val = 0.0 for cont_fet in range(len(train_col[i].features)): val += (train_col[i].features[cont_fet] - curr_center[cont_fet])**2 curr_sse += val sse.append(curr_sse) return sse
def Kmeans(self, dataframe): # We are going to use the Elbow method to determine the best number of cluster #Kmeans clustering model #In ML Pyspark, Kmeans need features assembler = VectorAssembler(inputCols=['latitude', 'longitude'], outputCol="features") dataframe = assembler.transform(dataframe) cost = np.zeros(20) for k in range(2, 20): kmeans = KMeans().setK(k).setSeed(1) model = kmeans.fit(dataframe) cost[k] = model.computeCost(dataframe) fig, ax = plot.subplots(1, 1, figsize=(8, 6)) ax.plot(range(2, 20), cost[2:20]) ax.set_xlabel('Number of Clusters') ax.set_ylabel('Score') ax.set_title("Elbow curve") centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) return centers
def kmeans(dictionary_path, filename_corpus, filename_gl, filename_label, num_of_species): dictionary = load_dictionary(dictionary_path) corpus = read_corpus(filename_corpus) GL = read_group(filename_gl) corpus_m = gensim.matutils.corpus2dense(corpus, len(dictionary.keys())).T SL = [] kmer_group_dist = compute_dist(corpus_m, GL, SL, only_seed=False) df = pd.DataFrame(kmer_group_dist) spark = SparkSession.builder.appName("kmeans").getOrCreate() group_dist_df = spark.createDataFrame(df) df_columns = group_dist_df.schema.names vecAssembler = VectorAssembler(inputCols=df_columns, outputCol="features") new_df = vecAssembler.transform(group_dist_df) kmeans = KMeans(k=num_of_species, seed=1) # 2 clusters here model = kmeans.fit(new_df.select('features')) transformed = model.transform(new_df) transformed.select(["features", "prediction"]).show() y_pred = transformed.select("prediction").rdd.flatMap(lambda x: x).collect() y_kmer_grp_cl = assign_cluster_2_reads(GL, y_pred) labels = read_labels(filename_label) prec, rcal = evalQuality(labels, y_kmer_grp_cl, n_clusters=num_of_species) return prec, rcal, spark, y_kmer_grp_cl
def kmeans_usecase(): spark = getSparkSession() schema = '' for i in range(65): schema = schema + '_c' + str(i) + ' DOUBLE' + ',' schema = schema[:len(schema) - 1] df_train = spark.read.csv('../data/optdigits.tra', schema=schema) df_test = spark.read.csv('../data/optdigits.tes', schema=schema) cols = [] for i in range(65): cols.append("_c" + str(i)) df_train.head = cols df_test.head = cols assembler = VectorAssembler(inputCols=cols[:-1], outputCol="features") train_output = assembler.transform(df_train) test_output = assembler.transform(df_test) train_features = train_output.select("features").toDF('features') test_features = test_output.select("features").toDF('features') train_features.show(truncate=False) test_features.show(truncate=False) kmeans = KMeans().setK(10).setSeed(1) model = kmeans.fit(train_features) predictions = model.transform(test_features) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette))
def kmeans(coordinates_list, spark): coordinates_list = [ [float(coordinates[0]), float(coordinates[1])] for coordinates in coordinates_list ] df = spark.createDataFrame(coordinates_list, ["Longitude", "Latitude"]) vecAssembler = VectorAssembler( inputCols=["Longitude", "Latitude"], outputCol="features" ) new_df = vecAssembler.transform(df) silhouettes = [] for k in range(2, 10): kmeans = KMeans().setK(k).setSeed(1) model = kmeans.fit(new_df.select("features")) predictions = model.transform(new_df) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) silhouettes.append([silhouette, predictions, k]) _, predictions, k = max(silhouettes, key=lambda x: x[0]) predictions.show() print(k) return predictions
def get_clusters(df, num_clusters, max_iterations, initialization_mode, seed): # TODO: vecAssembler = VectorAssembler(inputCols=[ "count1", "count2", "count3", "count4", "count5", "count6", "count7", "count8", "count9", "count10", "count11" ], outputCol="features") new_df = vecAssembler.transform(df) #new_df.show() kmeans = KMeans(k=NUM_CLUSTERS, seed=0, maxIter=MAX_ITERATIONS, initMode=INITIALIZATION_MODE) model = kmeans.fit(new_df.select('features')) transformed = model.transform(new_df) #transformed.show() grouped = transformed.groupby('prediction').agg(F.collect_list('id')) mvv = grouped.select("collect_list(id)").rdd.flatMap(lambda x: x).collect() # Use the given data and the cluster pparameters to train a K-Means model # Find the cluster id corresponding to data point (a car) # Return a list of lists of the titles which belong to the same cluster # For example, if the output is [["Mercedes", "Audi"], ["Honda", "Hyundai"]] # Then "Mercedes" and "Audi" should have the same cluster id, and "Honda" and # "Hyundai" should have the same cluster id return mvv
def get_uber_data(): spark = SparkSession\ .builder\ .appName("Uber Dataset")\ .getOrCreate() cluster_count = int(request.args.get('cluster_count')) dataset = spark.read.csv('uberdata.csv', inferSchema=True, header="True") assembler = VectorAssembler(inputCols=["Lat", "Lon"], outputCol="features") dataset = assembler.transform(dataset) (training, testdata) = dataset.randomSplit([0.7, 0.3], seed=5043) kmeans = KMeans().setK(cluster_count) model = kmeans.fit(dataset) transformed = model.transform(testdata).withColumnRenamed( "prediction", "cluster_id") transformed.createOrReplaceTempView("data_table") transformed.cache() centerList = list() cluster_centers = model.clusterCenters() count = int() for center in cluster_centers: centersIndList = list() centersIndList.append(format(center[0], '.8f')) centersIndList.append(format(center[1], '.8f')) centersIndList.append(count) centerList.append(centersIndList) count = count + 1 centers = spark.createDataFrame(centerList) centers.createOrReplaceTempView("centers") resultsDFF = spark.sql( "SELECT centers._1 as Longitude, centers._2 as Latitude FROM data_table, centers WHERE data_table.cluster_id=centers._3" ) data = resultsDFF.groupBy("Longitude", "Latitude").count() return jsonify(data.toJSON().collect())
def k_means_transform(book_at, k=100, load_model=True): ''' input: attribute feature matrix of all books output: transformed matrix including cluster assignment This function is used to cluster all books for faster calculation for knn later ''' if load_model == False: ###k-means clustering### #Since the data is too big to do knn, first cluster them from pyspark.ml.clustering import KMeans kmeans = KMeans( k=k, seed=42 ) #divide all books to 1000 clusters (1/1000, less computation for knn) model = kmeans.fit(book_at.select('features')) #model.save('k-means_model_001_10') else: from pyspark.ml.clustering import KMeansModel model = KMeansModel.load('hdfs:/user/yw2115/k-means_model_001') #add the cluster col to original attribute matrix transformed = model.transform(book_at) transformed = transformed.withColumnRenamed("prediction", "cluster") #transormed.show(3) return transformed
def basic_example(spark, resources_folder): data = spark.read.format('libsvm').load(resources_folder + 'sample_kmeans_data.txt') data.printSchema() data.show() final_data = data.select(data['features']) kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(final_data) print(type(model)) # Withinn Sum Square Error # ClusteringEvaluator # computeCost is deprecated and now we have the values on summary wssse = model.summary print(type(wssse)) wssse.predictions.show() print("Training Costs!!!!!") print(wssse.trainingCost ) # esto era en remplazo de model.computeCost(final_data) print(model.clusterCenters()) data = spark.read.format('libsvm').load(resources_folder + 'sample_kmeans_data.txt') data.printSchema() data.show()
def train(k, path): # fileSave = "/home/hadoop/data_school/sparkMlib/KMeans" # # 男: 1, 女: 2 # df = spark.read.format('csv').option('header', 'true').load(fileSave).fillna('0') df = createDataframeKMeans(path).fillna('0') df = df.where(df.TotalFee != '0').where(df.DiseaseCode == '13104') df = df.withColumn("Age", df.Age.cast(IntegerType())) \ .withColumn("TotalFee", df.TotalFee.cast(FloatType())) # vecAss = VectorAssembler(inputCols=df.columns[2:], outputCol='feature') # data = vecAss.transform(df).select("feature") # data.show() data = df.drop("DiseaseCode") data.show() # 转换数据 featureCreator = VectorAssembler(inputCols=data.columns[1:], outputCol='feature') data = featureCreator.transform(data) # 评估器 kmeans = KMeans(k=k, featuresCol='feature') # 模型拟合 model = kmeans.fit(data) # 聚合 test = model.transform(data) test.show() points = [] for i in test.select("Age", "TotalFee", "prediction", "HosRegisterCode").collect(): temp = [float(i['Age']), float(i['TotalFee']), int(i['prediction']), i['HosRegisterCode']] points.append(temp) centers = model.clusterCenters() model.save("/home/hadoop/PycharmProjects/SparkMlib/model/kmeans")
class KMeansTrainer: def __init__(self, k=100): self.kmeans = KMeans().setK(k).setSeed(42).setFeaturesCol('features') @staticmethod def make_features(user_master: SparkDataFrame): """ This method receives the user_master table with features 1 to 6 and returns a copy of the dataframe with an extra column called `features` which is a column of sparce vectors with the features 1 to 6 one-hot encoded. """ df = user_master.select([f'feature{i}' for i in range(1, 7)] + ["user_id"]) cols = df.columns categoricalColumns = [f'feature{i}' for i in range(1, 7)] stages = [] for categoricalCol in categoricalColumns: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index') encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) stages += [stringIndexer, encoder] #label_stringIdx = StringIndexer(inputCol = 'item_id', outputCol = 'label') #stages += [label_stringIdx] assemblerInputs = [c + "classVec" for c in categoricalColumns] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(df) df = pipelineModel.transform(df) selectedCols = ['features'] + cols df = df.select(selectedCols) #df.printSchema() return df def silhouete_score(self, data): """ returns the silhouette score of data. """ predictions = self.model.transform(data) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print(f"silhouette score: {silhouette:.4f}") return silhouette def fit(self, train, silhouette_score=False): """ returns the silhouette score of the fitted data if silhouette_score is True. Default False """ self.model = self.kmeans.fit(train) if silhouette_score: print("Done Fitting", end="\r") return self.silhouete_score(train)
def __find_cluster_split_kmeans_sparkdf(cls, feature_col, df_norm, n_iterations, kmeans_method, sc): from pyspark.ml.clustering import KMeans start_time = time.time() #convert to spark df sqlContext = SQLContext(sc) spark_df = sqlContext.createDataFrame(df_norm) #assemble vector vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features") spark_df_clustering = vecAssembler.transform(spark_df).select('features') n_components_list = [] n_range = np.arange(2, 20) for iteration in np.arange(n_iterations): cost = [] for k in n_range: if kmeans_method == 'kmeans': print("Kmeans Elbow Method K = ", k) kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(spark_df_clustering) elif kmeans_method == 'bisecting_kmeans': print("Bisecting Kmeans Elbow Method K = ", k) bkm = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features") model = bkm.fit(spark_df_clustering) cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later print('Cluster List: ', n_range) print('Within Set Sum of Squared Errors: ', cost) n_split_knee = cls.__knee_locator(n_range, cost, 'convex', 'decreasing', 'sum_of_square_error') print("Recommended no. of components by knee locator: " + str(n_split_knee)) n_components_list.append(n_split_knee) n_components = int(np.median(n_components_list).round(0)) print('Recommended median number of splits: ', n_components) print("elbow method time: ", time.time()-start_time, "(sec)") return n_components
def trainALS(self, ranks, iterations): for rank in ranks: als = ALS(rank=rank, maxIter=iterations, regParam=0.1, userCol="UserID", itemCol="MovieID",ratingCol="label") paramGrid = ParamGridBuilder().addGrid(als.rank,[rank]).build() crossval = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=Remove_nan(metricName="rmse", labelCol="label", predictionCol="prediction"), numFolds=5) self.trainDf.show() cvModel = crossval.fit(self.trainDf) predictions = cvModel.transform(self.testDf) rmse = Remove_nan(metricName="rmse", labelCol="label", predictionCol="prediction").evaluate(predictions) print "****RMSE VALUE IS :*****", rmse movieFactors = cvModel.bestModel.itemFactors.orderBy('id').cache() movieFactors.show(truncate=False) convertToVectors = udf(lambda features: Vectors.dense(features), VectorUDT()) movieFactors = movieFactors.withColumn("features", convertToVectors(movieFactors.features)) kmeans = KMeans(k=50, seed=1) kModel = kmeans.fit(movieFactors) kmeansDF = kModel.transform(movieFactors) clusters = [1, 2] kmeansDF = kmeansDF.join(self.movieDf, kmeansDF.id == self.movieDf.MovieID).drop('MovieID') for cluster in clusters: movieNamesDf = kmeansDF.where(col("prediction") == cluster).select("MovieName") movieNamesDf.rdd.map(lambda row: row[0]).saveAsTextFile(outputDir + \ "Rank" + str(rank) + "Cluster" + str(cluster)) if __name__ == "__main__": mr = movieRecALS(inputDir + "/MovieLens100K_train.txt", inputDir + "/MovieLens100K_test.txt", inputDir + "/u.item") ranks = [2, 4, 8, 16, 32, 64, 128, 256] iterations = 20 mr.trainALS(ranks, iterations)
def test_kmeans_cosine_distance(self): data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),), (Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),), (Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine") model = kmeans.fit(df) result = model.transform(df).collect() self.assertTrue(result[0].prediction == result[1].prediction) self.assertTrue(result[2].prediction == result[3].prediction) self.assertTrue(result[4].prediction == result[5].prediction)
def clustering(input_df, input_col_name, n): """ KMeans and PCA """ input_df = input_df.select('state','categories','stars',input_col_name) norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0) df = norm.transform(input_df) kmeans = KMeans(k=n, seed=2) KMmodel = kmeans.fit(df) predicted = KMmodel.transform(df).cache() pca = PCA(k=2, inputCol='features', outputCol="pc") df = pca.fit(dfsample).transform(dfsample).cache() return df
def elbow(elbowset, clusters): wsseList = [] for k in clusters: print("Training for cluster size {} ".format(k)) kmeans = KM(k = k, seed = 1) model = kmeans.fit(elbowset) transformed = model.transform(elbowset) featuresAndPrediction = transformed.select("features", "prediction") W = computeCost(featuresAndPrediction, model) print("......................WSSE = {} ".format(W)) wsseList.append(W) return wsseList
def test_kmeans_summary(self): data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 1)
def test_kmean_pmml_basic(self): # Most of the validation is done in the Scala side, here we just check # that we output text rather than parquet (e.g. that the format flag # was respected). data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = self.spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) path = tempfile.mkdtemp() km_path = path + "/km-pmml" model.write().format("pmml").save(km_path) pmml_text_list = self.sc.textFile(km_path).collect() pmml_text = "\n".join(pmml_text_list) self.assertIn("Apache Spark", pmml_text) self.assertIn("PMML", pmml_text)
def kmeans(inputdir,df,alg,k): from pyspark.ml.clustering import KMeans from numpy import array from math import sqrt kmeans = KMeans(k=int(k), seed=1,initSteps=5, tol=1e-4, maxIter=20, initMode="k-means||", featuresCol="features") model = kmeans.fit(df) kmFeatures = model.transform(df).select("labels", "prediction") erFeatures = model.transform(df).select("features", "prediction") ###Evaluation rows = erFeatures.collect() WSSSE = 0 for i in rows: WSSSE += sqrt(sum([x**2 for x in (model.clusterCenters()[i[1]]-i[0])])) print("Within Set Sum of Squared Error = " + str(WSSSE)) output_data = writeOutClu(inputdir,kmFeatures,alg,k,WSSSE) return output_data
def cluster(): ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8')) spark = SparkSession.builder\ .master("local")\ .appName("Word Count")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() df = spark.createDataFrame([["0"], ["1"], ["2"], ["3"], ["4"]], ["id"]) df.show() vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features") new_df = vecAssembler.transform(df) kmeans = KMeans(k=2, seed=1) # 2 clusters here model = kmeans.fit(new_df.select('features')) transformed = model.transform(new_df) print(transformed.show())
t0 = time.time() word2Vec = Word2Vec(vectorSize=100, minCount=5, stepSize=0.025, inputCol="text", outputCol="result") modelW2V = word2Vec.fit(twDF) wordVectorsDF = modelW2V.getVectors() timeW2V = time.time() - t0 ## Train K-means on top of the Word2Vec matrix: t0 = time.time() vocabSize = wordVectorsDF.count() K = int(math.floor(math.sqrt(float(vocabSize)/2))) # K ~ sqrt(n/2) this is a rule of thumb for choosing K, # where n is the number of words in the model # feel free to choose K with a fancier algorithm dfW2V = wordVectorsDF.select('vector').withColumnRenamed('vector','features') kmeans = KMeans(k=K, seed=1) modelK = kmeans.fit(dfW2V) labelsDF = modelK.transform(dfW2V).select('prediction').withColumnRenamed('prediction','labels') vocabSize = wordVectorsDF.count() timeKmeans = time.time() - t0 sc.stop() ## Print Some Results printResults = 1 # set t if (printResults): ## Read Tweets print "="*80 print "Read Tweets..." print "Elapsed time (seconds) to read tweets as a data frame: ", timeReadTweets
from pyspark.mllib.linalg import Vectors from pyspark.ml.clustering import KMeans from pyspark import SparkContext from pyspark.sql import SQLContext # sc = SparkContext(appName="test") # sqlContext = SQLContext(sc) data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] df = sqlContext.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) centers = model.clusterCenters() model.transform(df).select("features", "prediction").collect()
.option("header", "true") .option("inferSchema", "true") .load("/data/retail-data/by-day/*.csv") .limit(50) .coalesce(1) .where("Description IS NOT NULL")) sales.cache() # COMMAND ---------- from pyspark.ml.clustering import KMeans km = KMeans().setK(5) print km.explainParams() kmModel = km.fit(sales) # COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ----------
def assign_cluster(data): """Train kmeans on rescaled data and then label the rescaled data.""" kmeans = KMeans(k=2, seed=1, featuresCol="features_scaled", predictionCol="label") model = kmeans.fit(data) label_df = model.transform(data) return label_df
newdf = onehotenc.transform(newdf).drop(c) newdf = newdf.withColumnRenamed(c+"-onehot", c) return newdf dfhot = oneHotEncodeColumns(dfnumeric, ["Take-out","GoodFor_lunch", "GoodFor_dinner", "GoodFor_breakfast"]) dfhot.show(5) # Taining set assembler = VectorAssembler(inputCols = list(set(dfhot.columns) | set(['stars','review_count'])), outputCol="features") train = assembler.transform(dfhot) # Kmeans set for 5 clusters knum = 5 kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="cluster", k=knum, seed=0) model = kmeans.fit(train) print "Model Created!" # See cluster centers: centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # Apply the clustering model to our data: prediction = model.transform(train) prediction.groupBy("cluster").count().orderBy("cluster").show() # Look at the features of each cluster customerCluster = {} for i in range(0,knum):
df0 = tfs.analyze(df).cache() mllib_df.count() df0.count() np.random.seed(2) init_centers = np.random.randn(k, num_features) start_centers = init_centers dataframe = df0 ta_0 = time.time() kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode( "random").setMaxIter(num_iters) mod = kmeans.fit(mllib_df) ta_1 = time.time() tb_0 = time.time() (centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=False) tb_1 = time.time() tc_0 = time.time() (centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=True) tc_1 = time.time() mllib_dt = ta_1 - ta_0 tf_dt = tb_1 - tb_0 tf2_dt = tc_1 - tc_0 print("mllib:", mllib_dt, "tf+spark:",tf_dt, "tf:",tf2_dt)
# COMMAND ---------- display(transformed) # COMMAND ---------- # MAGIC %md # MAGIC #### K-Means Visualized # COMMAND ---------- modelCenters = [] iterations = [0, 2, 4, 7, 10, 20] for i in iterations: kmeans = KMeans(k=3, seed=5, maxIter=i, initSteps=1) model = kmeans.fit(irisTwoFeatures) modelCenters.append(model.clusterCenters()) # COMMAND ---------- print 'modelCenters:' for centroids in modelCenters: print centroids # COMMAND ---------- import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np def prepareSubplot(xticks, yticks, figsize=(10.5, 6), hideLabels=False, gridColor='#999999',
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("KMeansExample")\ .getOrCreate() # $example on$ # Loads data. dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") # Trains a k-means model. kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) # Make predictions predictions = model.transform(dataset) # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
#Place the means and std.dev values in a broadcast variable bcMeans = sc.broadcast(colMeans) bcStdDev = sc.broadcast(colStdDev) csAuto = autoVector.map(centerAndScale) #csAuto.collect() #csAuto.foreach(println) print(csAuto) #Create Spark Data Frame autoRows = csAuto.map(lambda f:Row(features=f)) autoDf = SQLContext.createDataFrame(autoRows) autoDf.select("features").show(10) kmeans = KMeans(k=3, seed=1) model = kmeans.fit(autoDf) predictions = model.transform(autoDf) predictions.collect() predictions.foreach(println) #Plot the results in a scatter plot unstripped = predictions.map(unstripData) predList=unstripped.collect() predPd = pd.DataFrame(predList) # preparing to save the clustered data list_current_gni_final_maped = current_gni_final_maped.collect() list_current_gni_rdd = current_gni_rdd.collect() list_predictions_pandas=predictions.toPandas() list_predictions_temp=list_predictions_pandas.as_matrix()
# COMMAND ---------- transformedTraining = fittedPipeline.transform(trainDataFrame) # COMMAND ---------- from pyspark.ml.clustering import KMeans kmeans = KMeans()\ .setK(20)\ .setSeed(1L) # COMMAND ---------- kmModel = kmeans.fit(transformedTraining) # COMMAND ---------- transformedTest = fittedPipeline.transform(testDataFrame) # COMMAND ---------- from pyspark.sql import Row spark.sparkContext.parallelize([Row(1), Row(2), Row(3)]).toDF() # COMMAND ----------
initMode="k-means||", maxIter=20) type(firstMlKMeans) # `pyspark.ml` paketo modelių klasės turi `explainParams` metodą, kuruo išvedami modelio parametrų paaiškinimai. # In[63]: print(firstMlKMeans.explainParams()) # Apmokykime modelį. # In[64]: firstMlModel = firstMlKMeans.fit(ca1mlFeaturizedDF) type(firstMlModel) # In[65]: firstMlModel.clusterCenters() # Sudarome `Pipeline` žingsnių seką iš `vecAssembler` ir `kmeans` komponentų. # In[66]: from pyspark.ml.pipeline import Pipeline firstPipeline = Pipeline(stages=[vecAssembler, firstMlKMeans])