def create_kmeans_dendogram(input_csv, num_clusters): spark = SparkSession.builder.appName( 'HotelsPriceDataGeneratorSession').getOrCreate() # Lazy op - Load the data # Read CSV # Note that the schema is already defined, a fully null df will result if the csv does not fit the schema print('Reading CSV from ' + input_csv) generated_hotels_df = spark.read.csv(input_csv, header=True, inferSchema=True) # Limit the clusters to num cols num_clusters = min(num_clusters, len(generated_hotels_df.columns[1:])) # Assemble the features vector column vecAssembler = VectorAssembler(inputCols=generated_hotels_df.columns[1:], outputCol="features") vector_df = vecAssembler.transform(generated_hotels_df) # Run the BisectingKMeans to find hierarchial clusters kmeans = BisectingKMeans().setK(num_clusters).setSeed(42) model = kmeans.fit(vector_df) # Link it to find relations between the clusters z = hc.linkage(model.clusterCenters(), method='average', metric='correlation') # Plot the dendrogram hc.dendrogram(z) plt.show()
def __find_cluster_split_kmeans_sparkdf(cls, feature_col, df_norm, n_iterations, kmeans_method, sc): from pyspark.ml.clustering import KMeans start_time = time.time() #convert to spark df sqlContext = SQLContext(sc) spark_df = sqlContext.createDataFrame(df_norm) #assemble vector vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features") spark_df_clustering = vecAssembler.transform(spark_df).select('features') n_components_list = [] n_range = np.arange(2, 20) for iteration in np.arange(n_iterations): cost = [] for k in n_range: if kmeans_method == 'kmeans': print("Kmeans Elbow Method K = ", k) kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(spark_df_clustering) elif kmeans_method == 'bisecting_kmeans': print("Bisecting Kmeans Elbow Method K = ", k) bkm = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features") model = bkm.fit(spark_df_clustering) cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later print('Cluster List: ', n_range) print('Within Set Sum of Squared Errors: ', cost) n_split_knee = cls.__knee_locator(n_range, cost, 'convex', 'decreasing', 'sum_of_square_error') print("Recommended no. of components by knee locator: " + str(n_split_knee)) n_components_list.append(n_split_knee) n_components = int(np.median(n_components_list).round(0)) print('Recommended median number of splits: ', n_components) print("elbow method time: ", time.time()-start_time, "(sec)") return n_components
def getTopClusters(startDate, endDate, startTime, endTime, category): filteredDF = applyFilter(startDate, endDate, startTime, endTime, category).cache() # Extract X, Y into feature vector vectorizer = VectorAssembler() vectorizer.setInputCols(["X", "Y"]) vectorizer.setOutputCol("features") pointsDF = vectorizer.transform(filteredDF).cache() # Hierarchical K means bkm = BisectingKMeans().setK(10).setSeed(7).setMaxIter(7) model = bkm.fit(pointsDF) # RDD of (clusterIndex, size) clustersRDD = (model.transform(pointsDF) .select("prediction").rdd .map(lambda row: (row["prediction"], 1)) .reduceByKey(lambda a, c: a + c)) clusters = model.clusterCenters() clusterRV = clustersRDD.collect() rv = [] for ind, num in clusterRV: val = {"c": (clusters[ind][0], clusters[ind][1]), "o": num} rv.append(val) return rv
def get_clusters(self, parameters: dict, urls_and_vectors: DataFrame) -> DataFrame: urls_and_vectors = urls_and_vectors.cache() bisecting_kmeans = BisectingKMeans().setK(parameters['k']).setDistanceMeasure( parameters['distance_measure']).setFeaturesCol("vector").setPredictionCol("cluster_id") model = bisecting_kmeans.fit(urls_and_vectors) clustered_url_vectors = model.transform(urls_and_vectors) urls_and_vectors.unpersist() return clustered_url_vectors
def clustering(df_kmeans, n): kmeans = BisectingKMeans().setK(n).setSeed(1).setFeaturesCol("features") print('kmeans ', kmeans) model = kmeans.fit(df_kmeans) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def bisecting_k_means(self, k): print('\nBisecting K-Means - ' + str(k)) kmeans = BisectingKMeans().setK(k).setSeed(1) model = kmeans.fit(self.df.select('features')) transformed = model.transform(self.df) transformed.groupBy("prediction").count().show() centers = model.clusterCenters() self.print_centers(centers)
def bisect_model(data): #TODO grid search best parametrs bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(data) cost = model.computeCost(data) prilog.infont("Within Set Sum of Squared Errors = " + str(cost)) log.info("Cluster Centers: ") centers = model.clusterCenters() for center in centers: log.info(center) predictions_bi = model.transform(data) return predictions_bi
def model_list(): clist = [] df2 = df1.select('features') df2.cache df1.cache for i in range(2,20): kmeans = BisectingKMeans(k=i, minDivisibleClusterSize=1.0) model = kmeans.fit(df2) WSSSE = model.computeCost(df1) #print("Within Set Sum of Squared Error, k = " + str(i) + ": " +str(WSSSE)) clist.append({i: WSSSE, 'model': model}) df1.unpersist df2.unpersist return clist
def test_bisecting_kmeans_summary(self): data = [(Vectors.dense(1.0), ), (Vectors.dense(5.0), ), (Vectors.dense(10.0), ), (Vectors.sparse(1, [], []), )] df = self.spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2) model = bkm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 20)
def __bisecting_k_mean(cls, k_clusters, xnorm, feature_col, sc): #k_clusters = elbow point start_time = time.time() #convert to spark df sqlContext = SQLContext(sc) df_norm = pd.DataFrame(data = xnorm, columns = feature_col) spark_df = sqlContext.createDataFrame(df_norm) #assemble vector vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features") spark_df_clustering = vecAssembler.transform(spark_df).select('features') bkm = BisectingKMeans().setK(k_clusters).setSeed(1).setFeaturesCol("features") model = bkm.fit(spark_df_clustering) prediction = model.transform(spark_df_clustering).select('prediction').collect() labels = [p.prediction for p in prediction] return labels
def test_bisecting_kmeans_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2) model = bkm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 20)
def main(sc, spark): # Load the Corpus corpus = load_corpus(sc, spark) # Create the vector/cluster pipeline pipeline = Pipeline(stages=[ Tokenizer(inputCol="text", outputCol="tokens"), Word2Vec(vectorSize=7, minCount=0, inputCol="tokens", outputCol="vecs"), BisectingKMeans(k=10, featuresCol="vecs", maxIter=10), ]) # Fit the model model = pipeline.fit(corpus) corpus = model.transform(corpus) # Evaluate clustering. bkm = model.stages[-1] cost = bkm.computeCost(corpus) sizes = bkm.summary.clusterSizes # TODO: compute cost of each cluster individually # Get the text representation of each cluster. wvec = model.stages[-2] table = [["Cluster", "Size", "Terms"]] for ci, c in enumerate(bkm.clusterCenters()): ct = wvec.findSynonyms(c, 7) size = sizes[ci] terms = " ".join([row.word for row in ct.take(7)]) table.append([ci, size, terms]) # Print Results print(tabulate(table)) print("Sum of square distance to center: {:0.3f}".format(cost))
def main(): parser = argparse.ArgumentParser(description='Clustering with pyspark.') parser.add_argument('--data-file', type=str, default='enwiki.json') parser.add_argument('--num-clusters', type=int, default=4) parser.add_argument('--seed', type=int, default=23) parser.add_argument('--algorithm', default='kmeans', choices=['kmeans', 'hier', 'gmm']) parser.add_argument('--output-groundtruth', type=str, default='groundtruth.csv') parser.add_argument('--output-cluster', type=str, default='cluster.csv') args = parser.parse_args() spark_session = SparkSession.builder.appName('clustering').getOrCreate() data = preprocess(spark_session, args.data_file) if args.algorithm == 'kmeans': alg = KMeans() elif args.algorithm == 'hier': alg = BisectingKMeans() elif args.algorithm == 'gmm': alg = GaussianMixture() model = train(alg, data, args.num_clusters, seed=args.seed) evaluate(data, model, args.algorithm, args.num_clusters, args.output_groundtruth, args.output_cluster)
def search_opt_k(df_kmeans): # Trains a k-means model. df_kmeans.show() # найдем оптимальное k методом локтя cost = np.zeros(20) for k in range(2, 20): kmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features") print('kmeans ', kmeans) model = kmeans.fit(df_kmeans.sample(False, 0.1, seed=42)) cost[k] = model.computeCost(df_kmeans) # requires Spark 2.0 or later # print(cost) # визуализируем локоть fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax.plot(range(2, 20), cost[2:20]) ax.set_xlabel('k') ax.set_ylabel('cost') plt.show()
def train(df, hiperparameter): ''' KMeans training, returning KMeans model. input: - Dataframe - config (configurasi hiperparameter) return: kmeans model ''' bs_kmeans = BisectingKMeans( featuresCol=hiperparameter['featuresCol'], predictionCol=hiperparameter['predictionCol'], maxIter=hiperparameter['maxIter'], seed=hiperparameter['seed'], k=hiperparameter['k'], minDivisibleClusterSize=hiperparameter['minDivisibleClusterSize']) model = bs_kmeans.fit(df) return model
def main(argv): spark = SparkSession.builder \ .appName('VIDEO_CLUSTERING') \ .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \ .getOrCreate() spark.conf.set('spark.sql.execution.arrow.enabled', 'true') spark.conf.set('spark.driver.maxResultSize', '0') spark.conf.set('spark.driver.cores', '4') spark.conf.set('spark.driver.memory', '4g') spark.conf.set('spark.executor.memory', '4g') spark.conf.set('spark.executor.cores', '4') video_type_code = handle_params(argv) video_df = spark.read.format('jdbc')\ .option('url', 'jdbc:mysql://192.168.174.133:3306/big_data')\ .option('driver', 'com.mysql.cj.jdbc.Driver')\ .option('dbtable', 'VIDEO_STATISTIC')\ .option('user', 'root').option('password', 'root').load() assembler = VectorAssembler()\ .setInputCols(['play_count', 'favorite_count', 'comment_count', 'barrage_count'])\ .setOutputCol('features') video_vector = assembler.transform(video_df.select( 'play_count', 'favorite_count', 'comment_count', 'barrage_count' ).limit(1000)) bkm = BisectingKMeans(k=8, minDivisibleClusterSize=1.0) model = bkm.fit(video_vector) centers = model.clusterCenters() video_vector = assembler.transform(video_df.select( 'play_count', 'favorite_count', 'comment_count', 'barrage_count' )) transformed = model.transform(video_vector).select('features', 'prediction') transformed.show()
def bisecting_kmeans(features, num_clusters): """Does clustering on the features dataset using Bisecting KMeans clustering. Params: - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering - num_clusters (int): The number of clusters to be used Returns: - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column """ kmeans = BisectingKMeans(k=num_clusters, featuresCol='features', predictionCol='cluster') kmeans_model = kmeans.fit(features) clustered = kmeans_model.transform(features) clustered.show() print("=====Clustering Results=====") print("Clustering cost = ", kmeans_model.computeCost(features)) print("Cluster sizes = ", kmeans_model.summary.clusterSizes) return clustered
def bisecting_k_means(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )] df = spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0) model = bkm.fit(df) centers = model.clusterCenters() len(centers) model.computeCost(df) model.hasSummary summary = model.summary summary.k summary.clusterSizes #预测 transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[0].prediction == rows[1].prediction rows[2].prediction == rows[3].prediction
def main(args): spark=SparkSession\ .builder\ .master(args[2])\ .appName(args[1])\ .getOrCreate() start_computing_time = time.time() # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load(args[3]) (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=1234) # Trains a bisecting k-means model. bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(trainingData) # Make predictions predictions = model.transform(testData) appendTime(sys.argv, start_computing_time) spark.stop()
def compute_clusters(addons_df, num_clusters, random_seed): """Performs user clustering by using add-on ids as features.""" # Build the stages of the pipeline. We need hashing to make the next # steps work. hashing_stage = HashingTF(inputCol="addon_ids", outputCol="hashed_features") idf_stage = IDF(inputCol="hashed_features", outputCol="features", minDocFreq=1) # As a future improvement, we may add a sane value for the minimum cluster size # to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure # to pass along the random seed if needed for tests. kmeans_kwargs = {"seed": random_seed} if random_seed else {} bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs) pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage]) # Run the pipeline and compute the results. model = pipeline.fit(addons_df) return model.transform(addons_df).select(["client_id", "prediction"])
def return_correct_clustering_algorithm(_type, _cluster_number, _max_iter): """ This method returns an instance of the clustering algorithm selected by the user. :param _type: the name of the algorithm we want to use. :param _cluster_number: the number of clusters. :param _max_iter: the maximum number of iterations. :return: an _type instance or it raises an execption if _type is not valid. """ cluster_number = int(_cluster_number) if _cluster_number else 10 max_iter = int(_max_iter) if _max_iter else 20 if _type == "kmeans": return KMeans().setK(cluster_number).setMaxIter(max_iter).setSeed(1) elif _type == "b-kmeans": return BisectingKMeans().setK(cluster_number).setMaxIter( max_iter).setSeed(1) else: raise Exception( "The clustering algorithm requested {} is not available".format( _type))
cluster_centers = kmeans_model.clusterCenters() print(cluster_centers) # COMMAND ---------- # MAGIC %md #####Hierarchial Clustering via Bisecting K-means # COMMAND ---------- from pyspark.ml.clustering import BisectingKMeans from pyspark.ml.evaluation import ClusteringEvaluator retail_features = spark.read.table("retail_features") train_df = retail_features.selectExpr("selected_features as features") bkmeans = BisectingKMeans(k=3, featuresCol='features') bkmeans_model = kmeans.fit(train_df) predictions = bkmeans_model.transform(train_df) evaluator = ClusteringEvaluator() silhouette = evaluator.evaluate(predictions) print("Silhouette measure using squared euclidean distance = " + str(silhouette)) cluster_centers = kmeans_model.clusterCenters() print(cluster_centers) # COMMAND ----------
bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PythonBisectingKMeansExample")\ .getOrCreate() # $example on$ # Loads data. dataset = spark.read.format("libsvm").load( "data/mllib/sample_kmeans_data.txt") # Trains a bisecting k-means model. bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(dataset) # Evaluate clustering. cost = model.computeCost(dataset) print("Within Set Sum of Squared Errors = " + str(cost)) # Shows the result. print("Cluster Centers: ") centers = model.clusterCenters() for center in centers: print(center) # $example off$ spark.stop()
""" A simple example demonstrating a bisecting k-means clustering. """ if __name__ == "__main__": sc = SparkContext(appName="PythonBisectingKMeansExample") sqlContext = SQLContext(sc) # $example on$ data = sc.textFile("data/mllib/kmeans_data.txt") parsed = data.map(lambda l: Row(features=Vectors.dense([float(x) for x in l.split(' ')]))) training = sqlContext.createDataFrame(parsed) kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") model = kmeans.fit(training) # Evaluate clustering cost = model.computeCost(training) print("Bisecting K-means Cost = " + str(cost)) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # $example off$ sc.stop()
from pyspark.ml.clustering import BisectingKMeans # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("BisectingKMeansExample")\ .getOrCreate() # $example on$ # Loads data. dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") # Trains a bisecting k-means model. bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(dataset) # Evaluate clustering. cost = model.computeCost(dataset) print("Within Set Sum of Squared Errors = " + str(cost)) # Shows the result. print("Cluster Centers: ") centers = model.clusterCenters() for center in centers: print(center) # $example off$ spark.stop()
('HYP_TENS_GEST', typ.IntegerType()), ('PREV_BIRTH_PRETERM', typ.IntegerType())] births_transformed = "file:///home/yuty/yangzz/births_transformed.csv" schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv(births_transformed, header=True, schema=schema) featuresCreator = ft.VectorAssembler( inputCols=[col[0] for col in labels[1:]], outputCol='features').transform(births).select('features').collect() from pyspark.ml.linalg import Vectors from pyspark.ml.clustering import BisectingKMeans data = [(Vectors.dense([10, 10]), ), (Vectors.dense([3.0, 5.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )] df = spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0) model = bkm.fit(df) centers = model.clusterCenters() len(centers) model.computeCost(df) model.hasSummary summary = model.summary summary.k summary.clusterSizes transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[0].prediction
A simple example demonstrating a bisecting k-means clustering. """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PythonBisectingKMeansExample")\ .getOrCreate() # $example on$ data = spark.read.text("data/mllib/kmeans_data.txt").rdd parsed = data\ .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')]))) training = spark.createDataFrame(parsed) kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") model = kmeans.fit(training) # Evaluate clustering cost = model.computeCost(training) print("Bisecting K-means Cost = " + str(cost)) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # $example off$ spark.stop()
# COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import BisectingKMeans bkm = BisectingKMeans().setK(5).setMaxIter(5) bkmModel = bkm.fit(sales) # COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ----------
from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml.clustering import KMeans cluster_df = spark.read.csv('clustering_dataset.csv', header=True, inferSchema=True) cluster_df.show() vectorAssembler = VectorAssembler(inputCols=['col1', 'col2', 'col3'], outputCol='features') vcluster_df = vectorAssembler.transform(cluster_df) vcluster_df.show() kmeans = KMeans().setK(3) kmeans = kmeans.setSeed(1) kmodel = kmeans.fit(vcluster_df) centers = kmodel.clusterCenters() # hierarchical clustering vcluster_df.show() from pyspark.ml.clustering import BisectingKMeans bkmeans = BisectingKMeans().setK(3) bkmeans = bkmeans.setSeed(1) bkmodel = bkmeans.fit(vcluster_df) bkcenters = bkmodel.clusterCenters()
start = time.time() kmeans = KMeans(k=8, seed=int(np.random.randint(100, size=1)), initMode="k-means||") modelKmeans = kmeans.fit(tsneDataFrame.select("features")) predictions = modelKmeans.transform(tsneDataFrame) end = time.time() times.append(end - start) kmeansTime = average(times) ########### BISECTING K-MEANS ################ from pyspark.ml.clustering import BisectingKMeans times = [] for i in range(1, 5): start = time.time() bkm = BisectingKMeans(k=8, seed=int(np.random.randint(100, size=1))) modelBkm = bkm.fit(tsneDataFrame.select("features")) transformedBkm = modelBkm.transform(tsneDataFrame) end = time.time() times.append(end - start) bisectingKmeansTime = average(times) ############## GMM ################# from pyspark.ml.clustering import GaussianMixture times = [] for i in range(1, 5): start = time.time() gmm = GaussianMixture(k=8, seed=int(np.random.randint(100, size=1))) modelGmm = gmm.fit(tsneDataFrame.select("features")) transformedGmm = modelGmm.transform(tsneDataFrame) end = time.time()
kmModel = km.fit(sales) # COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import BisectingKMeans bkm = BisectingKMeans().setK(5).setMaxIter(5) bkmModel = bkm.fit(sales) # COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture
kmeansScores = [] for k in range(4, 8): kmeans = KMeans().setK(k).setSeed(216) model = kmeans.fit(trainingData) prediction = model.transform(testData) evaluator = ClusteringEvaluator() score = evaluator.evaluate(prediction) kmeansScores.append(score) plt.plot(range(4, 8), kmeansScores, 'ro') plt.savefig('kmeansScores.pdf') bisectScores = [] for k in range(4, 8): bisection = BisectingKMeans().setK(k).setSeed(216) model = bisection.fit(trainingData) prediction = model.transform(testData) evaluator = ClusteringEvaluator() score = evaluator.evaluate(prediction) bisectScores.append(score) plt.plot(range(4, 8), bisectScores, 'g^') plt.savefig('bisectScores.pdf') plt.clf() kmeansK = np.argmax(kmeansScores) + 4 bisectK = np.argmax(bisectScores) + 4 evaluator = ClusteringEvaluator() kmeans = KMeans().setK(kmeansK).setSeed(216)
sqlContext = SQLContext(sc) # Loading required packages from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml.clustering import KMeans # Reading the data set cluster_df = spark.read.csv("./Exercise_Files/Ch03/03_02/clustering_dataset.csv", header = True, inferSchema = True) # Examining the data frame print(cluster_df.schema) cluster_df.printSchema() print(cluster_df.columns) # VectorAssembler for transformation vectorAssembler = VectorAssembler(inputCols = ["col1", "col2", "col3"], outputCol = "features") vectorized_cluster_df = vectorAssembler.transform(cluster_df) print(vectorized_cluster_df.select(["features"]).show()) print(vectorized_cluster_df.take(1)) # K-means clustering - not working for some reason! # kmeans = KMeans().setK(3).setSeed(1) # kmeans_model = KMeans.fit(vectorized_cluster_df.select("features"), 3, maxIterations = 10, initializationMode = "random") # km_centers = kmeans_model.clusterCenters() # Hierarchical clustering (Bisecting K-means) - not working for same reason as KMeans from pyspark.ml.clustering import BisectingKMeans bkmeans = BisectingKMeans().setK(3).setSeed(1) # bk_model = bkmeans.fit(vectorized_cluster_df) # bk_centers = bk_model.fit(vectorized_cluster_df)
dataset = outputFeatureDf kValues = [2, 3, 4, 5, 6, 7, 8] wssse = [] for k in kValues: kmeans = KMeans().setK(k).setSeed(122) model = kmeans.fit(dataset) wssse.append(model.computeCost(dataset)) for i in wssse: print(i) # In[29]: from pyspark.ml.clustering import BisectingKMeans # Trains a bisecting k-means model. bkm = BisectingKMeans().setK(2).setSeed(1222) model = bkm.fit(outputFeatureDf) # Evaluate clustering. cost = model.computeCost(dataset) print("Within Set Sum of Squared Errors = " + str(cost)) # Shows the result. print("Cluster Centers: ") centers = model.clusterCenters() for center in centers: print(center) # In[30]: from sklearn.metrics.cluster import completeness_score transformed = model.transform(dataset) labels = labeldf.collect()