def create_kmeans_dendogram(input_csv, num_clusters): spark = SparkSession.builder.appName( 'HotelsPriceDataGeneratorSession').getOrCreate() # Lazy op - Load the data # Read CSV # Note that the schema is already defined, a fully null df will result if the csv does not fit the schema print('Reading CSV from ' + input_csv) generated_hotels_df = spark.read.csv(input_csv, header=True, inferSchema=True) # Limit the clusters to num cols num_clusters = min(num_clusters, len(generated_hotels_df.columns[1:])) # Assemble the features vector column vecAssembler = VectorAssembler(inputCols=generated_hotels_df.columns[1:], outputCol="features") vector_df = vecAssembler.transform(generated_hotels_df) # Run the BisectingKMeans to find hierarchial clusters kmeans = BisectingKMeans().setK(num_clusters).setSeed(42) model = kmeans.fit(vector_df) # Link it to find relations between the clusters z = hc.linkage(model.clusterCenters(), method='average', metric='correlation') # Plot the dendrogram hc.dendrogram(z) plt.show()
def getTopClusters(startDate, endDate, startTime, endTime, category): filteredDF = applyFilter(startDate, endDate, startTime, endTime, category).cache() # Extract X, Y into feature vector vectorizer = VectorAssembler() vectorizer.setInputCols(["X", "Y"]) vectorizer.setOutputCol("features") pointsDF = vectorizer.transform(filteredDF).cache() # Hierarchical K means bkm = BisectingKMeans().setK(10).setSeed(7).setMaxIter(7) model = bkm.fit(pointsDF) # RDD of (clusterIndex, size) clustersRDD = (model.transform(pointsDF) .select("prediction").rdd .map(lambda row: (row["prediction"], 1)) .reduceByKey(lambda a, c: a + c)) clusters = model.clusterCenters() clusterRV = clustersRDD.collect() rv = [] for ind, num in clusterRV: val = {"c": (clusters[ind][0], clusters[ind][1]), "o": num} rv.append(val) return rv
def __find_cluster_split_kmeans_sparkdf(cls, feature_col, df_norm, n_iterations, kmeans_method, sc): from pyspark.ml.clustering import KMeans start_time = time.time() #convert to spark df sqlContext = SQLContext(sc) spark_df = sqlContext.createDataFrame(df_norm) #assemble vector vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features") spark_df_clustering = vecAssembler.transform(spark_df).select('features') n_components_list = [] n_range = np.arange(2, 20) for iteration in np.arange(n_iterations): cost = [] for k in n_range: if kmeans_method == 'kmeans': print("Kmeans Elbow Method K = ", k) kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(spark_df_clustering) elif kmeans_method == 'bisecting_kmeans': print("Bisecting Kmeans Elbow Method K = ", k) bkm = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features") model = bkm.fit(spark_df_clustering) cost.append(model.computeCost(spark_df_clustering)) # requires Spark 2.0 or later print('Cluster List: ', n_range) print('Within Set Sum of Squared Errors: ', cost) n_split_knee = cls.__knee_locator(n_range, cost, 'convex', 'decreasing', 'sum_of_square_error') print("Recommended no. of components by knee locator: " + str(n_split_knee)) n_components_list.append(n_split_knee) n_components = int(np.median(n_components_list).round(0)) print('Recommended median number of splits: ', n_components) print("elbow method time: ", time.time()-start_time, "(sec)") return n_components
def get_clusters(self, parameters: dict, urls_and_vectors: DataFrame) -> DataFrame: urls_and_vectors = urls_and_vectors.cache() bisecting_kmeans = BisectingKMeans().setK(parameters['k']).setDistanceMeasure( parameters['distance_measure']).setFeaturesCol("vector").setPredictionCol("cluster_id") model = bisecting_kmeans.fit(urls_and_vectors) clustered_url_vectors = model.transform(urls_and_vectors) urls_and_vectors.unpersist() return clustered_url_vectors
def clustering(df_kmeans, n): kmeans = BisectingKMeans().setK(n).setSeed(1).setFeaturesCol("features") print('kmeans ', kmeans) model = kmeans.fit(df_kmeans) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center)
def bisecting_k_means(self, k): print('\nBisecting K-Means - ' + str(k)) kmeans = BisectingKMeans().setK(k).setSeed(1) model = kmeans.fit(self.df.select('features')) transformed = model.transform(self.df) transformed.groupBy("prediction").count().show() centers = model.clusterCenters() self.print_centers(centers)
def bisect_model(data): #TODO grid search best parametrs bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(data) cost = model.computeCost(data) prilog.infont("Within Set Sum of Squared Errors = " + str(cost)) log.info("Cluster Centers: ") centers = model.clusterCenters() for center in centers: log.info(center) predictions_bi = model.transform(data) return predictions_bi
def model_list(): clist = [] df2 = df1.select('features') df2.cache df1.cache for i in range(2,20): kmeans = BisectingKMeans(k=i, minDivisibleClusterSize=1.0) model = kmeans.fit(df2) WSSSE = model.computeCost(df1) #print("Within Set Sum of Squared Error, k = " + str(i) + ": " +str(WSSSE)) clist.append({i: WSSSE, 'model': model}) df1.unpersist df2.unpersist return clist
def test_bisecting_kmeans_summary(self): data = [(Vectors.dense(1.0), ), (Vectors.dense(5.0), ), (Vectors.dense(10.0), ), (Vectors.sparse(1, [], []), )] df = self.spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2) model = bkm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 20)
def __bisecting_k_mean(cls, k_clusters, xnorm, feature_col, sc): #k_clusters = elbow point start_time = time.time() #convert to spark df sqlContext = SQLContext(sc) df_norm = pd.DataFrame(data = xnorm, columns = feature_col) spark_df = sqlContext.createDataFrame(df_norm) #assemble vector vecAssembler = VectorAssembler(inputCols=feature_col, outputCol="features") spark_df_clustering = vecAssembler.transform(spark_df).select('features') bkm = BisectingKMeans().setK(k_clusters).setSeed(1).setFeaturesCol("features") model = bkm.fit(spark_df_clustering) prediction = model.transform(spark_df_clustering).select('prediction').collect() labels = [p.prediction for p in prediction] return labels
def test_bisecting_kmeans_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2) model = bkm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 20)
def search_opt_k(df_kmeans): # Trains a k-means model. df_kmeans.show() # найдем оптимальное k методом локтя cost = np.zeros(20) for k in range(2, 20): kmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("features") print('kmeans ', kmeans) model = kmeans.fit(df_kmeans.sample(False, 0.1, seed=42)) cost[k] = model.computeCost(df_kmeans) # requires Spark 2.0 or later # print(cost) # визуализируем локоть fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax.plot(range(2, 20), cost[2:20]) ax.set_xlabel('k') ax.set_ylabel('cost') plt.show()
def train(df, hiperparameter): ''' KMeans training, returning KMeans model. input: - Dataframe - config (configurasi hiperparameter) return: kmeans model ''' bs_kmeans = BisectingKMeans( featuresCol=hiperparameter['featuresCol'], predictionCol=hiperparameter['predictionCol'], maxIter=hiperparameter['maxIter'], seed=hiperparameter['seed'], k=hiperparameter['k'], minDivisibleClusterSize=hiperparameter['minDivisibleClusterSize']) model = bs_kmeans.fit(df) return model
def main(argv): spark = SparkSession.builder \ .appName('VIDEO_CLUSTERING') \ .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \ .getOrCreate() spark.conf.set('spark.sql.execution.arrow.enabled', 'true') spark.conf.set('spark.driver.maxResultSize', '0') spark.conf.set('spark.driver.cores', '4') spark.conf.set('spark.driver.memory', '4g') spark.conf.set('spark.executor.memory', '4g') spark.conf.set('spark.executor.cores', '4') video_type_code = handle_params(argv) video_df = spark.read.format('jdbc')\ .option('url', 'jdbc:mysql://192.168.174.133:3306/big_data')\ .option('driver', 'com.mysql.cj.jdbc.Driver')\ .option('dbtable', 'VIDEO_STATISTIC')\ .option('user', 'root').option('password', 'root').load() assembler = VectorAssembler()\ .setInputCols(['play_count', 'favorite_count', 'comment_count', 'barrage_count'])\ .setOutputCol('features') video_vector = assembler.transform(video_df.select( 'play_count', 'favorite_count', 'comment_count', 'barrage_count' ).limit(1000)) bkm = BisectingKMeans(k=8, minDivisibleClusterSize=1.0) model = bkm.fit(video_vector) centers = model.clusterCenters() video_vector = assembler.transform(video_df.select( 'play_count', 'favorite_count', 'comment_count', 'barrage_count' )) transformed = model.transform(video_vector).select('features', 'prediction') transformed.show()
def bisecting_kmeans(features, num_clusters): """Does clustering on the features dataset using Bisecting KMeans clustering. Params: - features (pyspark.sql.DataFrame): The data frame containing the features to be used for clustering - num_clusters (int): The number of clusters to be used Returns: - clustered (pyspark.sql.DataFrame): The data frame, with the predicted clusters in a 'cluster' column """ kmeans = BisectingKMeans(k=num_clusters, featuresCol='features', predictionCol='cluster') kmeans_model = kmeans.fit(features) clustered = kmeans_model.transform(features) clustered.show() print("=====Clustering Results=====") print("Clustering cost = ", kmeans_model.computeCost(features)) print("Cluster sizes = ", kmeans_model.summary.clusterSizes) return clustered
def main(args): spark=SparkSession\ .builder\ .master(args[2])\ .appName(args[1])\ .getOrCreate() start_computing_time = time.time() # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load(args[3]) (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=1234) # Trains a bisecting k-means model. bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(trainingData) # Make predictions predictions = model.transform(testData) appendTime(sys.argv, start_computing_time) spark.stop()
def bisecting_k_means(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )] df = spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0) model = bkm.fit(df) centers = model.clusterCenters() len(centers) model.computeCost(df) model.hasSummary summary = model.summary summary.k summary.clusterSizes #预测 transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[0].prediction == rows[1].prediction rows[2].prediction == rows[3].prediction
""" A simple example demonstrating a bisecting k-means clustering. """ if __name__ == "__main__": sc = SparkContext(appName="PythonBisectingKMeansExample") sqlContext = SQLContext(sc) # $example on$ data = sc.textFile("data/mllib/kmeans_data.txt") parsed = data.map(lambda l: Row(features=Vectors.dense([float(x) for x in l.split(' ')]))) training = sqlContext.createDataFrame(parsed) kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") model = kmeans.fit(training) # Evaluate clustering cost = model.computeCost(training) print("Bisecting K-means Cost = " + str(cost)) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # $example off$ sc.stop()
from pyspark.ml.clustering import BisectingKMeans # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("BisectingKMeansExample")\ .getOrCreate() # $example on$ # Loads data. dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") # Trains a bisecting k-means model. bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(dataset) # Evaluate clustering. cost = model.computeCost(dataset) print("Within Set Sum of Squared Errors = " + str(cost)) # Shows the result. print("Cluster Centers: ") centers = model.clusterCenters() for center in centers: print(center) # $example off$ spark.stop()
for i in col_name: data = data.withColumn(i, data[i].cast(DoubleType())) dataWithFeatures = VectorAssembler(inputCols=col_name, outputCol="features") data = dataWithFeatures.transform(data) trainingData = data testData = data '''RFC = RandomForestClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=14, maxBins=32, minInstancesPerNode=2, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=True, checkpointInterval=10, impurity="gini", numTrees=2000, featureSubsetStrategy="auto", seed=None, subsamplingRate=0.8)''' BKM = BisectingKMeans(featuresCol="features", predictionCol="prediction", maxIter=20, seed=1, k=4, minDivisibleClusterSize=1.0) # Train model. # paramMap1 = {gbt.stepSize: 0.05,gbt.minInstancesPerNode:2,gbt.maxDepth:6,gbt.cacheNodeIds:True,gbt.subsamplingRate:1,gbt.maxIter:200} BKM_model = BKM.fit(trainingData) cost = BKM_model.computeCost(trainingData) print("Within Set Sum of Squared Errors = " + str(cost)) centers = BKM_model.clusterCenters() summary = BKM_model.summary print(summary.k) print(summary.clusterSizes) # Make predictions. #"features", "label","mobile","prediction" prediction_result = BKM_model.transform(testData) # prediction_result.take(5) prediction_result.select('mobile', 'label', 'prediction').repartition(1).write.csv( '/user/wangkang/qiche/word2vec/Kmeans', mode='overwrite') '''conf = SparkConf() conf.setAppName('a') sc = SparkContext(conf=conf)
('HYP_TENS_GEST', typ.IntegerType()), ('PREV_BIRTH_PRETERM', typ.IntegerType())] births_transformed = "file:///home/yuty/yangzz/births_transformed.csv" schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv(births_transformed, header=True, schema=schema) featuresCreator = ft.VectorAssembler( inputCols=[col[0] for col in labels[1:]], outputCol='features').transform(births).select('features').collect() from pyspark.ml.linalg import Vectors from pyspark.ml.clustering import BisectingKMeans data = [(Vectors.dense([10, 10]), ), (Vectors.dense([3.0, 5.0]), ), (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )] df = spark.createDataFrame(data, ["features"]) bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0) model = bkm.fit(df) centers = model.clusterCenters() len(centers) model.computeCost(df) model.hasSummary summary = model.summary summary.k summary.clusterSizes transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[0].prediction
A simple example demonstrating a bisecting k-means clustering. """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PythonBisectingKMeansExample")\ .getOrCreate() # $example on$ data = spark.read.text("data/mllib/kmeans_data.txt").rdd parsed = data\ .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')]))) training = spark.createDataFrame(parsed) kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") model = kmeans.fit(training) # Evaluate clustering cost = model.computeCost(training) print("Bisecting K-means Cost = " + str(cost)) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # $example off$ spark.stop()
# COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import BisectingKMeans bkm = BisectingKMeans().setK(5).setMaxIter(5) bkmModel = bkm.fit(sales) # COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ----------
#Apply the above expression df_vector = df_limit.select(*expr) #Transform the dataFrame based on the vector assembler df_trans = vectorAssembler.transform(df_vector) #Create id that can be used correlate each observation to its feature vector df_trans = df_trans.withColumn("id", monotonically_increasing_id()) df_limit = df_limit.withColumn("id", monotonically_increasing_id()).drop("Latitude").drop("Longitude") #Drop one of the id columns after joining df_joined = df_limit.join(df_trans, "id", "inner").drop("id") df_joined.cache() bkm = BisectingKMeans(k=num_of_Clusters, minDivisibleClusterSize=minDivisSize, featuresCol="Features") model = bkm.fit(df_joined) log("Model was trained using following parameters:") log("") log(model.extractParamMap()) log("") centers = model.clusterCenters() log("The coordinates to each cluster center:") log(centers) summary = model.summary log("Size of each identified cluster:") log(summary.clusterSizes) #DataFrame of predicted cluster centers for each training data point
kmeans = KMeans(k=8, seed=int(np.random.randint(100, size=1)), initMode="k-means||") modelKmeans = kmeans.fit(tsneDataFrame.select("features")) predictions = modelKmeans.transform(tsneDataFrame) end = time.time() times.append(end - start) kmeansTime = average(times) ########### BISECTING K-MEANS ################ from pyspark.ml.clustering import BisectingKMeans times = [] for i in range(1, 5): start = time.time() bkm = BisectingKMeans(k=8, seed=int(np.random.randint(100, size=1))) modelBkm = bkm.fit(tsneDataFrame.select("features")) transformedBkm = modelBkm.transform(tsneDataFrame) end = time.time() times.append(end - start) bisectingKmeansTime = average(times) ############## GMM ################# from pyspark.ml.clustering import GaussianMixture times = [] for i in range(1, 5): start = time.time() gmm = GaussianMixture(k=8, seed=int(np.random.randint(100, size=1))) modelGmm = gmm.fit(tsneDataFrame.select("features")) transformedGmm = modelGmm.transform(tsneDataFrame) end = time.time() times.append(end - start)
kmeansScores = [] for k in range(4, 8): kmeans = KMeans().setK(k).setSeed(216) model = kmeans.fit(trainingData) prediction = model.transform(testData) evaluator = ClusteringEvaluator() score = evaluator.evaluate(prediction) kmeansScores.append(score) plt.plot(range(4, 8), kmeansScores, 'ro') plt.savefig('kmeansScores.pdf') bisectScores = [] for k in range(4, 8): bisection = BisectingKMeans().setK(k).setSeed(216) model = bisection.fit(trainingData) prediction = model.transform(testData) evaluator = ClusteringEvaluator() score = evaluator.evaluate(prediction) bisectScores.append(score) plt.plot(range(4, 8), bisectScores, 'g^') plt.savefig('bisectScores.pdf') plt.clf() kmeansK = np.argmax(kmeansScores) + 4 bisectK = np.argmax(bisectScores) + 4 evaluator = ClusteringEvaluator() kmeans = KMeans().setK(kmeansK).setSeed(216) kmModel = kmeans.fit(trainingData)
dataset = outputFeatureDf kValues = [2, 3, 4, 5, 6, 7, 8] wssse = [] for k in kValues: kmeans = KMeans().setK(k).setSeed(122) model = kmeans.fit(dataset) wssse.append(model.computeCost(dataset)) for i in wssse: print(i) # In[29]: from pyspark.ml.clustering import BisectingKMeans # Trains a bisecting k-means model. bkm = BisectingKMeans().setK(2).setSeed(1222) model = bkm.fit(outputFeatureDf) # Evaluate clustering. cost = model.computeCost(dataset) print("Within Set Sum of Squared Errors = " + str(cost)) # Shows the result. print("Cluster Centers: ") centers = model.clusterCenters() for center in centers: print(center) # In[30]: from sklearn.metrics.cluster import completeness_score transformed = model.transform(dataset) labels = labeldf.collect() label_array = [int(i[0]) for i in labels]
data = sc.textFile("practice5.data") data_label = data.map(parsePoint) data_label = np.array(data_label.collect()) trainData, testData = sort_by_target(data_label) trainData = map(lambda x: (int(x[-1]), Vectors.dense(x[:-1])), trainData) testData = map(lambda x: (int(x[-1]), Vectors.dense(x[:-1])), testData) trainData = sqlContext.createDataFrame(trainData, schema=["label", "features"]) trFeat = trainData.select([c for c in trainData.columns if c in ["features"]]) trLab = trainData.select([c for c in trainData.columns if c in ["label"]]) testData = sqlContext.createDataFrame(testData, schema=["label", "features"]) tsFeat = testData.select([c for c in testData.columns if c in ["features"]]) tsLab = testData.select([c for c in testData.columns if c in ["label"]]) bkm = BSK(k=10, minDivisibleClusterSize=1.0) model = bkm.fit(trFeat) predict = model.transform(tsFeat).select("prediction") predict = predict.rdd.flatMap(lambda x: x).collect() Label = [int(row['label']) for row in tsLab.collect()] f = open('result.txt', 'w') f.write('NMI of hierarchical clustering\n') f.write('{:.4f}'.format(NMI(Label, predict))) sc.stop()
# COMMAND ---------- summary = kmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import BisectingKMeans bkm = BisectingKMeans().setK(5).setMaxIter(5) bkmModel = bkm.fit(sales) # COMMAND ---------- summary = bkmModel.summary print summary.clusterSizes # number of points kmModel.computeCost(sales) centers = kmModel.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # COMMAND ---------- from pyspark.ml.clustering import GaussianMixture gmm = GaussianMixture().setK(5)
stand_scaled = StandardScaler(inputCol="DenseVector", outputCol="features", withStd=True, withMean=True) ''' outputCol must be named Features as Spark KMeans will only use that column as input ''' scaled_model = stand_scaled.fit(train_df) train_df = scaled_model.transform(train_df) bkmeans = BisectingKMeans().setK(2) bkmeans = bkmeans.setSeed(1) bkmodel = bkmeans.fit(train_df) bkcenters = bkmodel.clusterCenters() if bkmodel.hasSummary: print(bkmodel.summary.clusterSizes) print(bkmodel.clusterCenters()) predict_df = bkmodel.transform(train_df) predict_df = predict_df.select("avgMeasuredTime", "avgSpeed", "vehicleCount", "prediction") predict_df.show(2) c1 = mpatches.Patch(color="green", label="No Traffic")
from pyspark.ml.evaluation import ClusteringEvaluator # SETTING UP SPARK CONTEXT AND SESSION conf = SparkConf() sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") spark = SparkSession(sc) # READING THE GIVEN DATA FILE FRONM LIBSVM FORMAT INTO DATAFRAME dataset = spark.read.format("libsvm").load( "/home/vmalapati1/data/kmeans_input.txt") #INTIALIZING THE KMMEANS ALGO kmeans = BisectingKMeans(k=2, seed=1) # 2 clusters here # TRIANING THE MODEL WITH ABOVE DATA FRAME model = kmeans.fit(dataset) # PREDICTING THE RESULTS BASED ON THE INPUT OF THE MODEL transformed = model.transform(dataset) transformed.show(200) #dataset.show() #COMPUTING THE COST OF THE KMEANS MODEL cost = model.computeCost(dataset) print("Within Set Sum of Squared Errors = " + str(cost)) #FINDING THE CENTERS OF THE CLUSTERS centers = model.clusterCenters() # PRINTING THE CENTERS OF THE CLUSTERS for center in centers: print(center)
bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PythonBisectingKMeansExample")\ .getOrCreate() # $example on$ # Loads data. dataset = spark.read.format("libsvm").load( "data/mllib/sample_kmeans_data.txt") # Trains a bisecting k-means model. bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(dataset) # Evaluate clustering. cost = model.computeCost(dataset) print("Within Set Sum of Squared Errors = " + str(cost)) # Shows the result. print("Cluster Centers: ") centers = model.clusterCenters() for center in centers: print(center) # $example off$ spark.stop()
from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml.clustering import KMeans cluster_df = spark.read.csv('clustering_dataset.csv', header=True, inferSchema=True) cluster_df.show() vectorAssembler = VectorAssembler(inputCols=['col1', 'col2', 'col3'], outputCol='features') vcluster_df = vectorAssembler.transform(cluster_df) vcluster_df.show() kmeans = KMeans().setK(3) kmeans = kmeans.setSeed(1) kmodel = kmeans.fit(vcluster_df) centers = kmodel.clusterCenters() # hierarchical clustering vcluster_df.show() from pyspark.ml.clustering import BisectingKMeans bkmeans = BisectingKMeans().setK(3) bkmeans = bkmeans.setSeed(1) bkmodel = bkmeans.fit(vcluster_df) bkcenters = bkmodel.clusterCenters()
def train_model(dataset): # Trains a bisecting k-means model. bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(dataset) return model