def perform_training(sc: SparkContext, params_dict: dict): normal_ekg_data_path = None if 'normal_ekg_data_path' not in params_dict else params_dict[ 'normal_ekg_data_path'] min_num_of_clusters = 5 if 'min_num_of_clusters' not in params_dict else int(params_dict['min_num_of_clusters']) max_num_of_clusters = 20 if 'max_num_of_clusters' not in params_dict else int( params_dict['max_num_of_clusters']) boundary_ratio = 0.8 if 'boundary_ratio' not in params_dict else int(params_dict['boundary_ratio']) ekg_rdd_data = sc.textFile(normal_ekg_data_path).map( lambda line: np.array([float(val) for val in line.split(',')])) # ekg_rdd_data.foreach(Plotter.plot_signal_window) k_range = range(min_num_of_clusters, max_num_of_clusters, 1) prev_cost = float(np.inf) final_km = KMeansModel(ekg_rdd_data.takeSample(False, 1)) cost_ratios = [] found_best = False for k in k_range: km = KMeans.train(ekg_rdd_data, k) # cost equals to sum of squared distances of samples to the nearest cluster centre cost = km.computeCost(ekg_rdd_data) ratio = cost / prev_cost prev_cost = cost cost_ratios.append(ratio) if (ratio > boundary_ratio) & (not found_best): final_km = km found_best = True Plotter.plot_elbow(cost_ratios, k_range) return final_km
def kmeansInitialClusters(dataset): model = KMeansModel(CENTER_VECTORS) vectorsRdd = dataset.rdd.map(lambda data: Vectors.parse(Vectors.stringify(data['features']))) trainedModel = KMeans.train(vectorsRdd, 4, maxIterations=1000, initialModel=model) result=[] for d in dataset.collect(): entry = {} entry["features"] = d["features"] entry["prediction"] = trainedModel.predict(Vectors.parse(Vectors.stringify(d['features']))) entry["label"] = d['label'] result.append(entry) plotDiversitySizeClustering(result, CENTERS, "Size", "Diversity", "Song Analysis by Size and Diversity with Initial Clusters") centroidArtistSongCount(result, CENTERS)
def assign_pooling(data): image_name, feature_matrix = data[0] clusterCenters = data[1] feature_matrix = np.array(feature_matrix) model = KMeansModel(clusterCenters) bow = np.zeros(len(clusterCenters)) for x in feature_matrix: k = model.predict(x) dist = distance.euclidean(clusterCenters[k], x) bow[k] = max(bow[k], dist) clusters = bow.tolist() group = clusters.index(min(clusters)) + 1 return [image_name, group]
def assign_pooling(row, clusterCenters, pooling): image_name = row['fileName'] feature_matrix = np.array(row['features']) clusterCenters = clusterCenters.value model = KMeansModel(clusterCenters) bow = np.zeros(len(clusterCenters)) for x in feature_matrix: k = model.predict(x) dist = distance.euclidean(clusterCenters[k], x) if pooling == "max": bow[k] = max(bow[k], dist) elif pooling == "sum": bow[k] = bow[k] + dist clusters = bow.tolist() group = clusters.index(min(clusters)) + 1 #print(image_name + " in group: " + str(group)) return [(image_name, group)]
def assign_pooling(data): row = data[0] clusterCenters = data[1] pooling = data[2] image_name = row['fileName'] feature_matrix = np.array(row['features']) model = KMeansModel(clusterCenters) bow = np.zeros(len(clusterCenters)) for x in feature_matrix: k = model.predict(x) dist = distance.euclidean(clusterCenters[k], x) if pooling == "max": bow[k] = max(bow[k], dist) elif pooling == "sum": bow[k] = bow[k] + dist clusters = bow.tolist() group = clusters.index(min(clusters)) + 1 return [image_name, group]
def train_rotations(sc, split_vecs, M, Cs): """ For compute rotations for each split of the data using given coarse quantizers. """ Rs = [] mus = [] counts = [] for split in xrange(2): print 'Starting rotation fitting for split %d' % split # Get the data for this split data = split_vecs.map(lambda x: x[split]) # Get kmeans model model = KMeansModel(Cs[split]) R, mu, count = compute_local_rotations(sc, data, model, M / 2) Rs.append(R) mus.append(mu) counts.append(count) return Rs, mus, counts
currTime = strftime("%Y-%m-%d-%H-%M-%S") sc = SparkContext(appName="KMeans") lines = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/initial_centroids.csv") dataset = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/training_data.csv") predict_data = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv") average_per_year = average_year(lines) # 2014 and 2015 average_per_month = average_month(average_per_year) data = parseDataset(dataset) k = int(sys.argv[1]) initial_centroids = generate_initial_centroids(average_per_month.collect(), k) # KMeans start = time() kmeans_model = KMeans.train(data, k, maxIterations = 100, initialModel = KMeansModel(initial_centroids)) end = time() elapsed_time = end - start kmeans_output = [ "====================== KMeans ====================\n", "Final centers: " + str(kmeans_model.clusterCenters), "Total Cost: " + str(kmeans_model.computeCost(data)), "Value of K: " + str(k), "Elapsed time: %0.10f seconds." % elapsed_time ] # Predicting points = parseDataset(predict_data) count_lines = float(len(points.collect())) probabilities = generate_probabilities(points, k, kmeans_model, count_lines) print("Prob: ", probabilities)
"hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv" ) average_per_year = average_year(lines) # 2014 and 2015 average_per_month = average_month(average_per_year) data = parseDataset(dataset) k = int(sys.argv[1]) initial_centroids = generate_initial_centroids(average_per_month.collect(), k) # KMeans start = time() kmeans_model = KMeans.train(data, k, maxIterations=100, initialModel=KMeansModel(initial_centroids)) end = time() elapsed_time = end - start kmeans_output = [ "====================== KMeans ====================\n", "Final centers: " + str(kmeans_model.clusterCenters), "Total Cost: " + str(kmeans_model.computeCost(data)), "Value of K: " + str(k), "Elapsed time: %0.10f seconds." % elapsed_time ] # Predicting points = parseDataset(predict_data) count_lines = float(len(points.collect())) probabilities = generate_probabilities(points, k, kmeans_model, count_lines)
from pyspark.mllib.clustering import KMeansModel from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from itertools import permutations import csv import operator centers = [] with open('/home/ronald/kmeansModel', 'r') as f: line = f.readline() while line: points = line[1:len(line) - 2].split(",") centers.append([float(i) for i in points]) line = f.readline() model = KMeansModel(centers) modelCenters = model.clusterCenters realCenters = [] with open('/home/ronald/centers.csv', 'r') as f: csvReader = csv.DictReader(f) for row in csvReader: center = [] for i in row: center.append(row[i]) realCenters.append(Vectors.dense(center)) perm = list(permutations([i for i in range(8)])) totalDist = [] for i in perm:
#Taking latitude and longitude columns data for clustering. rddLoc = rdd.map(lambda line: (float(line[20]), float(line[21]))) arr = [] #Finding the optimum K value for clusters we used KMeansModel library for k in range(10, 160, 10): #Taking sample list for initial centroids samplelist = sc.parallelize(rdd.take(k)) list = samplelist.map(lambda line: (line[20], line[21])).collect() sample_centroidlist = np.array(list).astype('float') model = KMeans.train(rddLoc, k, maxIterations=30, initialModel=KMeansModel(sample_centroidlist)) #The model trained gives the final centroids. #The final centers are used for evaluating clustering. # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = model.centers[model.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = rddLoc.map(lambda point: error(point)).reduce(lambda x, y: x + y) arr.append(WSSSE) print("Within Set Sum of Squared Error = ") print(arr) #Plot the graph for array returned and corresponding K values. The optimum value of k is where there is an elbow in the curve. #The K value we got is 60.