def test_kmeans_deterministic(self): from pyspark.mllib.clustering import KMeans X = range(0, 100, 10) Y = range(0, 100, 10) data = [[x, y] for x, y in zip(X, Y)] clusters1 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42) clusters2 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42) centers1 = clusters1.centers centers2 = clusters2.centers for c1, c2 in zip(centers1, centers2): # TODO: Allow small numeric difference. self.assertTrue(array_equal(c1, c2))
def clusterKMeanSpark(matrix,k): m = transformInRealMatrix(matrix) sc = SparkContext(appName="Jsonizer: Remove stop words") parsedData = sc.parallelize(m) y = [] x = [] clustersControl = range(k,k+1) for kc in clustersControl: clusters = KMeans.train(parsedData, kc, maxIterations=50000,runs=200, initializationMode="k-means||",epsilon=0.0001) clu = [] def error(point,clust): center = clust.centers[clust.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point,clusters)).reduce(lambda x, y: x + y) for n in m: clu += [clusters.predict(np.array(n))] x += [kc] y += [WSSSE] #print(kc,WSSSE) #plt.plot(x,y) #plt.ylabel('some numbers') #plt.show() ret = [[] for i in range(0,max(clu)+1)] for i in range(0,len(clu)): ret[clu[i]] += [i] sc.stop() return ret
def train_subquantizers(sc, split_vecs, M, subquantizer_clusters, model, seed=None): """ Project each data point into it's local space and compute subquantizers by clustering each fine split of the locally projected data. """ b = sc.broadcast(model) def project_local(x): x = np.concatenate(x) coarse = b.value.predict_coarse(x) return b.value.project(x, coarse) projected = split_vecs.map(project_local) # Split the vectors into the subvectors split_vecs = projected.map(lambda x: np.split(x, M)) split_vecs.cache() subquantizers = [] for split in xrange(M): data = split_vecs.map(lambda x: x[split]) data.cache() sub = KMeans.train(data, subquantizer_clusters, initializationMode='random', maxIterations=10, seed=seed) data.unpersist() subquantizers.append(np.vstack(sub.clusterCenters)) return (subquantizers[:len(subquantizers) / 2], subquantizers[len(subquantizers) / 2:])
def main(sc): stopset = set(stopwords.words('english')) tweets = sc.textFile('hdfs:/adi/sample.txt') words = tweets.map(lambda word: word.split(" ")) wordArr = [] for wArr in words.collect(): tempArr = [] for w in wArr: if not w in stopset: tempArr.append(w) wordArr.append(tempArr) # Open a file # print wordArr #tokens = sc.textFile("hdfs:/adi/tokens1.txt") # Load documents (one per line). documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" ")) numDims = 100000 hashingTF = HashingTF(numDims) tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf.count() model = KMeans.train(tfidf, 5) model.save(sc,"tweetModel1") print("Final centers: " + str(model.clusterCenters)) # print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info('Loading pickled noun to vector dictionary') # Load noun to vector dictionary with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as pickled: noun_to_vect_dict = pickle.load(pickled) # Create vector array from mapping vectors = np.array(noun_to_vect_dict.values()) max_k = int(sqrt(len(vectors) / 2.0)) # Define search space for k numbers_of_clusters = reversed(range(MIN_K, max_k)) # For each k for i, k in enumerate(numbers_of_clusters): # Initialize Spark Context sc = ps.SparkContext() # Load data data = sc.parallelize(vectors, 1024) logger.info('Trial %i of %i, %i clusters', (i + 1), max_k - 1, k) # Calculate cluster kmeans_model = KMeans.train(data, k, maxIterations=10, runs=10, initializationMode='k-means||') logger.info('Calculating WSSSE') # Calculate WSSSE WSSSE = data.map(lambda point: error(kmeans_model, point)) \ .reduce(lambda x, y: x + y) logger.info('Writing WSSSE') # Write k and WSSSE with open(path.join(OUT_FILES_LOC, 'elbow_data.txt'), 'a') as elbow_data: elbow_data.write(str(k) + '\t' + str(WSSSE) + '\n') sc.stop()
def train_model(self, dataframe, k, model_name): ''' use data to train model :param dataframe: all columns for train :param k:k value :param model_name:the trained model :return:None ''' data = self.prepare_data(dataframe) # train to get model model = KMeans.train(data, k) # create model saving path path = self.base + model_name # try to delete the old model if it exists try: import subprocess subprocess.call(["hadoop", "fs", "-rm", "-f", path]) except: pass # save new model on hdfs model.save(self.sc, path) # print all cluster of the model for c in model.clusterCenters: l = [] for i in c: i = decimal.Decimal(i).quantize(decimal.Decimal('0.01')) l.append(float(i)) print(l)
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Load in pickled noun to vector dictionary logger.info('Loading pickled noun to vector dictionary') # Load noun to vector dictionary with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as f: noun_to_vect_dict = pickle.load(f) # Create vectors array vectors = noun_to_vect_dict.values() # Initialize Spark Context sc = ps.SparkContext('local[*]') # Load data data = sc.parallelize(vectors, 1024) # Create and fit a KMeans model to the data logger.info('Fitting KMeans model') kmeans_model = KMeans.train(data, N_CLUSTERS, maxIterations=10, runs=10, initializationMode='k-means||') # Create a list of labels corresponding to vectors logger.info('Labeling vectors') labels = [kmeans_model.predict(vector) for vector in vectors] # Write to text file logger.info('Writing labels to file') with open(path.join(OUT_FILE_LOC, 'labels.txt'), 'w') as f: for label in labels: f.write(str(label) + '\n')
def test_kmeans(self): from pyspark.mllib.clustering import KMeans data = [[0, 1.1], [0, 1.2], [1.1, 0], [1.2, 0]] clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||") self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1])) self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
def fit(self, Z): """Compute k-means clustering. Parameters ---------- Z : ArrayRDD or DictRDD containing array-like or sparse matrix Train data. Returns ------- self """ X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z check_rdd(X, (np.ndarray, sp.spmatrix)) if self.init == 'k-means||': self._mllib_model = MLlibKMeans.train( X.unblock(), self.n_clusters, maxIterations=self.max_iter, initializationMode="k-means||") self.cluster_centers_ = self._mllib_model.centers else: models = X.map(lambda X: super(SparkKMeans, self).fit(X)) models = models.map(lambda model: model.cluster_centers_).collect() return super(SparkKMeans, self).fit(np.concatenate(models))
def kMeans(vecs, clusterNum): clusters = KMeans.train(vecs, clusterNum, maxIterations=10, runs=10, initializationMode="random") if pv.outputDebugMsg: Utils.logMessage("\nKmean cluster finished") return clusters
def KMeansModel(dataPath, label, k, character, master): sc = SparkContext(master) data = sc.textFile(dataPath).map(lambda line: line.replace(character, ',')) if label == 0: label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect() label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect() train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)])) else: label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect() label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect() train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1])) model = km.train(train_data, k) predict_data = train_data.collect() train = len(predict_data) acc = 0 for i in range(len(label_sum)): ksum = np.zeros(k, dtype = int) cur_label = label_sum[i][0] for j in range(train): if label[j] == cur_label: ksum[model.predict(predict_data[j])] += 1 acc += max(ksum) string = "KMeans Result: \n" center = model.centers for i in range(k): cur = str(i) + ":" + str(center[i]) + '\n' string += cur string = string + "Acc: " + str((float(acc)/train) * 100) + "%" sc.stop() return string
def kmeans(iterations, theRdd): def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) clusters = KMeans.train(theRdd, iterations, maxIterations=10, runs=10, initializationMode="random") WSSSE = theRdd.map(lambda point: error(point)).reduce(lambda x, y: x + y) return WSSSE, clusters
def main(arg1, arg2): sc = SparkContext(appName="KMeans") lines = sc.textFile(arg1) data = lines.map(parseVector) k = int(arg2) model = KMeans.train(data, k) print("Final centers: " + str(model.clusterCenters)) print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def spark_KMeans(train_data): maxIterations = 10 runs = 20 numClusters = [2,3,4,5,6,7,8,9,10,11,12,13,14] errors = [] for k in numClusters: model = KMeans.train(train_data, k, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4) WSSSE = model.computeCost(train_data) errors.append(WSSSE) plt.plot(numClusters, errors, 'ro') plt.xlabel(r'k') plt.ylabel(r'inertia') plt.title(r'inertia v.s. k') plt.savefig('kmeans_cross_validation.png') bestModel = KMeans.train(train_data, 6, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4) return bestModel
def cluster_data(sc, qc): drivers = read_file_path(BASE_PATH) print "Number of drivers: %d" % len(drivers) # Load and parse the data for i, dr in enumerate(drivers): # extract driver number from path dr_num = re.search("[0-9]+$", dr.strip()) if dr_num: dr_num = dr_num.group(0) if dr_num == '1018': continue else: print 'driver number error for %s' % dr continue dr_data = sc.textFile("hdfs://" + dr + "/" + dr_num + "_all_trips.txt") data = dr_data.map(lambda row: [float(x) for x in row.split(',')]) if i == 0: all_data = data else: all_data = all_data.union(data) data.unpersist() print 'Total number of records: %d' % all_data.count() # Build the model (cluster the data), k = Number of clusters k = 5 t = time() clusters = KMeans.train(all_data, k, maxIterations=100, runs=100, initializationMode="random", ) print 'KMeans took %.2f seconds' % (time() - t) # Compute cost WSSSE_map = all_data.map(lambda point: error(point, clusters)) # Join cluster ID to original data all_data_w_cluster = all_data.map(lambda point: np.hstack((point, get_cluster_id(clusters, point)))) # all_data_w_cluster.saveAsTextFile("hdfs:///usr/local/spark/kmeans/results.txt") for i in xrange(0,k): subset = all_data_w_cluster.filter(lambda x: x[-1] == i) print "Number of items in cluster %d: %d" % (i, subset.count()) # Computer functions on different features: all_features_average = subset.sum() / subset.count() print 'Average of all features' print all_features_average WSSSE = all_data.map(lambda point: error(point, clusters)).reduce(lambda x, y: x + y) print("Within set sum of squared error: " + str(WSSSE))
def test_clustering(self): from pyspark.mllib.clustering import KMeans data = [ self.scipy_matrix(3, {1: 1.0}), self.scipy_matrix(3, {1: 1.1}), self.scipy_matrix(3, {2: 1.0}), self.scipy_matrix(3, {2: 1.1}) ] clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||") self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1])) self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
def k_means(loadTrainingFilePath, sc): # Load and parse the data loadTrainingFilePath = "../data/kmeans_data.txt" data = sc.textFile(loadTrainingFilePath) parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 3, maxIterations=10, runs=30, initializationMode="random") WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE))
def build_cluster_model(tfidf_vectors_rdd, num_clusters, max_iterations, runs): """Perform the clustering of vectors using K-means. Returns: k means model learned from the training data in tfidf_vectors_rdd """ # Build the model (cluster the training data) return KMeans.train(tfidf_vectors_rdd, num_clusters, maxIterations=max_iterations, runs=runs)
def main(noun_file_loc, model_file_loc, percent, n_trials, out_files_loc): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info('Loading Word2Vec model') # Load trained Word2Vec model model = Word2Vec.load('model_file_loc') logger.info('Reading in list of nouns') # Read in list of sorted nouns sorted_nouns = [] with open(noun_file_loc, 'r') as f: for line in f: sorted_nouns += line # Count number of nouns n_nouns = len(sorted_nouns) # Create dictionary to map nouns to vectors noun_to_vect_dict = {} # Calculate index to stop slice as percentage of total nouns n_nouns_to_keep = int(n_nouns * percent / 100.) logger.info('Keeping %i nouns, %i percent of %i', n_nouns_to_keep, percent, n_nouns) # Add nouns and vectors to dictionary for noun in sorted_nouns[0:n_nouns_to_keep]: noun_to_vect_dict[noun] = model[noun] vectors = np.array(noun_to_vect_dict.values()) # Initialize Spark Context sc = ps.SparkContext('local[4]') # Load data data = sc.parallelize(vectors) # Define search space for k ns_clusters = [int(x) for x in np.linspace(2, n_nouns, n_trials)] # Open WSSSEs output file with open(path.join(out_files_loc, 'elbow_data.txt'), 'w') as elbow_data: # For each k for i, k in enumerate(ns_clusters): logger.info('Trial %i of %i, %i clusters', (i + 1), n_trials, k) # Calculate cluster kmeans_model = KMeans.train(data, k, maxIterations=10, runs=10, initalizationMode='k-means||') # Calculate WSSSE WSSSE = data.map(lambda point: error(kmeans_model, point)) \ .reduce(lambda x, y: x + y) # Save centroids with open(path.join(out_files_loc, '_', k, '.pkl'), 'w') as f: pickle.dump(kmeans_model.clusterCenters(), f) # Write k and WSSSE elbow_data.write('%i, %f', k, WSSSE)
def main(): sc = SparkContext() filename = sys.argv[1] clusters=int(sys.argv[2]) outmodelname = sys.argv[3] dataset = gdal.Open(filename, GA_ReadOnly) driver = dataset.GetDriver().ShortName x, y, data = tiff_to_array(dataset, weights) print "after change to array" clusterdata = sc.parallelize(data) print "parallelize done" kmeanmodel = KMeans.train(clusterdata, clusters, maxIterations=50, runs=10) kmeanmodel.save(sc, outmodelname) print kmeanmodel.clusterCenters
def train_coarse(sc, split_vecs, V, seed=None): """ Perform KMeans on each split of the data with V clusters each. """ # Cluster first split first = split_vecs.map(lambda x: x[0]) first.cache() print 'Total training set size: %d' % first.count() print 'Starting training coarse quantizer...' C0 = KMeans.train(first, V, initializationMode='random', maxIterations=10, seed=seed) print '... done training coarse quantizer.' first.unpersist() # Cluster second split second = split_vecs.map(lambda x: x[1]) second.cache() print 'Starting training coarse quantizer...' C1 = KMeans.train(second, V, initializationMode='random', maxIterations=10, seed=seed) print '... done training coarse quantizer.' second.unpersist() return np.vstack(C0.clusterCenters), np.vstack(C1.clusterCenters)
def kmeans(k=2): """ kmeans """ # Load and parse training data data = getTrainData(dataFilename) parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # pyspark.rdd.PipelinedRDD # Build the model (cluster the data) # KMeans.train(cls, data, k, maxIterations=100, runs=1, initializationMode="k-means||") clf = KMeans.train(parsedData, k, maxIterations=10, runs=10, initializationMode="random") # pyspark.mllib.clustering.KMeansModel WSSSE = parsedData.map(lambda point: error(point, clf)).reduce(lambda x, y: x + y) # float print("Within Set Sum of Squared Error = " + str(WSSSE)) print "### cluster centers ###:" print clf.centers return clf
def bagofwords(imtrain, imtest=None, features=_features, outdir=None): cache = Cache(cacheroot=outdir) # Unique labels labels = imtrain.map(lambda x: x.category).distinct().collect() print labels # Features: each returns a row array of features X = imtrain.map(features) # Clustering: kmeans clustering to generate words # http://spark.apache.org/docs/0.9.0/mllib-guide.html model = KMeans.train(X, 2, maxIterations=10, runs=30, initializationMode='random') # construct bag of words representation print model.clusterCenters
def kmeans_CSV(): try: # creating a parsedData RDD to do kmeans on servernum = sys.argv[1] serverpath = "hdfs://10.0.0.4:8020/opentsdb/" + servernum print "Attempting to create SparkContext" sconf = SparkConf().setAppName("Kmeans for files") print "Sconf set..." sc = SparkContext(conf=sconf) print "SparkContext created" # making parsedData RDD: [ array([filewrites, filereads, CPU, diskIOBW, net bytes]), array([...]), ... ] # kmeans iteratively passes over data multiple times - cache parsedData if len(sys.argv) == 2: #user just specified server - do full server kmeans filepaths = get_file_paths(serverpath)# Array of string file paths to all files within folder parsedData = compile_RDD(sc, filepaths).cache() CSV_filename = make_name(filepaths) + "_" + servernum elif len(sys.argv) == 3: #user put in server and single timeframe - do single file kmeans timeframe = sys.argv[2] #ex: 2014-07-09 filepaths = get_singlefile_path(timeframe, serverpath) parsedData = compile_RDD(sc, filepaths).cache() CSV_filename = str(timeframe) + "_" + servernum else: #user put in server and start/end timeframe - do timeframe kmeans start_timeframe = sys.argv[2] end_timeframe = sys.argv[3] filepaths = get_timeframefile_paths(start_timeframe, end_timeframe, serverpath) parsedData = compile_RDD(sc, filepaths).cache() CSV_filename = make_name(filepaths)+ "_" + servernum k = findk(parsedData.count()) clusters = KMeans.train(parsedData, k, maxIterations=10, runs=10, initializationMode="random") centers = clusters.clusterCenters # Creating two CSVs (one has data points, one has centers) for later visualization compile_CSV(CSV_filename, parsedData) compile_centers_CSV(CSV_filename, centers) print "SUCCESS: Kmeans done" except: print "---------------------------------" print "Usage: ./bin/spark-submit kmeans_CSV.py <servername> <start_timeframe> <end_timeframe>" print "<servername> must be specified. EX: sense0 " print "Timeframes are optional. Specify just one timeframe for single file kmeans. Specify start and end for kmeans over timeframe." print "Timeframes must be in format yyyy-mm-DD" print "---------------------------------" raise
def choose_k(sub_df): wssse_list = [] for i in range(1, 11): clusters = KMeans.train(sub_df, i, maxIterations=10, initializationMode="random") WSSSE = sub_df.map(lambda point: sqrt( sum([ x**2 for x in (point - clusters.centers[clusters.predict(point)]) / sd ]))).reduce(add) wssse_list.append(WSSSE) wssse_minus = [(x - y) / (x + 0.001) for x, y in zip(wssse_list[:-2], wssse_list[1:])] zipped = zip(range(1, 10), np.abs(wssse_minus)) k = sorted(zipped, key=itemgetter(1), reverse=True)[0][0] return k
def test_kmeans(self): from pyspark.mllib.clustering import KMeans data = [ [0, 1.1], [0, 1.2], [1.1, 0], [1.2, 0], ] clusters = KMeans.train( self.sc.parallelize(data), 2, initializationMode="k-means||", initializationSteps=7, epsilon=1e-4, ) self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1])) self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
def get_cluster_ids(data_to_cluster, K): """Gets cluster ids for data to be clustered""" #~ print('Traning KMeans') model = KMeans.train(data_to_cluster, K, maxIterations=10, initializationMode='random') #~ print('Finished Traning') #~ print('Predicting Cluster IDs') cluster_ids = model.predict(data_to_cluster) #~ print('Finished Prediction') #~ print('10 Sample Cluster IDs:') #~ print(cluster_ids.takeSample(False, 10)) #~ print('10 Sample Data (nutrient-nutrient) Clusters') #~ print(data_to_cluster.takeSample(False, 10)) #~ print('Fetched Cluster IDs') return cluster_ids, data_to_cluster
def __kmeans_clustering(self): # get the whole population without fitness value, then flat it rdd_aux = self.__rdd.flatMap(lambda x: x.get_population(fitness=False)) # train the kmeans kmeans_cluster = KMeans.train( rdd_aux, self.__colonies, maxIterations=self.__cluster_iterations, initializationMode="random") # acepta "k-means||" # create a new rdd with the labels rdd_labels = kmeans_cluster.predict(rdd_aux) # zip each result with its class rdd_aux = rdd_labels.zip(rdd_aux) # input serialization cols = self.__colonies self.__sc.broadcast(cols) # divide into partitions rdd_aux = rdd_aux.partitionBy(cols, partitionFunc=lambda x: x).glom() # remove the index of each element rdd_aux = rdd_aux.map(lambda x: [y[1] for y in x]) # input serialization evaluation = self.__evaluation generation = self.__generation cross = self.__cross mutation = self.__mutation selection = self.__selection survival = self.__survival mut_ratio = self.__mut_ratio survival_ratio = self.__survival_ratio control_obj = self.__control_obj # create the new colonies self.__rdd = rdd_aux.map( lambda x: Colony(evaluation, generation, cross=cross, mutation=mutation, selection=selection, mut_ratio=mut_ratio, survival_ratio=survival_ratio, survival=survival, control_obj=control_obj, population=x))
def detect(self, k, t): # Encoding categorical features using one-hot. df1 = self.cat2Num(self.rawDF, [0, 1]).cache() df1.show(n=2, truncate=False) # Clustering points using KMeans features = df1.select("features").rdd.map(lambda row: row[0]).cache() model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20) # Adding the prediction column to df1 modelBC = sparkCt.broadcast(model) predictUDF = udf(lambda x: modelBC.value.predict(x), StringType()) df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache() df2.show(n=3, truncate=False) # Adding the score column to df2; The higher the score, the more likely it is an anomaly df3 = self.addScore(df2).cache() df3.show(n=3, truncate=False) return df3.where(df3.score > t)
def get_clusters(data_rdd, num_clusters=NUM_CLUSTERS, max_iterations=MAX_ITERATIONS, initialization_mode=INITIALIZATION_MODE, seed=SEED): # TODO: # Use the given data and the cluster pparameters to train a K-Means model # Find the cluster id corresponding to data point (a car) # Return a list of lists of the titles which belong to the same cluster # For example, if the output is [["Mercedes", "Audi"], ["Honda", "Hyundai"]] # Then "Mercedes" and "Audi" should have the same cluster id, and "Honda" and # "Hyundai" should have the same cluster id features = data_rdd.map(lambda line: array([float(x) for x in line.split(',')[1:]])) clusters = KMeans.train(features, num_clusters, maxIterations=max_iterations, initializationMode=initialization_mode, seed=seed) res = data_rdd.map(lambda line: (clusters.predict(array([float(x) for x in line.split(',')[1:]])), [line.split(',')[0]])).reduceByKey(lambda a, b: a + b) result = [[]] res = res.collect() for c in res: result.append(c[1]) if [] in result: result.remove([]) return result
def calculate_wssse(data_to_cluster): """Calculates Within Set Sum of Squared Error (WSSSE)""" K = [] wssse = [] for k in range(2, 12): print('Computing WSSE for {}'.format(k)) K.append(k) model = KMeans.train(data_to_cluster, k, maxIterations=10, initializationMode='random') wssse_value = model.computeCost(data_to_cluster) wssse.append(wssse_value) # Plot the WSSSE for different values of k plt.plot(K, wssse) plt.show()
def detect(self, k, t): #Encoding categorical features using one-hot. df1 = self.cat2Num(self.rawDF, [0, 1]) df1.show() #Clustering points using KMeans features = df1.select("features").rdd.map(lambda row: row[0]).cache() model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20) #Adding the prediction column to df1 modelBC = sc.broadcast(model) predictUDF = udf(lambda x: modelBC.value.predict(x), StringType()) df2 = df1.withColumn("prediction", predictUDF(df1.features)) df2.show() #Adding the score column to df2; The higher the score, the more likely it is an anomaly df3 = self.addScore(df2) df3.show() return df3.where(df3.score > t)
def run(): # Set up sc = SparkContext() records = sc.textFile(os.path.realpath(__file__+'/..') + '/data-scraper/data') # Build clusters kvpairs = records.map(keyAndParse) cts = kvpairs.groupByKey().map(lambda (name, statList): (name, len(statList))).collectAsMap() kvpairs = kvpairs.reduceByKey(combine) # Filter outliers with too few records kvpairs = kvpairs.filter(lambda (k,v): cts[k] > 2) kvpairs = kvpairs.map(lambda (name, statline): (name, normalize(statline, cts[name]))) numClusters = 20 clusters = KMeans.train(kvpairs.map(lambda (k,v): v),numClusters,10) groupedClusters = kvpairs.groupBy(lambda (k,v): clusters.predict(v)).map(lambda x: (x[0], getNames(list(x[1])))).collect() # Rank clusters centers = avg(clusters.clusterCenters) centers.sort(key=lambda x: x['score'], reverse=True) # Save sorted clusters save(groupedClusters, centers)
def cluster(filename, k, indices): stat, data = format_input("input/" + filename, indices) output = file("output/" + filename, "w") model = KMeans.train(data, k) #print(model) #pickle.dump(model,open(filename+".p","wb")) P = dict() cluster_centers = model.clusterCenters with file("output/" + filename, "w") as f: for x in stat.collect(): name = str(x[0]) num = str(model.predict(x[1])) centers = str(' '.join( '{:.3f}'.format(i) for i in cluster_centers[model.predict(x[1])])) P[name] = num f.write(name + "," + num + "\n") pickle.dump(P, open(filename + ".p", "wb"))
def kmeans_demo(self): file = self.sc.textFile(self.base+'k_data.csv') # transform to rdd data = file.map(lambda line: line.split(',')).cache() print(type(data)) # train data to get the model model = KMeans.train(data,k=3) # print to check all clusters cluster = model.clusterCenters for c in cluster: print(c) # predict new data return the data belong to which cluster(index of the cluster) predict = model.predict([1.3,.1,1.1]) print(predict)
def clustering_score(data,k): model = KMeans.train(data, k=k,maxIterations=200) def distance(v1, v2): s = 0 # [1,2,3] [4,5,6] --> [(1,4),(2,5),(3,6)] pairs = zip(v1,v2) for p in pairs: sub = float(p[0]) - float(p[1]) s = s + sub * sub return math.sqrt(s) def dist_to_centroid(datum): # predict the data cluster = model.predict(datum) # get the current centroid --> means center point centroid = model.clusterCenters[cluster] # call distance method return distance(centroid, datum) return data.map(dist_to_centroid).mean()
def build_classifier(self, dataset, kmeans_dataset, feature_keys): self.logger.info('building classifier') kmeans_train_set = [] for item in kmeans_dataset: features = [item[column] for column in feature_keys] kmeans_train_set.append(array(features)) self.logger.debug("kmeans_train_set %d", len(kmeans_train_set)) kmeans_train_set = sc.parallelize(kmeans_train_set) clusters = KMeans.train(kmeans_train_set, 100, maxIterations=500, runs=10, initializationMode="random") del kmeans_dataset del kmeans_train_set data = [] for item in dataset: features = [item[column] for column in feature_keys] data.append(LabeledPoint(int(item['classifier_label']), features)) del dataset data = sc.parallelize(data) (trainingData, testData) = data.randomSplit([0.7, 0.3]) del data model = RandomForest.trainClassifier( trainingData, numClasses=self.total_splits, categoricalFeaturesInfo={}, numTrees=self.rfc_config['num_trees'], featureSubsetStrategy=self. rfr_config['feature_subset_strategy'], # "all", impurity='gini', maxDepth=self.rfc_config['max_depth'], maxBins=32) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip( predictions) testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(testData.count()) self.logger.info('classifier build finished') return model, clusters, testErr
def kmeans_w2v(): df_path = "hdfs:///user/rmusters/lambert_w2v_data_jan" df = sqlContext.read.parquet(df_path) data = df.select("vectors") parsedData = data.dropna().map(lambda line: line[0]) errors = [] cluster_sizes = [] for n_clusters in range(10, 1000, 50): # Build the model (cluster the data) clusters = KMeans.train(parsedData, n_clusters, maxIterations=10, runs=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce( lambda x, y: x + y) errors.append(WSSSE) cluster_sizes.append(n_clusters) logger.info("Within Set Sum of Squared Error = " + str(n_clusters) + "&" + str(WSSSE)) # Save and load model if n_clusters == 520: clusters.save(sc, "hdfs:///user/rmusters/lambert_kmeans_w2v_jan") df = sc.parallelize(errors).map(lambda x: (x, )).toDF().withColumnRenamed( "_1", "error") df2 = sc.parallelize(cluster_sizes).map( lambda x: (x, )).toDF().withColumnRenamed("_1", "n_cluster") res = df.join(df2).dropDuplicates(["n_cluster"]) res.write.format("com.databricks.spark.csv").mode("overwrite").save( "errors_kmeans.csv")
def main(): ''' ''' # set up environment conf = SparkConf() \ .setAppName("kMeans") \ .set("spark.executor.memory", "2g") sc = SparkContext(conf=conf) # Load and parse the data data = sc.textFile("data/kmeans_data.txt") parsedData = data.map( \ lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, \ runs=10, initializationMode="random") WSSSE = parsedData.map(lambda point: error(clusters, point)) \ .reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE))
def kmeans_train(self, data_rdd, n_clusters): """ This method is used to train the model """ data_splits = data_rdd.randomSplit([.50, .25, .25], seed=0) training_set = data_splits[0].repartition(numPartitions=4).cache() validation_set = data_splits[1].repartition(numPartitions=4).cache() test_set = data_splits[2].repartition(numPartitions=4).cache() max_iter_arr = [50,60,80] max_runs = [50,60,80] k_list = n_clusters best_model = None best_rmse = float("inf") best_run = 0 best_k = 0 itertools.product for itera, run, k in itertools.product(max_iter_arr, max_runs, k_list): try: model = KMeans.train(training_set, 3, itera, run, "random") validation_rmse = model.computeCost(validation_set) print("#of clusters k %d\n" % (k)) if validation_rmse < best_rmse: best_model = model best_rmse = validation_rmse best_run = max_runs best_iter = itera best_k = k except Exception as e: print(e) continue # test_preds = best_model.predict(test_set.first()) print("K-means results...") print(str(best_rmse)) print(str(best_k)) return best_model
def main(): #Reading the json file reviews_data = sqlContext.read.json(input) reviews=reviews_data.select('reviewText') rdd_data=reviews.rdd.map(lambda line:str(line.reviewText)) transformed_data=rdd_data.map(transform_data).cache() #Transforming the words model = word2vec.fit(transformed_data) #Finding distinct words unique_words=transformed_data.flatMap(lambda l:l).map(lambda l:str(l)).distinct() # print unique_words.collect() dict1={} for a in unique_words.collect(): try: dict1[a]=model.transform(a) except Exception: pass # Saving word2vec model pickle.dump(dict1, open(output+'\output_vector_sample.txt', "wb")) # dict2=pickle.load(open(output+'/output4.txt', "rb")) #finding synonyms # synonyms = model.findSynonyms('happy', 10) # print synonyms feature_vectors=dict1.values() feature_vectors_rdd=sc.parallelize(feature_vectors) clusters = KMeans.train(feature_vectors_rdd, 2000, maxIterations=1,runs=1, initializationMode="random") # WSSSE=feature_vectors_rdd.map(lambda point: error(clusters,point)).reduce(lambda x, y: x + y) # print("Within Set Sum of Squared Error = " + str(WSSSE)) cluster_predictions={} for key in dict1.keys(): cluster_predictions[key]=clusters.predict(dict1[key]) # Saving word to cluster index model pickle.dump(cluster_predictions,open(output+'/cluster_data.txt', "wb"))
def kmeans_check(self, T, k=3, normalize=True): ''' #:param: indegree threshold. if T<1, at least one vector in each group would be removed #:return: outlier list ''' if not self.transform: self.column_datatype(self._column) trans_df = self._df.select(self._column).rdd.map(lambda x: np.array(x)) clusters = KMeans.train(trans_df.map(lambda x: x[:-1]), k, maxIterations=10, runs=1, initializationMode='random') maxIngroup = trans_df.map(lambda x: (clusters.predict(x[:-1]), \ np.linalg.norm(clusters.centers[clusters.predict(x[:-1])]-x[:-1]))).reduceByKey(lambda x,y: x if x>y else y).collect() maxIngroup = sorted(maxIngroup) distForAll = trans_df.map(lambda x: (x[-1],np.linalg.norm(clusters.centers[clusters.predict(x[:-1])]-x[:-1])/ \ maxIngroup[clusters.predict(x[:-1])][1])) outlier_index = distForAll.filter(lambda x: x[1] > T).map( lambda x: int(x[0])).collect() print('Around %.2f of rows are outliers.' % (len(outlier_index) / self.rownum)) self.transform = False return outlier_index
def my_test(sc, util, data): dat = tcg.tc_gen(100) train_data = [np.array(sf.softmax(x)) for x in dat] clusters = KMeans.train(sc.parallelize(train_data), 20, maxIterations=10, initializationMode="random") def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = map(lambda point: error(point), train_data) WSSSE = reduce(lambda x, y: x + y, WSSSE) print("Within Set Sum of Squared Error = " + str(WSSSE)) clustered = collections.defaultdict(list) for i, point in enumerate(train_data): clustered[clusters.predict(point)].append(dat[i][0]) #print len(train_data) print clustered.keys() return clustered
def kmeans_model(file_path, file_out): global SPARK_MASTER y = pyspark.SparkConf() y.setMaster(SPARK_MASTER) # y.setSparkHome('/usr/local/spark') print file_path print y.getAll() sc = pyspark.SparkContext(conf=y) # print sc.pythonExec # print sc.pythonVer textfile = sc.textFile(file_path) print textfile.collect() print textfile.count() y = textfile.map(lambda each: each.split(' ')[1:]) p = re.compile('\d:') z = y.map(lambda x: transform(x, p)) z = z.map(lambda x: [float(each) for each in x]) print z.collect() model = KMeans.train(z, 2) print model.clusterCenters # textfile.saveAsTextFile(file_out) model.save(sc, file_out) sc.stop() """
rdd_split_int = rdd_split.map(lambda x: [int(x[0]), int(x[1])]) ## Count the number of rows in RDD print("There are {} rows in the rdd_split_int dataset".format(rdd_split_int.count())) ##### K-Means Training ##### ## Error Function def error(point): center = model.centers[model.predict(point)] return sqrt(sum([x ** 2 for x in (point - center)])) ## Train the model with clusters from 13 to 16 and compute WSSSE clusters_wssse = [] for clst in range(1, 21): model = KMeans.train(rdd_split_int, clst, seed=1) WSSSE = rdd_split_int.map(lambda point: error(point)).reduce(lambda x, y: x + y) clusters_wssse.append([clst, WSSSE]) print("The cluster {} has Within Set Sum of Squared Error {}".format(clst, WSSSE)) ## Train the model again with the best k model = KMeans.train(rdd_split_int, k=15, seed=1) ## Get cluster centers cluster_centers = model.clusterCenters ##### Visual the Clusters ##### ## Convert rdd_split_int RDD into Spark DataFrame rdd_split_int_df = spark.createDataFrame(rdd_split_int, schema=["col1", "col2"]) ## Convert Spark DataFrame into Pandas DataFrame
# iris.csv is from https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data # this file is uploaded to the S3 bucket used in sc.textfile() below # FORMAT: # 5.1,3.5,1.4,0.2,setosa # 4.9,3.0,1.4,0.2,setosa # 4.7,3.2,1.3,0.2,setosa # 4.6,3.1,1.5,0.2,setosa sc = SparkContext() # data = sc.textFile("s3://com.lifetech.ampliseq.dev.transfer/iris.csv") # publically accessible file data = sc.textFile(input_file) # local file or publicly accessible file in S3 p = data.map(lambda line: array([float(x) for x in line.split(',')[0:4]])) # print RDD for x in p.collect(): print x clusters = KMeans.train(p, 3, maxIterations=100, initializationMode="random") # Get within-cluster sum-of-squares def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = p.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error (WSSSE) = " + str(WSSSE))
#*******************************************************************************************************# if __name__ == "__main__": if len(sys.argv) != 1: print("Not a correct no. of arguments", file=sys.stderr) exit(-1) #*********Initiating the spark context for the application Kmean *****************# sc = SparkContext(appName="KMeansApp") #************partitioning the data into RDD with the help of function .textFile and parsing it***************# inplines = sc.textFile('input.csv') inpdata = inplines.map(ParseAndDrop) # ********** k defines the number of cluster ********************************************# k = int(2) model = KMeans.train(inpdata, k) #***************reading the data points for calculating the accuracy and predicting the trained model **********# with open('Input.csv') as file: rows = file.readlines() #*************Initialization of the list *******************************************# DProw = [] PredVal_arrays = [] ActVal_arrays = [] #***********************************************************************************# for row in rows: row = row.rstrip("\n") #***********************Initializing an output list *************************************# Out = []
# Path to log input file logFile = "/user/root/src/Project - Developer - apache-access-log (4).txt.gz" # Read log text file and parse based on Apache log standard parsed_logs, access_logs = parseLogs(sc, logFile) # Process data for feature columns to be used in training df4 = dataProcessing(access_logs) df4.show() # Format DataFrame into Dense Vector for mllib K-means clustering data7 = df4.rdd.map(lambda row: Vectors.dense(row[2], row[3])) data7.cache() # Train Data for kmeans model kmeans = KMeans.train(data7, 3, 10) # Print the centers to cehck centers = kmeans.clusterCenters for center in centers: print(center) WSSSE = data7.map(lambda point: error(point, kmeans)).reduce( lambda x, y: x + y) print "Within Set Sum of Squared Error = " + str(WSSSE) # Convert DataFrame object to RDD object to add cluster predictions rowsRDD = df4.rdd.map(lambda r: (r[0], r[1], r[2], r[3], r[4])) rowsRDD.cache() predictions = rowsRDD.map(lambda r: (r[0], r[1], r[2], r[3], r[ 4], kmeans.predict(Vectors.dense(r[2], r[3])))) predDF = predictions.toDF() predDF.show()
# Spark initialization conf = pyspark.SparkConf().setAppName("kmeans").setMaster("local") sc = pyspark.SparkContext(conf=conf) # Arguments parsing file_name = sys.argv[1] k = int(sys.argv[2]) output_file_name = sys.argv[3] # Initialization points=sc.textFile(file_name).map(lambda x: x.split(" ")).\ map(lambda (x,y): (float(x), float(y))) clusters = KMeans.train(points, k, maxIterations=100, initializationMode="kmeans||") points.map(lambda x: "{0} {1} {2}".format(clusters.predict(x), x[0], x[1])).\ saveAsTextFile(output_file_name) write_centroids(clusters.centers, os.path.join(output_file_name, "centroids_final.txt")) def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) wsse = points.map(lambda point: error(point)).reduce(lambda x, y: x + y)
from pyspark import SparkContext from pyspark.mllib.clustering import KMeans from numpy import array from math import sqrt sc = SparkContext() #4 data points (0.0, 0.0), (1.0, 1.0), (9.0, 8.0) (8.0, 9.0) data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2) #Generate K means model = KMeans.train(sc.parallelize(data), 2, maxIterations=10, runs=30, initializationMode="random") #Print out the cluster of each data point print(model.predict(array([0.0, 0.0]))) print(model.predict(array([1.0, 1.0]))) print(model.predict(array([8.0, 0.0]))) print(model.predict(array([9.0, 8.0]))) print(model.predict(array([8.0, 9.0]))) print(model.predict(array([8.0, 7.0])))
# NOT MY CODE, modified from Apache Spark Python example of MLlib - Clustering # http://spark.apache.org/docs/latest/mllib-clustering.html ############################################################################### from pyspark.mllib.clustering import KMeans, KMeansModel from numpy import array #from math import sqrt # Load and parse the data #fileName = "data/mllib/kmeans_data.txt" fileName = "data.txt" data = sc.textFile(fileName, 8) #partition goes here parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])).cache() # Build the models with different seeders: random or fariest spots c1_clusters = KMeans.train(parsedData, 10, maxIterations=20, runs=1, initializationMode="random") c2_clusters = KMeans.train(parsedData, 10, maxIterations=20, runs=1, initializationMode='k-means||') #c1_initials=sc.textFile('c1.txt').map(lambda line: array([float(x) for x in line.split(' ')])) #c1_preset_clusters = KMeans.train(parsedData, 10, maxIterations=20, initialModel=c1_initials) #new parameters in Spark v1.5.0 # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point, model): center = model.centers[model.predict(point)] return sum([x**2 for x in (point - center)])**0.5 def wssse(dataRDD, model): return dataRDD.map(lambda point: error(point, model)).reduce(lambda x, y: x + y) c1_WSSSE = wssse(parsedData, c1_clusters)
'转为分布式行矩阵' total_outlier_matrix = RowMatrix(total_outlier_sample) '生成DenseVectorde的概述统计量' desc_total_outlier_matrix = MultivariateStatisticalSummary( total_outlier_matrix.rows) # print ('outlier factors mean:',desc_total_outlier_matrix.mean()) # print ('outlier factors variance:',desc_total_outlier_matrix.variance()) #训练 num_clusters = 9 num_iterations = 20 num_runs = 3 outlier_cluster_model = KMeans.train(total_outlier_sample, num_clusters, num_iterations, num_runs) outlier_predictions = outlier_cluster_model.predict(total_outlier_sample) print('对前十个outlier样本的预测标签为:' + ",".join([str(i) for i in outlier_predictions.take(10)])) #评估模型 #内部指标WCSS outlier_cost = outlier_cluster_model.computeCost(total_outlier_sample) print("WCSS for outlier_sample: %f" % outlier_cost) #外部指标 #带标注的数据 分类指标 #交叉验证K调优 train_test_split_outlier = total_outlier_sample.randomSplit([0.6, 0.4], 123)
.builder \ .appName("KMeans") \ .config("spark.some.config.option", "Angadpreet-KMeans") \ .getOrCreate() today = dt.datetime.today() spark_df = sc.parallelize(spark.read.json("Data/yelp_academic_dataset_business.json").select("stars","review_count","is_open").take(1700)) scaler = MinMaxScaler(inputCol="_1",\ outputCol="scaled_1") trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(lambda x:(x, )).toDF() scalerModel = scaler.fit(trial_df) vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(lambda x:Vectors.dense(x)) num_clusters = 3 #Input into the Algorithm km = KMeans() kme = km.train(vector_df, k = num_clusters, maxIterations = 10, seed=2018) centers = kme.clusterCenters err = vector_df.map(lambda x:(x[0], findCenter(x[0], centers))).collect() #Silhoutte Value comparison ag = 0 agi = 0 for er in err: avg = [0] * num_clusters avgi = [0] * num_clusters for e in err: avg[e[1]] += Vectors.squared_distance(er[0], e[0]) avgi[e[1]] += 1 a = avg[er[1]] / avgi[er[1]] b = sys.maxint
random.normal(incomecentroid, 10000.0), random.normal(agecentroid, 2.0) ]) X = array(X) return X def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) #Load data and normalize it with scale data = sc.parallelize(scale(createClusteredData(100, K))) clusters = KMeans.train(data, K, maxIterations=10, runs=10, initializationMode="random") resultRDD = data.map(lambda point: clusters.predict(point)).cache() print "Counts by value" counts = resultRDD.countByValue() print counts print "Actual assignments" result = resultRDD.collect() print result #within set sum of squared errors WSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y) print str(WSSE)
currTime = strftime("%Y-%m-%d-%H-%M-%S") sc = SparkContext(appName="KMeans") lines = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/initial_centroids.csv") dataset = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/training_data.csv") predict_data = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv") average_per_year = average_year(lines) # 2014 and 2015 average_per_month = average_month(average_per_year) data = parseDataset(dataset) k = int(sys.argv[1]) initial_centroids = generate_initial_centroids(average_per_month.collect(), k) # KMeans start = time() kmeans_model = KMeans.train(data, k, maxIterations = 100, initialModel = KMeansModel(initial_centroids)) end = time() elapsed_time = end - start kmeans_output = [ "====================== KMeans ====================\n", "Final centers: " + str(kmeans_model.clusterCenters), "Total Cost: " + str(kmeans_model.computeCost(data)), "Value of K: " + str(k), "Elapsed time: %0.10f seconds." % elapsed_time ] # Predicting points = parseDataset(predict_data) count_lines = float(len(points.collect())) probabilities = generate_probabilities(points, k, kmeans_model, count_lines) print("Prob: ", probabilities)
from pyspark.mllib.clustering import KMeans, KMeansModel from pyspark import SparkContext import json # Load and parse the data sc = SparkContext("local", "Python K-Means Amazon Reviews") # First arg must be the filename filename = sys.argv[1] data = sc.textFile(filename) parsedData = data.map(lambda line: json.loads(line)).\ map(lambda line: (float(line['overall']), float(len(line['reviewText'])), float(line['unixReviewTime']))) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, runs=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors WSSSE = clusters.computeCost(parsedData) print("Within Set Sum of Squared Error = " + str(WSSSE)) # print clusters.centers # Save and load model # clusters.save(sc, "myModelPath") # sameModel = KMeansModel.load(sc, "myModelPath")
from numpy import array from math import sqrt import json from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("KMeans WSSSE") sc = SparkContext(conf=conf) coordinates = sc.textFile( "hdfs:///user/emilojkovic/data/az_businesses_kmeans/part-00000") def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) errors = [] # Build the model (cluster the data) for i in range(1, 15): clusters = KMeans.train(data, i, maxIterations=300, runs=10, initializationMode="k-means") WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y) errors.append((i, str(WSSSE))) sc.parallelize(errors).coalesce(1).saveAsTextFile( 'hdfs:///user/emilojkovic/kmeans_wssse')
if l[2] == '-': l[2] = 0 return np.array([float(l[1]), float(l[2])]) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: kmeans <file> <k>", file=sys.stderr) exit(-1) fp = open('ballout.csv', 'w') writer = csv.writer(fp) sc = SparkContext(appName="KMeans") lines = sc.textFile(sys.argv[1]) data = lines.map(parseVector) k = int(sys.argv[2]) model = KMeans.train(data, k) #batsman on ave and strik rate #model=KMeans.train(sc.parallelize(data),k,maxIterations=10,runs=30,initialzationMode="random") print("labels : ", data.map(model.predict)) #bowler on ave and no of wickets print("Final centers: " + str(model.clusterCenters)) cluster_ind = model.predict(data) lis = [] f = open('bowl.csv', 'r') st = f.read() some = st.split('\n') i = 0 for x in cluster_ind.collect(): print(x) l1 = [] l1.append(x)
observation_group_1.append(randrange(5, 8)) observation_group_2=[] for i in range(n_in_each_group*n_of_feature): observation_group_2.append(randrange(55, 58)) observation_group_3=[] for i in range(n_in_each_group*n_of_feature): observation_group_3.append(randrange(105, 108)) data = array([observation_group_1, observation_group_2, observation_group_3]).reshape(n_in_each_group*3, 5) data = sc.parallelize(data) # Run the K-Means algorithm ----------------------------------------------------- # Build the K-Means model clusters = KMeans.train(data, 3, maxIterations=10, initializationMode="random") # the initializationMode can also be "k-means||" or set by users. # Collect the clustering result result=data.map(lambda point: clusters.predict(point)).collect() print result # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE))