def initializeModels(self): try: if self.kmeansDF: logger.info("Already loaded this DataFrame") pass except AttributeError: self.kmeansDF = None commandsDF = self.bashDF.map(lambda row: Row(date=row.date, source=row.source, username=row.username, exec_as=row.exec_as, srcip=row.srcip, command=row.command.split(" "))).toDF() commandsDF.cache() word2Vec = Word2Vec(vectorSize=100, minCount=1, inputCol="command", outputCol="features") w2model = word2Vec.fit(commandsDF) resultDF = w2model.transform(commandsDF) resultDF.cache() kmeans = KMeans(k=650, seed=42, featuresCol="features", predictionCol="prediction", maxIter=10, initSteps=3) kmodel = kmeans.fit(resultDF) kmeansDF = kmodel.transform(resultDF) kmeansDF.cache() kmeansDF.coalesce(1).write.parquet('/user/jleaniz/ml/kmeans', mode='append') outliers = kmeansDF.groupBy("prediction").count().filter('count < 10').withColumnRenamed("prediction", "cluster") self.outlierCmds = outliers.join(kmeansDF, kmeansDF.prediction == outliers.cluster)
def test_kmeans_deterministic(self): from pyspark.mllib.clustering import KMeans X = range(0, 100, 10) Y = range(0, 100, 10) data = [[x, y] for x, y in zip(X, Y)] clusters1 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42) clusters2 = KMeans.train(self.sc.parallelize(data), 3, initializationMode="k-means||", seed=42) centers1 = clusters1.centers centers2 = clusters2.centers for c1, c2 in zip(centers1, centers2): # TODO: Allow small numeric difference. self.assertTrue(array_equal(c1, c2))
def fit(self, Z): """Compute k-means clustering. Parameters ---------- Z : ArrayRDD or DictRDD containing array-like or sparse matrix Train data. Returns ------- self """ X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z check_rdd(X, (np.ndarray, sp.spmatrix)) if self.init == 'k-means||': self._mllib_model = MLlibKMeans.train( X.unblock(), self.n_clusters, maxIterations=self.max_iter, initializationMode="k-means||") self.cluster_centers_ = self._mllib_model.centers else: models = X.map(lambda X: super(SparkKMeans, self).fit(X)) models = models.map(lambda model: model.cluster_centers_).collect() return super(SparkKMeans, self).fit(np.concatenate(models))
def train_model(self, dataframe, k, model_name): ''' use data to train model :param dataframe: all columns for train :param k:k value :param model_name:the trained model :return:None ''' data = self.prepare_data(dataframe) # train to get model model = KMeans.train(data, k) # create model saving path path = self.base + model_name # try to delete the old model if it exists try: import subprocess subprocess.call(["hadoop", "fs", "-rm", "-f", path]) except: pass # save new model on hdfs model.save(self.sc, path) # print all cluster of the model for c in model.clusterCenters: l = [] for i in c: i = decimal.Decimal(i).quantize(decimal.Decimal('0.01')) l.append(float(i)) print(l)
def main(sc): stopset = set(stopwords.words('english')) tweets = sc.textFile('hdfs:/adi/sample.txt') words = tweets.map(lambda word: word.split(" ")) wordArr = [] for wArr in words.collect(): tempArr = [] for w in wArr: if not w in stopset: tempArr.append(w) wordArr.append(tempArr) # Open a file # print wordArr #tokens = sc.textFile("hdfs:/adi/tokens1.txt") # Load documents (one per line). documents = sc.textFile("hdfs:/adi/tokens1.txt").map(lambda line: line.split(" ")) numDims = 100000 hashingTF = HashingTF(numDims) tf = hashingTF.transform(documents) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) tfidf.count() model = KMeans.train(tfidf, 5) model.save(sc,"tweetModel1") print("Final centers: " + str(model.clusterCenters)) # print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def train_subquantizers(sc, split_vecs, M, subquantizer_clusters, model, seed=None): """ Project each data point into it's local space and compute subquantizers by clustering each fine split of the locally projected data. """ b = sc.broadcast(model) def project_local(x): x = np.concatenate(x) coarse = b.value.predict_coarse(x) return b.value.project(x, coarse) projected = split_vecs.map(project_local) # Split the vectors into the subvectors split_vecs = projected.map(lambda x: np.split(x, M)) split_vecs.cache() subquantizers = [] for split in xrange(M): data = split_vecs.map(lambda x: x[split]) data.cache() sub = KMeans.train(data, subquantizer_clusters, initializationMode='random', maxIterations=10, seed=seed) data.unpersist() subquantizers.append(np.vstack(sub.clusterCenters)) return (subquantizers[:len(subquantizers) / 2], subquantizers[len(subquantizers) / 2:])
def test_kmeans(self): from pyspark.mllib.clustering import KMeans data = [[0, 1.1], [0, 1.2], [1.1, 0], [1.2, 0]] clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||") self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1])) self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Load in pickled noun to vector dictionary logger.info('Loading pickled noun to vector dictionary') # Load noun to vector dictionary with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as f: noun_to_vect_dict = pickle.load(f) # Create vectors array vectors = noun_to_vect_dict.values() # Initialize Spark Context sc = ps.SparkContext('local[*]') # Load data data = sc.parallelize(vectors, 1024) # Create and fit a KMeans model to the data logger.info('Fitting KMeans model') kmeans_model = KMeans.train(data, N_CLUSTERS, maxIterations=10, runs=10, initializationMode='k-means||') # Create a list of labels corresponding to vectors logger.info('Labeling vectors') labels = [kmeans_model.predict(vector) for vector in vectors] # Write to text file logger.info('Writing labels to file') with open(path.join(OUT_FILE_LOC, 'labels.txt'), 'w') as f: for label in labels: f.write(str(label) + '\n')
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info('Loading pickled noun to vector dictionary') # Load noun to vector dictionary with open(NOUN_TO_VECT_DICT_FILE_LOC, 'rb') as pickled: noun_to_vect_dict = pickle.load(pickled) # Create vector array from mapping vectors = np.array(noun_to_vect_dict.values()) max_k = int(sqrt(len(vectors) / 2.0)) # Define search space for k numbers_of_clusters = reversed(range(MIN_K, max_k)) # For each k for i, k in enumerate(numbers_of_clusters): # Initialize Spark Context sc = ps.SparkContext() # Load data data = sc.parallelize(vectors, 1024) logger.info('Trial %i of %i, %i clusters', (i + 1), max_k - 1, k) # Calculate cluster kmeans_model = KMeans.train(data, k, maxIterations=10, runs=10, initializationMode='k-means||') logger.info('Calculating WSSSE') # Calculate WSSSE WSSSE = data.map(lambda point: error(kmeans_model, point)) \ .reduce(lambda x, y: x + y) logger.info('Writing WSSSE') # Write k and WSSSE with open(path.join(OUT_FILES_LOC, 'elbow_data.txt'), 'a') as elbow_data: elbow_data.write(str(k) + '\t' + str(WSSSE) + '\n') sc.stop()
def KMeansModel(dataPath, label, k, character, master): sc = SparkContext(master) data = sc.textFile(dataPath).map(lambda line: line.replace(character, ',')) if label == 0: label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect() label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect() train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)])) else: label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect() label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect() train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1])) model = km.train(train_data, k) predict_data = train_data.collect() train = len(predict_data) acc = 0 for i in range(len(label_sum)): ksum = np.zeros(k, dtype = int) cur_label = label_sum[i][0] for j in range(train): if label[j] == cur_label: ksum[model.predict(predict_data[j])] += 1 acc += max(ksum) string = "KMeans Result: \n" center = model.centers for i in range(k): cur = str(i) + ":" + str(center[i]) + '\n' string += cur string = string + "Acc: " + str((float(acc)/train) * 100) + "%" sc.stop() return string
def kMeans(vecs, clusterNum): clusters = KMeans.train(vecs, clusterNum, maxIterations=10, runs=10, initializationMode="random") if pv.outputDebugMsg: Utils.logMessage("\nKmean cluster finished") return clusters
def clusterKMeanSpark(matrix,k): m = transformInRealMatrix(matrix) sc = SparkContext(appName="Jsonizer: Remove stop words") parsedData = sc.parallelize(m) y = [] x = [] clustersControl = range(k,k+1) for kc in clustersControl: clusters = KMeans.train(parsedData, kc, maxIterations=50000,runs=200, initializationMode="k-means||",epsilon=0.0001) clu = [] def error(point,clust): center = clust.centers[clust.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point,clusters)).reduce(lambda x, y: x + y) for n in m: clu += [clusters.predict(np.array(n))] x += [kc] y += [WSSSE] #print(kc,WSSSE) #plt.plot(x,y) #plt.ylabel('some numbers') #plt.show() ret = [[] for i in range(0,max(clu)+1)] for i in range(0,len(clu)): ret[clu[i]] += [i] sc.stop() return ret
def kmeans(iterations, theRdd): def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) clusters = KMeans.train(theRdd, iterations, maxIterations=10, runs=10, initializationMode="random") WSSSE = theRdd.map(lambda point: error(point)).reduce(lambda x, y: x + y) return WSSSE, clusters
def main(arg1, arg2): sc = SparkContext(appName="KMeans") lines = sc.textFile(arg1) data = lines.map(parseVector) k = int(arg2) model = KMeans.train(data, k) print("Final centers: " + str(model.clusterCenters)) print("Total Cost: " + str(model.computeCost(data))) sc.stop()
def spark_KMeans(train_data): maxIterations = 10 runs = 20 numClusters = [2,3,4,5,6,7,8,9,10,11,12,13,14] errors = [] for k in numClusters: model = KMeans.train(train_data, k, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4) WSSSE = model.computeCost(train_data) errors.append(WSSSE) plt.plot(numClusters, errors, 'ro') plt.xlabel(r'k') plt.ylabel(r'inertia') plt.title(r'inertia v.s. k') plt.savefig('kmeans_cross_validation.png') bestModel = KMeans.train(train_data, 6, maxIterations=maxIterations, runs=runs,initializationMode='random', seed=10, initializationSteps=5, epsilon=1e-4) return bestModel
def cluster_data(sc, qc): drivers = read_file_path(BASE_PATH) print "Number of drivers: %d" % len(drivers) # Load and parse the data for i, dr in enumerate(drivers): # extract driver number from path dr_num = re.search("[0-9]+$", dr.strip()) if dr_num: dr_num = dr_num.group(0) if dr_num == '1018': continue else: print 'driver number error for %s' % dr continue dr_data = sc.textFile("hdfs://" + dr + "/" + dr_num + "_all_trips.txt") data = dr_data.map(lambda row: [float(x) for x in row.split(',')]) if i == 0: all_data = data else: all_data = all_data.union(data) data.unpersist() print 'Total number of records: %d' % all_data.count() # Build the model (cluster the data), k = Number of clusters k = 5 t = time() clusters = KMeans.train(all_data, k, maxIterations=100, runs=100, initializationMode="random", ) print 'KMeans took %.2f seconds' % (time() - t) # Compute cost WSSSE_map = all_data.map(lambda point: error(point, clusters)) # Join cluster ID to original data all_data_w_cluster = all_data.map(lambda point: np.hstack((point, get_cluster_id(clusters, point)))) # all_data_w_cluster.saveAsTextFile("hdfs:///usr/local/spark/kmeans/results.txt") for i in xrange(0,k): subset = all_data_w_cluster.filter(lambda x: x[-1] == i) print "Number of items in cluster %d: %d" % (i, subset.count()) # Computer functions on different features: all_features_average = subset.sum() / subset.count() print 'Average of all features' print all_features_average WSSSE = all_data.map(lambda point: error(point, clusters)).reduce(lambda x, y: x + y) print("Within set sum of squared error: " + str(WSSSE))
def k_means(loadTrainingFilePath, sc): # Load and parse the data loadTrainingFilePath = "../data/kmeans_data.txt" data = sc.textFile(loadTrainingFilePath) parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 3, maxIterations=10, runs=30, initializationMode="random") WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE))
def build_cluster_model(tfidf_vectors_rdd, num_clusters, max_iterations, runs): """Perform the clustering of vectors using K-means. Returns: k means model learned from the training data in tfidf_vectors_rdd """ # Build the model (cluster the training data) return KMeans.train(tfidf_vectors_rdd, num_clusters, maxIterations=max_iterations, runs=runs)
def test_clustering(self): from pyspark.mllib.clustering import KMeans data = [ self.scipy_matrix(3, {1: 1.0}), self.scipy_matrix(3, {1: 1.1}), self.scipy_matrix(3, {2: 1.0}), self.scipy_matrix(3, {2: 1.1}) ] clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||") self.assertEqual(clusters.predict(data[0]), clusters.predict(data[1])) self.assertEqual(clusters.predict(data[2]), clusters.predict(data[3]))
def main(noun_file_loc, model_file_loc, percent, n_trials, out_files_loc): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info('Loading Word2Vec model') # Load trained Word2Vec model model = Word2Vec.load('model_file_loc') logger.info('Reading in list of nouns') # Read in list of sorted nouns sorted_nouns = [] with open(noun_file_loc, 'r') as f: for line in f: sorted_nouns += line # Count number of nouns n_nouns = len(sorted_nouns) # Create dictionary to map nouns to vectors noun_to_vect_dict = {} # Calculate index to stop slice as percentage of total nouns n_nouns_to_keep = int(n_nouns * percent / 100.) logger.info('Keeping %i nouns, %i percent of %i', n_nouns_to_keep, percent, n_nouns) # Add nouns and vectors to dictionary for noun in sorted_nouns[0:n_nouns_to_keep]: noun_to_vect_dict[noun] = model[noun] vectors = np.array(noun_to_vect_dict.values()) # Initialize Spark Context sc = ps.SparkContext('local[4]') # Load data data = sc.parallelize(vectors) # Define search space for k ns_clusters = [int(x) for x in np.linspace(2, n_nouns, n_trials)] # Open WSSSEs output file with open(path.join(out_files_loc, 'elbow_data.txt'), 'w') as elbow_data: # For each k for i, k in enumerate(ns_clusters): logger.info('Trial %i of %i, %i clusters', (i + 1), n_trials, k) # Calculate cluster kmeans_model = KMeans.train(data, k, maxIterations=10, runs=10, initalizationMode='k-means||') # Calculate WSSSE WSSSE = data.map(lambda point: error(kmeans_model, point)) \ .reduce(lambda x, y: x + y) # Save centroids with open(path.join(out_files_loc, '_', k, '.pkl'), 'w') as f: pickle.dump(kmeans_model.clusterCenters(), f) # Write k and WSSSE elbow_data.write('%i, %f', k, WSSSE)
def main(): sc = SparkContext() filename = sys.argv[1] clusters=int(sys.argv[2]) outmodelname = sys.argv[3] dataset = gdal.Open(filename, GA_ReadOnly) driver = dataset.GetDriver().ShortName x, y, data = tiff_to_array(dataset, weights) print "after change to array" clusterdata = sc.parallelize(data) print "parallelize done" kmeanmodel = KMeans.train(clusterdata, clusters, maxIterations=50, runs=10) kmeanmodel.save(sc, outmodelname) print kmeanmodel.clusterCenters
def train_coarse(sc, split_vecs, V, seed=None): """ Perform KMeans on each split of the data with V clusters each. """ # Cluster first split first = split_vecs.map(lambda x: x[0]) first.cache() print 'Total training set size: %d' % first.count() print 'Starting training coarse quantizer...' C0 = KMeans.train(first, V, initializationMode='random', maxIterations=10, seed=seed) print '... done training coarse quantizer.' first.unpersist() # Cluster second split second = split_vecs.map(lambda x: x[1]) second.cache() print 'Starting training coarse quantizer...' C1 = KMeans.train(second, V, initializationMode='random', maxIterations=10, seed=seed) print '... done training coarse quantizer.' second.unpersist() return np.vstack(C0.clusterCenters), np.vstack(C1.clusterCenters)
def bagofwords(imtrain, imtest=None, features=_features, outdir=None): cache = Cache(cacheroot=outdir) # Unique labels labels = imtrain.map(lambda x: x.category).distinct().collect() print labels # Features: each returns a row array of features X = imtrain.map(features) # Clustering: kmeans clustering to generate words # http://spark.apache.org/docs/0.9.0/mllib-guide.html model = KMeans.train(X, 2, maxIterations=10, runs=30, initializationMode='random') # construct bag of words representation print model.clusterCenters
def kmeans(k=2): """ kmeans """ # Load and parse training data data = getTrainData(dataFilename) parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # pyspark.rdd.PipelinedRDD # Build the model (cluster the data) # KMeans.train(cls, data, k, maxIterations=100, runs=1, initializationMode="k-means||") clf = KMeans.train(parsedData, k, maxIterations=10, runs=10, initializationMode="random") # pyspark.mllib.clustering.KMeansModel WSSSE = parsedData.map(lambda point: error(point, clf)).reduce(lambda x, y: x + y) # float print("Within Set Sum of Squared Error = " + str(WSSSE)) print "### cluster centers ###:" print clf.centers return clf
def kmeans_CSV(): try: # creating a parsedData RDD to do kmeans on servernum = sys.argv[1] serverpath = "hdfs://10.0.0.4:8020/opentsdb/" + servernum print "Attempting to create SparkContext" sconf = SparkConf().setAppName("Kmeans for files") print "Sconf set..." sc = SparkContext(conf=sconf) print "SparkContext created" # making parsedData RDD: [ array([filewrites, filereads, CPU, diskIOBW, net bytes]), array([...]), ... ] # kmeans iteratively passes over data multiple times - cache parsedData if len(sys.argv) == 2: #user just specified server - do full server kmeans filepaths = get_file_paths(serverpath)# Array of string file paths to all files within folder parsedData = compile_RDD(sc, filepaths).cache() CSV_filename = make_name(filepaths) + "_" + servernum elif len(sys.argv) == 3: #user put in server and single timeframe - do single file kmeans timeframe = sys.argv[2] #ex: 2014-07-09 filepaths = get_singlefile_path(timeframe, serverpath) parsedData = compile_RDD(sc, filepaths).cache() CSV_filename = str(timeframe) + "_" + servernum else: #user put in server and start/end timeframe - do timeframe kmeans start_timeframe = sys.argv[2] end_timeframe = sys.argv[3] filepaths = get_timeframefile_paths(start_timeframe, end_timeframe, serverpath) parsedData = compile_RDD(sc, filepaths).cache() CSV_filename = make_name(filepaths)+ "_" + servernum k = findk(parsedData.count()) clusters = KMeans.train(parsedData, k, maxIterations=10, runs=10, initializationMode="random") centers = clusters.clusterCenters # Creating two CSVs (one has data points, one has centers) for later visualization compile_CSV(CSV_filename, parsedData) compile_centers_CSV(CSV_filename, centers) print "SUCCESS: Kmeans done" except: print "---------------------------------" print "Usage: ./bin/spark-submit kmeans_CSV.py <servername> <start_timeframe> <end_timeframe>" print "<servername> must be specified. EX: sense0 " print "Timeframes are optional. Specify just one timeframe for single file kmeans. Specify start and end for kmeans over timeframe." print "Timeframes must be in format yyyy-mm-DD" print "---------------------------------" raise
def detect(self, k, t): #Encoding categorical features using one-hot. df1 = self.cat2Num(self.rawDF, [0, 1]) df1.show() #Clustering points using KMeans features = df1.select("features").rdd.map(lambda row: row[0]).cache() model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20) #Adding the prediction column to df1 modelBC = sc.broadcast(model) predictUDF = udf(lambda x: modelBC.value.predict(x), StringType()) df2 = df1.withColumn("prediction", predictUDF(df1.features)) df2.show() #Adding the score column to df2; The higher the score, the more likely it is an anomaly df3 = self.addScore(df2) df3.show() return df3.where(df3.score > t)
def kmeans_demo(self): file = self.sc.textFile(self.base+'k_data.csv') # transform to rdd data = file.map(lambda line: line.split(',')).cache() print(type(data)) # train data to get the model model = KMeans.train(data,k=3) # print to check all clusters cluster = model.clusterCenters for c in cluster: print(c) # predict new data return the data belong to which cluster(index of the cluster) predict = model.predict([1.3,.1,1.1]) print(predict)
def clustering_score(data,k): model = KMeans.train(data, k=k,maxIterations=200) def distance(v1, v2): s = 0 # [1,2,3] [4,5,6] --> [(1,4),(2,5),(3,6)] pairs = zip(v1,v2) for p in pairs: sub = float(p[0]) - float(p[1]) s = s + sub * sub return math.sqrt(s) def dist_to_centroid(datum): # predict the data cluster = model.predict(datum) # get the current centroid --> means center point centroid = model.clusterCenters[cluster] # call distance method return distance(centroid, datum) return data.map(dist_to_centroid).mean()
def run(): # Set up sc = SparkContext() records = sc.textFile(os.path.realpath(__file__+'/..') + '/data-scraper/data') # Build clusters kvpairs = records.map(keyAndParse) cts = kvpairs.groupByKey().map(lambda (name, statList): (name, len(statList))).collectAsMap() kvpairs = kvpairs.reduceByKey(combine) # Filter outliers with too few records kvpairs = kvpairs.filter(lambda (k,v): cts[k] > 2) kvpairs = kvpairs.map(lambda (name, statline): (name, normalize(statline, cts[name]))) numClusters = 20 clusters = KMeans.train(kvpairs.map(lambda (k,v): v),numClusters,10) groupedClusters = kvpairs.groupBy(lambda (k,v): clusters.predict(v)).map(lambda x: (x[0], getNames(list(x[1])))).collect() # Rank clusters centers = avg(clusters.clusterCenters) centers.sort(key=lambda x: x['score'], reverse=True) # Save sorted clusters save(groupedClusters, centers)
def kmeans_train(self, data_rdd, n_clusters): """ This method is used to train the model """ data_splits = data_rdd.randomSplit([.50, .25, .25], seed=0) training_set = data_splits[0].repartition(numPartitions=4).cache() validation_set = data_splits[1].repartition(numPartitions=4).cache() test_set = data_splits[2].repartition(numPartitions=4).cache() max_iter_arr = [50,60,80] max_runs = [50,60,80] k_list = n_clusters best_model = None best_rmse = float("inf") best_run = 0 best_k = 0 itertools.product for itera, run, k in itertools.product(max_iter_arr, max_runs, k_list): try: model = KMeans.train(training_set, 3, itera, run, "random") validation_rmse = model.computeCost(validation_set) print("#of clusters k %d\n" % (k)) if validation_rmse < best_rmse: best_model = model best_rmse = validation_rmse best_run = max_runs best_iter = itera best_k = k except Exception as e: print(e) continue # test_preds = best_model.predict(test_set.first()) print("K-means results...") print(str(best_rmse)) print(str(best_k)) return best_model
def anomaly_detection_by_KMeans(self, columns, k=3, threshold=4, normalize=False): ''' Detect anomaly combination of features through K-Means columns: list of columns to be detected k: the number of clusters threshold: if (distance - mean_dsitance )> threshold*std, then point is considered to be anomaly normalize: whether normalize the data before fitting into the cluster Output(in self.out): output[0]: Index of outliers (list of int) output[1]: DataFrame of outliers ''' def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) def addclustercols(x): point = np.array(x[1:]) center = clusters.centers[0] mindist = sqrt(sum([y**2 for y in (point - center)])) cl = 0 for i in range(1, len(clusters.centers)): center = clusters.centers[i] distance = sqrt(sum([y**2 for y in (point - center)])) if distance < mindist: cl = i mindist = distance clcenter = clusters.centers[cl] #return [x[0]]+list(clcenter) + [distance] result = list(clcenter) + [distance] return [x[0], cl] + [float(x) for x in result] def featurize(df, col_name): df_stats = df.select( F.mean(F.col(col_name)).alias('mean'), F.stddev(F.col(col_name)).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] data = df.withColumn(col_name, (df[col_name] - mean) / std) data_stats = data.select( F.mean(F.col(col_name)).alias('mean'), F.stddev(F.col(col_name)).alias('std')).collect() new_mean = data_stats[0]['mean'] new_std = data_stats[0]['std'] return data def featurize_all(df, columns): for i in columns: df = featurize(df, i) data = df return data data = self.data if 'index' not in data.columns: print('Please create index first') return new_cols_len = len(columns) number_type = [ "BinaryType" "DecimalType", "DoubleType", "FloatType", "IntegerType", "LongType", "ShortType" ] all_number_type = True new_columns_name = ['index', 'cluster_number'] for column in columns: if column in data.columns: all_number_type = (str(data.schema[column].dataType) in number_type) and (all_number_type) if not all_number_type: print('The type of' + column + " is " + str(data.schema["_c6"].dataType)) print("Only numerical type is accepted ") return else: new_columns_name.append(column + "_cluster") else: print(column, "doesn't exist") return new_columns_name.append('distance_to_cluster') origin_data = data.cache() data = data.select(['index'] + columns) data = data.dropna() if normalize: data = featurize_all(data, columns) target_numpy = data.select(columns).rdd.map(lambda x: np.array(x)) clusters = KMeans.train(target_numpy, k, maxIterations=20) result_data = data.rdd.map(lambda x: addclustercols(x)).toDF( new_columns_name) full_data = origin_data.join(result_data, 'index', how='inner') stat = full_data.groupBy('cluster_number').agg( F.mean('distance_to_cluster').alias('distance_mean'), F.stddev('distance_to_cluster').alias('distance_std')) anomaly_data = full_data.join( stat, 'cluster_number', 'inner').rdd.filter(lambda x: x['distance_to_cluster'] > (x[ 'distance_mean'] + threshold * x['distance_std'])) try: anomaly_data = anomaly_data.toDF() anomaly_indices = anomaly_data.select('index') except: print("None anomaly data based on your setting") return else: self.out = ([int(i['index']) for i in anomaly_indices.collect()], anomaly_data) return
import sys from pyspark import SparkContext from pyspark.mllib.feature import HashingTF, IDF from pyspark.mllib.clustering import KMeans if __name__ == "__main__": dirs = "hdfs:///user/clondo46/datasets/gutenberg" k = 5 maxIters = 20 sc = SparkContext(appName="Proyecto04") #Leemos documentos = sc.wholeTextFiles(dirs) nombreDocumentos = documentos.keys().collect() docs = documentos.values().map(lambda doc: doc.split(" ")) #Usamos TFIDF hashingTF = HashingTF() tf = hashingTF.transform(docs) idf = IDF().fit(tf) tfidf = idf.transform(tf) #Crea el modelo de k-mean y crea los clusters clusters = KMeans.train(tfidf, k, maxIterations=maxIters) clustersid = clusters.predict(tfidf).collect() diccionario = dict(zip(nombreDocumentos, clustersid)) d = sc.parallelize(diccionario.items()) d.coalesce(1).saveAsTextFile("hdfs:///user/clondo46/gut5") sc.stop() #SparkContext detenido:
from numpy import array from math import sqrt from pyspark import SparkContext from pyspark.mllib.clustering import KMeans, KMeansModel # Load and parse the data sc = SparkContext() data = sc.textFile("dataframe.txt") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") sameModel = KMeansModel.load( sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
rdd1 = sc.textFile("5000_points.txt") rdd2 = rdd1.map(lambda x: x.split()) rdd3 = rdd2.map(lambda x: [int(x[0]), int(x[1])]) from pyspark.mllib.clustering import KMeans for clusters in range(1, 30): model = KMeans.train(rdd3, clusters) print(clusters, model.computeCost(rdd3)) for trials in range(10): #Try ten times to find best result for clusters in range(12, 16): #Only look in interesting range model = KMeans.train(rdd3, clusters) cost = model.computeCost(rdd3) centers = model.clusterCenters #Let's grab cluster centers if cost < 1e+13: #If result is good, print it out print(clusters, cost) for coords in centers: print(int(coords[0]), int(coords[1])) break
# Reduce returns a single value result contains count of 2xx and 3x against an IP rawTrainingData = rawTrainingData.reduceByKey(extract_features) print("training dataset after reduce: ", rawTrainingData.collect()) print('total training lines after reduce by key : ', rawTrainingData.count()) # K-means accepts data in the form of [a, b] its called feature vector. use vector assembler or map function. # Converts to map of count of 2xx and 3xx training_dataset = rawTrainingData.map(lambda data: data[1]) print("TRAINING DATASET for Kmean cluster: ", training_dataset.collect()) print('total training lines after reformat : ', rawTrainingData.count()) # set cluster count equals to 2 cluster_count = 2 # train the k-means algo to get the model trained_model = KMeans.train(training_dataset, cluster_count) # print the cluster centroids from trained model for center in range(cluster_count): print('centre ', center, trained_model.centers[center]) # streamingData = KafkaUtils.createStream(ssc, "localhost:2181", "test-consumer-group", {"test" : 1}) # lines = streamingData.map(lambda x:x[1]) # df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") stream_data_init = KafkaUtils.createDirectStream( ssc, [topic], {"metadata.broker.list": brokers}) # stream_data_init = KafkaUtils.createStream( # ssc, # zk,
exit(-1) infilenm = sys.argv[1] # input file name (in s3) k = int(sys.argv[2]) # number of clusters to use outfilenm = sys.argv[3] # Read the main data file lines = sc.textFile(infilenm) alldata = lines.map(parse_vector) # Only want kmeans run on columns fst:lst # For weekend only: .filter(lambda arr: np.array(arr[incols['dow']]) == 0 # or np.array(arr[incols['dow']] == 6)) datasub = alldata.map(lambda arr: np.array(arr[fst:lst])) \ .filter(lambda x: np.count_nonzero(x) > 0) clusters = KMeans.train(datasub, k) # For each point: figure out the closest cluster center # Add each cluster center as additional columns to the original input closestcenter = alldata.map(lambda cc: pt_pred_arr(cc)) # For M.distance calc: need inverted covariance matrix as part of inputs. # So: For each cluster 'c', calculate the covariance matrix. inv_covmat = [] for c in range(0, k): # Get the actual data columns (subset of the whole line) data = closestcenter.filter(lambda arr: np.array(arr[clstrcol]) == c) \ .map(lambda arr: np.array(arr[fst:lst])) # Calc the covariance matrix, and invert # Convert from RDD to list, so numpy stats will run against it # OR - could write a function to calc the covariance matrix against this RDD ... datacol = data.collect()
@author: lnunno ''' import numpy as np from pyspark import SparkContext from pyspark.mllib.linalg import SparseVector from pyspark.mllib.clustering import KMeans TOTAL_DOCS = 39944 NUM_CLUSTERS = 20 def parseVector(line): _,indices_tuple_ls = line.split('\t') indices_tuple_ls = eval(indices_tuple_ls) # Convert to a real python list. return SparseVector(TOTAL_DOCS,indices_tuple_ls) if __name__ == '__main__': sc = SparkContext(appName="KMeans") lines = sc.textFile('../../data/spark_tf_idf_vectors.tsv') data = lines.map(parseVector) np.set_printoptions(threshold='nan') n = NUM_CLUSTERS while n >= 2: model = KMeans.train(data,n) centers = model.clusterCenters with open('../../data/clusters_%d.txt' % (n),'w') as f: for c in centers: # Format in exponential notation. s = ','.join([('%e' % x) for x in c]) f.write('%s\n'% (s)) n -= 2
#let's generate random class data, add in a cluster center to random 2D points #use default num of partitions, or use a definte number to make it so that the union # will have samples across clusters c1_v=RandomRDDs.normalVectorRDD(sc,20,2,numPartitions=2,seed=1L).map(lambda v:np.add([1,5],v)) c2_v=RandomRDDs.normalVectorRDD(sc,16,2,numPartitions=2,seed=2L).map(lambda v:np.add([5,1],v)) c3_v=RandomRDDs.normalVectorRDD(sc,12,2,numPartitions=2,seed=3L).map(lambda v:np.add([4,6],v)) #concatenate 2 RDDs with .union(other) function c12 =c1_v.union(c2_v) my_data=c12.union(c3_v) #this now has all points, as RDD my_kmmodel = KMeans.train(my_data,k=1, maxIterations=20,runs=1, initializationMode='k-means||',seed=10L) #try: help(KMeans.train) to see parameter options #k is the number of desired clusters. #maxIterations is the maximum number of iterations to run. #initializationMode specifies either random initialization or initialization via k-means||. #runs is the number of times to run the k-means algorithm (k-means is not guaranteed to find a globally optimal solution, and when run multiple times on a given dataset, the algorithm returns the best clustering result). #initializationSteps determines the number of steps in the k-means|| algorithm. #epsilon determines the distance threshold within which we consider k-means to have converged. #type dir(my_kmmodel) to see functions available on the cluster results object #The computeCost function might not be available on your cloudera vm, # spark mlllib, it computes the Sum Squared Error: my_kmmodel.computeCost(my_data)
''' USES SPARK MLLIB LIBRARY TO RUN KMEANS IN A HADOOP CLUSTER AUTOMATICALLY USES ALL RESOURCES AVAILABLE ACROSS NODES ''' from pyspark.mllib.clustering import KMeans from numpy import array import time luteo_data = sc.textFile('/final_project/luteo_clean.csv') parsed_data = luteo_data.map( lambda line: array([float(x) for x in line.split(',')])).cache() with open('/usr/local/kmeans_spark_times.txt', 'w') as out_file: for n_clusters in range(1, 30): start_time = time.time() clusters = KMeans.train(parsed_data, n_clusters, maxIterations=100) end_time = time.time() out_file.write('{0} {1}\n'.format(n_clusters, end_time - start_time)) print('{0} {1}\n'.format(n_clusters, end_time - start_time))
data = sc.textFile("s3://ccdatauvamsds2017/YearPredictionMSD.txt") #sc.addFile("YearPredictionMSD.txt") #sc.addFile("YearPredictionMSD") #data = SparkFiles.get('YearPredictionMSD.txt') parsedData = data.map( lambda line: array([float(x) for x in line.split(',')])) #K-Means Code #Sampling with replacement data_sample = parsedData.sample(True, 100, 1234) start_time = timeit.default_timer() clusters = KMeans.train(data_sample, 2, maxIterations=10, initializationMode="random") time_Kmeans = (timeit.default_timer() - start_time) print(timeit.default_timer() - start_time) time_Kmeans_string = str(time_Kmeans) file = open("s3://ccdatauvamsds2017/output/Time_logs.txt", "w") file.write(time_Kmeans_string) file.close() #Random Forest Code data = MLUtils.loadLibSVMFile( sc, sc.textFile("s3://ccdatauvamsds2017/YearPredictionMSD")) start_time = timeit.default_timer()
cv_error_storage = [] for w in range(num_folds): #new train/validation split train = data[0:i] + data[j:] val = data[i:j] train = sc.parallelize(train) val = sc.parallelize(val) minError = float("inf") bestModel = None bestK = None test_values = [80, 90, 100, 110, 120, 130, 140] #test_values = [120] error_storage = [] for x in test_values: model = KMeans.train(train.values(), x, maxIterations=10, runs=10, epsilon=.00001) error = model.computeCost(val.values()) error_storage.append(error) print "****** model with " + str(x) + " clusters done in validation fold " + str(w+1) + " ***********" print "with error: " + str(error) if error < minError: bestModel = model minError = error bestK = x cv_error_storage.append(error_storage) i = i + partitionSize j = j + partitionSize #get CVerrors (mean of the errors from the 10 cross validated samples) CVerrors = []
sc = SparkContext(appName="kmeans") def myVec(line): from pyspark.mllib.linalg import SparseVector return eval("SparseVector" + line) # Load and parse the data data = sc.textFile(fname).map(myVec) # Build the model (cluster the data) clusters = KMeans.train(data, k, maxIterations=max_iter, runs=runs, initializationMode="random") # # Evaluate clustering by computing Within Set Sum of Squared Errors # def error(point): # center = clusters.centers[clusters.predict(point)] # return sqrt(sum([x**2 for x in (point - center)])) #WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) # print("Within Set Sum of Squared Error = " + str(WSSSE)) f = open(args.output, "w") for c in clusters.clusterCenters: f.write("[") for i in range(len(c)):
data = sc.textFile(input_data) parsedData = data.map( lambda line: np.array([float(x) for x in line.split('\t')[2:]])).cache() print("\n number of keys is {} \n".format(parsedData.count())) for K in [2,5,10,50,100,150,200,250,300,350,400,450,500,600,700,800,900] + \ list(range(1000,10001,500)): ts = time() print("\n start to train model, K = {} \n".format(K)) model = KMeans.train(parsedData, K, maxIterations=max_iteration, initializationMode="k-means||", initializationSteps=2, epsilon=1e-6) def error(point): center = model.centers[model.predict(point)] return sum([x**2 for x in (point - center)]) WSSSE = parsedData.map(lambda point: error(point)).reduce( lambda x, y: x + y) print("\nK = {}; WSSSE = {}; elapsed time = {} minutes \n".format( K, WSSSE, (time() - ts) / 60)) model.save(sc, join(output, str(K), 'model')) #sameModel = KMeansModel.load(sc, join(output, 'model'))
def calKGroup(self, high, sc): #conf = SparkConf().setAppName("SparkSQLKmeans") #sc = SparkContext() sqlsc = SQLContext(sc) MYSQL_USERNAME = "" MYSQL_PWD = "" #Original URL MYSQL_CONNECTION_URL = "jdbc:mysql://1.0.0.127:3306/telegramdb?autoReconnect=true&useSSL=false&user="******"&password="******"jdbc").options( url=MYSQL_CONNECTION_URL, dbtable="information", driver="com.mysql.jdbc.Driver").load() tag_df = sqlsc.read.format("jdbc").options( url=MYSQL_CONNECTION_URL, dbtable="tags", driver="com.mysql.jdbc.Driver").load() col_num = tag_df.filter(tag_df.high == high).count() tags = tag_df.filter( tag_df.high == high).map(lambda list: list.low).collect() cols = {} for tag in tags: cols[tag] = 0 #print(tag) print(cols) #results = info.map(lambda line: array([x[1:-1].replace("{", "").replace("}","") for x in line.low.split(",")])).collect() #for temp in results: # print(temp) pks = info_df.filter( info_df.high == high).map(lambda line: line.PK_aid).collect() repos = info_df.filter(info_df.high == high).map(lambda line: {line.PK_aid:json.loads(line.low, \ encoding="utf-8")}).collect() rows = info_df.filter(info_df.high == high).map( lambda line: { line.PK_aid: np.zeros(col_num, dtype=np.int) }).collect() row_num = info_df.filter(info_df.high == high).count() #print(row_num) print(row_num) print(col_num) #print(data) #print(rows) for index, repo in enumerate(repos): #print("[%d] : "%(index)+str(temps)) for temp in repo: print("[%d] : " % (index) + str(repo.get(temp))) for element in repo.get(temp): t = element.items() print("->" + str(t) + ", ") #+ str(element.get(element))) #for temp in cols: #print("[%d] : "%(index)+str(temp)) check = {} key = "" for index, repo in enumerate(repos): for pk_aids in repo: elements = repo.get(pk_aids) for element in elements: for col_index, col in enumerate(cols): if element.get(col) is not None: rows[index].get( pk_aids)[col_index] = element.get(col) + 3 key = str(element.keys()).strip().replace( "dict_keys([\'", "").replace("\'])", "") if key in check: check[key] += element.get(col) else: check[key] = element.get(col) else: rows[index].get( pk_aids)[col_index] = random.randrange(2) for index, row in enumerate(rows): for pk_aids in row: if rows[index].get(pk_aids) is not None: #print(rows[index].get(pk_aids)) if index == 0: data = rows[index].get(pk_aids) else: data = np.append(data, rows[index].get(pk_aids)) print(str(np.resize(data, (row_num, col_num)).shape)) #data = np.resize(data, (row_num, col_num) clusterdata_1 = sc.parallelize(np.resize(data, (row_num, col_num))) model = KMeans.train(clusterdata_1, 10, maxIterations=100, runs=30, \ initializationMode="random", seed=10, initializationSteps=10, epsilon=1e-4) #model = GaussianMixture.train(clusterdata_1, 3, convergenceTol=0.9, maxIterations=100, seed=10) #for i in range(3): # print ("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu, # "sigma = ", model.gaussians[i].sigma.toArray()) labels = model.predict(clusterdata_1).collect() temps = [] for pk in pks: temps.append([pk, 0]) print(labels) count = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for index, label in enumerate(labels): temps[index][1] = label + 1 count[label] += 1 for index, c in enumerate(count): print("Group %d : %d" % (index + 1, c)) for col in cols.keys(): for ch in check.keys(): #print("col : %s, ch : %s"%(col, ch)) if col == ch: cols[col] = check[ch] zeros = "" zero_count = 0 nozeros = "" nozero_count = 0 for col in cols.keys(): if cols[col] == 0: zeros += col + ", " zero_count += 1 else: nozeros += col + ", " nozero_count += 1 print("발견된 태그들[%d 개] : %s" % (nozero_count, nozeros[:-2])) print("") print("발견 안된 태그들[%d 개] : %s" % (zero_count, zeros[:-2])) return temps
inputNum_min = float(finalDF1.select(min('inputNum').alias('min_inputNum')).collect()[0]['min_inputNum']) inputNum_max = float(finalDF1.select(max('inputNum').alias('max_inputNum')).collect()[0]['max_inputNum']) Min_v = inputNum_min Max_v = inputNum_max Norm_inputNum_function = udf(lambda v: (float(v) - Min_v) / (Max_v - Min_v), DoubleType()) finalDF2 = finalDF2.withColumn('Norm_inputNum', Norm_inputNum_function(finalDF2.inputNum)) %pyspark #conduct OneHotEncoder for project_index column from pyspark.ml.feature import OneHotEncoder finalDF2.registerTempTable("dfData") finalDF2 = spark.sql("SELECT name, Norm_views, Norm_bytes, Norm_inputNum, project_index FROM dfData") encoder = OneHotEncoder(dropLast=False, inputCol="project_index", outputCol="project_Vec") encoded = encoder.transform(finalDF2) %pyspark encoded.registerTempTable("dfData") finalDF3 = spark.sql("SELECT Norm_views, Norm_bytes, Norm_inputNum, project_Vec FROM dfData") from pyspark.mllib.linalg import SparseVector import numpy as np #824 should be revised according to your onehotcode result RDD = finalDF3.rdd.map(lambda line: SparseVector(824, line["project_Vec"].indices.tolist() + [821, 822, 823], line["project_Vec"].values.tolist() + [line["Norm_views"], line["Norm_bytes"], line["Norm_inputNum"]])).cache() %pyspark from pyspark.mllib.clustering import KMeans clusters = KMeans.train(RDD, 2, maxIterations=10, runs=10, initializationMode="k-means||")
# then we calculate word count and get (docId, count) pair docCountTotal = allWords.map(lambda x: (x[1], 1)).reduceByKey(add) # we should get (docID, (dicPos, occur), docCount) pair and get (docId, (dicPos, Freq)) pair docWithCount = wordsInDocSorted.join(docCountTotal).flatMap(lambda x: ((x[0], j, x[1][1]) for j in x[1][0])) # print(docWithCount.take(1)) docWithFreq = docWithCount.map(lambda x: [x[0], (x[1][0], float(x[1][1]) / float(x[2]))]).groupByKey().map( lambda x: (x[0], sorted(x[1]))) # get the Feature vector for each doc docWithFreqVect = docWithFreq.map(lambda x: (x[0], featureVec(x[1]))) # print(docWithFreqVect.take(1)) # task2 # using k-means to cluster these data points parsedData = docWithFreqVect.map(lambda x: x[1]) # Build the model (cluster the data) model = KMeans.train(parsedData, 3, maxIterations=10, initializationMode="random") # get class and frequent vector regenum = re.compile('[^0-9]') # keyWithClass = keyAndText.map(lambda x: (x[1], x[0])) # classWithFreq = keyWithClass.join(docWithFreqVect).map(lambda x: x[1]).map(lambda x: (regenum.sub('', x[0]), x[1])) # testing # testLines = sc.textFile("testdata.csv") testLines = sc.textFile(sys.argv[2], 1) # filter data and transfer data into into (docId, txt) pair testValidLines = testLines.map(lambda x: x.split(',')).filter(lambda p: len(p) == 6) testKeyAndText = testValidLines.map(lambda x: (x[0], x[1], x[5])) # use regular expression to transfer data text into list of words # remove all non letter words testKeyAndListOfWords = testKeyAndText.map(lambda x: (x[1], regex.sub(' ', x[2]).lower().split()))
i+=1 return result #take seed data and convert to double trainingRaw = sc.textFile("/FileStore/tables/ghwlpxtt1499907037815/seeds.txt") trainingData = trainingRaw.map(lambda x: x.split('\t')).map(todouble) trainingData.collect() # COMMAND ---------- maxClus = [2,5,7,10,20,40,60,100,200,400] least = sys.maxint c = 0 for val in maxClus: clusters = KMeans.train(trainingData, val, maxIterations=20, initializationMode="random") WSSSE = clusters.computeCost(trainingData) if(least > WSSSE): least = WSSSE c = val print("Within Set Sum of Squared Error for "+ str(val) +" is " + str(WSSSE)) print("least WSSSE is " + str(least) + " cluster size of " + str(c)) # COMMAND ----------
results_list2 = len(results[1][1]) results_list3 = len(results[2][1]) min_len = min(result_list1, results_list2, results_list3) results_list = [[], [], []] results_list[0] = list(results[0][1])[:(min_len - 1)] results_list[1] = list(results[1][1])[:(min_len - 1)] results_list[2] = list(results[2][1])[:(min_len - 1)] datenow = (results[0][0].split(','))[1] mat = sc.parallelize(np.column_stack(results_list)) #summary = Statistics.colStats(mat) #print(summary.mean()) clusters = KMeans.train(mat, 3, maxIterations=10) transformeds = clusters.predict(mat).collect() first_group, second_group, third_group = 0, 0, 0 for transformed in transformeds: predict_value = int(transformed) if (predict_value == 0): first_group = first_group + 1 if (predict_value == 1): second_group = second_group + 1 if (predict_value == 2): third_group = third_group + 1 cluster_nums = [first_group, first_group, third_group] print("{} length of trans".format(len(transformeds)))
if __name__ == "__main__": sparkConf = SparkConf() sparkContext = SparkContext(conf=sparkConf) data = sparkContext\ .textFile("data/clusteringData.txt") parsed_data = data\ .map(lambda line: [float(x) for x in line.split(' ')])\ .cache() number_of_clusters = 4 number_of_iterations = 20 clusters = KMeans.train(parsed_data, number_of_clusters, number_of_iterations, initializationMode="random") def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsed_data.map(lambda point: error(point)).reduce( lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) centers = clusters.clusterCenters print("Cluster Centers: ") for center in centers: print(center)
conf = SparkConf() conf.set("spark.master", "local") sc = SparkContext(conf=conf) data = sc.textFile("practice6_train.csv") trData = data.map(parseFeat) data = sc.textFile("practice6_test.csv") tsData = data.map(parseFeat) tsLabel = data.map(parseLabel) kmeans_list = [] for i in range(30): kmeans_list.append(KMeans.train(trData, k=10, maxIterations=100, seed=i)) obj_list = [] for i in range(30): obj_list.append( trData.map(lambda point: error(point, kmeans_list[i])).reduce( lambda x, y: x + y)) kmeans = kmeans_list[obj_list.index(min(obj_list))] tsPredict = kmeans.predict(tsData) nmi_score = NMI(list(tsPredict.collect()), list(tsLabel.collect())) f = open('result.txt', 'w') f.write('NMI of K-Means clustering\n') f.write('{:.4f}'.format(nmi_score))
def buildModel(): model = KMeans.train(features, 3, maxIterations=5, initializationMode="random") return model
parser.add_argument('--iterations', help='number of iterations in each training run (default=32)', type=int, default=32) parser.add_argument('--runs', help='number of training runs (default=10)', type=int, default=10) parser.add_argument('--clusters', help='number of cluster centers to find (default=128)', type=int, default=128) parser.add_argument('--config', metavar="KEY=VAL", help="add KEY=VAL to Spark's configuration", action='append', default=[], dest='config') if __name__ == "__main__": args = parser.parse_args() print(args) protospark = SparkSession.builder.appName("k-means-app").master(args.master) spark = reduce(lambda x, y: x.config(*y.split("=")), args.config, protospark).getOrCreate() runs = args.runs iterations = args.iterations partitions = args.partitions clusters = args.clusters sc = spark.sparkContext rdd = sc.textFile(args.infile).map(lambda line: fromstring(line, sep=",")).repartition(partitions) logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) start_time = clock() for run in (range(runs)): KMeans.train(rdd, clusters, iterations) end_time = clock() sc.stop() print("completed %d run%s in %f seconds" % (runs, (runs > 1 and "s" or ""), end_time - start_time))
import sys import numpy as np from pyspark import SparkContext from pyspark.mllib.clustering import KMeans def parseVector(line): vector = eval(line[1]) return np.array([vector.get(x, 0.0) for x in range(max(vector) + 1)]) if __name__ == "__main__": if len(sys.argv) != 4: print >> sys.stderr, "Usage: kmeans <file> <k> <max_iteration>" exit(-1) sc = SparkContext(appName="PythonKMeans") lines = sc.sequenceFile( sys.argv[1], "org.apache.hadoop.io.LongWritable", "org.apache.mahout.math.VectorWritable", valueConverter= "com.intel.sparkbench.datagen.pythonconverter.MahoutVectorToStringConverter" ) data = lines.map(parseVector) k = int(sys.argv[2]) max_iterations = int(sys.argv[3]) model = KMeans.train(data, k, max_iterations) print "Final centers: " + str(model.clusterCenters)
from pyspark import SparkConf, SparkContext from pyspark.mllib.clustering import KMeans import numpy as np from operator import add conf = SparkConf().setMaster("local").setAppName("RatingsHistogram") sc = SparkContext(conf = conf) data = np.array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2) model = KMeans.train(sc.parallelize(data), 2, maxIterations=10, runs=30, initializationMode="random", seed=50, initializationSteps=5, epsilon=1e-4) #silhouette score spark labels = model.predict(sc.parallelize(data)) with_labels = sc.parallelize(data).zip(labels) errors_in_clusters = with_labels.map(lambda (x, cluster_index): (cluster_index, (x - model.clusterCenters[cluster_index])**2)) cluster_counts = with_labels.map(lambda (x,y): (y,x)).countByValue() errors_in_clusters.reduceByKey(add) zipByKey divide, done mse_array = with_cluster_centres.map(lambda (x, y): (x-y)**2).reduce(add)
X = [] for i in range (k): incomeCentroid = random.uniform(20000.0, 200000.0) ageCentroid = random.uniform(20.0, 70.0) for j in range(int(pointsPerCluster)): X.append([random.normal(incomeCentroid, 10000.0), random.normal(ageCentroid, 2.0)]) X = array(X) return X random.seed(0) # Load the data; note I am normalizing it with scale() - very important! data = sc.parallelize(scale(createClusteredData(100, K))) # Build the model (cluster the data) clusters = KMeans.train(data, K, maxIterations=10, runs=10, initializationMode="random") # Print out the cluster assignments resultRDD = data.map(lambda point: clusters.predict(point)).cache() print("Counts by value:") counts = resultRDD.countByValue() print(counts) print("Cluster assignments:") results = resultRDD.collect() print(results) # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point):
idf = IDF(minDocFreq=1).fit(tf) # In[83]: tfidf = idf.transform(tf) # In[84]: # tfidf.collect() # In[85]: if algorithm == "K": clusters = KMeans.train(tfidf, 8, maxIterations=20, initializationMode="random", seed=42) else: clusters = BisectingKMeans.train(tfidf, 8, maxIterations=20, seed=42) clusterCenters = clusters.clusterCenters # In[ ]: # In[86]: documentModel = documents1.zip(tfidf) # cluster_broadcast = sc.broadcast(clusters) # In[87]:
max_n_clusters = args.Max_n_components filname = "likelihood_" + str(max_n_clusters) if (os.path.exists(filname)): os.remove(filname) print("Data being unloaded in numpy array") print("data unloaded") myfile = open(filname, "a") z = 10 while z * 100 < max_n_clusters + 1: n_clusters = z * 100 print(n_clusters) print("THE VALUE OF NUMBER OF CLUSTERS IS ABOVE") model = KMeans.train(data, n_clusters, initializationMode="k-means||", seed=50, initializationSteps=5, epsilon=1e-3, maxIterations=10000) wssse = model.computeCost(data) print("Within Set Sum of Squared Errors = " + str(wssse)) # Shows the result. centers = model.clusterCenters # print("Cluster Centers: ") # for center in centers: # print(center) # responsibility_matrix, cluster_labels, loglikelihood, cluster_probability = GMMModel.resultPredict( # model, data) # responsibility_matrix_a = responsibility_matrix.take(data_size)
import numpy as np from pyspark import SparkContext from pyspark.mllib.clustering import KMeans, KMeansModel sc = SparkContext("local", "My Simple App") data = sc.textFile("/home/macuser/train.csv") # skip header data = data.filter(lambda line: line[0] != 'l') parsed = data.map( lambda line: np.array([float(x) for x in line.split(',')[1:]])) clusters = KMeans.train(parsed, 10, maxIterations=10, runs=1, initializationMode="random") def error(pt): center = clusters.centers[clusters.predict(pt)] return np.sqrt(sum([x**2 for x in (pt - center)])) wssse = parsed.map(lambda pt: error(pt)).reduce(lambda x, y: x + y) print("Within set sum of squared error: %s" % wssse)
.builder \ .master("local") \ .appName("Python Spark SQL basic example") \ .config("spark.default.parallelism","80") \ .config("spark.driver.memory","8g") \ .config("spark.executor.memory","8g") \ .config("spark.speculation","true") \ .config("spark.local.dir","/opt/tmp") \ .getOrCreate() sc = spark.sparkContext #data = sc.textFile("file:///opt/workspace/tgtag0528.csv") df = spark.read.csv("file:///opt/workspace/tgtag0528.csv") df3 = df.select("_c1", "_c4", "_c8").dropna().rdd.map(lambda x: array(list(x))) clusters = KMeans.train(df3, 3, maxIterations=10, initializationMode="random") print clusters.clusterCenters # Evaluate clustering by computing Within Set Sum of Squared Errors #def error(point): # center = clusters.centers[clusters.predict(point)] # return sqrt(sum([x**2 for x in (point - center)])) #WSSSE = df3.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(clusters.computeCost(df3))) k = 100 for i in range(2, k): clusters = KMeans.train(df3, i, maxIterations=100) print("%d class:Within Set Sum of Squared Error = " % (i) + str(clusters.computeCost(df3)))
import numpy as np from pyspark.mllib.clustering import KMeans images = td.images.frombinary('/user/ds/neuro/fish-long', order='F', engine=sc) series = images.toseries() normalized = series.normalize(method='mean') stddevs = (normalized.map(lambda s: s.std()).sample(1000)) plt.hist(stddevs.values, bins=20) plt.plot(normalized.filter(lambda s: s.std() >= 0.1).sample(50).values.T) # perform k-means on the normalized series ks = [5, 10, 15, 20, 30, 50, 100, 200] models = [] for k in ks: models.append(KMeans.train(normalized.values._rdd.values(), k)) # define a couple functions to score the clustering quality def model_error_1(model): def series_error(series): cluster_id = model.predict(series) center = model.centers[cluster_id] diff = center - series return diff.dot(diff)**0.5 return (normalized.map(series_error).toarray().sum()) def model_error_2(model): return model.computeCost(normalized.values._rdd.values())