def perform_training(sc: SparkContext, params_dict: dict): normal_ekg_data_path = None if 'normal_ekg_data_path' not in params_dict else params_dict[ 'normal_ekg_data_path'] min_num_of_clusters = 5 if 'min_num_of_clusters' not in params_dict else int(params_dict['min_num_of_clusters']) max_num_of_clusters = 20 if 'max_num_of_clusters' not in params_dict else int( params_dict['max_num_of_clusters']) boundary_ratio = 0.8 if 'boundary_ratio' not in params_dict else int(params_dict['boundary_ratio']) ekg_rdd_data = sc.textFile(normal_ekg_data_path).map( lambda line: np.array([float(val) for val in line.split(',')])) # ekg_rdd_data.foreach(Plotter.plot_signal_window) k_range = range(min_num_of_clusters, max_num_of_clusters, 1) prev_cost = float(np.inf) final_km = KMeansModel(ekg_rdd_data.takeSample(False, 1)) cost_ratios = [] found_best = False for k in k_range: km = KMeans.train(ekg_rdd_data, k) # cost equals to sum of squared distances of samples to the nearest cluster centre cost = km.computeCost(ekg_rdd_data) ratio = cost / prev_cost prev_cost = cost cost_ratios.append(ratio) if (ratio > boundary_ratio) & (not found_best): final_km = km found_best = True Plotter.plot_elbow(cost_ratios, k_range) return final_km
def kmeans(): """ 使用mllib对Spark安装包mllib的测试数据集做K-means聚类,由于train方法: Training points as an `RDD` of `Vector` or convertible 所以需对数据集格式化: 初始数据集 --> ['0.0 0.0 0.0', '0.1 0.1 0.1', '0.2 0.2 0.2'] 格式化后数据集 --> [array([0., 0., 0.]), array([0.1, 0.1, 0.1]), array([0.2, 0.2, 0.2])] :return: """ data_rdd = sc.textFile('{}/mllib/kmeans_data.txt'.format(current_dir)) parsed_data_rdd = data_rdd.map(lambda line: array([float(x) for x in line.split(' ')])) # 建立聚类模型 clusters = KMeans.train(parsed_data_rdd, 2, maxIterations=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x ** 2 for x in (point - center)])) WSSSE = parsed_data_rdd.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # 保存 训练好的模型 model_path = "{}/kmeans_model".format(current_dir) if not os.path.exists(model_path): clusters.save(sc, model_path) trained_model = KMeansModel.load( sc, "{}/kmeans_model".format(current_dir) ) return trained_model
def logo_feature_cluster(train_feature_list, train_name_list, clusternum): '''训练''' model = KMeans.train(sc.parallelize(train_feature_list), clusternum, maxIterations=10, initializationMode="random", seed=50, initializationSteps=5, epsilon=1e-4) model_path = tempfile.mkdtemp() model.save(sc, model_path) model = KMeansModel.load(sc, model_path) '''预测''' predict = model.predict(sc.parallelize(train_feature_list)) # print(predict.collect()) try: rmtree(model_path) except OSError: pass logo_result_path = os.path.join( result_path, "logo_image_result" + str(clusternum) + ".txt") writeResultTofile(logo_result_path, train_name_list, predict.collect()) '''Calinski-Harabasz聚类评估指标''' # evaluationCH = metrics.calinski_harabaz_score(train_feature_list, predict.collect()) # ch = str(round(evaluationCH, 2)) # print("Calinski-Harabasz聚类评估指标:"+ch) # with open(result_path + "Calinski-Harabasz.txt", 'a') as a: # a.write(str(clusternum)+":" + ch + "\n") '''Silhouette-Coefficient聚类评估指标'''
def loadModel(): clusterModel = KMeansModel.load(sc, pv.clusterModelPath) classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath) if pv.outputDebugMsg: Utils.logMessage("\nLoad cluster & classification model finished") return clusterModel, classificationModel
def kmeans_w2v_predict(): # appName='kmeans_w2v_predict' # sc = SparkContext(appName=appName) # from pyspark.sql import SQLContext from pyspark.mllib.clustering import KMeans, KMeansModel # sqlContext = SQLContext(sc) data = sqlContext.read.parquet( "hdfs:///user/rmusters/lambert_w2v_data_jan") # df = data.toDF("text", "filtered_text", "split_text", "vectors", "id") df = data.toDF("tokens", "vectors", "id") df = df.where(df.vectors.isNotNull()) data = df.rdd model = KMeansModel.load(sc, "hdfs:///user/rmusters/lambert_kmeans_w2v_jan") # data = data.map(lambda (text, filtered_text, split_text, vectors, id): (text, filtered_text, split_text, vectors, model.predict(vectors), id)) # df = data.toDF(["text", "filtered_text", "split_text", "vectors", "cluster", "id"]) data = data.map(lambda (tokens, vectors, id): (tokens, vectors, model.predict(vectors), id)) df = data.toDF(["tokens", "vectors", "cluster", "id"]) df = df.select("cluster", "id") df = df.sort(df.cluster.asc()) df.write.format("com.databricks.spark.csv").mode("overwrite").save( "lambert_w2v_data_cluster.csv") # df.save("hdfs:///user/rmusters/lambert_w2v_data_cluster.csv", "com.databricks.spark.csv") df.write.parquet("hdfs:///user/rmusters/lambert_w2v_data_cluster", mode="overwrite")
def model_instream(sc, **params): fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get( sc._jsc.hadoopConfiguration()) if not fs.exists( sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])): raise Exception("Invalid file path, path not exists!") if params['type'] == 'kmeans': model = KMeansModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'fpgrowth': model = FPGrowthModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'logistic-regression': model = LogisticRegressionModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'word2vec': model = Word2VecModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'decision-tree': model = DecisionTreeModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) else: raise Exception("Invalid model type!") return True, model
def getCluster(price, crime, male, female, white, black, asian, hispanic, young, mid_age, senior): KModel = KMeansModel.load(sc, "project/data/output/KMeansModel") cluster = KModel.predict([ price, crime, male, female, white, black, asian, hispanic, young, mid_age, senior ]) return cluster
def KMeans_Processing(self, columns): data_point = np.array(self.df_PD[columns]) model = KMeansModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + 'KMeans') result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'KMeans_feature', result)
def assign_pooling(data): image_name, feature_matrix = data[0] clusterCenters = data[1] feature_matrix = np.array(feature_matrix) model = KMeansModel(clusterCenters) bow = np.zeros(len(clusterCenters)) for x in feature_matrix: k = model.predict(x) dist = distance.euclidean(clusterCenters[k], x) bow[k] = max(bow[k], dist) clusters = bow.tolist() group = clusters.index(min(clusters)) + 1 return [image_name, group]
def assign_pooling(row, clusterCenters, pooling): image_name = row['fileName'] feature_matrix = np.array(row['features']) clusterCenters = clusterCenters.value model = KMeansModel(clusterCenters) bow = np.zeros(len(clusterCenters)) for x in feature_matrix: k = model.predict(x) dist = distance.euclidean(clusterCenters[k], x) if pooling == "max": bow[k] = max(bow[k], dist) elif pooling == "sum": bow[k] = bow[k] + dist clusters = bow.tolist() group = clusters.index(min(clusters)) + 1 #print(image_name + " in group: " + str(group)) return [(image_name, group)]
def main(): sc = SparkContext(appName="tileMapper") print("I do all the input output jazz") ########################################################################### big_image = sc.binaryFiles("Reference/108103_sm.jpg") tile_avgs = big_image.flatMap(extract_opencv_tiles()) #buckets = tile_avgs.collect() #print("Bucket",buckets) tileMap = tile_avgs.map( lambda l: [item for sublist in l for item in sublist]) tileList = tileMap.collect() print("Tile Map", tileMap) print("Tile Map", tileMap.collect()) print("Tile List", tileList) print("Tile LIst", type(tileList)) ############################################################################ clusterIndex = getIndex() kmModel = KMeansModel.load(sc, "myModelPath") readyToCombine = [] currentRow = None noOfRow = 0 noOfCol = 0 firstTile = tileList[0] tileSize = firstTile[1] #Randomly Get small images using kmeans match for tile in tileList: if tile[0] == currentRow: smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]], tileSize, clusterIndex) readyToCombine.append(smallImg) noOfCol = noOfCol + 1 else: currentRow = tile[0] noOfCol = 1 noOfRow = noOfRow + 1 currentRow = tile[0] smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]], tileSize, clusterIndex) readyToCombine.append(smallImg) #Put small images into the big image canvas canvas = np.zeros((noOfRow * tileSize, noOfCol * tileSize, 3), np.uint8) #Print Image print("No. of Col", noOfCol) print("No. of Row", noOfRow) #print("Before Print, Check Once again",readyToCombine) mosaicImage = printImage(readyToCombine, canvas, noOfCol, noOfRow, tileSize) print("Finished processing of image") cv2.imwrite('mosaicImageYeah.jpg', mosaicImage)
def run_kmeans(sc): cpu_count = multiprocessing.cpu_count() # Load Model sameModel = KMeansModel.load(sc, INPUT_MODEL) centers = sameModel.clusterCenters print("Cluster Centers: ") for n, center in enumerate(centers): out_f = OUTPUT_DATA + str(n) + "Cluster.csv" numpy.savetxt(out_f, center, newline=";") print(center)
def main(): modelname = sys.argv[1] tiffname = sys.argv[2] outputname = sys.argv[3] sc = SparkContext() model = KMeansModel.load(sc, modelname) dataset = gdal.Open(tiffname, GA_ReadOnly) x, y, data = train.tiff_to_array(dataset, train.weights) driver = dataset.GetDriver().ShortName clusterdata = sc.parallelize(data) result = np.array(clusterdata.map(lambda point: model.predict(point)).collect()) write_to_tif(outputname, x, y, result, driver)
def assign_pooling(data): row = data[0] clusterCenters = data[1] pooling = data[2] image_name = row['fileName'] feature_matrix = np.array(row['features']) model = KMeansModel(clusterCenters) bow = np.zeros(len(clusterCenters)) for x in feature_matrix: k = model.predict(x) dist = distance.euclidean(clusterCenters[k], x) if pooling == "max": bow[k] = max(bow[k], dist) elif pooling == "sum": bow[k] = bow[k] + dist clusters = bow.tolist() group = clusters.index(min(clusters)) + 1 return [image_name, group]
def kmeans_classification(sc, c_tag, util): print 'data retrieving' _data_ = data_retriever(c_tag) print len(_data_) #rids = [x[0] for x in _data_] blc = nlpb.nlpblockbase() __ans = [train_feature_extraction(x, c_tag, blc, util) for x in _data_] ans = [[float(k) for k in x] for x in __ans] #print ans train_data = [np.array(sf.softmax(x)) for x in ans] #train_data = ans print 'dataprep done' assert_len(train_data) brotrain = sc.broadcast(train_data) clusters = KMeans.train(sc.parallelize(brotrain.value), 200, maxIterations=10, initializationMode="random") def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = map(lambda point: error(point), train_data) WSSSE = reduce(lambda x, y: x + y, WSSSE) print("Within Set Sum of Squared Error = " + str(WSSSE)) clustered = collections.defaultdict(list) print len(_data_) cnt = 0 for row in _data_: clustered[clusters.predict(train_data[cnt])].append( [row[0], row[2], row[3], row[4]]) cnt += 1 print len(clustered) for k in clustered.keys(): print len(clustered[k]) clusters.save(sc, "MovieModel") sameModel = KMeansModel.load(sc, "MovieModel") ref = collections.defaultdict(list) for point in train_data: ref[sameModel.predict(point)].append(point) for x, y in zip(ref.keys(), clustered.keys()): assert len(ref[x]) == len(clustered[y]) return clustered
def kmeansInitialClusters(dataset): model = KMeansModel(CENTER_VECTORS) vectorsRdd = dataset.rdd.map(lambda data: Vectors.parse(Vectors.stringify(data['features']))) trainedModel = KMeans.train(vectorsRdd, 4, maxIterations=1000, initialModel=model) result=[] for d in dataset.collect(): entry = {} entry["features"] = d["features"] entry["prediction"] = trainedModel.predict(Vectors.parse(Vectors.stringify(d['features']))) entry["label"] = d['label'] result.append(entry) plotDiversitySizeClustering(result, CENTERS, "Size", "Diversity", "Song Analysis by Size and Diversity with Initial Clusters") centroidArtistSongCount(result, CENTERS)
def main(sc): data = [[1.0, 1.0], [1.0, 0.8], [-1.0, 1.0], [-1.0, -1.0]] parsedData = sc.parallelize(data) kmeansModel = KMeans.train(parsedData, 2, maxIterations=10, runs=10, initializationMode="random") print(kmeansModel.predict([1.0, 1.0])) print(kmeansModel.predict([1.0, -2.0])) # Save and load model kmeansModel.save(sc, "KMeansModel") model = KMeansModel.load(sc, "KMeansModel") print(model.predict([1.0, 1.0])) print(model.predict([1.0, -2.0]))
def print_model(self,model_name): # try to load the specified model path = self.base + model_name try: model = KMeansModel.load(self.sc, path) except: raise Exception('No such model found on hdfs!') for c in model.clusterCenters: print(c) for c in model.clusterCenters: l = [] for i in c: i = decimal.Decimal(i).quantize(decimal.Decimal('0.01')) l.append(float(i)) print(l)
def run_kmeans(sc): cpu_count = multiprocessing.cpu_count() # Load Data dataset = sc.textFile(INPUT_DATA, cpu_count) dataset = dataset.map( lambda line: array([float(x) for x in line.split(';')])) # Load Model sameModel = KMeansModel.load(sc, INPUT_MODEL) # Predict cluster labels per row labels = sameModel.predict(dataset).collect() # Save labels in json file with open(OUTPUT_LABEL, 'w') as out_f: json.dump(labels, out_f)
def predict(): from pyspark.mllib.clustering import KMeans, KMeansModel from pyspark.mllib.linalg import Vectors data = sqlContext.read.format("com.databricks.spark.csv").option( "header", "true").load("w2v_vector.csv") data = data.map(lambda x: [float(a) for a in x]) df = data.toDF() columns = df.columns vectors = df.select(columns[1:71]) vectors = vectors.map(lambda x: Vectors.dense(x)) for n_clusters in _range: model = KMeansModel.load( sc, "hdfs:///user/rmusters/w2v_model_kmeans_" + str(n_clusters)) predicted = model.predict(vectors) result = predicted.map(lambda x: (x, )).toDF() result.save("clusters_" + str(n_clusters))
def kmeans_lda_predict(): appName = 'kmeans_lda_predict' from pyspark.mllib.clustering import KMeans, KMeansModel from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel sc = SparkContext(appName=appName) from pyspark.sql import SQLContext sqlContext = SQLContext(sc) data = sqlContext.read.parquet( "hdfs:///user/rmusters/lda_doc_topic") #lda_data_jan # data = df.rdd model = KMeansModel.load(sc, "hdfs:///user/rmusters/kmeans_lda_jan") data = data.map(lambda (id, vectors): (id, vectors, model.predict(vectors))) df = data.toDF(["id", "vectors", "cluster"]) df = df.sort(df.cluster.asc()) # df.write.parquet("hdfs:///user/rmusters/lda_data_cluster", mode= "overwrite") df.write.parquet("hdfs:///user/rmusters/lda_data_cluster", mode="overwrite") logger.info(appName)
def predict(self, model_name, data): ''' predict unknown data :param model_name: the trained model saving on hdfs :param data: unknown data :return: (cluster_index, cluster) ''' # try to load the specified model path = self.base + model_name try: model = KMeansModel.load(self.sc, path) except: raise Exception('No such model found on hdfs!') # get the predict : means which cluster it belongs to index = model.predict(data) print('Data:%s belongs to cluster:%s. The index is %s' % (data, model.clusterCenters[index], index)) return index, model.clusterCenters[index]
def train_rotations(sc, split_vecs, M, Cs): """ For compute rotations for each split of the data using given coarse quantizers. """ Rs = [] mus = [] counts = [] for split in xrange(2): print 'Starting rotation fitting for split %d' % split # Get the data for this split data = split_vecs.map(lambda x: x[split]) # Get kmeans model model = KMeansModel(Cs[split]) R, mu, count = compute_local_rotations(sc, data, model, M / 2) Rs.append(R) mus.append(mu) counts.append(count) return Rs, mus, counts
def parseOwnership(line): fields = line.split(',') #print len(fields) owner = fields[0] taxes = int(fields[1]) lat = float(fields[2]) lon = float(fields[3]) cluster = int(fields[4]) return (owner, taxes, lat, lon, cluster) conf = SparkConf().setMaster("local").setAppName("tugasbigdata") sc = SparkContext(conf = conf) clusters = KMeansModel.load(sc,"C:/SparkCourse/FP_model") lines = sc.textFile("file:///SparkCourse/data_center.csv") parsedLines = lines.map(parseLine) #bersih bersih reserveddata= parsedLines.filter(lambda x : x is not None) reserveddata1= reserveddata.filter(lambda x : x[1] is not None) reserveddata2=reserveddata1.filter(lambda x : x[2] is not None) temp=reserveddata2.filter(lambda x : "San Francisco" in x[0]) tempdata=temp.map(lambda x: (x[1],x[2])) data=tempdata.map(lambda x: (float(x[0]),float(x[1]))) data_local = data.collect() ownershipData = lines.map(parseOwner) #bersih2 lagi
def main(): data_rdd = load_data(project["data_file"]) print (data_rdd.count()) listed_data_rdd = data_rdd.map(data_extractor) # Filtering unwanted data rows elect_filtered_rdd = listed_data_rdd.filter(electric_vehicles_filter) filtered_rdd = elect_filtered_rdd.filter(empty_cost_filter) # Mapping related data to convenient format for clustering cost_tx_rdd = filtered_rdd.map(cost_transform_mapper) feature_mapped_rdd = cost_tx_rdd.map(feature_mapper) estimated_clusters = KMeansModel.load(spark_context, "identified_clusters") optimum_cluster = 4 optimum_points_rdd = feature_mapped_rdd.filter( lambda filtered_data_feature_vector: estimated_clusters.predict(filtered_data_feature_vector[-1]) == optimum_cluster ) print (optimum_points_rdd.count()) sample_data = optimum_points_rdd.take(100) for data in sample_data: feature_vector = data[-1] pyplot.scatter(feature_vector[0], feature_vector[1]) optimum_cluster_manufactures_rad = optimum_points_rdd.map(manufactures_mapper) optimum_points_rdd.persist() # To hold the previously calculated data set in memory individual_manufactures_count_rdd = optimum_cluster_manufactures_rad.reduceByKey(operator.add) sorted_manufactures_count_rdd = individual_manufactures_count_rdd.sortBy( lambda manufactures_set: manufactures_set[1], ascending=False ) top_ten = 10 vehicle_count = [] manufactures_name = [] for manufacture in sorted_manufactures_count_rdd.take(top_ten): vehicle_count.append(manufacture[1]) manufactures_name.append(manufacture[0]) print (manufacture) pyplot.title("Best Vehicle Cluster") pyplot.xlabel("Europe Rating") pyplot.ylabel("Feature Normalized") pyplot.show() number_of_manufactures = top_ten # sorted_manufactures_count_rdd.count() index = np.arange(number_of_manufactures) bar_width = 0.5 opacity = 0.4 error_config = {"ecolor": "0.3"} chart = pyplot.bar( index, vehicle_count, bar_width, alpha=opacity, color="b", error_kw=error_config, label="manufactures" ) pyplot.xticks(index + bar_width, manufactures_name) pyplot.title("Top preforming vehicles") pyplot.xlabel("manufactures") pyplot.ylabel("Vehicles count") pyplot.show()
#print np.shape(parsedData) # Build the model (cluster the data) # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] #print center return sqrt(sum([x**2 for x in (point - center)])) #WSSE = parsedData.map(lambda point:error(point)).reduce(lambda x,y:x+y) WSSE=np.zeros(5) import time for i in range(2,7): t = time.time() clusters = KMeans.train(parsedData, i, maxIterations=100, runs=100, initializationMode="random") WSSE[i] = (parsedData.map(lambda point:error(point)).reduce(lambda x,y :x+y)) print str(WSSE[i])+" "+str(i)+" WITH TIME ="+str(time.time()-t) #WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) #print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "./mymodel1") sameModel = KMeansModel.load(sc, "./mymodel1") #print clusters
from pyspark import SparkContext # $example on$ from pyspark.mllib.clustering import KMeans, KMeansModel # $example off$ if __name__ == "__main__": sc = SparkContext(appName="KMeansExample") # SparkContext # $example on$ # Load and parse the data data = sc.textFile("data/mllib/kmeans_data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") # $example off$ sc.stop()
from collections import OrderedDict from numpy import array from math import sqrt if __name__ == "__main__": if (len(sys.argv) != 2): print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \ "predict.py kddcup.data.file" sys.exit(1) data_file = sys.argv[1] conf = SparkConf().setAppName("KDDCup99") \ #.set("spark.executor.memory", "2g") sc = SparkContext(conf=conf) model = KMeansModel.load(sc, "best_model") clusters = model.clusterCenters with open(data_file) as file: for line in file: line_split = line.split(",") clean_line_split = [line_split[0]] + line_split[4:] clusterIndex = model.predict( array([float(x) for x in clean_line_split])) print clusterIndex print print "DONE!"
from pyspark import SparkContext, SparkConf # Load and parse the data # 데이터 로드 및 분석 conf = SparkConf().setMaster("local").setAppName("Test") sc = SparkContext(conf=conf) data = sc.textFile("data/mllib/kmeans_data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) # 모델 빌드(데이터 클러스터링) clusters = KMeans.train(parsedData, 2, maxIterations=20, runs=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "myModelPath") sameModel = KMeansModel.load(sc, "myModelPath")
def process(id, content): kmeansDoc = DocumentKmeans(id, '') print(type(content)) kmeansDoc.content = content kmeansDoc.doc2vec() kmeansDoc.printVec() kmeansDoc.kmeansVec() kmeansDoc.cluster_id = kmeansPredict.predict(kmeansDoc) return kmeansDoc utils = Utils(45000) sc = SparkContext(appName="PythonKafkaConsumerKmeans") kmeansModel = KMeansModel.load(sc, '../KmeansModel') kmeansPredict = KmeansPredict(kmeansModel) consumer = KafkaConsumer('test', group_id='kafka_consumer_group', bootstrap_servers=['localhost:9092']) for message in consumer: value = message.value spl = value.split(':') id = spl[0] content = utils.normalizeString(spl[1]) docu = process(id, content) print(docu.cluster_id) KafkaConsumer(auto_offset_reset='earliest', enable_auto_commit=False) KafkaConsumer(value_deserializer=lambda m: json.loads(unicode(m, "utf8")))
from numpy import array from math import sqrt # Load and parse input data data = sc.textFile("data/kmeans_data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build and train the model: K=2, 10 iterations. clusters = KMeans.train(parsedData, 2, 10) # Evaluate the clustering def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Saving and loading the model clusters.save(sc, "MyModels") sameModel = KMeansModel.load(sc, "MyModels") sameModel sameModel.k sameModel.clusterCenters
def kmeans_model_load(self, sc,path): return KMeansModel.load(sc, path)
def __init__(self, spark_contect, model_path): self.model = KMeansModel.load(spark_contect, model_path)
input_living_index = sys.argv[1] # Read the parquet data and convert to RDD parquet_living_index = sqlContext.read.parquet(input_living_index) parquet_living_index.registerTempTable("living_index_table") living_index_table = sqlContext.sql("SELECT * FROM living_index_table") living_index_rdd = living_index_table.map(lambda colName: (str(colName.Community_Code) + "," + str(colName.Crime_Frequency) + "," + str(colName.Housing_Crowded) + "," + str(colName.Household_BPL) + "," + str(colName.Unemployed) + "," + str(colName.Without_Diploma) + "," + str(colName.Age_Bar) + "," + str(colName.Per_Capita_Income) + "," + str(colName.Hardship_Index))) # K-means does multiple runs to find the optimal cluster center, so cache the input to K-means cluster_input = living_index_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache() # Perform K-means clustering clusters = KMeans.train(cluster_input, 20, maxIterations=5, runs=5, initializationMode="random") # Compute squared error and change cluster centers def squared_error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y) print("Squared error for a cluster = " + str(error)) # Save the cluster model clusters.save(sc, "myModel/living-index") sameModel = KMeansModel.load(sc, "myModel/living-index")
from pyspark.mllib.clustering import KMeans, KMeansModel from numpy import array from math import sqrt from pyspark import SparkContext sc = SparkContext(appName="AA_kmeans") # Load and parse the data data = sc.textFile("hdfs://namenode/kmeans_data.txt") #print(data.take(5)) #print data.map(lambda line: array([x for x in line.split(' ')])).collect() parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, runs=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "hdfs://namenode/myModelPath") sameModel = KMeansModel.load(sc, "hdfs://namenode/myModelPath")
from numpy import array from math import sqrt from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark_cassandra import streaming def predict(line): spl = line.split(':') doc = DocumentKmeans(spl[0], spl[1], spl[2]) doc.kmeansVec() doc.cluster_id = kmeansPredict.predict(doc) return doc sc = SparkContext() sameModel = KMeansModel.load(sc, "../KmeansModel") kmeansPredict = KmeansPredict(sameModel) parsedData = sc.textFile('hdfs://localhost:8020/user/manh/vector')\ .filter(lambda x: len(x) > 2000)\ .map(lambda x: predict(x))\ .map(lambda x: { 'id' : x.id, 'cluster_id' : int(x.cluster_id), 'timestamps' : long(x.timestamps), 'vector' : x.vector }).saveToCassandra('reishi', 'dockmeans')
from pyspark.mllib.clustering import KMeans, KMeansModel from numpy import array from math import sqrt # Load and parse the data data = sc.textFile("/home/grijesh/sampleData/k-means-data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, runs=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "myModelPath") sameModel = KMeansModel.load(sc, "myModelPath")
if __name__ == "__main__": sc = SparkContext(appName="KMeansExample") # SparkContext # $example on$ # Load and parse the data data = sc.textFile("k_means_data.txt") parsedData = data.map( lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce( lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "KMeansModel") sameModel = KMeansModel.load(sc, "KMeansModel") # $example off$ sc.stop()
print arrays.collect() indx = 0 while indx < count: vec = Vectors.dense(arrays.collect()[indx]) indx += 1 clusternum = model.predict(vec) print "Cluster -> ", clusternum, vec return # Create a local StreamingContext with two working thread and batch interval of 1 second conf = SparkConf().setAppName("Fraud Detector") conf = conf.setMaster("local[2]") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) # Create a DStream that will connect to hostname:port, like localhost:9999 lines = ssc.socketTextStream("localhost", 8999) # Split each line into words model = KMeansModel.load(sc, "kmeansmodel01") print model.clusterCenters print "************************** Loaded the model *********************" words = lines.flatMap(lambda line: line.split(" ")) lines.foreachRDD(detect) ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
from pyspark import SparkContext # Load and parse the data sc = SparkContext() data = sc.textFile("/user/hduser/venkat/iris.txt") print data.first() parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random") prediction = clusters.predict(parsedData) print clusters.centers # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model myModelPath = "/user/hduser/output/kmeans_output" clusters.save(sc, myModelPath) sameModel = KMeansModel.load(sc, myModelPath)
# check input db input_db = os.path.join(args.input_root_dir, "dbs", "db", "out.parquet") if not os.path.isdir(input_db): raise Exception("missing db parquet directory") # check output dir # logger.debug("Create new codebook dir...") output_dir = os.path.join(args.input_root_dir, 'features', 'feature') if os.path.isdir(output_dir): new_name = output_dir + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) logger.info("backup old output-dir in %s" % new_name) os.rename(output_dir, os.path.join(args.input_root_dir, new_name)) os.makedirs(output_dir) model = KMeansModel.load(sc, input_codebook) model = sc.broadcast(model) pooling="max" feature_name = "SIFT" df = sqc.read.parquet(input_db) print df.count() features_bow = df.map(functools.partial(compute_global_feature, feature_name="SURF", model=model, pooling=pooling)) print features_bow.first()
# Load and parse the data # conf = SparkConf() sc = SparkContext() data = sc.textFile("./business_gps.csv") parsedData = data.map(parse_line) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 10, maxIterations=1000, runs=10, initializationMode="k-means") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "kmeans_model") sameModel = KMeansModel.load(sc, "kmeans_model") print("Cluster centers", sameModel.clusterCenters)
conf = SparkConf().setAppName('KMeans').setMaster('local[2]') sc = SparkContext(conf=conf) # load and parse data data = sc.textFile('../data/kmeans_data.txt') parseData = data.map( lambda line: np.array([float(x) for x in line.split(' ')])) # build the model clusters = KMeans.train(parseData, 2, maxIterations=10, runs=10, initializationMode='random') #evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return math.sqrt(sum([x**2 for x in (point - center)])) WSSSE = parseData.map(lambda p: error(p)).reduce(lambda x, y: x + y) print('Within Set Sum of Squared Error :' + str(WSSSE)) # save and load model clusters.save(sc, '../model/KMeansModel') sameModel = KMeansModel.load(sc, '../model/KMeansModel') sc.stop()
"hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv" ) average_per_year = average_year(lines) # 2014 and 2015 average_per_month = average_month(average_per_year) data = parseDataset(dataset) k = int(sys.argv[1]) initial_centroids = generate_initial_centroids(average_per_month.collect(), k) # KMeans start = time() kmeans_model = KMeans.train(data, k, maxIterations=100, initialModel=KMeansModel(initial_centroids)) end = time() elapsed_time = end - start kmeans_output = [ "====================== KMeans ====================\n", "Final centers: " + str(kmeans_model.clusterCenters), "Total Cost: " + str(kmeans_model.computeCost(data)), "Value of K: " + str(k), "Elapsed time: %0.10f seconds." % elapsed_time ] # Predicting points = parseDataset(predict_data) count_lines = float(len(points.collect())) probabilities = generate_probabilities(points, k, kmeans_model, count_lines)
if ascontext.isComputeDataModelOnly(): ascontext.setSparkOutputSchema(output_schema) sys.exit(0) else: modelpath = ascontext.getModelContentToPath("model") model_metadata = json.loads(ascontext.getModelContentToString("model.metadata")) # create a DataModelTools to handle data model and data conversions datamodel = model_metadata["datamodel"] dmt = DataModelTools(datamodel) predictors = model_metadata["predictors"] DataModelTools.checkPredictors(datamodel,predictors,df) from pyspark.mllib.clustering import KMeansModel model = KMeansModel.load(sc, modelpath) # to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this dv = dmt.extractDenseVector(df,predictors,setToFlag=1.0) def rowToList(row): result = [] for idx in range(0, len(row)): result.append(row[idx]) return result mapFn = lambda (x,y):rowToList(x)+[y] rdd2 = dv.map(lambda x: rowToList(x[0]) + [model.predict(x[1])])
def main(): k_input_model = sys.argv[1] #read kmean model from this location w_input_model = sys.argv[2] #read word2vec model from this location input_file = sys.argv[3] #read input file conf = SparkConf().setAppName('Clustering') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) '''sbaronia - load both kmean and Word2Vec model''' kmean_model = KMeansModel.load(sc,k_input_model) word2vec_model = Word2VecModel.load(sc,w_input_model) '''sbaronia - select fields from json and make data frame zipped with index''' review = sqlContext.read.json(input_file).select('reviewText','overall','reviewTime').cache() review_df = review.filter(review.reviewText != "").cache() rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache() rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache() year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache() year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache() clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText)).cache() clean_list = clean_words_rdd.collect() '''sbaronia - make a list of all words in our model''' keys = sqlContext.read.parquet(w_input_model+"/data") keys_list = keys.rdd.map(lambda line: line.word).collect() '''sbaronia - here we create one vector per review, where vector contains the number of times a cluster is assinged to a word in a review. We make a SparseVector compatible format''' features = [] for i in range(len(clean_list)): histogram = [0] * 2000 for word in clean_list[i]: if word in keys_list: vec = word2vec_model.transform(word) clust = kmean_model.predict(vec) if histogram[clust] > 0: histogram[clust] = histogram[clust] + 1 else: histogram[clust] = 1 features.append((2000,range(2000),histogram)) '''sbaronia - create a normalized SparseVector rdd''' nor = Normalizer(1) features_rdd = rdd_zip(sc.parallelize(features) \ .map(lambda line: nor.transform(SparseVector.parse(line))) \ .cache()).cache() '''sbaronia - make a dataframe with rating, year and vector per review''' features_df = sqlContext.createDataFrame(features_rdd, ['feature', 'index']).cache() year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache() featyearrate_df = features_df.join(year_rating_df, features_df.index == year_rating_df.index, 'inner') \ .drop(features_df.index).cache() '''sbaronia - create training and testing data based on year''' train_rdd = featyearrate_df.filter(featyearrate_df.year < 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() test_rdd = featyearrate_df.filter(featyearrate_df.year == 2014) \ .select('rating','feature') \ .map(lambda line: (LabeledPoint(line.rating, line.feature))) \ .coalesce(1) \ .cache() '''sbaronia - find best step using validation and run LinearRegressionWithSGD with that step and report final RMSE''' step_best_norm = validation(train_rdd) RMSE_norm = regression_and_error(train_rdd,test_rdd,step_best_norm) print("Final RMSE(Normalization) = " + str(RMSE_norm) + " Best Step size = " + str(step_best_norm))
from pyspark import SparkContext, SparkConf from pyspark.mllib.clustering import KMeans, KMeansModel from math import sqrt def parse_line(ln): split_ln = ln.split(',') ln_coord = [float(split_ln[1]), float(split_ln[2])] new_line = ln + ',' + cluster_labels[ KMeans_model.predict(ln_coord)] return new_line # load and parse the data # conf = SparkConf() sc = SparkContext() # load previously generated k-means model KMeans_model = KMeansModel.load(sc, "kmeans_model") # define cluster label array cluster_labels = ["Pheonix-AZ", "Edinburgh-UK", "Charlotte-NC", "Madison-WI", "Montreal-Canada", "Waterloo-Canada", "Las Vegas-NV", "Urbana-Champaign-IL", "Pittsburgh-PA", "Karlsruhe-Germany"] # read the file which has business_ids, latitude, longitude data = sc.textFile("./business_gps.csv") # get labelled rows parsedData = data.map(parse_line) # save labelled businesses in the output folder parsedData.saveAsTextFile("./output")
from pyspark import SparkContext from pyspark.mllib.clustering import KMeans, KMeansModel if __name__ == "__main__": sc = SparkContext(appName="KMeansApp") # SparkContext # Load and parse the data data = sc.textFile("s3://irm238FinalProject/input/citibike*") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, runs=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model clusters.save(sc, "KmeansModel") sameModel = KMeansModel.load(sc, "KMeansModel") sc.stop()
import json from numpy import array from pyspark import SparkContext from pyspark.mllib.clustering import KMeansModel def mapper(line): # Format the line line = line.replace("(", "").replace(")", "").replace("[", "").replace("]", "") elements = line.split(",") stock_name = elements.pop(0) percent_changes = map(lambda x: float(x), elements) return stock_name, percent_changes if __name__ == "__main__": sc = SparkContext(appName="ComputeResults") model = KMeansModel.load(sc, sys.argv[2]) mapred_results = sc.textFile(sys.argv[1]) clusters = mapred_results.map(mapper)\ .reduceByKey(lambda a, b: a + b)\ .map(lambda stock: (model.predict(array(stock[1])), [stock[0]]))\ .reduceByKey(lambda a, b: a + b)\ .collectAsMap() with open('result.json', 'w') as fp: json.dump(clusters, fp)
sqlContext = SQLContext(sc) # Read the input parquet input_crime = sys.argv[1] # Read the parquet data and convert to RDD parquet_crime = sqlContext.read.parquet(input_crime) parquet_crime.registerTempTable("crime_table") crime_table = sqlContext.sql("SELECT * FROM crime_table") crime_rdd = crime_table.map(lambda line: str(line.Year) + "," + str(line.Latitude) + "," + str(line.Longitude) + "," + str(line.Crime_Frequency)) # K-means does multiple runs to find the optimal cluster center, so cache the input to K-means cluster_input = crime_rdd.map(lambda line: array([float(x) for x in line.split(',')])).cache() # Perform K-means clustering clusters = KMeans.train(cluster_input, 20, maxIterations=5, runs=5, initializationMode="random") # Compute root mean squared error and change cluster centers def squared_error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) error = cluster_input.map(lambda point: squared_error(point)).reduce(lambda x, y: x + y) print("Squared error for a cluster = " + str(error)) # Save the cluster output into parquet files clusters.save(sc, "myModel_crime") sameModel = KMeansModel.load(sc, "myModel_crime")
if __name__ == "__main__": b = open("name", 'wb') sc = SparkContext("local[*]", "kmeans") print("data being loaded.....") data = sc.textFile( sys.argv[1]).map(lambda row: map(lambda x: float(x), row.split(','))) #file:///dev/desc_hdfs print("data loaded!") D = 128 print("loading and counting") data_size = data.count() print("count done") print("model being loaded.....") model = KMeansModel.load(sc, sys.argv[2]) print("model loaded!") centers = model.clusterCenters # ################SAMPLING################################################## #total_sampled_points = int(sys.argv[3]) cluster = {} samples = {} print("data being stored in array....") #da = data.collect() print("data stored") n_clusters = model.k for j in range(n_clusters): cluster[j] = []
currTime = strftime("%Y-%m-%d-%H-%M-%S") sc = SparkContext(appName="KMeans") lines = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/initial_centroids.csv") dataset = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/training_data.csv") predict_data = sc.textFile("hdfs://masterNode:9000/user/spark/dataset_observatory/predict_data/Semestres/Semestre1-2016.csv") average_per_year = average_year(lines) # 2014 and 2015 average_per_month = average_month(average_per_year) data = parseDataset(dataset) k = int(sys.argv[1]) initial_centroids = generate_initial_centroids(average_per_month.collect(), k) # KMeans start = time() kmeans_model = KMeans.train(data, k, maxIterations = 100, initialModel = KMeansModel(initial_centroids)) end = time() elapsed_time = end - start kmeans_output = [ "====================== KMeans ====================\n", "Final centers: " + str(kmeans_model.clusterCenters), "Total Cost: " + str(kmeans_model.computeCost(data)), "Value of K: " + str(k), "Elapsed time: %0.10f seconds." % elapsed_time ] # Predicting points = parseDataset(predict_data) count_lines = float(len(points.collect())) probabilities = generate_probabilities(points, k, kmeans_model, count_lines) print("Prob: ", probabilities)
# -*- coding: utf-8 -*- from konlpy.tag import Twitter from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.feature import Normalizer from pyspark.mllib.clustering import KMeans, KMeansModel from numpy import array from math import sqrt sc = SparkContext() sqlContext = SQLContext(sc) normData = sc.pickleFile('idf_normalized') clusters = KMeansModel.load('KMeasModel') text = normData.map(lambda x : (x.no,x.eval_content)) data = normData.map(lambda x : (x.no,clusters.predict(x.idf_norm)) ) result = text.join(data).map(lambda (k, (left,right)) : (right,left.encode('uft-8')) ) for i in range(10): result.filter(lambda (x,y): x == i).map( lambda (x,y): y).saveAsTextFile("KMeansOutput/cluster_"+str(i))