def test_gmm_deterministic(self): from pyspark.mllib.clustering import GaussianMixture x = range(0, 100, 10) y = range(0, 100, 10) data = self.sc.parallelize([[a, b] for a, b in zip(x, y)]) clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=10, seed=63) clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=10, seed=63) for c1, c2 in zip(clusters1.weights, clusters2.weights): self.assertEqual(round(c1, 7), round(c2, 7))
def test_gmm_with_initial_model(self): from pyspark.mllib.clustering import GaussianMixture data = self.sc.parallelize([(-10, -5), (-9, -4), (10, 5), (9, 4)]) gmm1 = GaussianMixture.train(data, 2, convergenceTol=0.001, maxIterations=10, seed=63) gmm2 = GaussianMixture.train( data, 2, convergenceTol=0.001, maxIterations=10, seed=63, initialModel=gmm1 ) self.assertAlmostEqual((gmm1.weights - gmm2.weights).sum(), 0.0)
def test_gmm_deterministic(self): from pyspark.mllib.clustering import GaussianMixture x = range(0, 100, 10) y = range(0, 100, 10) data = self.sc.parallelize([[a, b] for a, b in zip(x, y)]) clusters1 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=100, seed=63) clusters2 = GaussianMixture.train(data, 5, convergenceTol=0.001, maxIterations=100, seed=63) for c1, c2 in zip(clusters1.weights, clusters2.weights): self.assertEquals(round(c1, 7), round(c2, 7))
def test_gmm_with_initial_model(self): from pyspark.mllib.clustering import GaussianMixture data = self.sc.parallelize([ (-10, -5), (-9, -4), (10, 5), (9, 4) ]) gmm1 = GaussianMixture.train(data, 2, convergenceTol=0.001, maxIterations=10, seed=63) gmm2 = GaussianMixture.train(data, 2, convergenceTol=0.001, maxIterations=10, seed=63, initialModel=gmm1) self.assertAlmostEqual((gmm1.weights - gmm2.weights).sum(), 0.0)
def gmm_spark(sc, X=None, clusters=3): if X is None: X = users_as_parallelizable_sparse_data(users) X = sc.parallelize(X) gmm = GaussianMixture.train(X, k=clusters) for i in range(2): print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray())
def gmm_spark(sc, X=None, clusters=3): if X is None: X = users_as_parallelizable_sparse_data(users) X = sc.parallelize(X) gmm = GaussianMixture.train(X, k=clusters) for i in range(2): print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray())
def test_gmm(self): from pyspark.mllib.clustering import GaussianMixture data = self.sc.parallelize([[1, 2], [8, 9], [-4, -3], [-6, -7]]) clusters = GaussianMixture.train(data, 2, convergenceTol=0.001, maxIterations=100, seed=56) labels = clusters.predict(data).collect() self.assertEquals(labels[0], labels[1]) self.assertEquals(labels[2], labels[3])
def main(): sc = SparkContext(master="local", appName="K-Means") try: # csv = sc.textFile(sys.argv[1]) if input via cmd csv = sc.textFile("kmeans_data.csv") except IOError: print('No such file') exit(1) parsedData = csv.map(parseLine) trueValue = csv.map(getTrueValue) # print for debugging print("number of features: ", len(parsedData.collect()[0])) # Build the model (cluster the data), K = 2 clusters = KMeans.train(parsedData, 2, maxIterations=50, initializationMode="random") g_clusters = GaussianMixture.train(parsedData, 2) centers = clusters.clusterCenters # g_centers = g_clusters.clusterCenters print("Final k centers:", centers) # print for debugging purpose # print("Final k centers for expectation maximization:", g_centers) # for each data point, generate its cluster label: predictedLabels = parsedData.map( lambda point: closestCluster(point, centers)) # g_predictedLabels = parsedData.map(lambda point: closestCluster(point, g_centers)) g_predictedLabels = g_clusters.predict(parsedData) results = predictedLabels.collect() g_results = g_predictedLabels.collect() true = trueValue.collect() accuracy_count = 0 # count how many data points having correct labels # output in results.txt: i-th row: true label, predicted label for i-th data point: g_accuracy_count = 0 with open("results.txt", "w") as f: f.write("true\tpredicted\n") for i in range(len(results)): f.write(str(true[i]) + "\t" + str(results[i]) + "\n") if int(true[i]) == int(results[i]): accuracy_count += 1 if int(true[i]) == int(g_results[i]): g_accuracy_count += 1 accuracy = accuracy_count / len(results) g_accuracy = g_accuracy_count / len(results) if accuracy < 0.5: # our predicted label IDs might be opposite accuracy = 1 - accuracy if g_accuracy < 0.5: g_accuracy = 1 - g_accuracy print("accuracy is :", accuracy) print("EM accuracy is : ", g_accuracy) sc.stop()
def test_gmm(self): from pyspark.mllib.clustering import GaussianMixture data = self.sc.parallelize([ [1, 2], [8, 9], [-4, -3], [-6, -7], ]) clusters = GaussianMixture.train(data, 2, convergenceTol=0.001, maxIterations=100, seed=56) labels = clusters.predict(data).collect() self.assertEquals(labels[0], labels[1]) self.assertEquals(labels[2], labels[3])
def cluster(sc, sample): sample = sc.parallelize(sample) testdata = sc.parallelize([5, 5]) ###### # kmeansmodel = KMeans.train(sample,3) # print kmeansmodel.centers # print kmeansmodel.predict([5,5]) gmmmodel = GaussianMixture.train(sample, 3, maxIterations=10) # print gmmmodel.weights print gmmmodel.predict(testdata)
def find_outliers_Gaussian(sequence, distance_factor=6): df = sequence df_vector = df.map(lambda x: np.array(float(x))) gmm = GaussianMixture.train(df_vector, 1) mu, sigma = list(zip(*[(g.mu, g.sigma) for g in gmm.gaussians])) m = mu[0].values s = sqrt(sigma[0].values) l = np.array(df_vector.collect()) d = abs(l - m) outliers = list(set(list(l[d >= distance_factor * s]))) filtered = sequence.filter(lambda x: x not in outliers) return outliers, filtered
def find_collective_outliers_KGaussians(sequence, k=5, proportion=0.1, ratio=10): df = sequence df_vector = df.map(lambda x: np.array(float(x))) gmm = GaussianMixture.train(df_vector, k) labels = gmm.predict(df_vector) w = gmm.weights l = [] point_label = df_vector.zip(labels) mus, sigmas = list(zip(*[(g.mu, g.sigma) for g in gmm.gaussians])) m = [] for i in range(k): m.append(float(mus[i].values)) m1 = m[:] removed = [] not_removed = [] for i in range(len(w)): w_i = w[i] if w_i < proportion / k: removed.append(i) else: not_removed.append(i) for e in removed: l = l + point_label.filter(lambda x: x[1] == e).map( lambda x: float(x[0])).collect() m = np.array(m) if not_removed: m = m[not_removed] n = list(m).index(max(m)) try: p = not_removed[n] m = sorted(m) a = m[0] b = m[-2] c = m[-1] if ratio * b / a < c / b: l = l + point_label.filter(lambda x: x[1] == p).map( lambda x: float(x[0])).collect() #return l+m1+list(w) return l except IndexError: return []
def __gauss_clustering(self): # get the whole population without fitness value, then flat it rdd_aux = self.__rdd.flatMap(lambda x: x.get_population(fitness=False)) # train the gauss cluster gauss_cluster = GaussianMixture.train( rdd_aux, self.__colonies, maxIterations=self.__cluster_iterations) # create a new rdd with the labels rdd_labels = gauss_cluster.predict(rdd_aux) # zip each result with its class rdd_aux = rdd_labels.zip(rdd_aux) # input serialization cols = self.__colonies self.__sc.broadcast(cols) # divide into partitions rdd_aux = rdd_aux.partitionBy(cols, partitionFunc=lambda x: x).glom() # remove the index of each element rdd_aux = rdd_aux.map(lambda x: [y[1] for y in x]) # input serialization evaluation = self.__evaluation generation = self.__generation cross = self.__cross mutation = self.__mutation selection = self.__selection survival = self.__survival mut_ratio = self.__mut_ratio survival_ratio = self.__survival_ratio control_obj = self.__control_obj # create the new colonies self.__rdd = rdd_aux.map( lambda x: Colony(evaluation, generation, cross=cross, mutation=mutation, selection=selection, mut_ratio=mut_ratio, survival_ratio=survival_ratio, survival=survival, control_obj=control_obj, population=x))
def gaussian_mixture(unclustered_data, number_of_clusters, max_iterations=100, seed=None, initial_model=None): if number_of_clusters < 1: raise ValueError("While clustering with GaussianMixture, \ the given number of clusters is not positive") gmm = GaussianMixture.train(rdd=unclustered_data, k=number_of_clusters, maxIterations=max_iterations, seed=seed, initialModel=initial_model) parameters = [] for i in range(number_of_clusters): parameters.append({ "weight": gmm.weights[i], "mu": gmm.gaussians[i].mu, "sigma": gmm.gaussians[i].sigma.toArray() }) return [gmm, parameters]
def find_outliers_KGuaussians(sequence, k=2, proportion=0.95, distance_factor=3): #currently please take input as a list and return a list #for now, k=2 df = sequence df_vector = df.map(lambda x: np.array(float(x))) gmm = GaussianMixture.train(df_vector, k) labels = gmm.predict(df_vector).collect() labels = np.array(labels) n = len(labels) c0 = len(labels[labels == 0]) c1 = n - c0 mus, sigmas = list(zip(*[(g.mu, g.sigma) for g in gmm.gaussians])) m = [] s = [] for i in range(k): m.append(float(mus[i].values)) s.append(float(sigmas[i].values)) m = np.array(m) s = np.array(m) l = df_vector.collect() l = np.array(l) if abs(m[0] - m[1]) > distance_factor * (sqrt(s[0]) + sqrt(s[1])): if c0 / n > proportion: return list(l[labels == 1]) elif c1 / n > proportion: return list(l[labels == 0]) else: return []
# -*- coding:utf-8 -*- """" Program: GMM Description: 调用spark内置的GMM算法示例 Author: zhenglei - [email protected] Date: 2016-01-14 13:38:58 Last modified: 2016-01-14 13:50:11 Python release: 2.7 """ # 调用spark内部的kmeans算法实现完成机器学习实战中的第十章示例 from numpy import array from pyspark import SparkContext from pyspark.mllib.clustering import GaussianMixture if __name__ == '__main__': sc = SparkContext() datas = sc.textFile('testSet.txt') clusters_num = 4 parseData = datas.map(lambda x: array([float(y) for y in x.split('\t')])) model = GaussianMixture.train(parseData, clusters_num, maxIterations=10) clusters = [[] for i in range(clusters_num)] labels = model.predict(parseData).collect() nums = len(labels) for i in xrange(nums): clusters[labels[i]].append(parseData.collect()[i]) print clusters sc.stop()
def train(row_id_str, ds_id, hdfs_feat_dir, local_out_dir, ml_opts_jstr , sp_master, spark_rdd_compress, spark_driver_maxResultSize, sp_exe_memory, sp_core_max , zipout_dir, zipcode_dir, zip_file_name , mongo_tuples, labelnameflag, fromweb, src_filename , jobname ): # create zip files for Spark workers ================= ================ zip_file_path = ml_build_zip_file(zipout_dir, zipcode_dir, zip_file_name, prefix='zip_feature_util') print "INFO: zip_file_path=",zip_file_path #data_folder = hdfs_feat_dir + "/" #local_out_dir = local_out_dir + "/" #if os.path.exists(local_out_dir): # shutil.rmtree(local_out_dir) # to keep smaplelist file if not os.path.exists(local_out_dir): os.makedirs(local_out_dir) # init Spark context ==== sc=ml_util.ml_get_spark_context(sp_master , spark_rdd_compress , spark_driver_maxResultSize , sp_exe_memory , sp_core_max , jobname , [zip_file_path]) # start here =================================================================== =============== t0 = time() ### Need to check if PCA available here =========================== libsvm_data_file = os.path.join(hdfs_feat_dir , src_filename) # need to set k numb in filename somehow print "INFO: libsvm_data_file=", libsvm_data_file #samples_rdd = MLUtils.loadLibSVMFile(sc, libsvm_data_file).cache() # load sample RDD from text file # format (LabeledPoint,hash) from str2LabeledPoint_hash() feature_count=0 samples_rdd, feature_count = zip_feature_util.get_sample_rdd(sc, libsvm_data_file, feature_count, '') # get label as a list labels_list_all = samples_rdd.map(lambda p: int(p[0].label)).collect() total_sample_count=len(labels_list_all) parsedData =samples_rdd.map(lambda p: p[0].features).cache() #for i in parsedData.collect(): #p.features: pyspark.mllib.linalg.SparseVector # print "pd=",type(i),",i=",i t1 = time() print 'INFO: running time: %f' %(t1-t0) t0 = t1 ############################################### ########## build learning model ############### ############################################### ### get the parameters### print "INFO: ============Learning Algorithm and Parameters=============" para_dict = json.loads(ml_opts_jstr) flag_model = para_dict['learning_algorithm'] # kmeans iteration_num = eval(para_dict['iterations']) k=2 if 'k' in para_dict: k = eval(para_dict['k']) print "INFO: Learning Algorithm:", flag_model print "INFO: iterations=", iteration_num #print "training_sample_number=", training_sample_number ### generate label names (family names) ##### ### connect to database to get the column list which contains all column number of the corresponding feature#### if labelnameflag == 1: key = "dic_name_label" jstr_filter='{"rid":'+row_id_str+',"key":"'+key+'"}' jstr_proj='{"value":1}' # get parent dataset's data if ds_id != row_id_str: jstr_filter='{"rid":'+ds_id+',"key":"'+key+'"}' doc=query_mongo.find_one_t(mongo_tuples, jstr_filter, jstr_proj) dic_list = doc['value'] label_dic = {} for i in range(0, len(dic_list)): for key in dic_list[i]: label_dic[dic_list[i][key]] = key.encode('UTF8') print "INFO: label_dic:", label_dic else: label_dic = {} label_set = set(labels_list_all) for label_value in label_set: label_dic[int(label_value)] = str(int(label_value)) print "INFO: generated label_dic:", label_dic labels_list = [] for key in sorted(label_dic): labels_list.append(label_dic[key]) print "INFO: labels_list=", labels_list ### build model ### if flag_model == "kmeans": print "=================== Kmeans ============" model = KMeans.train(parsedData, k, maxIterations=iteration_num) t_cost= model.computeCost(parsedData) print "INFO: cost for training set =", str(t_cost) clusterCenters=model.clusterCenters print "INFO: clusterCenters t=", type(clusterCenters) #list elif flag_model == "gaussian_mixture_model": # didn't work some native lib issue print "=================== Gaussian_Mixture_Model ============" model = GaussianMixture.train(parsedData, k, maxIterations=iteration_num) print "INFO: model.weights =", model.weights else: print "INFO: Training model selection error: no valid ML model selected!" return ### Save model save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str try: hdfs.ls(save_dir) #print "find hdfs folder" hdfs.rmr(save_dir) #print "all files removed" except IOError as e: print "ERROR: I/O error({0}): {1}".format(e.errno, e.strerror) except: print "ERROR: Unexpected error:", sys.exc_info()[0] print "INFO: model saved at hdfs=",save_dir print "INFO: model type=",type(model)," model=",model model.save(sc, save_dir) ###load model if needed #sameModel = SVMModel.load(sc, save_dir) ### # (true label, keams label, features list, hash) all_data=samples_rdd.map(lambda t: ( t[0].label, model.predict(t[0].features), t[0].features, t[1] ) ).collect() true_label_arr = np.asarray([int(x) for x,_,_,_ in all_data]) labels_kmeans = np.asarray([int(x) for _,x,_,_ in all_data]) hash_list = np.asarray([x for _,_,_,x in all_data]) print "INFO: all_data len=",len(all_data),"all_data t=",type(labels_list_all) print "INFO: true_label_arr.shape=",true_label_arr.shape,"labels_kmeans.shape=",labels_kmeans.shape print "INFO: true_label_arr t=",type(true_label_arr),"labels_kmeans t=",type(labels_kmeans) mtx_center=np.asarray(clusterCenters) features_array_reduced=np.asarray([x.toArray() for _,_,x,_ in all_data]) print "INFO: mtx_center t=",type(mtx_center),"mtx_center.shape=",mtx_center.shape print "INFO: features_array_reduced t=",type(features_array_reduced),"features_array_reduced.shape",features_array_reduced.shape #Adjusted Mutual Information between two clusterings amis=adjusted_mutual_info_score(labels_list_all,labels_kmeans) print "INFO: Adjusted_mutual_info_score=", amis #Similarity measure between two clusterings ars=adjusted_rand_score(labels_list_all,labels_kmeans) print "INFO: Adjusted_rand_score=", ars accuracy=0.0 t1 = time() print 'INFO: training run time: %f' %(t1-t0) t0 = t1 ############################################### ########## plot histogram ###### ############################################### n_clusters=k plot_col_num = int(math.ceil(math.sqrt(n_clusters))) figsize = (4*plot_col_num, 3*int(math.ceil(n_clusters*1.0/plot_col_num))) print "INFO: n_clusters=",n_clusters,",label_dic=",label_dic print "INFO: plot_col_num=",plot_col_num,",figsize=",figsize,",local_out_dir=",local_out_dir # kmeans histogram _, p_true = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic , plot_col_num = plot_col_num, figsize=figsize, folder = local_out_dir, rid=row_id_str) # normalized kmeans histogram _, p_true_norm = ml_plot_kmeans_histogram_subfigures(true_label_arr, labels_kmeans, n_clusters, names = label_dic , plot_col_num = plot_col_num, figsize=figsize, normalize = True, folder = local_out_dir, rid=row_id_str) ####plot "reverse" histogram with labels #### num_bars = max(true_label_arr) + 1 figsize = (4*plot_col_num, 3*int(math.ceil(num_bars*1.0/plot_col_num))) _, p_cluster = ml_plot_kmeans_histogram_subfigures(labels_kmeans, true_label_arr, num_bars, names = label_dic , plot_col_num = plot_col_num, figsize=figsize, reverse = True, folder = local_out_dir, rid=row_id_str) #### plot dot figures #### # dot plot for Kmeans =========== filename=os.path.join(local_out_dir ,row_id_str+'_cluster.png') filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, labels_kmeans, mtx_center, n_clusters, figsize=(10,7), filename=filename , title='KMeans', filename_3d=filename_3d) # dot plot for True Labels =========== filename=os.path.join(local_out_dir ,row_id_str+'_cluster_tl.png') filename_3d=os.path.join(local_out_dir ,row_id_str+'_cluster_3d_tl.json') ml_plot_kmeans_dot_graph_save_file(features_array_reduced, true_label_arr, mtx_center, n_clusters, figsize=(10,7), filename=filename , title='True Labels', filename_3d=filename_3d) dataset_info={"training_fraction":1, "class_count":n_clusters,"dataset_count":total_sample_count} # only update db for web request if fromweb=="1": #print "database update" str_sql="UPDATE atdml_document set "+"accuracy = '" \ +"', status = 'learned', processed_date ='"+str(datetime.datetime.now()) \ +"', total_feature_numb='"+str(feature_count) \ +"', perf_measures='{}" \ +"', dataset_info='"+json.dumps(dataset_info) \ +"' where id="+row_id_str ret=exec_sqlite.exec_sql(str_sql) print "INFO: Data update done! ret=", str(ret) else: print "INFO: accuracy = '"+str(accuracy*100)+"%" print 'INFO: Finished!' return 0
" opp_score " \ "FROM team_avgs" query = "SELECT " \ " team_id, " \ " team_name, " \ " AVG(t1_rush), " \ " AVG(t1_pass), " \ " AVG(t2_rush), " \ " AVG(t2_pass) " \ "FROM full_game_stats " \ "JOIN team ON 1=1 " \ " AND full_game_stats.t1_id = team.team_id " \ "GROUP BY team_id, team_name" curs.execute(query) sql_dat = curs.fetchall() team_ids = [row[0] for row in sql_dat] team_names = [row[1] for row in sql_dat] features = [row[2:] for row in sql_dat] data = sc.parallelize(features, 1) model = GaussianMixture.train(data, k=10) cluster_labels = model.predict(data).collect() labels = zip(team_ids,team_names, cluster_labels) df = spark.createDataFrame( labels, ["team_id", "team_name", "cluster_id"] ) df.createOrReplaceTempView("model") for k in range(10): spark.sql("SELECT * FROM model WHERE cluster_id = {}".format(k)).show()
maxIterations=100, initialModel=KMeansModel(initial_centroids)) end = time() elapsed_time = end - start kmeans_output = [ "====================== KMeans ====================\n", "Final centers: " + str(kmeans_model.clusterCenters), "Total Cost: " + str(kmeans_model.computeCost(data)), "Value of K: " + str(k), "Elapsed time: %0.10f seconds." % elapsed_time ] #path = "hdfs://masterNode:9000/user/spark/MODELOS-marcelo/KMEANS-2" #kmeans_model.save(sc,path) # Gauss KMeans start = time() gauss_model = GaussianMixture.train(data, k, maxIterations=20) end = time() elapsed_time = end - start gauss_output = [ "====================== Gauss KMeans ====================\n" ] for i in range(k): v1 = ("weight = ", gauss_model.weights[i]) v2 = ("mu = ", gauss_model.gaussians[i].mu) v3 = ("sigma = ", gauss_model.gaussians[i].sigma.toArray()) gauss_output.append((v1, v2, v3)) tiempo = "Tiempo: " + str(elapsed_time) gauss_output.append(tiempo) kmeans_info = sc.parallelize(kmeans_output) gauss_info = sc.parallelize(gauss_output)
from pyspark.mllib.clustering import GaussianMixture from pyspark import SparkContext from scipy.stats import mvn import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import time DIR = "/home/adrianj/Desktop/MachineLearning/Resources/" FILE_PATH = DIR+"atemporalTest.txt" NUM_GAUSSIANS = 500 sc = SparkContext(appName="GMM Trainer") data = sc.textFile(FILE_PATH) parsedData = data.map(lambda line: np.array([float(x) for x in line.strip().split(' ')])) gmm = GaussianMixture.train(parsedData, NUM_GAUSSIANS, seed=10) print("Dumping to "+DIR+"GMMA/...") #fig = plt.figure() #ax = fig.gca(projection='3d') # Record the model gmm.save(sc, DIR+"GMMA/") ''' for i in range(NUM_GAUSSIANS): mu = gmm.gaussians[i].mu sigma = (gmm.gaussians[i].sigma).toArray() weight = gmm.weights[i] #a, b = np.random.multivariate_normal(mu, sigma, 5000).T #surf = ax.scatter(a, b, c, zdir='z') #plt.plot(a, b, "x")
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel # $example off$ # $example off$ if __name__ == "__main__": sc = SparkContext(appName="GaussianMixtureExample") # SparkContext # $example on$ # Load and parse the data data = sc.textFile("data/mllib/gmm_data.txt") parsedData = data.map( lambda line: array([float(x) for x in line.strip().split(' ')])) # Build the model (cluster the data) gmm = GaussianMixture.train(parsedData, 2) # Save and load model gmm.save( sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel" ) sameModel = GaussianMixtureModel\ .load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel") # output parameters of model for i in range(2): print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray()) # $example off$
# -*- coding:utf-8 -*- """" Program: GMM Description: 调用spark内置的GMM算法示例 Author: zhenglei - [email protected] Date: 2016-01-14 13:38:58 Last modified: 2016-01-14 13:50:11 Python release: 2.7 """ # 调用spark内部的kmeans算法实现完成机器学习实战中的第十章示例 from numpy import array from pyspark import SparkContext from pyspark.mllib.clustering import GaussianMixture if __name__ == '__main__': sc = SparkContext() datas = sc.textFile('testSet.txt') clusters_num = 4 parseData = datas.map(lambda x: array([float(y) for y in x.split('\t')])) model = GaussianMixture.train(parseData, clusters_num, maxIterations=10) clusters = [[] for i in range(clusters_num)] labels = model.predict(parseData).collect() nums = len(labels) for i in xrange(nums): clusters[labels[i]].append(parseData.collect()[i]) print clusters sc.stop()
from pyspark import SparkContext from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel if __name__ == "__main__": sc = SparkContext(appName="GaussianMixtureExample") # SparkContext ### Local default options k=2 # "k" (int) Set the number of Gaussians in the mixture model. Default: 2 convergenceTol=0.001 # "convergenceTol" (double) Set the largest change in log-likelihood at which convergence is considered to have occurred. maxIterations=150 # "maxIterations" (int) Set the maximum number of iterations to run. Default: 100 seed=None # "seed" (long) Set the random seed # Load and parse the data data = sc.textFile("/var/mdp-cloud/gmm_data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.strip().split(' ')])) # filteredData = data.filter(lambda arr: int(arr[1]) != 0) # Build and save the model (cluster the data) gmm = GaussianMixture.train(parsedData, k, convergenceTol=0.001, maxIterations=150, seed=None) # gmm.save(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel") # gmm.save(sc, "GaussianMixtureModel_CV") # The following line would load the model # sameModel = GaussianMixtureModel.load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel") # output parameters of model for i in range(k): print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray()) sc.stop()
### Local default options k = 2 # "k" (int) Set the number of Gaussians in the mixture model. Default: 2 convergenceTol = 0.001 # "convergenceTol" (double) Set the largest change in log-likelihood at which convergence is considered to have occurred. maxIterations = 150 # "maxIterations" (int) Set the maximum number of iterations to run. Default: 100 seed = None # "seed" (long) Set the random seed # Load and parse the data data = sc.textFile("/var/mdp-cloud/gmm_data.txt") parsedData = data.map( lambda line: array([float(x) for x in line.strip().split(' ')])) # filteredData = data.filter(lambda arr: int(arr[1]) != 0) # Build and save the model (cluster the data) gmm = GaussianMixture.train(parsedData, k, convergenceTol=0.001, maxIterations=150, seed=None) # gmm.save(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel") # gmm.save(sc, "GaussianMixtureModel_CV") # The following line would load the model # sameModel = GaussianMixtureModel.load(sc, "target/org/apache/spark/PythonGaussianMixtureExample/GaussianMixtureModel") # output parameters of model for i in range(k): print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray()) sc.stop()
spark_df = sc.parallelize( spark.read.json("Data/yelp_academic_dataset_user.json").select( "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[ 0], x[1], (today - par.parse(x[2])).days)).collect()[:1200]) scaler = MinMaxScaler(inputCol="_1",\ outputCol="scaled_1") # Getting the input data trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map( lambda x: (x, )).toDF() scalerModel = scaler.fit(trial_df) vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map( lambda x: Vectors.dense(x)) # Initialize GMM start = timer() gmm = GaussianMixture.train(vector_df, k=4, maxIterations=20, seed=2018) end = timer() print(end - start) df = pandas.DataFrame({'features': [], 'cluster': []}) i = 0 for v in vector_df.collect(): df.loc[i] = [[float(v[0]), float(v[1]), float(v[2])], int(gmm.predict(v))] i += 1 print df err = spark.createDataFrame(df).rdd.map(lambda x: (x[0], int(x[1]))).collect() num_clusters = 4 per_clus = [0] * num_clusters per_clus_num = [0] * num_clusters
elements = repo.get(pk_aids) for element in elements: for col_index, col in enumerate(cols): if element.get(col) is not None: rows[index].get(pk_aids)[col_index] = element.get(col) print(element.get(col)) for index, row in enumerate(rows): for pk_aids in row: if rows[index].get(pk_aids) is not None: if index == 0: data = rows[index].get(pk_aids) else: data = np.concatenate((data, rows[index].get(pk_aids)), axis=0) print(data) #Parameters: #data – RDD of data points #k – Number of components #convergenceTol – Threshold value to check the convergence criteria. Defaults to 1e-3 #maxIterations – Number of iterations. Default to 100 #seed – Random Seed #initialModel – GaussianMixtureModel for initializing learning model = GaussianMixture.train(data, 10, convergenceTol=0.0001, maxIterations=50) labels = model.predict(data).collect() print
gmm.gaussians[i].mu, gmm.gaussians[i].sigma.toArray()).pdf(x) # prob_x = gmm.predictSoft([x]) # rs = np.prod(prob_x) return rs if __name__ == "__main__": sc = SparkContext(appName="GaussianMixtureExample") # SparkContext # $example on$ # Load and parse the data data = sc.textFile(sys.argv[1]) parsedData = data.map(lambda line: array( ([float(x) for x in line.strip().split(",")])[index])) # Build the model (cluster the data) gmm = GaussianMixture.train(parsedData, n_clusters) # Save and load model if (os.path.isdir('GMMResult')): shutil.rmtree('GMMResult') gmm.save(sc, "GMMResult") sameModel = GaussianMixtureModel.load(sc, "GMMResult") # output parameters of model for i in range(n_clusters): print("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray()) datfull = data.map(lambda line: array( ([float(x) for x in line.strip().split(",")]))) dat = datfull.take(datfull.count())
import numpy as np def parse(data): list = [] for i in range(len(data)): value = float(data[i][1:-1]) list.append(value) return (list) parsedata = outdata.map(lambda line: line.encode('utf-8').split(",")).map( lambda l: parse(l)) start_time = time.time() gmm = GaussianMixture.train(parsedata, 80) gmm.fit(parsedata) print time.time() - start_time #testing Gaussian mixture model for python start_time = time.time() #print sample1 gmix = mixture.GMM(n_components=90, covariance_type='full') gmix.fit(parsedata) #gmix.predict(parsedInSample1) end_time = time.time() gmpython = end_time - start_time print gmpython
# print data1.take(5) # Without converting the features into dense vectors, transformation with zero mean will raise # exception on sparse vector. # data2 will be unit variance and zero mean. data2 = label.zip(scaler1.transform(features.map(lambda x: Vectors.dense(x.toArray())))) parsedData = data2.map (lambda x: x[1]) parsedData.cache() modelList = []; d = dict() noClusters = 5 convergenceTol = 1e-3 maxIterations = 1000 seed = random.getrandbits(19) # Build the model (cluster the data) gmm = GaussianMixture.train(parsedData, noClusters, convergenceTol, maxIterations, seed) # output parameters of model for i in range(noOfClusters): print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray()) """ for clusterSize in range(2, 21, 2): # Build the model (cluster the data) clusters = KMeans.train(parsedData, clusterSize, maxIterations=10,runs=10, initializationMode="random") modelList.append(clusters) # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)]))
:param convergenceTol: Convergence threshold. Default to 1e-3 :param maxIterations: Number of EM iterations to perform. Default to 100 :param seed: Random seed """ parser = argparse.ArgumentParser() parser.add_argument('inputFile', help='Input File') parser.add_argument('k', type=int, help='Number of clusters') parser.add_argument('--convergenceTol', default=1e-3, type=float, help='convergence threshold') parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations') parser.add_argument('--seed', default=random.getrandbits(19), type=long, help='Random seed') args = parser.parse_args() conf = SparkConf().setAppName("GMM") sc = SparkContext(conf=conf) lines = sc.textFile(args.inputFile) data = lines.map(parseVector) model = GaussianMixture.train(data, args.k, args.convergenceTol, args.maxIterations, args.seed) for i in range(args.k): print(("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu, "sigma = ", model.gaussians[i].sigma.toArray())) print("\n") print(("The membership value of each vector to all mixture components (first 100): ", model.predictSoft(data).take(100))) print("\n") print(("Cluster labels (first 100): ", model.predict(data).take(100))) sc.stop()
row_num = info_df.filter(info_df.high == 'IT').count() for index, repo in enumerate(repos): for pk_aids in repo: elements = repo.get(pk_aids) for element in elements: for col_index, col in enumerate(cols): if element.get(col) is not None: rows[index].get(pk_aids)[col_index]=element.get(col) print(element.get(col)) for index, row in enumerate(rows): for pk_aids in row: if rows[index].get(pk_aids) is not None: if index == 0: data = rows[index].get(pk_aids) else: data = np.concatenate((data, rows[index].get(pk_aids)), axis=0) print(data) #Parameters: #data – RDD of data points #k – Number of components #convergenceTol – Threshold value to check the convergence criteria. Defaults to 1e-3 #maxIterations – Number of iterations. Default to 100 #seed – Random Seed #initialModel – GaussianMixtureModel for initializing learning model = GaussianMixture.train(data, 10, convergenceTol=0.0001,maxIterations=50) labels = model.predict(data).collect() print
df = pd.DataFrame(l, index = ['gp1_P', 'gp2_P', 'gp3_P', 'gp4_P', 'gp5_P', 'gp6_P'], columns = ['gp1_R', 'gp2_R', 'gp3_R', 'gp4_R', 'gp5_R', 'gp6_R']) df # ### Interprétation (à finir) Avec Kmeans, 2 groupes se distinguent : 4 et 6 Le groupe gp1_P regroupe 123 des individus et mélange nettement gp1_R / gp2_R / gp3_R # ## Gaussian Mixture # In[12]: from pyspark.mllib.clustering import GaussianMixture # Construction du model avc le mm dataTrain que Kmeans gmm = GaussianMixture.train(dataTrain, 6) # sortie des parameters du modele for i in range(2): print ("weight = ", gmm.weights[i], "mu = ", gmm.gaussians[i].mu, "sigma = ", gmm.gaussians[i].sigma.toArray()) # ### Interprétation (à finir) # # Mesures d'évaluation (en cours) # In[30]: from pyspark.mllib.evaluation import MultilabelMetrics
from numpy import array from pyspark import SparkContext import matplotlib.pyplot as plt import numpy as np #plt.figure() sc=SparkContext() data=sc.textFile("./coord.txt") #test_plot=np.genfromtxt("./coord.txt",delimiter=',',dtype=float) #plt.plot(test_plot[:,1],test_plot[:,0],'ro') #plt.show() parsedData=data.map(lambda line: array([float(x) for x in line.strip().split(',')])) l=3 gmm = GaussianMixture.train(parsedData,l) #x=np.zeros(90000) #y=np.zeros(90000) #for i in range(0,l): #print "w= ",gmm.weights[i] #print "sigma= ",gmm.gaussians[i].sigma.toArray() #print "mu= ",gmm.gaussians[i].mu #x1=gmm.weights[0]*np.random.multivariate_normal(gmm.gaussians[0].mu,gmm.gaussians[0].sigma.toArray(),90000) #x2=gmm.weights[1]*np.random.multivariate_normal(gmm.gaussians[1].mu,gmm.gaussians[1].sigma.toArray(),90000) file = open("./GMM.txt",'w') for j in range(0,l): file.write(str(gmm.weights[j])+'\n')
default=1e-3, type=float, help='convergence threshold') parser.add_argument('--maxIterations', default=100, type=int, help='Number of iterations') parser.add_argument('--seed', default=random.getrandbits(19), type=long, help='Random seed') args = parser.parse_args() conf = SparkConf().setAppName("GMM") sc = SparkContext(conf=conf) lines = sc.textFile(args.inputFile) data = lines.map(parseVector) model = GaussianMixture.train(data, args.k, args.convergenceTol, args.maxIterations, args.seed) for i in range(args.k): print(("weight = ", model.weights[i], "mu = ", model.gaussians[i].mu, "sigma = ", model.gaussians[i].sigma.toArray())) print("\n") print(( "The membership value of each vector to all mixture components (first 100): ", model.predictSoft(data).take(100))) print("\n") print(("Cluster labels (first 100): ", model.predict(data).take(100))) sc.stop()