def training_batched_som(map_min_size, map_max_size, nb_models, X_train): for i in range(nb_models): sm = SOMFactory().build( X_train, mapsize=[ random.choice(list(range(map_min_size, map_max_size))), random.choice(list(range(map_min_size, map_max_size))) ], normalization='var', initialization='random', component_names=names, lattice="hexa") sm.train(n_job=1, verbose=False, train_rough_len=30, train_finetune_len=100) joblib.dump(sm, path + "batched_model_{}.joblib".format(i)) print("end of training model n°" + str(i)) # Study the models trained and plot the errors obtained in order to select the best one models_pool = glob.glob(path + "batched_model*") errors = [] for model_filepath in models_pool: sm = joblib.load(model_filepath) topographic_error = sm.calculate_topographic_error() quantization_error = sm.calculate_quantization_error() errors.append((topographic_error, quantization_error)) e_top, e_q = zip(*errors) plt.scatter(e_top, e_q) plt.xlabel("Topographic error") plt.ylabel("Quantization error") plt.title("Topographic and quantization errors of the models trained") plt.show()
def sm_training(self): """ Train the model with different parameters. """ file=askopenfilename(initialdir=dir_name, title="Select Data", filetypes=[("csv files", "*.csv")]) if file is None: tk.messagebox.showerror("Error","your chosen file is not valid. \n Please choose again.") content=open(file, "rb") data=pd.read_csv(content) # ind=data[data.columns[0]] # data = data.set_index(ind) comp_names=[name for name in data.columns] index = data.index # test cali housing first df=data.fillna(0).values # initialize the build sm=SOMFactory().build( data=df, mapsize=(int(self.Mapsize_x.get()), int(self.Mapsize_y.get())), mask=None, mapshape='planar', lattice=self.Lattice_ent.get(), normalization=self.Normalization_ent.get(), initialization=self.Initialization_ent.get(), neighborhood='gaussian', training='batch', name='sompy', component_names=self.comp_names) # start training sm.train(n_job=int(self.n_job_ent.get()), shared_memory=self.shared_memory_ent.get(), verbose=self.verbose_ent.get(), train_rough_len=int(self.train_rough_len_ent.get()), train_rough_radiusin=int(self.train_rough_rin_ent.get()), train_rough_radiusfin=int(self.train_rough_rfin_ent.get()), train_finetune_len=int(self.train_ft_len_ent.get()), train_finetune_radiusin=int(self.train_ft_rin_ent.get()), train_finetune_radiusfin=int(self.train_ft_rfin_ent.get()), train_len_factor=int(self.train_len_factor_ent.get()), maxtrainlen=np.Inf) # errors calculation topographic_error=sm.calculate_topographic_error() quantitization_error=np.mean(sm._bmu[1]) # if multiple runs are required # joblib.dump(sm, "model_{}.joblib".format(i)) pickle.dump(sm, open("Models/sm_model", "wb")) # print errors on the cmd prompt print("the topographic error is %s " % topographic_error) print("the quantitization error is %s " % quantitization_error)
def _prediction(self): """SOM function""" try: data = np.loadtxt('/home/mininet/testmininet/trainingdata1.txt', delimiter=',') names = [ 'Interval', 'Throughput(Mbits/0.5sec)', 'Bandwidth(Mbits/sec)', 'Jitter(ms)', 'Loss', 'Decision' ] sm = SOMFactory().build(data, normalization='var', initialization='random', component_names=names) sm.train(n_job=1, verbose='info', train_rough_len=15, train_finetune_len=15) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) line = open('/home/mininet/testmininet/pdata1.txt').readlines() log.debug(line) comp = line[0].split(",") del comp[len(comp) - 1] data2 = np.array([[ float(comp[0]), float(comp[1]), float(comp[2]), float(comp[3]), float(comp[4]) ]]) sm.cluster(5) pred = np.absolute(sm.predict_by(data2, 5)) self.details.write(comp[4] + "\t" + comp[1] + "\t" + str(pred[0]) + "\t" + str(topographic_error) + "\n") print(pred) if pred <= 0.5: print("No congestion") self._congdelay(pred) elif pred > 0.5: print("Congestion there for next 5 seconds atleast") self.prevpred = pred except IndexError: print("ERROR")
def training_specific_som(map_x_size, map_y_size, X_train): sm = SOMFactory().build(X_train, mapsize=[map_x_size, map_y_size], normalization='var', initialization='random', component_names=names, lattice='hexa') sm.train(n_job=1, verbose=False, train_rough_len=30, train_finetune_len=100) joblib.dump(sm, path + "batched_model_specific{}.joblib".format(0)) print("Topographic error: " + str(sm.calculate_topographic_error()) + ", Quantization error: " + str(sm.calculate_quantization_error()) + "\n") return (sm)
def build_som(self, X): print('Building SOM...') sm = SOMFactory().build(X, normalization='var', mapsize=(15, 15), initialization='pca') sm.train(n_job=1, verbose='info', train_rough_len=200, train_finetune_len=100) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print ("Topographic error = {}; Quantization error = {}"\ .format(topographic_error,quantization_error)) return sm
def soms(data): """ Input: 3-D array (nt,ny,nx) """ #Reshape data nt, ny, nx = data.shape data = np.reshape(data, [nt, ny * nx], order='F') sm = SOMFactory().build(data, mapsize=(5, 5), normalization=None, initialization='pca') sm.train(n_job=-1, verbose=False, train_rough_len=20, train_finetune_len=10) return sm
def self_organizing_map(normalized_df, normalization='var', initialization='pca', n_job=1, train_rough_len=2, train_finetune_len=5, verbose=None): # create the SOM network and train it. You can experiment with different normalizations and initializations som = SOMFactory().build(normalized_df.values, normalization=normalization, initialization=initialization, component_names=normalized_df.columns) som.train(n_job=n_job, train_rough_len=train_rough_len, train_finetune_len=train_finetune_len, verbose=verbose) # The quantization error: average distance between each data vector and its BMU. # The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units. topographic_error = som.calculate_topographic_error() quantization_error = np.mean(som._bmu[1]) print("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) return som
def cluster_category_data(df, scale_data='minmax', dim_red_method='som', use_elbow_method='True', cluster_method='hierarchical', n_clusters=None, verbose=1, perplexity=None): """ :param df: dataframe containing all the columns belonging to a category to be used in clustering :param scale_data: method to be used to scale the dataset :param dim_red_method: options are 'som', 'umap', 'tsne', None. If None, do clustering directly. :param use_elbow_method: if True, elbow method is used to find the optimum number of clusters. If False, n_clusters needs to be specified :param cluster_method: options are 'kmeans' and 'hierarchical'. In either case kmeans is used for the elbow method(because of the time required). :param n_clusters: If use_elbow_method is False, n_clusters needs to be given. :param verbose: If True, output the progress in clustering process :param perplexity: If method used is TSNE, perplexity nedds to be specified """ t = time.time() if scale_data == 'minmax': X = MinMaxScaler().fit_transform(df) elif scale_data == 'standard': X = StandardScaler().fit_transform(df) else: X = df.values if verbose: print(f'number of features = {df.shape[1]}') if dim_red_method == 'som': if verbose: print( 'Self Organising Maps is being used for dimensionality reduction...' ) opt_k = 2 max_s = -1 f = 0 for mapsize in [(30, 30)]: if verbose: print(f'map size = {mapsize}') sm = SOMFactory().build(X, normalization='var', initialization='pca', mapsize=mapsize) sm.train(n_job=1, verbose=False, train_rough_len=100, train_finetune_len=500) if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(sm.codebook.matrix).elbow_value_ if elbow and verbose: print(f'elbow value = {elbow}') if not elbow: if verbose: print('elbow not found') ms = -1 for k in range(2, 20): km_labels = KMeans(k).fit_predict(sm.codebook.matrix) s = silhouette_score(sm.codebook.matrix, km_labels) if s > ms: elbow = k else: elbow = n_clusters x = sm.project_data(X) labels, _, _ = sm.cluster(opt=elbow, cl_type=cluster_method) clabels = [] for i in range(X.shape[0]): clabels.append(labels[x[i]]) s_score = silhouette_score(X, clabels) if verbose: print(f'silhouette score = {round(s_score, 3)}') max_s = max(s_score, max_s) if (max_s == s_score): opt_k = elbow opt_labels = clabels opt_size = mapsize if (max_s > s_score): break if verbose: print(f'optimum mapsize = {opt_size}') print( f'optimum number of clusters = {opt_k} & silhouette score = {round(max_s,3)}' ) print(f'time taken = {round(time.time()-t,1)}') return opt_labels, opt_k elif dim_red_method: if dim_red_method == 'umap': print('UMAP is being used for dimensionality reduction...') embedding = umap.UMAP(n_components=2, n_neighbors=5, min_dist=0.0001, metric='euclidean', random_state=1, spread=0.5, n_epochs=1000).fit_transform(X) print('UMAP embedding done...') elif dim_red_method == 'tsne': print('t-SNE is being used for dimensionality reduction...') embedding = TSNE(perplexity=perplexity).fit_transform(X) print('t-SNE embedding is done...') if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(embedding).elbow_value_ else: elbow = n_clusters if cluster_method == 'kmeans': opt_labels = KMeans(elbow).fit_predict(embedding) elif cluster_method == 'hierarchical': opt_labels = AgglomerativeClustering(elbow).fit_predict(embedding) if verbose: s_score = silhouette_score(X, opt_labels) print( f'number of clusters = {elbow} and silhouette_score = {s_score}' ) return opt_labels, elbow else: if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(X).elbow_value_ else: elbow = n_clusters if cluster_method == 'kmeans': opt_labels = KMeans(elbow).fit_predict(X) elif cluster_method == 'hierarchical': opt_labels = AgglomerativeClustering(elbow).fit_predict(X) print(f'silhouette score = {round(silhouette_score(X,opt_labels),3)}') return opt_labels, elbow
class MySOM: def __init__(self, df, mapsize, initialization='random'): """ :param df: 数据框 :param mapsize: 输出层维度,一般为二维,输入(20,20)的形式 :param initialization: "PCA" 或 "random",初始化权重的方法 - PCA是以变量的主成分值作为权重,见sompy.codebool.pca_linear_initialization - random是以随机数进行初始化 """ self.data = np.array(df) self.sm = SOMFactory().build(self.data, mapsize=mapsize, initialization=initialization, component_names=df.columns) self.train() def train(self): self.sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5) def print_error(self): topographic_error = self.sm.calculate_topographic_error() quantization_error = np.mean(self.sm._bmu[1]) print("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) def draw_input_weights(self): from sompy.visualization.mapview import View2D view2D = View2D(10, 10, "rand data", text_size=10) view2D.show(self.sm, col_sz=4, which_dim="all", desnormalize=True) plt.show() def draw_hit_map(self): from sompy.visualization.bmuhits import BmuHitsView vhts = BmuHitsView(4, 4, "Hits Map", text_size=12) vhts.show(self.sm, anotate=True, onlyzeros=False, labelsize=12, cmap="Greys", logaritmic=False) plt.show() def draw_cluster_map(self): from sompy.visualization.hitmap import HitMapView hits = HitMapView(20, 20, "Clustering", text_size=12) hits.show(self.sm) plt.show() def cluster(self, n): self.sm.cluster(n) def get_cluster_label(self): # 长度等于mapsize[0] * mapsize[1] return self.sm.cluster_labels def get_neurons(self): """ 获取原数据的每个样本对应的神经元,原包并未提供此方法,所以自己动手 :return: array, length = self.df.shape[0] """ return self.sm._bmu[0] def get_label(self): """ 获取原数据的每个样本对应的分类标签,原包并未提供此方法,所以自己动手 :return: array, length = self.df.shape[0] """ neurons_label_dict = { i: j for i, j in enumerate(self.sm.cluster_labels) } return np.array([neurons_label_dict[i] for i in self.sm._bmu[0]]) def predict(self, x): """ 以label作为y,采取各种机器学习算法 :param x: :return: """ pass
# 14. SOM # 14.1. SOM FOR CONSUMPTION # Define SOM grid size mapsize_consump = 9 # Create algorithm, define parameters and train the grid sm_consump = SOMFactory().build(data=std_cons.values, mapsize=(mapsize_consump, mapsize_consump), normalization='var', initialization='random', component_names=Consumption.columns, lattice='rect', training='batch') sm_consump.train(n_job=6, verbose='info', train_rough_len=35, train_finetune_len=80) # 'final_clusters_consump' is a dataframe similar to df but including a column 'Labels' which indicates the closest neuron to each obs final_clusters_consump = pd.DataFrame(sm_consump._data, columns=Consumption.columns).set_index(Consumption.index) my_labels_c = pd.DataFrame(sm_consump._bmu[0], columns=['Labels']).set_index(Consumption.index) final_clusters_consump = pd.concat([final_clusters_consump, my_labels_c], axis=1) # Plot the number of observations associated to each neuron vhts_c = BmuHitsView(12, 12, "Hits Map", text_size=7) vhts_c.show(sm_consump, anotate=True, onlyzeros=False, labelsize=10, cmap="summer", logaritmic=False) plt.show() # Visualization of the value of the grid neurons in each variable view2D_c = View2D(9, 9, "", text_size=7) view2D_c.show(sm_consump, col_sz=5, what='codebook')
# -*- coding: utf-8 -*- """ Created on Sat Oct 7 15:09:18 2017 @author: Ethan """ import numpy as np from matplotlib import pyplot as plt from sompy.sompy import SOMFactory data = np.random.randint(0, 255, (100, 3)) dims = np.array([5, 5]) iterations = 2000 learningRate = 0.01 # normalize data = data / data.max() sm = SOMFactory().build(data, normalization = 'var', initialization='random', component_names=['r', 'g', 'b']) sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1])
if meta['AGE'][i] < 0.3: ## Z.append([mags[i] for i in gunn + isubaru]) Z.append([mags[i] for i in gunn]) ## Z = np.array(Z) print(Z) print('\n\n') sm = SOMFactory().build(Z, normalization='var', initialization='random', component_names=gunn) sm.train(n_job=1, verbose=False, train_rough_len=2, train_finetune_len=5) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print("Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error)) vhts = BmuHitsView(10, 10, 'Hits Map', text_size=7) vhts.show(sm, anotate=True, onlyzeros=False, labelsize=12, cmap='Greys', logaritmic=False)
ap_kmeans = {"cc": cc_kmeans, "sil_score": silhouette_avg, "sizes": sizes} ### 2. Approach: SOM followed by K-Means scaler = StandardScaler() cust_norm = scaler.fit_transform(df[customer_related_num]) df_cust_norm = pd.DataFrame(cust_norm, columns=customer_related_num) X = df_cust_norm.values sm = SOMFactory().build(data=X, mapsize=(8, 8), normalization='var', initialization="pca", component_names=customer_related_num, lattice="hexa", training="batch") sm.train(n_job=5, verbose='info', train_rough_len=40, train_finetune_len=100) final_clusters = pd.DataFrame(sm._data, columns=customer_related_num) my_labels = pd.DataFrame(sm._bmu[0]) final_clusters = pd.concat([final_clusters, my_labels], axis=1) cluster_cols = customer_related_num + ["Labels"] final_clusters.columns = cluster_cols som_cluster = final_clusters.groupby("Labels").mean() #create_elbowgraph(10, som_cluster) kmeans = KMeans(n_clusters=3, random_state=1).fit(som_cluster) som_cluster["somk_cluster"] = kmeans.labels_ k_cluster = som_cluster.groupby("somk_cluster").mean() k_cluster = pd.DataFrame(scaler.inverse_transform(X=k_cluster), columns=customer_related_num) final_clusters["somk_cluster"] = [ som_cluster.loc[i, "somk_cluster"] for i in final_clusters["Labels"].values ]
encoder = OrdinalEncoder() try: df[c] = encoder.fit_transform(df[c].values.reshape(-1, 1)) except TypeError: print('apagar ', c) df = df.drop(columns=c) sm = SOMFactory().build(df.values, [50, 50], mask=None, mapshape='planar', lattice='rect', normalization='var', initialization='pca', component_names=list(df.columns)) sm.train(n_job=2, verbose='info', train_rough_len=30, train_finetune_len=20) with open( '/content/drive/My Drive/IC_Cristine/SOM/som_primeiro.pkl', 'wb') as arq: pickle.dump(sm, arq) view2D = sompy.mapview.View2D(100, 100, "rand data", text_size=14) view2D.show(sm, col_sz=5, which_dim="all", denormalize=True) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print("Topographic error = {0}; Quantization error = {1}".format( topographic_error, quantization_error)) colunas_apagar = [
training_split_0.reset_index(inplace=True, drop=True) validation_split_0.reset_index(inplace=True, drop=True) print(training_split_0.head()) # Train the data mapSize = [20, 20] sm = SOMFactory().build(training_split_0.values, mapSize, normalization="var", lattice="rect", initialization="random", component_names=training_split_0.columns) sm.train( n_job=1, verbose=None, train_rough_len=2, train_finetune_len=100) # I left some of the codes as the example provided # plot the results, components map from sompy.visualization.mapview import View2D view2D = View2D(20, 20, "", text_size=12) view2D.show(sm, col_sz=3, which_dim="all", denormalize=False) # Hit maps from sompy.visualization.bmuhits import BmuHitsView vhts = BmuHitsView(15, 10, "Hits Map", text_size=12) vhts.show(sm, anotate=False, onlyzeros=False,
df = df.dropna(how='any') # Resetting index so it goes in order, since I only selected april, a standard index is all messed up april 2008 to april 2009 skips #a ton of index values, and this makes it impossible to combine cluster output index with df df.index df = df.reset_index(level=0) del df['index'] unique = df ##Investigating # 5*sqrt(row*column) som = SOMFactory().build(df.values, mapsize=[20,20], normalization = 'var', initialization='pca', component_names=names,\ neighborhood = 'gaussian', lattice='rect') som.train(n_job=1, verbose='info', train_rough_len=30, train_finetune_len=30) # The quantization error: average distance between each data vector and its BMU. # The topographic error: the proportion of all data vectors for which first and second BMUs are not adjacent units. topographic_error = som.calculate_topographic_error() quantization_error = np.mean(som._bmu[1]) print "Topographic error = %s; Quantization error = %s" % (topographic_error, quantization_error) from sompy.visualization.mapview import View2D view2D = View2D(4,4,"rand data",text_size=16) view2D.show(som, col_sz=2, which_dim="all", denormalize=True) # U-matrix plot
##Investigating # 5*sqrt(row*column) df.info() dfdrop = df.drop_duplicates() dfdrop.info() topo = [] quant = [] array = np.arange(10, 25, 1) for i in array: som = SOMFactory().build(df.values, mapsize=[i,i], normalization = 'var', initialization='pca', component_names=names,\ neighborhood = 'gaussian', lattice='rect') som.train(n_job=1, verbose='info', train_rough_len=50, train_rough_radiusin=4, train_finetune_radiusin=1, train_finetune_len=50) topo.append(som.calculate_topographic_error()) quant.append(np.mean(som._bmu[1])) print i plt.scatter(topo, quant, c=array, s=50) plt.title('Self Organizing Map') plt.xlabel('Topographic Error') plt.ylabel('Quantization Error') plt.colorbar(label='grid size nxn') som = SOMFactory().build(df.values, mapsize=[20,20], normalization = 'var', initialization='pca', component_names=names,\ neighborhood = 'gaussian', lattice='rect')
# %% for c in df: if df[c].dtype == 'object': encoder = OrdinalEncoder() df[c] = encoder.fit_transform(df[c].values.reshape(-1, 1)) # %% sm = SOMFactory().build(df.values, [50, 50], mask=None, mapshape='planar', lattice='rect', normalization='var', initialization='pca', component_names=list(df.columns)) sm.train(n_job=-1, verbose='info', train_rough_len=50, train_finetune_len=30) topographic_error = sm.calculate_topographic_error() quantization_error = np.mean(sm._bmu[1]) print("Topographic error = {0}; Quantization error = {1}".format( topographic_error, quantization_error)) view2D = sompy.mapview.View2D(100, 100, "rand data", text_size=14) view2D.show(sm, col_sz=5, which_dim="all", denormalize=True) # %% colunas_apagar = [ 'TIPOBITO', 'data_obito', 'data_nasc', 'res_MSAUDCOD',