while iter_ < int(np.floor(x_valid.shape[0] / batch_size)): batch_x = x_valid[iter_*batch_size: (iter_+1)*batch_size, :].T.reshape(1, sequence_len, batch_size) batch_y = y_valid[np.newaxis, iter_*batch_size: (iter_+1)*batch_size] errors_valid[iter_] = sess.run(prediction-batch_y, feed_dict={input_: batch_x, target: batch_y}) iter_ += 1 # estimate mean and deviation of the errors' vector # since we have a batch size that may be different from 1 and we consider # the error of each last batch_y, we need to cut off the zero values errors_valid = errors_valid[:iter_].flatten() gaussian_mixture = mixture.GaussianMixture(n_components=n_mixtures) gm = gaussian_mixture.fit(errors_valid.reshape(-1, 1)) means_valid = gm.means_[:,0] stds_valid = gm.covariances_[:,0,0]**.5 # square it since it is the cov matrix weights_valid = gm.weights_ # test predictions = np.zeros(shape=(int(np.floor(x_test.shape[0] / batch_size)), batch_size)) y_test = y_test[:x_test.shape[0]] # anomalies' statistics gaussian_error_statistics = np.zeros(shape=(len(predictions), batch_size)) errors_test = np.zeros(shape=(len(predictions), batch_size)) threshold = [scistats.norm.pdf(mean-sigma_threshold*std, mean, std) for (mean, std) in zip(means_valid, stds_valid)] anomalies = np.array([np.array([False for _ in range(batch_size)]) for _ in range(len(y_test))])
def make_graphical_experiments(algorithms=[], n_samples=1500, run_scikit_algorithms=True, SAVE_PLOTS=False, results_file_name=''): noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = make_moons(n_samples=n_samples, noise=.05) noisy_square = np.random.rand(n_samples, 2), None blobs = make_blobs(n_samples=n_samples, random_state=8) random_state = 170 X, y = make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) anisotropic_blobs = (X_aniso, y) varied_blobs = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) if run_scikit_algorithms: scikit_algorithms = range(9) else: scikit_algorithms = [] plt.figure(figsize=((len(scikit_algorithms) + len(algorithms)) * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 50, 'n_clusters': 3 } datasets = [(noisy_circles, { 'damping': .77, 'preference': -240, 'quantile': .2, 'n_clusters': 2 }), (noisy_moons, { 'damping': .75, 'preference': -220, 'n_clusters': 2 }), (varied_blobs, { 'eps': .18, 'n_neighbors': 2 }), (anisotropic_blobs, { 'eps': .15, 'n_neighbors': 2 }), (blobs, {}), (noisy_square, {})] for i_dataset, (dataset, algo_params) in enumerate(datasets): params = default_base.copy() params.update(algo_params) X, y = dataset X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # connectivity matrix for structured Ward G, pos, labels = generate_dataset_from_euclidean_points( X, similarity_measure=lambda p, q: np.exp(-(np.linalg.norm(p - q) / 1. )**2), threshold=.8) G, pos, labels = connect_dataset_connected_components(G, pos, labels) connectivity = nx.to_scipy_sparse_matrix(G) print("Dataset: ", i_dataset) if run_scikit_algorithms: ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans( n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering( n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering( n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') scikit_algorithms = [('MiniBatchKMeans', two_means), ('AffinityProp', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AggloClustering', average_linkage), ('DBSCAN', dbscan), ('Birch', birch), ('GaussianMixture', gmm)] for name, algorithm in scikit_algorithms: t0 = time() with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message= "Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) algorithm.fit(X) t1 = time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) plt.subplot(len(datasets), len(scikit_algorithms) + len(algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) # plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), # transform=plt.gca().transAxes, size=15, # horizontalalignment='right') plot_num += 1 for name, algorithm in algorithms: t0 = time() clusters = algorithm(G) t1 = time() y_pred = clusters_list2clusters_dict(clusters).values() plt.subplot(len(datasets), len(scikit_algorithms) + len(algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) # plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), # transform=plt.gca().transAxes, size=15, # horizontalalignment='right') plot_num += 1 if SAVE_PLOTS: plt.savefig(results_file_name + ".pdf", bbox_inches='tight') plt.savefig(results_file_name + ".png", bbox_inches='tight') else: plt.show()
M=64, num_gpus=arguments.num_gpus, arguments=arguments) if arguments.cuda: mdl.cuda() mdl.load_state_dict(torch.load(path)) # train the base distribution if 0 & os.path.exists(path2 + 'Kmog{}.gmm'.format(Kcomps)): GMM = pickle.load( open(path2 + 'Kmog{}.gmm'.format(Kcomps), 'rb')) else: if use_gmms[0]: GMM = mix.GaussianMixture(n_components=Kcomps, verbose=1, n_init=3, max_iter=200, covariance_type='diag') GMM.fit(all_hhats.data.cpu().numpy()) pickle.dump( GMM, open(path2 + 'Kmog{}.gmm'.format(Kcomps), 'wb')) mdl.initialize_GMMparams(GMM=GMM) if use_gmms[1]: BGMM = mix.GaussianMixture(n_components=Kcomps, verbose=1, n_init=3, max_iter=200, covariance_type='full') BGMM.fit(all_hhats.data.cpu().numpy()) pickle.dump(
y.append(int(l.split(",")[-2])) X = np.array(X, dtype=np.float32) X = np.array(X, dtype=np.float32) scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) pca = PCA(n_components=2) pca.fit(X) dr_X = pca.transform(X) #plot_bic(X) gmm = mixture.GaussianMixture(n_components=2, covariance_type='tied') gmm.fit(dr_X) newX = [] for pt in dr_X: newX.append(gmm.predict(pt.reshape(1, -1))[0]) newX = np.array(newX) newX = to_categorical(newX) y = to_categorical(y) INIT_LR = 5E-4 EPOCHS = 500 BS = 50
def init_std(self, x, gmm_mu=None, gmm_cv=None, weights=None, inv_maxstd=1e-1, beta_constant=0.5, component_overwrite=None, beta_override=None, n_samples=2, z_override=None, sigma=None): if component_overwrite is not None: self.num_components = component_overwrite if z_override is None: with torch.no_grad(): mu, lv = torch.chunk(self.encoder(x.to(self.device)), chunks=2, dim=-1) z = td.Normal(loc=mu, scale=lv.mul(0.5).exp() + 1e-10).sample( [n_samples]) z = z.reshape(int(x.shape[0] * n_samples), z.shape[-1]) else: z = z_override N, D = x.shape d = z.shape[1] inv_maxstd = inv_maxstd # 1.0 / x.std(dim=0).mean() # x.std(dim=0).mean() #D*x.var(dim=0).mean() if gmm_mu is None and gmm_cv is None and weights is None: from sklearn import mixture clf = mixture.GaussianMixture(n_components=self.num_components, covariance_type='spherical') clf.fit(z.cpu().numpy()) self.gmm_means = clf.means_ self.gmm_covariances = clf.covariances_ self.clf_weights = clf.weights_ else: print('loading weights...') self.gmm_means = gmm_mu self.gmm_covariances = gmm_cv self.clf_weights = weights if beta_override is None: beta = beta_constant.cpu() / torch.tensor( self.gmm_covariances, dtype=torch.float, requires_grad=False) else: beta = beta_override self.beta = beta.to(self.device) self.dec_std = nnj.Sequential( nnj.RBF(d, self.num_components, points=torch.tensor(self.gmm_means, dtype=torch.float, requires_grad=False), beta=self.beta), # d --> num_components nnj.PosLinear(self.num_components, 1, bias=False), # num_components --> 1 nnj.Reciprocal(inv_maxstd), # 1 --> 1 nnj.PosLinear(1, D)).to(self.device) # 1 --> D if sigma is not None: self.dec_std[0] = nnj.RBF_variant( d, self.gmm_means.shape[0], points=torch.tensor(self.gmm_means, dtype=torch.float, requires_grad=False), beta=self.beta.requires_grad_(False), boxwidth=sigma).to(self.device) with torch.no_grad(): self.dec_std[1].weight[:] = ( (torch.tensor(self.clf_weights, dtype=torch.float).exp() - 1.0).log()).to(self.device)
def gmm(k): model = mixture.GaussianMixture(n_components=k, covariance_type='full', random_state=100) return model
ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = (('My_KMeans', my_kmeans), ('My_GMM', my_gmm), ('My_SpectralClustering', my_spectral), ('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('OPTICS', optics), ('Birch', birch), ('GaussianMixture', gmm)) # 此处是内层循环,遍历每种算法 for name, algorithm in clustering_algorithms: t0 = time.time()
def mixture_gaussian(param, n_samples, components=0, name=None, analyze=False): if path.exists(f'{base_dir}/gm_{name}_samples.pkl'): best_gmm = load_mixture_gaussian(name) if not analyze and global_data.seed == 0: print(f'Load samples from file {name}') pickle_in = open(f'{base_dir}/gm_{name}_samples.pkl', "rb") dict = pickle.load(pickle_in) samples = dict['samples'] if samples.shape[0] == n_samples: return samples else: name = f'{name}_{n_samples}' if path.exists(f'{base_dir}/gm_{name}_samples.pkl'): print(f'Load samples from file {name}') pickle_in = open(f'{base_dir}/gm_{name}_samples.pkl', "rb") dict = pickle.load(pickle_in) return dict['samples'] else: print('Load distribution') best_gmm = load_mixture_gaussian(name) else: bic = [] lowest_bic = np.infty max_components = param.shape[1] if param.shape[1] < 15 else 15 if components != 0: gmm = mixture.GaussianMixture(n_components=components, covariance_type='full', max_iter=5000, tol=1e-15, n_init=20) gmm.fit(param) print( f'Lowest bic with number of components {components}: {gmm.bic(param)}' ) best_gmm = gmm else: for n_components in range(1, max_components): # Fit a Gaussian mixture with EM gmm = mixture.GaussianMixture(n_components=n_components, covariance_type='full', max_iter=5000, tol=1e-15, n_init=20) gmm.fit(param) bic.append(gmm.bic(param)) if bic[-1] < lowest_bic: components = n_components lowest_bic = bic[-1] print( f'Lowest bic with number of components {n_components}: {lowest_bic}' ) best_gmm = gmm samples = best_gmm.sample(n_samples)[0] if name is not None and not analyze and global_data.seed == 0: print(f'Save samples and mixture gaussian in file {name}') dict = {'samples': samples} pickle_out = open(f'{base_dir}/gm_{name}_samples.pkl', "wb") pickle.dump(dict, pickle_out) dict = { 'comp': components, 'weights': best_gmm.weights_, 'means': best_gmm.means_, 'cov': best_gmm.covariances_, 'precision': best_gmm.precisions_cholesky_ } pickle_out = open(f'{base_dir}/gm_{name}_dist.pkl', "wb") pickle.dump(dict, pickle_out) if analyze: centers = best_gmm.means_ if centers.shape[-1] == 9: centers = centers.reshape(centers.shape[0], 3, 3) centers = centers.reshape(1, centers.shape[0], 3, 3) print(best_gmm.weights_) # plot_weights(centers, name) # mixture_analysis(best_gmm.weights_, best_gmm.means_, best_gmm.covariances_, name) return best_gmm # return samples return samples
# np.savetxt("Data_train_rand_seed="+str(0)+".csv", complete_D_train,delimiter=",") ##Stack the test complete_D_test = np.zeros([len(test_idx), num_stacked * n]) len_test = len(test_idx) for i in range(len(sorted_test_idx)): idx = sorted_test_idx[i] idx_left = idx - 1 while idx_left not in sorted_training_idx: idx_left -= 1 point_tr = sorted_training_idx.index(idx_left) complete_D_test[i] = complete_D_train[point_tr] complete_D_test[i][0:n] = Data[idx][0:n] # np.savetxt("Data_test_rand_seed="+str(0)+".csv", complete_D_test,delimiter=",") #####INITIALIZATION!!! gmm = mixture.GaussianMixture(n_components=num_clusters, covariance_type="full") gmm.fit(complete_D_train) clustered_points = gmm.predict(complete_D_train) clustered_points_test = gmm.predict(complete_D_test) gmm_clustered_pts_test = gmm.predict(complete_D_test) gmm_clustered_pts = clustered_points + 0 gmm_covariances = gmm.covariances_ gmm_means = gmm.means_ ##USE K-means kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(complete_D_train) clustered_points_kmeans = kmeans.labels_ clustered_points_test_kmeans = kmeans.predict(complete_D_test)
def fit(self, input_file): """ Main method for TICC solver. Parameters: - input_file: location of the data file """ assert self.maxIters > 0 # must have at least one iteration self.log_parameters() # Get data into proper format times_series_arr, time_series_rows_size, time_series_col_size = self.load_data( input_file) ############ # The basic folder to be created str_NULL = self.prepare_out_directory() # Train test split training_indices = getTrainTestSplit( time_series_rows_size, self.num_blocks, self.window_size) # indices of the training samples num_train_points = len(training_indices) # Stack the training data complete_D_train = self.stack_training_data(times_series_arr, time_series_col_size, num_train_points, training_indices) print("here") # Initialization # Gaussian Mixture gmm = mixture.GaussianMixture(n_components=self.number_of_clusters, covariance_type="full") print("here maybe") gmm.fit(complete_D_train) print("here past gmmfit") clustered_points = gmm.predict(complete_D_train) gmm_clustered_pts = clustered_points + 0 # K-means print("here at kmeans") kmeans = KMeans(n_clusters=self.number_of_clusters, random_state=0).fit(complete_D_train) clustered_points_kmeans = kmeans.labels_ # todo, is there a difference between these two? kmeans_clustered_pts = kmeans.labels_ print("here again") train_cluster_inverse = {} log_det_values = {} # log dets of the thetas computed_covariance = {} cluster_mean_info = {} cluster_mean_stacked_info = {} old_clustered_points = None # points from last iteration empirical_covariances = {} # PERFORM TRAINING ITERATIONS pool = Pool(processes=self.num_proc) # multi-threading for iters in range(self.maxIters): print("\n\n\nITERATION ###", iters) # Get the train and test points train_clusters_arr = collections.defaultdict( list) # {cluster: [point indices]} for point, cluster_num in enumerate(clustered_points): train_clusters_arr[cluster_num].append(point) len_train_clusters = { k: len(train_clusters_arr[k]) for k in range(self.number_of_clusters) } # train_clusters holds the indices in complete_D_train # for each of the clusters opt_res = self.train_clusters( cluster_mean_info, cluster_mean_stacked_info, complete_D_train, empirical_covariances, len_train_clusters, time_series_col_size, pool, train_clusters_arr) self.optimize_clusters(computed_covariance, len_train_clusters, log_det_values, opt_res, train_cluster_inverse) # update old computed covariance old_computed_covariance = computed_covariance print("UPDATED THE OLD COVARIANCE") self.trained_model = { 'cluster_mean_info': cluster_mean_info, 'computed_covariance': computed_covariance, 'cluster_mean_stacked_info': cluster_mean_stacked_info, 'complete_D_train': complete_D_train, 'time_series_col_size': time_series_col_size } clustered_points = self.predict_clusters() # recalculate lengths new_train_clusters = collections.defaultdict( list) # {cluster: [point indices]} for point, cluster in enumerate(clustered_points): new_train_clusters[cluster].append(point) len_new_train_clusters = { k: len(new_train_clusters[k]) for k in range(self.number_of_clusters) } before_empty_cluster_assign = clustered_points.copy() if iters != 0: cluster_norms = [(np.linalg.norm( old_computed_covariance[self.number_of_clusters, i]), i) for i in range(self.number_of_clusters)] norms_sorted = sorted(cluster_norms, reverse=True) # clusters that are not 0 as sorted by norm valid_clusters = [ cp[1] for cp in norms_sorted if len_new_train_clusters[cp[1]] != 0 ] # Add a point to the empty clusters # assuming more non empty clusters than empty ones counter = 0 for cluster_num in range(self.number_of_clusters): if len_new_train_clusters[cluster_num] == 0: cluster_selected = valid_clusters[ counter] # a cluster that is not len 0 counter = (counter + 1) % len(valid_clusters) print("cluster that is zero is:", cluster_num, "selected cluster instead is:", cluster_selected) start_point = np.random.choice( new_train_clusters[cluster_selected] ) # random point number from that cluster for i in range(0, self.cluster_reassignment): # put cluster_reassignment points from point_num in this cluster point_to_move = start_point + i if point_to_move >= len(clustered_points): break clustered_points[point_to_move] = cluster_num computed_covariance[ self.number_of_clusters, cluster_num] = old_computed_covariance[ self.number_of_clusters, cluster_selected] cluster_mean_stacked_info[ self.number_of_clusters, cluster_num] = complete_D_train[ point_to_move, :] cluster_mean_info[self.number_of_clusters, cluster_num] \ = complete_D_train[point_to_move, :][ (self.window_size - 1) * time_series_col_size:self.window_size * time_series_col_size] for cluster_num in range(self.number_of_clusters): print("length of cluster #", cluster_num, "-------->", sum([x == cluster_num for x in clustered_points])) self.write_plot(clustered_points, str_NULL, training_indices) # TEST SETS STUFF # LLE + swtiching_penalty # Segment length # Create the F1 score from the graphs from k-means and GMM # Get the train and test points train_confusion_matrix_EM = compute_confusion_matrix( self.number_of_clusters, clustered_points, training_indices) train_confusion_matrix_GMM = compute_confusion_matrix( self.number_of_clusters, gmm_clustered_pts, training_indices) train_confusion_matrix_kmeans = compute_confusion_matrix( self.number_of_clusters, kmeans_clustered_pts, training_indices) ###compute the matchings matching_EM, matching_GMM, matching_Kmeans = self.compute_matches( train_confusion_matrix_EM, train_confusion_matrix_GMM, train_confusion_matrix_kmeans) print("\n\n\n") if np.array_equal(old_clustered_points, clustered_points): print("\n\n\n\nCONVERGED!!! BREAKING EARLY!!!") break old_clustered_points = before_empty_cluster_assign # end of training if pool is not None: pool.close() pool.join() train_confusion_matrix_EM = compute_confusion_matrix( self.number_of_clusters, clustered_points, training_indices) train_confusion_matrix_GMM = compute_confusion_matrix( self.number_of_clusters, gmm_clustered_pts, training_indices) train_confusion_matrix_kmeans = compute_confusion_matrix( self.number_of_clusters, clustered_points_kmeans, training_indices) self.compute_f_score(matching_EM, matching_GMM, matching_Kmeans, train_confusion_matrix_EM, train_confusion_matrix_GMM, train_confusion_matrix_kmeans) if self.compute_BIC: bic = computeBIC(self.number_of_clusters, time_series_rows_size, clustered_points, train_cluster_inverse, empirical_covariances) print("this is the val,", bic) if iters > 998: bic = 999999999 return clustered_points, train_cluster_inverse, bic return clustered_points, train_cluster_inverse, bic return clustered_points, train_cluster_inverse
def __init__(self, data, n_labels, beta_init=1, stencil=None, normalize=True): """ Args: data (:obj:`np.ndarray`): Multidimensional data array containing all observations (features) in the following shape: 1D = (Y, F) 2D = (Y, X, F) 3D = (Y, X, Z, F) n_labels (int): Number of labels representing the number of clusters to be segmented. beta_init (float): Initial penalty value for Gibbs energy calculation. stencil (int): Number specifying the stencil of the neighborhood system used in the Gibbs energy calculation. """ # TODO: [DOCS] Main object description # store initial data self.data = data # get shape for physical and feature dimensions self.shape = np.shape(data) self.phys_shp = np.array(self.shape[:-1]) # get number of features self.n_feat = self.shape[-1] # GRAPH COLORING self.stencil = stencil self.colors = pseudocolor(self.shape, self.stencil) # ************************************************************************************************ # fetch dimensionality, coordinate and feature vector from input data # 1D if len(self.shape) == 2: # 1d case self.dim = 1 # create coordinate vector # self.coords = np.array([np.arange(self.shape[0])]).T # feature vector self.feat = self.data # 2D elif len(self.shape) == 3: # 2d case self.dim = 2 # create coordinate vector # y, x = np.indices(self.shape[:-1]) # print(y, x) # self.coords = np.array([y.flatten(), x.flatten()]).T # feature vector self.feat = np.array( [self.data[:, :, f].ravel() for f in range(self.n_feat)]).T # 3D elif len(self.shape) == 4: # 3d case raise Exception("3D segmentation not yet supported.") # mismatch else: raise Exception( "Data format appears to be wrong (neither 1-, 2- or 3-D).") if normalize: self.normalize_feature_vectors() # ************************************************************************************************ # INIT GAUSSIAN MIXTURE MODEL self.n_labels = n_labels self.gmm = mixture.GaussianMixture(n_components=n_labels, covariance_type="full") self.gmm.fit(self.feat) # do initial prediction based on fit and observations, store as first entry in labels # ************************************************************************************************ # INIT LABELS, MU and COV based on GMM # TODO: [GENERAL] storage variables from lists to numpy ndarrays self.labels = np.array([self.gmm.predict(self.feat)]) # INIT MU (mean from initial GMM) self.mus = np.array([self.gmm.means_]) # INIT COV (covariances from initial GMM) self.covs = np.array([self.gmm.covariances_]) self.labels_probability = np.zeros( (1, self.labels.shape[1], self.n_labels)) self.storage_gibbs_e = np.zeros( [1, self.labels.shape[1], self.n_labels]) self.storage_like_e = np.zeros( [1, self.labels.shape[1], self.n_labels]) self.storage_te = np.zeros([1, self.labels.shape[1], self.n_labels]) self.beta_acc_ratio = np.array([]) self.cov_acc_ratio = np.array([]) self.mu_acc_ratio = np.array([]) # ************************************************************************************************ # Initialize PRIOR distributions for beta, mu and covariance # BETA if self.dim == 1: self.prior_beta = norm(beta_init, np.eye(1) * 100) self.betas = [beta_init] elif self.dim == 2: if self.stencil == "4p": beta_dim = 2 elif self.stencil == "8p" or self.stencil is None: beta_dim = 4 self.betas = [[beta_init for i in range(beta_dim)]] self.prior_beta = multivariate_normal( [beta_init for i in range(beta_dim)], np.eye(beta_dim) * 100) elif self.dim == 3: raise Exception("3D not yet supported.") # MU # generate distribution means for each label prior_mu_means = [self.mus[0][label] for label in range(self.n_labels)] # generate distribution covariances for each label prior_mu_stds = [ np.eye(self.n_feat) * 100 for label in range(self.n_labels) ] # use the above to generate multivariate normal distributions for each label self.priors_mu = [ multivariate_normal(prior_mu_means[label], prior_mu_stds[label]) for label in range(self.n_labels) ] # COV # generate b_sigma self.b_sigma = np.zeros((self.n_labels, self.n_feat)) for l in range(self.n_labels): self.b_sigma[l, :] = np.log( np.sqrt(np.diag(self.gmm.covariances_[l, :, :]))) # generate kesi self.kesi = np.ones((self.n_labels, self.n_feat)) * 100 # generate nu self.nu = self.n_feat + 1
def fit_gaus(masked_array, ras_fn, ncomp, sampleStep): # http://stackoverflow.com/questions/10143905/python-two-curve-gaussian-fitting-with-non-linear-least-squares/19182915#19182915 X_compress = masked_array.compressed() X_reshape = np.reshape(X_compress, (masked_array.compressed().size, 1)) clf = mixture.GaussianMixture(n_components=ncomp, covariance_type='full') clf.fit(X_reshape) ml = clf.means_ wl = clf.weights_ cl = clf.covariances_ ms = [m[0] for m in ml] cs = [np.sqrt(c[0][0]) for c in cl] ws = [w for w in wl] i = 0 sampleStep_str = "%03d" % (sampleStep) histo = matplotlib.pyplot.hist(masked_array.compressed(), 300, normed=True, color='gray', alpha=0.5) fig_name = ras_fn.split('/')[-1].strip('.tif') + "_" + str( ncomp ) + "_" + sampleStep_str + '.png' ##'_pks' + str(ncomp) + '_' + 'hist' + str(sampleStep_str) +'.png' # Delete out_peaksCSV if exists out_dir = os.path.split(ras_fn)[0] out_peaks_csv = os.path.join(out_dir, fig_name.strip('.png') + '.csv') if os.path.isfile(out_peaks_csv): os.remove(out_peaks_csv) print "\tOutput gaussian peaks csv: %s" % (out_peaks_csv) with open(out_peaks_csv, 'w') as outpk: # Write hdr if new outpk.write( 'ras_fn,gaus1_mean,gaus1_sd,gaus2_mean,gaus2_sd,gaus3_mean,gaus3_sd\n' ) i = 0 gauss_num = '' outpk.write(ras_fn) # Start writing the line for w, m, c in zip(ws, ms, cs): i += 1 matplotlib.pyplot.plot( histo[1], w * matplotlib.mlab.normpdf(histo[1], m, np.sqrt(c)), linewidth=3) matplotlib.pyplot.axis([-5, 15, 0, 1]) gauss_num = 'Gaussian peak #%s' % (i) print '\t' + gauss_num + ' mean: ', m, ' std dev:', c outpk.write(',' + str(m) + ',' + str(c)) # Finish writing the line if i == ncomp: outpk.write('\n') matplotlib.pyplot.savefig(os.path.join(out_dir, fig_name)) matplotlib.pyplot.clf() return (out_peaks_csv)
if pc_name == "vision-pc26-Ubuntu": data_path = "/home/z2228wan/data/BSDS300/images" else: data_path = "BSDS300/images" # load data print("[*] Loading data ...\t", end="") start = time.time() train_data = read_data(os.path.join(data_path, "train"), num_samples) test_data = read_data(os.path.join(data_path, "test"), num_samples) np.save(f"train_gmm/train_data_{num_samples}.npy", train_data) np.save(f"train_gmm/test_data_{num_samples}.npy", test_data) sio.savemat(f"train_gmm/data_{num_samples}.mat", dict(train_data=train_data, test_data=test_data)) print(f"{time.time() - start:.3f} s") # fit a GMM model with EM gmm = mixture.GaussianMixture(n_components=n_components, covariance_type='full', max_iter=500, tol=1e-6, verbose=2, verbose_interval=1) if is_train: print("[*] Fitting ...") start = time.time() gmm.fit(train_data) print(f"Fitting takes {time.time() - start:.3f} s") joblib.dump(gmm, f"train_gmm/gmm_{n_components}.joblib") else: gmm = joblib.load(f"train_gmm/gmm_{n_components}.joblib") # testing this model log_prob = gmm.score_samples(test_data) neg_log_prob = - np.mean(log_prob) print(f"[*] Negative log likeliohood on testing set: {neg_log_prob:.3f}")
print(df.shape, df.dtype) dat = df[:, 0:64] tar1 = df[:, 64] X = dat y = tar1 x1 = [] a1 = [] a2 = [] t1 = [] t2 = [] i1 = [] for k in range(2, 16, 2): gmm = mixture.GaussianMixture(n_components=k, covariance_type='full', random_state=777) gmm.fit(X) cluster_labels = gmm.predict(X) l1 = np.reshape(cluster_labels, (5619, 1)) print(cluster_labels.shape, l1.shape, X.shape) print(cluster_labels) X1 = np.hstack((l1, X)) X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.3, random_state=20) clf = MLPClassifier(solver='sgd', activation='relu', alpha=0.03, momentum=0.9,
trimData = np.array(trimData[1:],dtype=np.float) # %% # %% #labels.remove('STID') labels = np.array(labels) labels = np.delete(labels, 0) # remove first element in array #labels = labels.astype(np.float) # %% View data thus far for i in range(3): print "data " + str(data[i+1]) + "\n" print "labels " + str(labels[i]) + "\n" print "trim" + str(trimData[i]) + "\n" # %% start = time.time() gmix = mixture.GaussianMixture(n_components=6, covariance_type='full') gmix.fit(trimData) end = time.time() print(end - start) print gmix.means_ predictions = gmix.predict(trimData) # %%view data kylabels = pd.DataFrame({'a':labels}) kylabels.head(10) kylabels.tail(10) kylabels['a'].value_counts() # %% # %% cllabels = pd.DataFrame({'a':predictions})
if Fs == all_emotion_Fs: features = extract_MFCCs(x, Fs, window * Fs, window_overlap * Fs, voiced_threshold_mul, voiced_threshold_range, calc_deltas) all_emotion_data.append(features) else: print sample_file + " skipped due to mismatch in frame rate" all_emotion_data = np.concatenate(all_emotion_data, 0) #print all_emotion_data.shape try: gmm = mixture.GaussianMixture( n_components=n_mixtures, covariance_type='diag', max_iter=max_iterations).fit(all_emotion_data) except: print "ERROR : Error while training model for file " + emotion try: joblib.dump(gmm, 'train_models/' + emotion_name + '.pkl') except: print "ERROR : Error while saving model for " + emotion_name spct += 1 print "Training Completed" confusion_matrix = np.zeros((total_sp, total_sp)) tct = 0
def cal(algo, labels_true, labels_pred): print('%-30s\t%.3f\t%.3f\t%.3f' % ( algo, metrics.normalized_mutual_info_score( labels_true, labels_pred, average_method='arithmetic'), metrics.homogeneity_score(labels_true, labels_pred), metrics.completeness_score(labels_true, labels_pred), )) labels_pred = cluster.KMeans(n_clusters=np.unique(tar).shape[0], random_state=30).fit_predict(Data) cal('K-Means', labels, labels_pred) labels_pred = cluster.AffinityPropagation(damping=0.6, preference=-2000).fit_predict(Data) cal('AffinityPropagation', labels, labels_pred) labels_pred = cluster.MeanShift(bandwidth=0.0005, bin_seeding=True).fit_predict(Data) cal('Mean-Shift', labels, labels_pred) labels_pred = cluster.SpectralClustering( n_clusters=np.unique(tar).shape[0]).fit_predict(Data) cal('SpectralClustering', labels, labels_pred) labels_pred = cluster.AgglomerativeClustering( n_clusters=np.unique(tar).shape[0]).fit_predict(Data) cal('AgglomerativeClustering', labels, labels_pred) labels_pred = cluster.DBSCAN(eps=0.004, min_samples=6).fit_predict(Data) cal('Dbscan', labels, labels_pred) labels_pred = mixture.GaussianMixture( n_components=np.unique(tar).shape[0]).fit_predict(Data) cal('GaussianMixtures', labels, labels_pred)
angle = 180. * angle / np.pi # convert to degrees ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(0.5) splot.add_artist(ell) plt.xticks(()) plt.yticks(()) plt.title(title) compnum = [2, 3, 4, 6, 8, 10] for each in compnum: t0= time.clock() # Fit a Gaussian mixture with EM using n components gmm = mixture.GaussianMixture(n_components= each, covariance_type='full') gmm = gmm.fit(traindata) # print(gmm.means_) print(gmm.converged_) print("Lower Bound: ") print(gmm.lower_bound_) t1= time.clock() timetaken = str(t1-t0) print("Computation Time: " + timetaken) plot_results(traindata, gmm.predict(traindata), gmm.means_, gmm.covariances_, 0, 'Gaussian Mixture') dpgmm = mixture.BayesianGaussianMixture(n_components=each, covariance_type='full').fit(traindata) plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1,
def create_graph_with_weight(points, normCount): ''' Returns a graph created from cell coordiantes. edge weights set by normalized counts. :param points: shape (n,2); normCount: shape (n) :rtype: ndarray shape (n ,3) ''' edges = {} var = normCount.var() delauny = Delaunay(points) # cellGraph = np.zeros((delauny.simplices.shape[0]*delauny.simplices.shape[1], 4)) cellGraph = np.zeros((points.shape[0]*10, 4)) for simplex in delauny.simplices: simplex.sort() edge0 = str(simplex[0]) + " " + str(simplex[1]) edge1 = str(simplex[0]) + " " + str(simplex[2]) edge2 = str(simplex[1]) + " " + str(simplex[2]) edges[edge0] = 1 edges[edge1] = 1 edges[edge2] = 1 ## remove repetitives edges among triangle i = 0 for kk in edges.keys(): node0 = int(kk.split(sep=" ")[0]) node1 = int(kk.split(sep=" ")[1]) edgeDiff = normCount[node0] - normCount[node1] energy = np.exp((0 - edgeDiff**2)/(2*var)) dist = distance.euclidean(points[node0,:], points[node1,:]) cellGraph[i] = [node0, node1, energy, dist] i = i + 1 tempGraph = cellGraph[0:i] n_components_range = range(1,5) best_component = 1 lowest_bic=np.infty temp_data = tempGraph[:,3].reshape(-1,1) ## GMM of dist for n_components in n_components_range: gmm = mixture.GaussianMixture(n_components = n_components) gmm.fit(temp_data) gmm_bic = gmm.bic(temp_data) if gmm_bic < lowest_bic: best_gmm = gmm lowest_bic = gmm_bic best_component = n_components mIndex = np.where(best_gmm.weights_ == max(best_gmm.weights_))[0] cutoff = best_gmm.means_[mIndex] + 2*np.sqrt(best_gmm.covariances_[mIndex]) for simplex in delauny.simplices: simplex.sort() dist0 = distance.euclidean(points[simplex[0],:], points[simplex[1],:]) dist1 = distance.euclidean(points[simplex[0],:], points[simplex[2],:]) dist2 = distance.euclidean(points[simplex[1],:], points[simplex[2],:]) tempArray = np.array((dist0, dist1, dist2)) badIndex = np.where(tempArray == max(tempArray))[0][0] ## remove longest edges among simplex taiangle. if tempArray[badIndex] > cutoff: edge0 = str(simplex[0]) + " " + str(simplex[1]) edge1 = str(simplex[0]) + " " + str(simplex[2]) edge2 = str(simplex[1]) + " " + str(simplex[2]) edgeCount = 0 if edge0 in edges and edge1 in edges and edge2 in edges: if badIndex == 0: del edges[edge0] elif badIndex == 1: del edges[edge1] elif badIndex == 2: del edges[edge2] ## remove longest edges from edges i = 0 for kk in edges.keys(): ## recrete cellGraph with new edges node0 = int(kk.split(sep=" ")[0]) node1 = int(kk.split(sep=" ")[1]) edgeDiff = normCount[node0] - normCount[node1] energy = np.exp((0 - edgeDiff**2)/(2*var)) dist = distance.euclidean(points[node0,:], points[node1,:]) cellGraph[i] = [node0, node1, energy, dist] i = i + 1 tempGraph = cellGraph[0:i] temp_data = tempGraph[:,3].reshape(-1,1) gmm = mixture.GaussianMixture(n_components = 1) gmm.fit(temp_data) cutoff = gmm.means_[0] + 2*np.sqrt(gmm.covariances_[0]) finalGraph = tempGraph.copy() j=0 for i in np.arange(tempGraph.shape[0]): if tempGraph[i, 3] < cutoff: ### re-test all edges' dist have similar distribution. finalGraph[j] = tempGraph[i] j = j + 1 return finalGraph
myML.plotML.plotparam_cluster(X,labels_true,"cluster.AgglomerativeClustering()",drawParam=1,n_clusters=nums) # 测试 AgglomerativeClustering 的聚类结果随链接方式的影响 nums=range(1,50) linkages=['ward','complete','average'] myML.plotML.plotparam_cluster(X,labels_true,"cluster.AgglomerativeClustering()",drawParam=2,n_clusters=nums,linkage=linkages) # ---GMM centers=[[1,1],[2,2],[1,2],[10,20]] # 用于产生聚类的中心点 X, labels_true = myML.DataPre.make_datasets("blobs", n_samples=1000, centers=centers, cluster_std=0.5 ) from sklearn import mixture from sklearn.metrics import adjusted_rand_score # 测试 GMM 的用法 clst=mixture.GaussianMixture() clst.fit(X) predicted_labels=clst.predict(X) print("ARI:%s"% adjusted_rand_score(labels_true,predicted_labels)) # 测试 GMM 的聚类结果随 n_components 参数的影响 nums=range(1,20) myML.plotML.plotparam_cluster(X,labels_true,"mixture.GaussianMixture()",n_components=nums) # 测试 GMM 的聚类结果随协方差类型的影响 nums=range(1,20) cov_types=['spherical','tied','diag','full'] myML.plotML.plotparam_cluster(X,labels_true,"mixture.GaussianMixture()",drawParam=2,n_components=nums,covariance_type=cov_types)
def LSTM_MYAP_TRAIN(Xtrain,ytrain): everya, everyb = add_data(Xtrain,ytrain) num_class = 2 num_features = 10 n_epoch = 20 n_batch = 10 look_back = 2 gmm1 = mixture.GaussianMixture(n_components = 2,covariance_type='full').fit(Xtrain) nm1 = gmm1.predict(Xtrain) #kmeans = KMeans(n_clusters=2, random_state=0).fit(Xtrain) #nm1 = kmeans.labels_ nm1 = nm1.reshape(len(nm1),1) Xtrain = np.concatenate((Xtrain, nm1),axis = 1); Xtrainn = Xtrain ### ytrainn = Xtrain[:,10 ] Xtrain = Xtrain[:,0:Xtrain.shape[1]-1] ytraina = ytrain[ytrainn==0] ytrainb = ytrain[ytrainn==1] ################################################################ Xtraina = Xtrainn[Xtrainn[:,10]==0]; Xtraina = Xtraina[:,0:Xtraina.shape[1]-1] Xtrainb = Xtrainn[Xtrainn[:,10]==1]; Xtrainb = Xtrainb[:,0:Xtrainb.shape[1]-1] Xtraina = np.concatenate((everya,Xtraina),axis = 0) Xtrainb = np.concatenate((everya,Xtrainb),axis = 0) ytraina = np.concatenate((everyb,ytraina),axis = 0) ytrainb = np.concatenate((everyb,ytrainb),axis = 0) num_class = 4 num_features = 10 n_epoch = 20 n_batch = 10 look_back = 2 nb_samples = Xtraina.shape[0] - look_back Xtrain2 = np.zeros((nb_samples,look_back,num_features)) y_train_reshaped2 = np.zeros((nb_samples,1,num_class)) one_hot_labels2 = np.zeros((nb_samples,1,num_class)) ytra = np.array(pd.get_dummies(np.array(ytraina.astype(int).reshape(-1)))) for i in range(nb_samples): y_position = i + look_back Xtrain2[i] = Xtraina[i:y_position] one_hot_labels2[i] = ytra[y_position,:4] model = Sequential() opt = Adam(lr=0.001) model.add(LSTM(4,input_shape=(None, num_features), return_sequences=True)) model.add(TimeDistributed(Dense(num_class,activation = 'tanh'))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) filepath="weights-improvement1-{epoch:02d}-{categorical_accuracy:.2f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='categorical_accuracy', verbose=2, save_best_only=True, mode='max') callbacks_list = [checkpoint] cm1 = model.fit(Xtrain2,one_hot_labels2,epochs=n_epoch,batch_size=n_batch,verbose=2) clf1 = model nb_samples = Xtrainb.shape[0] - look_back Xtrain2 = np.zeros((nb_samples,look_back,num_features)) y_train_reshaped2 = np.zeros((nb_samples,1,num_class)) one_hot_labels2 = np.zeros((nb_samples,1,num_class)) ytra = np.array(pd.get_dummies(np.array(ytrainb.astype(int).reshape(-1)))) for i in range(nb_samples): y_position = i + look_back Xtrain2[i] = Xtrainb[i:y_position] one_hot_labels2[i] = ytra[y_position,:4] model = Sequential() opt = Adam(lr=0.001) model.add(LSTM(4, input_shape=(None, num_features), return_sequences=True,kernel_initializer='random_uniform')) model.add(TimeDistributed(Dense(num_class,activation = 'tanh'))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) filepath="weights-improvement1-{epoch:02d}-{categorical_accuracy:.2f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='categorical_accuracy', verbose=2, save_best_only=True, mode='max') callbacks_list = [checkpoint] n_epoch = 20 cm2 = model.fit(Xtrain2,one_hot_labels2, epochs=n_epoch, batch_size=n_batch, verbose=2) clf2 = model po = ([len(list(group)) for key, group in groupby(np.sort(ytraina))]) pn = ([len(list(group)) for key, group in groupby(np.sort(ytrainb))]) return (clf1, clf2, po, pn, Xtraina, ytraina, Xtrainb, ytrainb)
def __init__(self, K, random_state=42): self.gmm = mixture.GaussianMixture(n_components=K, covariance_type='full')
def estimate(self, experiment, subset=None): """ Estimate the Gaussian mixture model parameters """ if not experiment: raise util.CytoflowOpError("No experiment specified") if self.xchannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.xchannel)) if self.ychannel not in experiment.data: raise util.CytoflowOpError( "Column {0} not found in the experiment".format(self.ychannel)) for b in self.by: if b not in experiment.data: raise util.CytoflowOpError("Aggregation metadata {0} not found" " in the experiment".format(b)) if len(experiment.data[b].unique()) > 100: #WARNING - magic number raise util.CytoflowOpError( "More than 100 unique values found for" " aggregation metadata {0}. Did you" " accidentally specify a data channel?".format(b)) if self.num_components == 1 and self.posteriors: raise util.CytoflowOpError( "If num_components == 1, all posteriors are 1.") if subset: try: experiment = experiment.query(subset) except: raise util.CytoflowViewError( "Subset string '{0}' isn't valid".format(subset)) if len(experiment) == 0: raise util.CytoflowViewError( "Subset string '{0}' returned no events".format(subset)) if self.by: groupby = experiment.data.groupby(self.by) else: # use a lambda expression to return a group that contains # all the events groupby = experiment.data.groupby(lambda x: True) # get the scale. estimate the scale params for the ENTIRE data set, # not subsets we get from groupby(). And we need to save it so that # the data is transformed the same way when we apply() self._xscale = util.scale_factory(self.xscale, experiment, channel=self.xchannel) self._yscale = util.scale_factory(self.yscale, experiment, channel=self.ychannel) gmms = {} for group, data_subset in groupby: if len(data_subset) == 0: raise util.CytoflowOpError( "Group {} had no data".format(group)) x = data_subset.loc[:, [self.xchannel, self.ychannel]] x[self.xchannel] = self._xscale(x[self.xchannel]) x[self.ychannel] = self._yscale(x[self.ychannel]) # drop data that isn't in the scale range x = x[~(np.isnan(x[self.xchannel]) | np.isnan(x[self.ychannel]))] x = x.values gmm = mixture.GaussianMixture(n_components=self.num_components, covariance_type="full", random_state=1) gmm.fit(x) if not gmm.converged_: raise util.CytoflowOpError("Estimator didn't converge" " for group {0}".format(group)) # in the 1D version, we sort the components by the means -- so # the first component has the lowest mean, the second component # has the next-lowest mean, etc. that doesn't work in a 2D area, # obviously. # instead, we assume that the clusters are likely (?) to be # arranged along *one* of the axes, so we take the |norm| of the # x,y mean of each cluster and sort that way. norms = (gmm.means_[:, 0]**2 + gmm.means_[:, 1]**2)**0.5 sort_idx = np.argsort(norms) gmm.means_ = gmm.means_[sort_idx] gmm.weights_ = gmm.weights_[sort_idx] gmm.covariances_ = gmm.covariances_[sort_idx] gmms[group] = gmm self._gmms = gmms
MFCC_GIRL = mfcc(SIG_GIRL, RATE_GIRL, numcep=16) DELTA1_GIRL = delta(MFCC_GIRL, 2) DELTA2_GIRL = delta(DELTA1_GIRL, 2) GIRL_FEATURES = pd.concat([ pd.DataFrame(MFCC_GIRL), pd.DataFrame(DELTA1_GIRL), pd.DataFrame(DELTA2_GIRL) ], axis=1) GIRL_FEATURES = preprocessing.scale(GIRL_FEATURES) ########################################################################################################################################## BOY_MODEL = mixture.GaussianMixture(n_components=20, max_iter=1000, tol=.01, warm_start=True, covariance_type='diag') BOY_MODEL.fit(BOY_FEATURES) GIRL_MODEL = mixture.GaussianMixture(n_components=20, max_iter=1000, tol=.01, warm_start=True, covariance_type='diag') GIRL_MODEL.fit(GIRL_FEATURES) ########################################################################################################################################## (RATE_INPUT, SIG_INPUT) = wav.read( "/Users/abhishaikemahajan/Documents/VOICE/WORKSHOPTEST/SeshaTest.wav")
splot.add_artist(ell) plt.xlim(-9., 5.) plt.ylim(-3., 6.) plt.xticks(()) plt.yticks(()) plt.title(title) # Number of samples per component n_samples = 500 # Generate random sample, two components np.random.seed(0) C = np.array([[0., -0.1], [1.7, .4]]) X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] # Fit a Gaussian mixture with EM using five components gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X) plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, 'Gaussian Mixture') # Fit a Dirichlet process Gaussian mixture using five components dpgmm = mixture.BayesianGaussianMixture(n_components=5, covariance_type='full').fit(X) plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1, 'Bayesian Gaussian Mixture with a Dirichlet process prior') plt.show()
plt.suptitle(("Silhouette analysis for KMeans clustering on tic tac toe dataset " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') plt.show() ##em lowest_bic = np.infty bic = [] n_components_range = range(2, 14,2) cv_types = ['spherical', 'tied', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=777) gmm.fit(X12) gmm_labels = gmm.predict(X12) bic.append(gmm.bic(X12)) if abs(bic[-1]) < lowest_bic: lowest_bic = abs(bic[-1]) best_gmm = gmm bic = np.array(bic) color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue', 'darkorange']) clf = best_gmm bars = [] # Plot the BIC scores plt.figure(figsize=(8, 6))
# In[6]: from sklearn import mixture import itertools Xgmm = Xmat lowest_bic = np.infty bic = [] n_components_range = range(1, 6) cv_types = ['tied'] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type) gmm.fit(Xgmm) bic.append(gmm.bic(Xgmm)) if n_components == 5: bic[-1] = bic[-2] + 1000 if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) color_iter = itertools.cycle(['cornflowerblue']) clf = best_gmm bars = [] # Plot the BIC scores
def maintask(task): data = np.load('lab2_data.npz')['data'] phoneHMMs = np.load('lab2_models_onespkr.npz')['phoneHMMs'].item() phoneHMMs_all = np.load('lab2_models_all.npz')['phoneHMMs'].item() if task == '4': hmm1 = phoneHMMs['ah'] hmm2 = phoneHMMs['ao'] twohmm = concatTwoHMMs(hmm1, hmm2) """5 HMM Likelihood and Recognition""" example = np.load('lab2_example.npz')['example'].item() isolated = {} for digit in prondict.keys(): isolated[digit] = ['sil'] + prondict[digit] + ['sil'] wordHMMs = {} wordHMMs_all = {} for digit in prondict.keys(): wordHMMs[digit] = concatHMMs(phoneHMMs, isolated[digit]) # for 11 digits for digit in prondict.keys(): wordHMMs_all[digit] = concatHMMs(phoneHMMs_all, isolated[digit]) # example lpr = log_multivariate_normal_density_diag(example['lmfcc'], wordHMMs['o']['means'], wordHMMs['o']['covars']) diff = example['obsloglik'] - lpr # 0 # same digit 'o' lpr_o = log_multivariate_normal_density_diag(data[22]['lmfcc'], wordHMMs_all['o']['means'], wordHMMs_all['o']['covars']) if task == '5.1': plt.figure() plt.subplot(2, 1, 1) plt.pcolormesh(lpr.T) plt.title('example "o" ') plt.colorbar() plt.subplot(2, 1, 2) plt.pcolormesh(lpr_o.T) plt.title('test "o" from data22') plt.colorbar() plt.show() """ 5.2 """ lalpha = forward(lpr, np.log(wordHMMs['o']['startprob']), np.log(wordHMMs['o']['transmat'])) diff1 = example['logalpha'] - lalpha # 0 # log-likelihood loglike = logsumexp(lalpha[-1]) diff0 = example['loglik'] - loglike # 0 # score all the 44 utterances in the data array with each of the 11 HMM # models in wordHMMs. scores_1 = np.zeros((44, 11)) scores_2 = np.zeros((44, 11)) labels_ori = [] labels_pre = [] labels_pre2 = [] keys = list(prondict.keys()) acc_1 = 0 acc_2 = 0 if task == '5.2': for i in range(44): for j, key in enumerate(keys): lpr = log_multivariate_normal_density_diag( data[i]['lmfcc'], wordHMMs_all[key]['means'], wordHMMs_all[key]['covars']) alpha = forward(lpr, np.log(wordHMMs_all[key]['startprob']), np.log(wordHMMs_all[key]['transmat'])) scores_2[i, j] = logsumexp(alpha[-1]) lpr_1 = log_multivariate_normal_density_diag( data[i]['lmfcc'], wordHMMs[key]['means'], wordHMMs[key]['covars']) alpha_1 = forward(lpr_1, np.log(wordHMMs[key]['startprob']), np.log(wordHMMs[key]['transmat'])) scores_1[i, j] = logsumexp(alpha_1[-1]) ori = data[i]['digit'] pre_1 = keys[int(np.argmax(scores_1[i, :]))] pre_2 = keys[int(np.argmax(scores_2[i, :]))] #labels_ori.append(ori) labels_pre.append(pre_1) labels_pre2.append(pre_2) if ori == pre_1: acc_1 += 1 if ori == pre_2: acc_2 += 1 print( "Accuracy(trained on all speakers): {0}; Accuracy(trained on one speaker):{1} " .format(acc_2, acc_1)) print(labels_pre, labels_pre2) """ 5.3 Viterbi """ viterbi_loglik, viterbi_path = viterbi(lpr, np.log(wordHMMs['o']['startprob']), np.log(wordHMMs['o']['transmat'])) if task == '5.3': plt.pcolormesh(lalpha.T) plt.plot(viterbi_path, 'r') plt.title( 'alpha array overlaid with best path obtained by Viterbi decoding ' ) plt.colorbar() plt.show() diff3 = example['vloglik'] - viterbi_loglik.T # 0 # Score all 44 utterances in the data with each of the 11 HMM models in wordHMMs for i in range(44): for j, key in enumerate(keys): lpr = log_multivariate_normal_density_diag( data[i]['lmfcc'], wordHMMs_all[key]['means'], wordHMMs_all[key]['covars']) viterbi_2, viterbi_path_2 = viterbi( lpr, np.log(wordHMMs_all[key]['startprob']), np.log(wordHMMs_all[key]['transmat'])) scores_2[i, j] = viterbi_2 lpr_1 = log_multivariate_normal_density_diag( data[i]['lmfcc'], wordHMMs[key]['means'], wordHMMs[key]['covars']) viterbi_1, viterbi_path_1 = viterbi( lpr_1, np.log(wordHMMs[key]['startprob']), np.log(wordHMMs[key]['transmat'])) scores_1[i, j] = viterbi_1 ori = data[i]['digit'] pre_1 = keys[int(np.argmax(scores_1[i, :]))] pre_2 = keys[int(np.argmax(scores_2[i, :]))] #labels_ori.append(ori) labels_pre.append(pre_1) labels_pre2.append(pre_2) if ori == pre_1: acc_1 += 1 if ori == pre_2: acc_2 += 1 print( "Accuracy(trained on all speakers): {0}; Accuracy(trained on one speaker):{1} " .format(acc_2, acc_1)) print(labels_pre, labels_pre2) """ 5.4 """ lbeta = backward(lpr, np.log(wordHMMs['o']['startprob']), np.log(wordHMMs['o']['transmat'])) diff2 = example['logbeta'] - lbeta # log-likelihood loglike = logsumexp(lbeta[0]) diff4 = example['loglik'] - loglike # 0 if task == '5.4': plt.figure() plt.subplot(1, 3, 1) plt.pcolormesh(lbeta) plt.title('log-beta') plt.subplot(1, 3, 2) plt.pcolormesh(example['logbeta']) plt.title('example') plt.subplot(1, 3, 3) plt.pcolormesh(example['logalpha']) plt.title('log-alpha') plt.show() """6 HMM Retraining(emission probability distributions)""" """ 6.1 """ lgamma = statePosteriors(lalpha, lbeta) N = lgamma.shape[0] K = 9 lgamma_gmm = np.zeros((N, K)) total = log_multivariate_normal_density_diag(example['lmfcc'], wordHMMs['o']['means'], wordHMMs['o']['covars']) if task == '6.1': print('HMM posteriors') print('each time step sum along state axis', np.sum(np.exp(lgamma), axis=1)) #=1 print('each state sum along time axis', np.sum(np.exp(lgamma) / 71, axis=0)) print('sum over both states and time steps', np.sum(np.sum( np.exp(lgamma)))) # =length of obs sequence/time steps print('length of observation sequence', lalpha.shape[0]) print('GMM posteriors') # for k in range(K): #lgamma_gmm[:, k] = 1 / K * total[:, k] / np.sum(total[:, k]) gmm = mixture.GaussianMixture(n_components=9) gmm.fit(example['lmfcc']) gmm_post = gmm.predict_proba(example['lmfcc']) plt.subplot(2, 1, 1) plt.pcolormesh(gmm_post.T) plt.title('GMM posteriors') plt.colorbar() plt.subplot(2, 1, 2) plt.pcolormesh(lgamma.T) plt.title('HMM posteriors') plt.colorbar() plt.show() """6.2""" if task == '6.2': plt.figure() L = {} for d in prondict: # initialization log_pi = np.log(wordHMMs_all[d]['startprob']) log_tr = np.log(wordHMMs_all[d]['transmat']) means = wordHMMs_all[d]['means'] covars = wordHMMs_all[d]['covars'] l = [] # repitation: for i in range(20): lpr = log_multivariate_normal_density_diag( data[10]['lmfcc'], means, covars) # Expectation lalpha = forward(lpr, log_pi, log_tr) lbeta = backward(lpr, log_pi, log_tr) log_gamma = statePosteriors(lalpha, lbeta) # Maximization means, covars = updateMeanAndVar(data[10]['lmfcc'], log_gamma) # Estimate likelihood log_like = logsumexp(lalpha[-1]) if i > 2 and log_like - l[-1] < 0.1: l.append(log_like) L[d] = l break else: l.append(log_like) L[d] = l plt.plot(l, label=d) plt.legend() plt.title('log-likelihood (data[10] with different wordHMMs)') plt.show()
def compute_paa(self, targets, target_labels, anchors, labels_all, loss_all, matched_idx_all): """ criteria: 'PAA' or 'GMM' Args: labels_all (batch_size x num_anchors): assigned labels loss_all (batch_size x numanchors): calculated loss """ cls_labels = [] reg_targets = [] matched_gts = [] for im_i in range(len(targets)): targets_per_im = targets[im_i].tensor bboxes_per_im = targets_per_im labels_per_im = target_labels[im_i] anchors_per_im = Boxes.cat(anchors[im_i]).tensor num_gt = bboxes_per_im.shape[0] labels = labels_all[im_i] loss = loss_all[im_i] matched_idx = matched_idx_all[im_i] assert labels.shape == matched_idx.shape num_anchors_per_level = [ len(anchors_per_level) for anchors_per_level in anchors[im_i] ] # select candidates based on IoUs between anchors and GTs candidate_idxs = [] for gt in range(num_gt): candidate_idxs_per_gt = [] star_idx = 0 for level, anchors_per_level in enumerate(anchors[im_i]): end_idx = star_idx + num_anchors_per_level[level] loss_per_level = loss[star_idx:end_idx] labels_per_level = labels[star_idx:end_idx] matched_idx_per_level = matched_idx[star_idx:end_idx] match_idx = ((matched_idx_per_level == gt) & (labels_per_level > 0)).nonzero()[:, 0] if match_idx.numel() > 0: _, topk_idxs = loss_per_level[match_idx].topk( min(match_idx.numel(), self.cfg.MODEL.PAA.TOPK), largest=False) topk_idxs_per_level_per_gt = match_idx[topk_idxs] candidate_idxs_per_gt.append( topk_idxs_per_level_per_gt + star_idx) star_idx = end_idx if candidate_idxs_per_gt: candidate_idxs.append(torch.cat(candidate_idxs_per_gt)) else: candidate_idxs.append(None) # fit 2-mode GMM per GT box n_labels = anchors_per_im.shape[0] cls_labels_per_im = torch.zeros(n_labels, dtype=torch.long).cuda() matched_gts_per_im = torch.zeros_like(anchors_per_im) fg_inds = matched_idx >= 0 matched_gts_per_im[fg_inds] = bboxes_per_im[matched_idx[fg_inds]] is_grey = None for gt in range(num_gt): if candidate_idxs[gt] is not None: if candidate_idxs[gt].numel() > 1: candidate_loss = loss[candidate_idxs[gt]] candidate_loss, inds = candidate_loss.sort() candidate_loss = candidate_loss.view(-1, 1).cpu().numpy() min_loss, max_loss = candidate_loss.min( ), candidate_loss.max() means_init = [[min_loss], [max_loss]] weights_init = [0.5, 0.5] precisions_init = [[[1.0]], [[1.0]]] gmm = skm.GaussianMixture( 2, weights_init=weights_init, means_init=means_init, precisions_init=precisions_init) gmm.fit(candidate_loss) components = gmm.predict(candidate_loss) scores = gmm.score_samples(candidate_loss) components = torch.from_numpy(components).to("cuda") scores = torch.from_numpy(scores).to("cuda") fgs = components == 0 bgs = components == 1 if fgs.nonzero().numel() > 0: fg_max_score = scores[fgs].max().item() fg_max_idx = ( fgs & (scores == fg_max_score)).nonzero().min() is_neg = inds[fgs | bgs] is_pos = inds[:fg_max_idx + 1] else: # just treat all samples as positive for high recall. is_pos = inds is_neg = is_grey = None else: is_pos = 0 is_neg = None is_grey = None if is_grey is not None: grey_idx = candidate_idxs[gt][is_grey] cls_labels_per_im[grey_idx] = -1 if is_neg is not None: neg_idx = candidate_idxs[gt][is_neg] cls_labels_per_im[neg_idx] = 0 pos_idx = candidate_idxs[gt][is_pos] cls_labels_per_im[pos_idx] = labels_per_im[gt].view(-1, 1) matched_gts_per_im[pos_idx] = bboxes_per_im[gt].view(-1, 4) reg_targets_per_im = self.box_coder.get_deltas( anchors_per_im, matched_gts_per_im) cls_labels.append(cls_labels_per_im) reg_targets.append(reg_targets_per_im) matched_gts.append(matched_gts_per_im) return cls_labels, reg_targets, matched_gts
for i in range(0, len(idx_positive[0])): positive_features[i] = gen_features[idx_positive[0][i]] print("positive features: ", type(positive_features), positive_features.shape) for i in range(0, len(idx_negative[0])): negative_features[i] = gen_features[idx_negative[0][i]] print("negative features: ", type(negative_features), negative_features.shape) n_components = np.arange(1, 10) models = [ mixture.GaussianMixture(n, covariance_type='full', random_state=0) for n in n_components ] aics = [ model.fit(positive_features).aic(positive_features) for model in models ] bics = [ model.fit(positive_features).bic(positive_features) for model in models ] plt.plot(n_components, aics, label="AIC-positive") plt.plot(n_components, bics, label="BIC-positive") plt.legend(loc='best') plt.xlabel('n_components') plt.savefig("/path/to/save/results/positive_data.png") print("Plot saved") gmm_positive = mixture.GaussianMixture(4,