def main(): clust_num = 3 data_shape = 2 mu_list, sigma_list, w_list = get_sparse_gmm_model(clust_num, data_shape) spsa_gamma = 1. / 6 spsa_alpha = lambda x: 0.25 / (x ** spsa_gamma) spsa_beta = lambda x: 15. / (x ** (spsa_gamma / 4)) # spsa_alpha = lambda x: 0.001 # spsa_beta = lambda x: 0.001 clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_num, data_shape=data_shape, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False, verbose=False, sparse=False, eta=None) N = 3000 data_set = [] true_labels = [] for _ in range(N): mix_ind = np.random.choice(len(w_list), p=w_list) data_point = np.random.multivariate_normal(mu_list[mix_ind], np.identity(data_shape) * sigma_list[mix_ind]) data_set.append(data_point) true_labels.append(mix_ind) clustering.fit(data_point) data_set = np.array(data_set) dataset_name = 'good' dataset_dir = os.path.join('datasets', dataset_name) if not os.path.isdir(dataset_dir): os.mkdir(dataset_dir) np.save(os.path.join(dataset_dir, 'data.npy'), data_set) np.save(os.path.join(dataset_dir, 'true.npy'), np.array(true_labels)) param = {'mu': mu_list, 'sigma': sigma_list, 'w': w_list} with open(os.path.join(dataset_dir, 'param.pickle'), 'wb') as f: pickle.dump(param, f) utils.order_clust_centers(np.array(mu_list), clustering) clustering.clusters_fill(data_set) ari_spsa = metrics.adjusted_rand_score(true_labels, clustering.labels_) print('ARI: {}'.format(ari_spsa)) print('Mean centers dist: {}'.format(utils.mean_cent_dist(np.array(mu_list), clustering))) utils.plot_centers(np.array(mu_list), clustering) # utils.plot_centers_converg(np.array(mu_list), clustering) utils.plot_clustering(data_set, clustering.labels_, 'SPSA clustering partition') utils.plot_clustering(data_set, true_labels, 'True partition') plt.show()
def load_experiment(name='bad'): dataset_dir = os.path.join('datasets', name) data_set = np.load(os.path.join(dataset_dir, 'data.npy')) true_labels = np.load(os.path.join(dataset_dir, 'true.npy')) with open(os.path.join(dataset_dir, 'param.pickle'), 'rb') as f: param = pickle.load(f) mu_list, sigma_list, w_list = param['mu'], param['sigma'], param['w'] clust_num = len(mu_list) data_shape = data_set[0].shape[0] spsa_gamma = 1. / 6 spsa_alpha = lambda x: 0.25 / (x ** spsa_gamma) spsa_beta = lambda x: 15. / (x ** (spsa_gamma / 4)) clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_num, data_shape=data_shape, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False, verbose=False, sparse=False, eta=None, spsa_sigma=False) rand_ind = np.random.permutation(data_set.shape[0]) for i in rand_ind: clustering.fit(data_set[i]) # utils.order_clust_centers(np.array(mu_list), clustering) clustering.clusters_fill(data_set[rand_ind]) ari_spsa = metrics.adjusted_rand_score(true_labels[rand_ind], clustering.labels_) print('ARI: {}'.format(ari_spsa)) print('Mean centers dist: {}'.format(utils.mean_cent_dist(np.array(mu_list), clustering))) utils.plot_centers(np.array(mu_list), clustering) # utils.plot_centers_converg(np.array(mu_list), clustering) # utils.plot_clustering(data_set[rand_ind], clustering.labels_, 'SPSA clustering partition') # utils.plot_clustering(data_set[rand_ind], true_labels[rand_ind], 'True partition') # for Gamma in clustering.Gammas: # print(Gamma) # for center in clustering.cluster_centers_: # print(center) # utils.plot_clustering_cov(data_set, clustering.labels_, 'SPSA clustering partition', clustering.cluster_centers_, # clustering.Gammas) plt.show()
ari_kmeans = np.zeros(n_run) ari_spsa = np.zeros(n_run) ari_spsa_cov = np.zeros(n_run) ari_gmm = np.zeros(n_run) ari_pam = np.zeros(n_run) centers_dist_kmeans = np.zeros(n_run) centers_dist_spsa = np.zeros(n_run) centers_dist_spsa_cov = np.zeros(n_run) centers_dist_gmm = np.zeros(n_run) for i in range(n_run): print('Run {0}'.format(i)) clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_means.shape[0], data_shape=2, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False, verbose=False, noise=noises[j]) clustering_cov = spsa_clustering.ClusteringSPSA(n_clusters=clust_means.shape[0], data_shape=2, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False, noise=noises[j], eta=3000, verbose=False) data_set = [] true_labels = [] for _ in range(N): mix_ind = np.random.choice(len(mix_prob), p=mix_prob) data_point = np.random.multivariate_normal(clust_means[mix_ind], clust_gammas[mix_ind]) data_set.append(data_point) true_labels.append(mix_ind) # clustering.fit(data_point)
# data_generator.save_example() train_generator = data_generator.generate('train') centers_fname = '/home/a.boiarov/Projects/spsa_clustering_gmm_log/centers.npy' if not args.only_clf: spsa_gamma = 1. / 6 spsa_alpha = lambda x: 0.25 / (x**spsa_gamma) spsa_beta = lambda x: 15. / (x**(spsa_gamma / 4)) n_filters = 500 clustering = spsa_clustering.ClusteringSPSA(n_clusters=n_filters, data_shape=patch_size * patch_size, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False, eta=900) # spsa_train_num = data_generator.train_number spsa_train_num = 1500 num = 0 for _ in range(spsa_train_num): print(num) num += 1 train_data = next(train_generator) for patch in train_data[0]: patch = patch.flatten() clustering.fit(patch)
noise_5 = spsa_clustering.Noise(func=lambda x: 10 * (np.random.rand() * 4 - 2), name='random') noise_6 = spsa_clustering.Noise(func=lambda x: 0.1 * np.sin(x) + 19 * np.sign(50 - x % 100), name='irregular') noise_7 = spsa_clustering.Noise(func=lambda x: 20, name='constant') experiment_noise = noise_0 spsa_gamma = 1. / 6 spsa_alpha = lambda x: 0.25 / (x ** spsa_gamma) spsa_beta = lambda x: 15. / (x ** (spsa_gamma / 4)) # spsa_alpha = lambda x: 0.0001 # spsa_beta = lambda x: 0.0001 clustering = spsa_clustering.ClusteringSPSA(n_clusters=10, data_shape=784, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False, noise=experiment_noise) data_set = [] true_labels = [] # init_ind = [] # for label in range(10): # ind = np.random.choice(df.index[df['label'] == label].tolist(), 1) # row = df.loc[ind[0], :] # true_labels.append(row[0]) # data_point = np.array(row[1:].tolist(), dtype=float) # data_set.append(data_point) # clustering.fit(data_point) # init_ind.append(ind) index = list(range(df.shape[0]))
clust_means = np.array([[0, 0], [2, 2], [-3, 6]]) clust_gammas = np.array([[[1, -0.7], [-0.7, 1]], np.eye(2), [[1, 0.8], [0.8, 1]]]) data_set = [] true_labels = [] spsa_gamma = 1. / 6 spsa_alpha = lambda x: 0.25 / (x**spsa_gamma) spsa_beta = lambda x: 15. / (x**(spsa_gamma / 4)) # spsa_alpha = lambda x: 0.001 # spsa_beta = lambda x: 0.001 clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_means.shape[0], data_shape=2, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False) for _ in range(N): mix_ind = np.random.choice(len(mix_prob), p=mix_prob) data_point = np.random.multivariate_normal(clust_means[mix_ind], clust_gammas[mix_ind]) data_set.append(data_point) true_labels.append(mix_ind) clustering.fit(data_point) data_set = np.array(data_set) utils.order_clust_centers(clust_means, clustering) clustering.clusters_fill(data_set)
noise_7 = spsa_clustering.Noise(func=lambda x: [20] * x.shape[0], name='constant') experiment_noise = noise_3 spsa_gamma = 1. / 6 spsa_alpha = lambda x: 0.25 / (x**spsa_gamma) spsa_beta = lambda x: 15. / (x**(spsa_gamma / 4)) # spsa_alpha = lambda x: 0.001 # spsa_beta = lambda x: 0.001 clustering = spsa_clustering.ClusteringSPSA(n_clusters=3, data_shape=2, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False, noise=experiment_noise, eta=None) clustering_cov = spsa_clustering.ClusteringSPSA(n_clusters=3, data_shape=2, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False, noise=experiment_noise, eta=1000) for _ in range(N): mix_ind = np.random.choice(len(mix_prob), p=mix_prob)
def stat(): clust_num = 3 data_shape = 2 mu_list, sigma_list, w_list = get_sparse_gmm_model(clust_num, data_shape) spsa_gamma = 1. / 6 spsa_alpha = lambda x: 0.25 / (x ** spsa_gamma) spsa_beta = lambda x: 15. / (x ** (spsa_gamma / 4)) # spsa_alpha = lambda x: 0.001 # spsa_beta = lambda x: 0.001 n_run = 10 N = 3000 ari_spsa = np.zeros(n_run) ari_kmeans = np.zeros(n_run) ari_mb_kmeans = np.zeros(n_run) ari_pam = np.zeros(n_run) cent_dist = np.zeros(n_run) cent_dist_kmeans = np.zeros(n_run) cent_dist_mb_kmeans = np.zeros(n_run) cent_dist_pam = np.zeros(n_run) for i in tqdm(range(n_run)): clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_num, data_shape=data_shape, Gammas=None, alpha=spsa_alpha, beta=spsa_beta, norm_init=False, verbose=False, sparse=True, eta=700, spsa_sigma=False) kmeans = cluster.KMeans(n_clusters=clust_num) mb_kmeans = cluster.MiniBatchKMeans(n_clusters=clust_num, n_init=1, init='random', max_iter=1, batch_size=1, max_no_improvement=None) data_set = [] true_labels = [] for _ in range(N): mix_ind = np.random.choice(len(w_list), p=w_list) data_point = np.random.multivariate_normal(mu_list[mix_ind], np.identity(data_shape) * sigma_list[mix_ind]) data_set.append(data_point) true_labels.append(mix_ind) # clustering.fit(data_point) data_set = np.array(data_set) # utils.order_clust_centers(np.array(mu_list), clustering) # clustering.clusters_fill(data_set) labels_pred_kmenas = kmeans.fit_predict(data_set) labels_pred_mb_kmeans = mb_kmeans.fit_predict(data_set) dist = pairwise_distances(data_set) labels_pred_pam, pam_med = pam.cluster(dist, k=clust_num) # ari_spsa[i] = metrics.adjusted_rand_score(true_labels, clustering.labels_) # cent_dist[i] = utils.mean_cent_dist(np.array(mu_list), clustering) ari_kmeans[i] = metrics.adjusted_rand_score(true_labels, labels_pred_kmenas) ari_mb_kmeans[i] = metrics.adjusted_rand_score(true_labels, labels_pred_mb_kmeans) ari_pam[i] = metrics.adjusted_rand_score(true_labels, labels_pred_pam) cent_dist_kmeans[i] = utils.mean_cent_dist_(np.array(mu_list), kmeans.cluster_centers_) cent_dist_mb_kmeans[i] = utils.mean_cent_dist_(np.array(mu_list), mb_kmeans.cluster_centers_) cent_dist_pam[i] = utils.mean_cent_dist_(np.array(mu_list), data_set[pam_med]) print(ari_spsa.mean(), cent_dist.mean()) print('\nMean ARI k-means: {:f}, Mean L2: {:f}'.format(ari_kmeans.mean(), cent_dist_kmeans.mean())) print('Mean ARI online k-means: {:f}, Mean L2: {:f}'.format(ari_mb_kmeans.mean(), cent_dist_mb_kmeans.mean())) # print('Mean ARI SPSA clustering: {:f}, Mean L2: {:f}'.format(ari_spsa.mean(), cen)) print('\nMean ARI PAM: {:f}, Mean L2: {:f}'.format(ari_pam.mean(), cent_dist_pam.mean()))