def visualize_clusters(self, x, y, dataset): """Visualize clusters with PCA and TSNE. Args: x (ndarray): data. y (ndarray): true labels. dataset (string): dataset, WDBC or MNIST. Returns: None. """ # Declare PCA and reduce data pca = PCA(n_components=2, random_state=self.random_seed) x_pca = pca.fit_transform(x) # Declare TSNE and reduce data tsne = TSNE(n_components=2, random_state=self.random_seed) x_tsne = tsne.fit_transform(x) n_classes = len(np.unique(y)) # compute number of classes print('\nBenchmark Model with k = n classes = {}'.format(n_classes)) # Benchamark the model with number of clusters (k) = number of classes model = clone(self.model) model_params = self.model.get_params() model_params[self.name_param] = n_classes model.set_params(**model_params) clusters = model.fit_predict(x) self.benchmark(x, y, clusters) # Create dataframe for visualization df = pd.DataFrame(x_tsne, columns=['tsne1', 'tsne2']) df['pca1'] = x_pca[:, 0] df['pca2'] = x_pca[:, 1] df['y'] = y df['c'] = self.clusters # Create subplot and plot clusters with PCA and TSNE fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(15, 8)) utils.plot_clusters(ax1, 'pca1', 'pca2', df, self.name) utils.plot_clusters(ax2, 'tsne1', 'tsne2', df, self.name) # Save figure utils.save_figure_tight('{}_{}_clusters'.format(dataset, self.name))
def main(): VECTORIZER_TYPE = "tf-idf" MAX_FEATURES = 50000 SVM_TYPE = "linear" C = 1.1 X, y = utils.read_corpus(c.FAKE_CORPUS, c.TRUTH_CORPUS) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y) model1, vectorizer = train_svm(X_train, y_train, VECTORIZER_TYPE, max_features=MAX_FEATURES, type=SVM_TYPE, c=C, max_iter=10000) evaluation.evaluate_linear(model1, vectorizer, X_test, y_test) utils.plot_clusters(X_test, vectorizer)
def kmeans_run_all(): pd.set_option('expand_frame_repr', True) pd.set_option('max_rows', 100) np.set_printoptions(precision=3, floatmode='fixed') for fn in c.ALL: k = c.ks[fn] t = 1 df, class_id = parse_csv(fn) clusters, centroids = kmeans(df, k, t) results = evaluate_clusters(clusters, centroids, verbose=False) totals = results.sum() totals.name = c.TOTALS results = results.append(totals) sfn = strip_file_path(fn) print(f'\nSummary - {sfn}') print(results) for idx, (cluster, centroid) in enumerate(zip(clusters, centroids)): print(f'\nCluster {idx + 1}') print(f'Centroid: {centroid}') print(cluster) if 2 <= clusters[0].shape[1] <= 3: plot_clusters([df], np.array([df.mean().values]), f'kmeans {sfn}') plot_clusters(clusters, centroids, f'kmeans clustered {sfn}')
def _run_unsupervised_clustering(self, visualize: bool = False): num_clusters = self._detect_num_clusters() (_, _, _, training_messages,) = self._generate_funcs_contexts_messages(1000) k_means = cluster.KMeans(n_clusters=num_clusters) training_labels = k_means.fit_predict(training_messages) if visualize: utils.plot_clusters(training_messages, training_labels, "Messages clusters") # Align cluster ids with with function/message ids: # Generate messages for each function, # pair a cluster id with the function most common in it. ( alignment_func_selectors, _, _, alignment_messages, ) = self._generate_funcs_contexts_messages(1000) alignment_func_idxs = alignment_func_selectors.argmax(dim=1) alignment_labels = k_means.predict(alignment_messages) func_counts_per_cluster = collections.defaultdict(collections.Counter) for i, cluster_label in enumerate(alignment_labels): function_idx = alignment_func_idxs[i] func_counts_per_cluster[cluster_label][function_idx] += 1 cluster_label_to_func_idx = { cluster_label: func_counts.most_common(1)[0][0] for cluster_label, func_counts in func_counts_per_cluster.items() } assert len(cluster_label_to_func_idx) == num_clusters self.clustering_model = k_means self.cluster_label_to_func_idx = cluster_label_to_func_idx
opts, args = getopt.getopt(sys.argv[1:], "f:k:t:", ["file=", "threshold="]) except getopt.GetoptError: print error_msg sys.exit(2) input_filename = None K = 0 threshold = 0.01 for opt, arg in opts: if opt in ('-f', '--file'): input_filename = arg elif opt == '-k': K = int(arg) elif opt in ('-t', '--threshold'): threshold = float(arg) if input_filename is None or K == 0: print error_msg sys.exit(2) input_points = utils.read_points(input_filename) clusterization = lloyd_kmeans(input_points, K, threshold) centroids = clusterization[0] clusters = clusterization[1] print "centroids:\n {}".format(centroids) utils.plot_clusters(centroids, clusters)
# Task 1 Generate easy_y = scatter_clusters([[0.5, 0.7], [1.5, 0.7], [1, 1.7]], [0.2, 0.2], N_POINTS) medium_y = scatter_clusters([[0.5, 0.7], [1.5, 0.7], [1, 1.7]], [.55, 0.55], N_POINTS) hard_y = scatter_clusters([[0.5, 0.7], [1.5, 0.7], [1, 1.7]], [.75, 0.75], N_POINTS) y_true = { i: [idx for idx in range(i * N_POINTS, N_POINTS + i * N_POINTS)] for i in range(3) } # Task 1 Plot fig = plt.figure(figsize=FIG_SIZE) easy_plot = plot_clusters(fig, easy_y, y_true) easy_plot.savefig("plots/easy_true.pdf", bbox_inches='tight') fig = plt.figure(figsize=FIG_SIZE) medium_plot = plot_clusters(fig, medium_y, y_true) medium_plot.savefig("plots/medium_true.pdf", bbox_inches='tight') fig = plt.figure(figsize=FIG_SIZE) hard_plot = plot_clusters(fig, hard_y, y_true) hard_plot.savefig("plots/hard_true.pdf", bbox_inches='tight') # plt.sow() # plots only the last problem, move up to see others
importlib.reload(utils) # Load the data path = os.getcwd() + "/data/" data_train = pd.read_table(path + "EMGaussian.data", header=None, sep=" ") data_test = pd.read_table(path + "EMGaussian.test", header=None, sep=" ") x = data_train.values.T xtest = data_test.values.T xall = np.concatenate((x, xtest), axis=1) # Run k-means k = 4 mus, z = kmeans.iterate_kmeans(x, k, nits=100, epsilon=0.001) # Plot clusters and centers fig1, ax1 = plt.subplots() utils.plot_clusters(x, mus, z, ax1) plt.title("K-means clustering on training data") # Compare several runs of k-means with different random initializations centers, objectives = kmeans.compare_several_runs(x, k, nsims=100, nits=100, epsilon=0.001) # Plot the different centers obtained kmeans.plot_centroids(centers, k) # Plot histogram of distorstion values plt.hist(objectives) # Run EM with covariance matrices proportional to the identity matrix # Initialization with kmeans
def main(): args = parse_arguments(sys.argv[1:]) print("Parameters:") for arg_ in args.sys_args: print(arg_) print() # read data # ========= hapt_data = data.HAPT() hapt_data.load_all_data() hapt_data.aggregate_groups() exp_data = hapt_data.get_train_data() exp_labs = hapt_data.get_train_labels() exp_labels_map = hapt_data.get_labels_map() exp_centroids_num = len(hapt_data.get_labels_map()) if args.data == "test": exp_data = hapt_data.get_test_data() exp_labs = hapt_data.get_test_labels() exp_centroids_num = len(hapt_data.get_labels_map()) if args.aggregate: exp_labs = hapt_data.get_aggregated_train_labels() exp_labels_map = hapt_data.get_aggregated_labels_map() exp_centroids_num = len(hapt_data.get_aggregated_labels_map()) if args.data == "test": exp_labs = hapt_data.get_aggregated_test_labels() # Show experiment data # ==================== if args.showdata: utils.plot_clusters(exp_data, exp_labs, exp_labels_map, True) return # evolution # ========= iterations_list, scores_list, populations_list, total_time_list, log_dir_list, best_indiv_idx_list = [],[],[],[],[],[] best_overall = (-1, 0, 0, 0 ) # score, experiment, generation (iteration), individual for exp_i in range(args.repeat): iterations, scores, populations, total_time, log_dir, best_indiv_idx = evolution.run_SGA( args.iter_num, exp_data, exp_labs, args.pop_num, args.prob_cross, args.prob_mutation, exp_centroids_num, args.adapt_function, args.dist_measure, log_dir="logs", loggin_pref="exp {}/{}: ".format(exp_i + 1, args.repeat)) cur_best_score = scores[best_indiv_idx[0], best_indiv_idx[1]] if best_overall[0] < cur_best_score: best_overall = (cur_best_score, exp_i, best_indiv_idx[0], best_indiv_idx[1]) iterations_list.append(iterations) scores_list.append(scores) populations_list.append(populations) total_time_list.append(total_time) log_dir_list.append(log_dir) best_indiv_idx_list.append(best_indiv_idx) # save plot plot_tuple = ("pop:" + str(args.pop_num), "p_c:" + str(args.prob_cross), "p_m:" + str(args.prob_mutation), "data size:" + str(len(exp_labs)), args.adapt_function, args.dist_measure) utils.plot_scores(iterations, scores, args.adapt_function, plot_tuple, to_file=True, out_dir=log_dir) # visualize # ========= if 1 < args.repeat: plot_tuple = ("pop:" + str(args.pop_num), "p_c:" + str(args.prob_cross), "p_m:" + str(args.prob_mutation), "data size:" + str(len(exp_labs)), args.adapt_function, args.dist_measure) utils.plot_avg_scores(iterations_list, scores_list, args.adapt_function, best_indiv_idx_list, plot_tuple, to_file=True, out_dirs=log_dir_list)
# -------------------------------- # Visualizing the data # -------------------------------- if __name__ == '__main__': from utils import plot_clusters blobs_data, blobs_clusters = blobs(600, n_blobs=4, surplus=500) moons_data, moons_clusters = two_moons(600) point_circle_data, point_circle_clusters = point_and_circle(600) worst_blobs_data, worst_blobs_clusters = worst_case_blob(600, 5.0) print(blobs_data.shape) # print((blobs_clusters == 0).sum()) # print((blobs_clusters == 1).sum()) # print((blobs_clusters == 2).sum()) # print((blobs_clusters == 3).sum()) plot_clusters(blobs_data, blobs_clusters, 'blobs', show=True) plot_clusters(moons_data, moons_clusters, 'moons', show=False) plot_clusters(point_circle_data, point_circle_clusters, 'point and circle', show=False) plot_clusters(worst_blobs_data, worst_blobs_clusters, 'worst case blob', show=True)
plot_feature_importance, pr_curve, print_classfication_report, read_data, write_data, score_model) from sklearn import metrics batch_size = 128 epochs = 50 X_train, y_train, X_test = read_data() # Feature Diagnostic plot_feature_corr(X_train) plot_feature_corr(np.vstack(X_test), stem='test') plot_pca(X_train, y_train) plot_clusters(X_train, y_train) indices_ci = plot_feature_importance(X_train, y_train) skf = StratifiedKFold(y_train, n_folds=4) train_index, dev_index = next(iter(skf)) X_dev = X_train[dev_index] y_dev = y_train[dev_index] X_train = X_train[train_index] y_train = y_train[train_index] # Since GMM works well, transforming the data to alternate space kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True) kp = kpca.fit(X_train) X_train = kp.transform(X_train)
def main(): args = parse_arguments(sys.argv[1:]) # read params # =========== # possible params: # iter_num, pop_num, centers_num, prob_cross, prob_mutation, data shape, labs shape, # adapt_function, dist_measure, log_dir, best score, best score (index), total_time exp_params = {} text_file = [f for f in os.listdir(args.path) if f.endswith(".txt")][0] with open(os.path.join(args.path, text_file), "r") as text_f: for line in text_f: line = line.replace("\t", "").strip().split(":") if len(line) == 2 and line[0] != "" and line[1] != "": if line[0] == "iter_num" or line[0] == "pop_num" or line[ 0] == "centers_num": exp_params[line[0].replace(" ", "_")] = int(line[1]) elif line[0] == "prob_cross" or line[ 0] == "prob_mutation" or line[0] == "best score": exp_params[line[0].replace(" ", "_")] = float(line[1]) elif line[0] == "data shape" or line[0] == "labs shape": exp_params[line[0].replace(" ", "_")] = make_tuple(line[1]) elif line[0] == "best score (index)": #best score (index): generation 95, individual 99 line[1] = line[1].strip().split(",") exp_params["best_index"] = ( int(line[1][0].strip().split(" ")[1]), int(line[1][1].strip().split(" ")[1])) else: exp_params[line[0].replace(" ", "_")] = line[1] print("\nexperiment parameters were:") for k, v in exp_params.items(): print("{:20}: {}".format(k, v)) # read results # ============ generations = np.load(os.path.join(args.path, "generations.npy")) iterations = np.load(os.path.join(args.path, "iterations.npy")) scores = np.load(os.path.join(args.path, "scores.npy")) best_centers = generations[exp_params["best_index"][0], exp_params["best_index"][1]] print("\nobtained results are:") print( "generations (total num, pop size, centrs num, feats num): {}".format( generations.shape)) print( "iterations (iterations num, ): {}".format( iterations.shape)) print( "scores (total num, pop size): {}".format( scores.shape)) print( "generations total num, iterations num and scores total num must be equal!" ) print("generations pop size and scores pop size must be equal too!") plot_tuple = ("pop:" + str(exp_params["pop_num"]), "p_c:" + str(exp_params["prob_cross"]), "p_m:" + str(exp_params["prob_mutation"]), "data size:" + str(len(exp_params["data_shape"])), exp_params["adapt_function"], exp_params["dist_measure"], "best score:" + str(exp_params["best_score"])[:9] + " at " + str(exp_params["best_index"])) utils.plot_scores(iterations, scores, exp_params["adapt_function"], plot_tuple, not args.nooutput, out_dir=args.outdir) # read data # ========= print("reading data...") hapt_data = data.HAPT() hapt_data.load_all_data() hapt_data.aggregate_groups() test_data = hapt_data.get_test_data() test_labs = hapt_data.get_test_labels() train_data = hapt_data.get_train_data() train_labs = hapt_data.get_train_labels() labs_map = hapt_data.get_labels_map() if exp_params["centers_num"] == 3: test_labs = hapt_data.get_aggregated_test_labels() train_labs = hapt_data.get_aggregated_train_labels() labs_map = hapt_data.get_aggregated_labels_map() centroids_num = len(labs_map) assert exp_params["centers_num"] == centroids_num # do clusterizations # ================== print("clustering...") labels_names = list(labs_map.values()) # train data train_clust_labs = cluster.Centroids.cluster( train_data, best_centers, dist_func=exp_params["dist_measure"]) train_clust_labs = cluster.Utils.adjust_labels(train_clust_labs, train_labs) train_silh = cluster.Evaluate.silhouette(train_data, train_clust_labs, exp_params["dist_measure"]) train_silh_normalized = (train_silh + 1) / 2 train_info_gain = cluster.Evaluate.information_gain( train_labs, train_clust_labs) mapped_train_clust_labs = [labs_map[l] for l in train_clust_labs] mapped_train_labs = [labs_map[l] for l in train_labs] train_conf_mtx = confusion_matrix(mapped_train_labs, mapped_train_clust_labs, labels=labels_names) print("train set\tsilh: {:.6}, silh normalized: {:.6}, info gain: {:.6}". format(train_silh, train_silh_normalized, train_info_gain)) # test data test_clust_labs = cluster.Centroids.cluster( test_data, best_centers, dist_func=exp_params["dist_measure"]) test_clust_labs = cluster.Utils.adjust_labels(test_clust_labs, test_labs) test_silh = cluster.Evaluate.silhouette(test_data, test_clust_labs, exp_params["dist_measure"]) test_silh_normalized = (test_silh + 1) / 2 test_info_gain = cluster.Evaluate.information_gain(test_labs, test_clust_labs) mapped_test_clust_labs = [labs_map[l] for l in test_clust_labs] mapped_test_labs = [labs_map[l] for l in test_labs] test_conf_mtx = confusion_matrix(mapped_test_labs, mapped_test_clust_labs, labels=labels_names) print("test set\tsilh: {:.6}, silh normalized: {:.6}, info gain: {:.6}". format(test_silh, test_silh_normalized, test_info_gain)) # Show data # ========= print("creating plots...") # clusters utils.plot_clusters(train_data, train_labs, labs_map, True, out_dir=args.outdir, filename="train_orig_clusters") utils.plot_clusters(train_data, train_clust_labs, labs_map, True, out_dir=args.outdir, filename="train_obtained_clusters") utils.plot_clusters(test_data, test_labs, labs_map, True, out_dir=args.outdir, filename="test_orig_clusters") utils.plot_clusters(test_data, test_clust_labs, labs_map, True, out_dir=args.outdir, filename="test_obtained_clusters") # confusion matrices utils.plot_confusion_matrix( train_conf_mtx, labels_names, normalize=False, title= 'Confusion matrix\ntrain set\n(silh: {:.6}, silh normalized: {:.6}, info gain: {:.6})' .format(train_silh, train_silh_normalized, train_info_gain), cmap=plt.cm.Blues, out_dir=args.outdir, filename="train_conf_matr_silh_info_gain") utils.plot_confusion_matrix( test_conf_mtx, labels_names, normalize=False, title= 'Confusion matrix\ntest set\n(silh: {:.6}, silh normalized: {:.6}, info gain: {:.6})' .format(test_silh, test_silh_normalized, test_info_gain), cmap=plt.cm.Blues, out_dir=args.outdir, filename="test_conf_matr_silh_info_gain") print("inference ended")