def create_tree_result(dir_path: str, tree: DpNode, pred_labels: np.ndarray, ground_truth: np.ndarray, col_node_id_map): if not os.path.exists(dir_path): os.makedirs(dir_path) f = open(f"{dir_path}/results.txt", "wt") f.write( f"NMI: {nmi(pred_labels, ground_truth, average_method='arithmetic')}\n" ) acc, acc_confusion = cluster_acc(pred_labels, ground_truth) f.write(f"ACC: {acc} confusion (ground truth \ prediction):\n") f.write(f"{acc_confusion}\n") f.write("\n\n") f.write(f"Dendrogram Purity: {dendrogram_purity(tree, ground_truth)}\n") lp = leaf_purity(tree, ground_truth) f.write(f"Leaf Purity: Avg:{lp[0]:1.3} std:{lp[1]:1.3}\n") col_nod_str = "\t".join([ f"{x[0]}:{x[1]}" for x in sorted(list(col_node_id_map.items()), key=lambda x: x[0]) ]) f.write(f"{col_nod_str}\n") f.write("\n\n") tree_str = tree2string(tree, "children", "node_id") f.write(f"{tree_str} \n") f.write("\n\n") f.write(_label_distribution(tree, ground_truth)) f.close()
def run_experiment(ae_model_path): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) new_seed = random.randint(0, 1000) logger.info(f"Seed value for this is: {new_seed}") set_random_seed(new_seed) ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() # Get embedded data embedded_data = None for batch_data in torch.utils.data.DataLoader(pt_data, batch_size=256, shuffle=False): embedded_batch_np = ae_module.forward( batch_data.cuda())[0].detach().cpu().numpy() if embedded_data is None: embedded_data = embedded_batch_np else: embedded_data = np.concatenate([embedded_data, embedded_batch_np], 0) del ae_module # Perform k-means k_means_labels = k_means(embedded_data, n_clusters, n_init=20)[1] k_means_nmi_value = nmi(gold_labels, k_means_labels, average_method='arithmetic') k_means_acc_value = cluster_acc(gold_labels, k_means_labels)[0] result_file = Path(f"{result_dir}/results_ae_kmeans_{dataset_name}.txt") result_file_exists = result_file.exists() f = open(result_file, "a+") if not result_file_exists: f.write("#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\n") f.write( f"{ae_model_path.name}\t{k_means_nmi_value}\t{k_means_acc_value}\n") f.close()
def evaluate(train_round_idx, ae_module, cluster_module): test_loader = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(pt_data), batch_size=256) pred_labels = np.zeros(pt_data.shape[0], dtype=np.int) index = 0 n_batches = 0 for batch_data in test_loader: batch_data = batch_data[0].cuda() n_batches += 1 batch_size = batch_data.shape[0] embedded_data, reconstructed_data = ae_module.forward(batch_data) labels = cluster_module.prediction_hard_np(embedded_data) pred_labels[index:index + batch_size] = labels index = index + batch_size nmi_value = nmi(gold_labels, pred_labels, average_method='arithmetic') acc_value = cluster_acc(gold_labels, pred_labels)[0] logger.info( f"{train_round_idx} Evaluation: NMI: {nmi_value} ACC: {acc_value}") return nmi_value, acc_value
def run_experiment(ae_model_path): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) new_seed = random.randint(0, 1000) logger.info(f"Seed value for this is: {new_seed}") set_random_seed(new_seed) ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() # Get embedded data embedded_data = None for batch_data in torch.utils.data.DataLoader(pt_data, batch_size=256, shuffle=False): embedded_batch_np = ae_module.forward( batch_data.cuda())[0].detach().cpu().numpy() if embedded_data is None: embedded_data = embedded_batch_np else: embedded_data = np.concatenate([embedded_data, embedded_batch_np], 0) del ae_module # bisecting k-means: tree = bisection(n_leaf_nodes_final, embedded_data) bisec_labels = predict_by_tree(tree, embedded_data, n_clusters) bisec_tree = predict_id_tree(tree, embedded_data) bisec_km_nmi = nmi(gold_labels, bisec_labels, average_method='arithmetic') bisec_km_acc = cluster_acc(bisec_labels, gold_labels)[0] bisec_km_purity = dendrogram_purity(bisec_tree, gold_labels) lp = leaf_purity(bisec_tree, gold_labels) leaf_purity_value = f"{lp[0]:1.3}\t({lp[1]:1.3})" result_file = Path(f"{result_dir}/results_ae_biseckm_{dataset_name}.txt") result_file_exists = result_file.exists() f = open(result_file, "a+") if not result_file_exists: f.write( "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n" ) f.write( f"{ae_model_path.name}\t{bisec_km_nmi}\t{bisec_km_acc}\t{bisec_km_purity}\t{leaf_purity_value}\n" ) f.close()
def run_experiment(ae_model_path): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) new_seed = random.randint(0, 1000) logger.info(f"Seed value for this is: {new_seed}") set_random_seed(new_seed) ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() # Get embedded data embedded_data = None for batch_data in torch.utils.data.DataLoader(pt_data, batch_size=256, shuffle=False): embedded_batch_np = ae_module.forward( batch_data.cuda())[0].detach().cpu().numpy() if embedded_data is None: embedded_data = embedded_batch_np else: embedded_data = np.concatenate([embedded_data, embedded_batch_np], 0) del ae_module sl_cl = AgglomerativeClustering(compute_full_tree=True, n_clusters=n_clusters, linkage="single").fit(embedded_data) sl_labels = sl_cl.labels_ sl_purity_tree = prune_dendrogram_purity_tree( to_dendrogram_purity_tree(sl_cl.children_), n_leaf_nodes_final) sl_nmi = nmi(gold_labels, sl_labels, average_method='arithmetic') sl_acc = cluster_acc(sl_labels, gold_labels)[0] sl_purity = dendrogram_purity(sl_purity_tree, gold_labels) sl_lp = leaf_purity(sl_purity_tree, gold_labels) sl_leaf_purity_value = f"{sl_lp[0]:1.3}\t({sl_lp[1]:1.3})" result_file_sl = Path( f"{result_dir}/results_ae_agglo_single_{dataset_name}.txt") result_file_sl_exists = result_file_sl.exists() f = open(result_file_sl, "a+") if not result_file_sl_exists: f.write( "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n" ) f.write( f"{ae_model_path.name}\t{sl_nmi}\t{sl_acc}\t{sl_purity}\t{sl_leaf_purity_value}\n" ) f.close() del sl_cl, sl_labels, sl_purity_tree cl_cl = AgglomerativeClustering(compute_full_tree=True, n_clusters=n_clusters, linkage="complete").fit(embedded_data) cl_labels = cl_cl.labels_ cl_purity_tree = prune_dendrogram_purity_tree( to_dendrogram_purity_tree(cl_cl.children_), n_leaf_nodes_final) cl_nmi = nmi(gold_labels, cl_labels, average_method='arithmetic') cl_acc = cluster_acc(cl_labels, gold_labels)[0] cl_purity = dendrogram_purity(cl_purity_tree, gold_labels) cl_lp = leaf_purity(cl_purity_tree, gold_labels) cl_leaf_purity_value = f"{cl_lp[0]:1.3}\t({cl_lp[1]:1.3})" result_file_cl = Path( f"{result_dir}/results_ae_agglo_complete_{dataset_name}.txt", ) result_file_cl_exists = result_file_cl.exists() f = open(result_file_cl, "a+") if not result_file_cl_exists: f.write( "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n" ) f.write( f"{ae_model_path.name}\t{cl_nmi}\t{cl_acc}\t{cl_purity}\t{cl_leaf_purity_value}\n" ) f.close() del cl_cl, cl_labels, cl_purity_tree