def run_experiment(ae_model_path): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) new_seed = random.randint(0, 1000) logger.info(f"Seed value for this is: {new_seed}") set_random_seed(new_seed) ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() # Get embedded data embedded_data = None for batch_data in torch.utils.data.DataLoader(pt_data, batch_size=256, shuffle=False): embedded_batch_np = ae_module.forward( batch_data.cuda())[0].detach().cpu().numpy() if embedded_data is None: embedded_data = embedded_batch_np else: embedded_data = np.concatenate([embedded_data, embedded_batch_np], 0) del ae_module # Perform k-means k_means_labels = k_means(embedded_data, n_clusters, n_init=20)[1] k_means_nmi_value = nmi(gold_labels, k_means_labels, average_method='arithmetic') k_means_acc_value = cluster_acc(gold_labels, k_means_labels)[0] result_file = Path(f"{result_dir}/results_ae_kmeans_{dataset_name}.txt") result_file_exists = result_file.exists() f = open(result_file, "a+") if not result_file_exists: f.write("#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\n") f.write( f"{ae_model_path.name}\t{k_means_nmi_value}\t{k_means_acc_value}\n") f.close()
def init_data_and_ae(): ae_path = Path(Path(__file__).parent, "ae.model") data, gold_labels = make_blobs(n_samples=1000, centers=3, n_features=2, random_state=42) min_max_scaler = preprocessing.MinMaxScaler((0.01, 0.99)) data = np.float32(min_max_scaler.fit_transform(data)) n_features = data.shape[1] pt_data = torch.from_numpy(data).cuda() train_ds = torch.utils.data.TensorDataset(pt_data) train_loader = torch.utils.data.DataLoader(train_ds, batch_size=64, shuffle=True) plot_data(data, gold_labels, "Original data") ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y) ** 2) ae_module = stacked_ae(n_features, [50, 50, 200, 2], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.leaky_relu(x), loss_fn=ae_reconstruction_loss_fn, optimizer_fn=lambda parameters: torch.optim.Adam(parameters, lr=0.001)) if ae_path.exists(): model_data = torch.load(ae_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() else: print( "Warning we train a new AE, because we did not find the preexisting one, this could generate a diffrent result to the paper") ae_module = ae_module.cuda() ae_module.pretrain(train_loader, 1000) ae_module.refine_training(train_loader, 5000) torch.save(ae_module.state_dict(), ae_path) embedded_data = ae_module.forward(pt_data)[0] embedded_data_np = embedded_data.data.cpu().numpy() plot_data(embedded_data_np, gold_labels, "Emedded data initial") return ae_module, pt_data, gold_labels, train_ds, train_loader, ae_reconstruction_loss_fn
def run_experiment(ae_model_path, seed): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Seed value for this is: {seed}") set_random_seed(seed) train = torch.utils.data.TensorDataset(pt_data) train_loader = torch.utils.data.DataLoader(train, batch_size=256, shuffle=True) n_features = data.shape[1] ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y)**2) ae_module = stacked_ae(n_features, [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() optimizer = torch.optim.Adam(list(ae_module.parameters()), lr=0.0001) embedded_split_data_loader = lambda: map( lambda x: ae_module.forward(x.cuda())[0], torch.utils.data.DataLoader( pt_split_data, batch_size=256, shuffle=True)) cluster_module = ECTree(optimizer, embedded_split_data_loader).cuda() def evaluate(train_round_idx, ae_module, cluster_module): test_loader = torch.utils.data.DataLoader( torch.utils.data.TensorDataset( pt_data, torch.tensor(range(pt_data.shape[0]))), batch_size=256) pred_labels = np.zeros(pt_data.shape[0], dtype=np.int) pred_tree = None index = 0 n_batches = 0 print("start evaluation") for batch_data_id in test_loader: batch_data, batch_ids = batch_data_id batch_data = batch_data.cuda() n_batches += 1 batch_size = batch_data.shape[0] embeded_data, reconstructed_data = ae_module.forward(batch_data) labels = cluster_module.prediction_labels_np( min(cluster_module.n_leaf_nodes, n_clusters), embeded_data)[0] pred_labels[index:index + batch_size] = labels new_pred_tree = cluster_module.predict_tree( embeded_data, batch_ids) if pred_tree is None: pred_tree = new_pred_tree else: pred_tree = combine_to_trees(pred_tree, new_pred_tree) index = index + batch_size lp = leaf_purity(pred_tree, gold_labels) nmi_best_prun_tree = as_flat_clustering_pruned_for_highest_measure( pred_tree, n_clusters, gold_labels, lambda x, y: nmi(x, y, 'arithmetic')) acc_best_prun_tree = as_flat_clustering_pruned_for_highest_measure( pred_tree, n_clusters, gold_labels, lambda x, y: cluster_acc(x, y)[0]) nmi_value = nmi(gold_labels, pred_labels, average_method='arithmetic') acc_value = cluster_acc(gold_labels, pred_labels)[0] dp_value = dendrogram_purity(pred_tree, gold_labels) leaf_purity_value = f"{lp[0]:1.3}\t({lp[1]:1.3})" logger.info( f"{train_round_idx} leaf_purity: {leaf_purity_value}, D-purity: {dp_value}, NMI: {nmi_value} ACC: {acc_value} NMI(best p-tree): {nmi_best_prun_tree} ACC (best p-tree): {acc_best_prun_tree}" ) return nmi_value, acc_value, dp_value, leaf_purity_value, nmi_best_prun_tree, acc_best_prun_tree evaluate("init", ae_module, cluster_module) n_rounds = 40000 train_round_idx = 0 while True: # each iteration is equal to an epoch for batch_data in train_loader: train_round_idx += 1 if train_round_idx > n_rounds: break batch_data = batch_data[0].cuda() if train_round_idx % 500 == 0 and cluster_module.n_leaf_nodes < n_leaf_nodes_final: cluster_module.split_highest_sse_node() embedded_data, reconstruced_data = ae_module.forward(batch_data) ae_loss = ae_reconstruction_loss_fn(batch_data, reconstruced_data) dp_loss, center_losses = cluster_module.loss(embedded_data, is_training=True) total_loss = dp_loss + center_losses + ae_loss if train_round_idx <= 10 or train_round_idx % 100 == 0: logger.info( f"{train_round_idx} - loss in this batch: dp_loss:{dp_loss.item()} " f"center_losses:{center_losses.item()} ae_loss:{ae_loss.item()} total_loss: {total_loss.item()}" ) # Backward pass optimizer.zero_grad() total_loss.backward() optimizer.step() if train_round_idx % 2000 == 0: evaluate(train_round_idx, ae_module, cluster_module) else: # For else is being executed if break did not occur, we continue the while true loop otherwise we break it too continue break # Break while loop here # Write last evaluation nmi_value, acc_value, dp_value, leaf_purity_value, nmi_best_prun_tree, acc_best_prun_tree = evaluate( "", ae_module, cluster_module) result_file = Path(result_dir, f"results_{dataset_name}.txt") result_file_exists = result_file.exists() f = open(result_file, "a+") if not result_file_exists: f.write( "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\"\t\"(Std)\"\t\"NMI(best-p-tree)\"\t\"ACC(best-p-tree)\"\n" ) f.write( f"{ae_model_path.name}\t{nmi_value}\t{acc_value}\t{dp_value}\t{leaf_purity_value}\t{nmi_best_prun_tree}\t{acc_best_prun_tree}\n" ) f.close()
torch.manual_seed(np.random.randint(10000)) pt_data = torch.from_numpy(data) pt_split_data = torch.from_numpy(data[split_idx, :]) ae_model_path = Path(ae_dir, dataset_name, "ae_reuters_5.model") # with partial splits train = torch.utils.data.TensorDataset(pt_data) train_loader = torch.utils.data.DataLoader(train, batch_size=256, shuffle=True) n_features = data.shape[1] ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y)**2) ae_module = stacked_ae(n_features, [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() optimizer = torch.optim.Adam(list(ae_module.parameters()), lr=0.0001) embedded_split_data_loader = lambda: map( lambda x: ae_module.forward(x.cuda())[0], torch.utils.data.DataLoader(pt_split_data, batch_size=256, shuffle=True)) cluster_module = ECTree(optimizer, embedded_split_data_loader).cuda()
def run_experiment(ae_model_path): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) new_seed = random.randint(0, 1000) logger.info(f"Seed value for this is: {new_seed}") set_random_seed(new_seed) ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() # Get embedded data embedded_data = None for batch_data in torch.utils.data.DataLoader(pt_data, batch_size=256, shuffle=False): embedded_batch_np = ae_module.forward( batch_data.cuda())[0].detach().cpu().numpy() if embedded_data is None: embedded_data = embedded_batch_np else: embedded_data = np.concatenate([embedded_data, embedded_batch_np], 0) del ae_module # bisecting k-means: tree = bisection(n_leaf_nodes_final, embedded_data) bisec_labels = predict_by_tree(tree, embedded_data, n_clusters) bisec_tree = predict_id_tree(tree, embedded_data) bisec_km_nmi = nmi(gold_labels, bisec_labels, average_method='arithmetic') bisec_km_acc = cluster_acc(bisec_labels, gold_labels)[0] bisec_km_purity = dendrogram_purity(bisec_tree, gold_labels) lp = leaf_purity(bisec_tree, gold_labels) leaf_purity_value = f"{lp[0]:1.3}\t({lp[1]:1.3})" result_file = Path(f"{result_dir}/results_ae_biseckm_{dataset_name}.txt") result_file_exists = result_file.exists() f = open(result_file, "a+") if not result_file_exists: f.write( "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n" ) f.write( f"{ae_model_path.name}\t{bisec_km_nmi}\t{bisec_km_acc}\t{bisec_km_purity}\t{leaf_purity_value}\n" ) f.close()
def run_experiment(ae_model_path): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) new_seed = random.randint(0, 1000) logger.info(f"Seed value for this is: {new_seed}") set_random_seed(new_seed) train = torch.utils.data.TensorDataset(pt_data) train_loader = torch.utils.data.DataLoader(train, batch_size=256, shuffle=True) n_features = pt_data.shape[1] # Same loss as in the DEC implementation ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y)**2) ae_module = stacked_ae(n_features, [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() node_data = None for batch_data in torch.utils.data.DataLoader(pt_init_sample, batch_size=256, shuffle=True): embedded_batch_np = ae_module.forward( batch_data.cuda())[0].detach().cpu().numpy() if node_data is None: node_data = embedded_batch_np else: node_data = np.concatenate([node_data, embedded_batch_np], 0) init_centers = k_means(node_data, n_clusters, n_init=20)[0] # Initialize cluster centers based on a smaller sample cluster_module = DEC(init_centers).cuda() optimizer = torch.optim.Adam(list(ae_module.parameters()) + list(cluster_module.parameters()), lr=0.001) def evaluate(train_round_idx, ae_module, cluster_module): test_loader = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(pt_data), batch_size=256) pred_labels = np.zeros(pt_data.shape[0], dtype=np.int) index = 0 n_batches = 0 for batch_data in test_loader: batch_data = batch_data[0].cuda() n_batches += 1 batch_size = batch_data.shape[0] embedded_data, reconstructed_data = ae_module.forward(batch_data) labels = cluster_module.prediction_hard_np(embedded_data) pred_labels[index:index + batch_size] = labels index = index + batch_size pred_tree = dendrogram_purity_tree_from_clusters( cluster_module, pred_labels, 'single') pred_tree2 = dendrogram_purity_tree_from_clusters( cluster_module, pred_labels, 'complete') lp = leaf_purity(pred_tree, gold_labels) leaf_purity_value = f"{lp[0]:1.3}\t({lp[1]:1.3})" dp_value_single = dendrogram_purity(pred_tree, gold_labels) dp_value_complete = dendrogram_purity(pred_tree2, gold_labels) logger.info( f"{train_round_idx} Evaluation: leaf_purity: {leaf_purity_value}, purity_single: {dp_value_single}, purity_complete: {dp_value_complete}" ) return dp_value_single, dp_value_complete, leaf_purity_value evaluate("init", ae_module, cluster_module) n_rounds = 40000 train_round_idx = 0 while True: # each iteration is equal to an epoch for batch_data in train_loader: train_round_idx += 1 if train_round_idx > n_rounds: break batch_data = batch_data[0].cuda() embedded_data, reconstruced_data = ae_module.forward(batch_data) ae_loss = ae_reconstruction_loss_fn(batch_data, reconstruced_data) cluster_loss = cluster_module.loss_dec_compression(embedded_data) loss = cluster_loss + 0.1 * ae_loss if train_round_idx == 1 or train_round_idx % 100 == 0: logger.info( f"{train_round_idx} - loss in this batch: cluster_loss:{cluster_loss.item()} " f"ae_loss:{ae_loss.item()} total_loss: {ae_loss.item() + cluster_loss.item()}" ) # Backward pass optimizer.zero_grad() loss.backward() optimizer.step() if train_round_idx % 2000 == 0: evaluate(train_round_idx, ae_module, cluster_module) else: # For else is being executed if break did not occur, we continue the while true loop otherwise we break it too continue break # Break while loop here # Write last evaluation dp_value_single, dp_value_complete, leaf_purity_value = evaluate( "", ae_module, cluster_module) result_file = Path(result_dir, f"results_{dataset_name}.txt") result_file_exists = result_file.exists() f = open(result_file, "a+") if not result_file_exists: f.write( "#\"ae_model_name\"\t\"Dendrogram_Purity Single\"\t\"Dendrogram_Purity Complete\"\t\"Leaf_Purity\t(Std)\"\n" ) f.write( f"{ae_model_path.name}\t{dp_value_single}\t{dp_value_complete}\t{leaf_purity_value}\n" ) f.close()
def run_experiment(ae_model_path): logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info(f"Working now on {ae_model_path.name}") logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) logger.info( f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) new_seed = random.randint(0, 1000) logger.info(f"Seed value for this is: {new_seed}") set_random_seed(new_seed) ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10], weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=None, optimizer_fn=None) model_data = torch.load(ae_model_path, map_location='cpu') ae_module.load_state_dict(model_data) ae_module = ae_module.cuda() # Get embedded data embedded_data = None for batch_data in torch.utils.data.DataLoader(pt_data, batch_size=256, shuffle=False): embedded_batch_np = ae_module.forward( batch_data.cuda())[0].detach().cpu().numpy() if embedded_data is None: embedded_data = embedded_batch_np else: embedded_data = np.concatenate([embedded_data, embedded_batch_np], 0) del ae_module sl_cl = AgglomerativeClustering(compute_full_tree=True, n_clusters=n_clusters, linkage="single").fit(embedded_data) sl_labels = sl_cl.labels_ sl_purity_tree = prune_dendrogram_purity_tree( to_dendrogram_purity_tree(sl_cl.children_), n_leaf_nodes_final) sl_nmi = nmi(gold_labels, sl_labels, average_method='arithmetic') sl_acc = cluster_acc(sl_labels, gold_labels)[0] sl_purity = dendrogram_purity(sl_purity_tree, gold_labels) sl_lp = leaf_purity(sl_purity_tree, gold_labels) sl_leaf_purity_value = f"{sl_lp[0]:1.3}\t({sl_lp[1]:1.3})" result_file_sl = Path( f"{result_dir}/results_ae_agglo_single_{dataset_name}.txt") result_file_sl_exists = result_file_sl.exists() f = open(result_file_sl, "a+") if not result_file_sl_exists: f.write( "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n" ) f.write( f"{ae_model_path.name}\t{sl_nmi}\t{sl_acc}\t{sl_purity}\t{sl_leaf_purity_value}\n" ) f.close() del sl_cl, sl_labels, sl_purity_tree cl_cl = AgglomerativeClustering(compute_full_tree=True, n_clusters=n_clusters, linkage="complete").fit(embedded_data) cl_labels = cl_cl.labels_ cl_purity_tree = prune_dendrogram_purity_tree( to_dendrogram_purity_tree(cl_cl.children_), n_leaf_nodes_final) cl_nmi = nmi(gold_labels, cl_labels, average_method='arithmetic') cl_acc = cluster_acc(cl_labels, gold_labels)[0] cl_purity = dendrogram_purity(cl_purity_tree, gold_labels) cl_lp = leaf_purity(cl_purity_tree, gold_labels) cl_leaf_purity_value = f"{cl_lp[0]:1.3}\t({cl_lp[1]:1.3})" result_file_cl = Path( f"{result_dir}/results_ae_agglo_complete_{dataset_name}.txt", ) result_file_cl_exists = result_file_cl.exists() f = open(result_file_cl, "a+") if not result_file_cl_exists: f.write( "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n" ) f.write( f"{ae_model_path.name}\t{cl_nmi}\t{cl_acc}\t{cl_purity}\t{cl_leaf_purity_value}\n" ) f.close() del cl_cl, cl_labels, cl_purity_tree
for index in range(0, 10): logging.info(f"Start training ae {index}") train = torch.utils.data.TensorDataset(pt_data) train_loader = torch.utils.data.DataLoader(train, batch_size=256, shuffle=True, pin_memory=True) # Original DEC paper AE ae = stacked_ae(n_features, ae_layout, weight_initalizer=torch.nn.init.xavier_normal_, activation_fn=lambda x: F.relu(x), loss_fn=loss_fn, optimizer_fn=lambda parameters: torch.optim.Adam( parameters, lr=0.0001)).cuda() def add_noise(batch): mask = torch.empty(batch.shape, device=batch.device).bernoulli_(0.8) return batch * mask ae.pretrain(train_loader, rounds_per_layer=steps_per_layer, dropout_rate=0.2, corruption_fn=add_noise) logging.info(f"Complete data loss after pretraining {get_total_loss()}")