コード例 #1
0
def run_experiment(ae_model_path):
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(f"Working now on {ae_model_path.name}")
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    new_seed = random.randint(0, 1000)
    logger.info(f"Seed value for this is: {new_seed}")
    set_random_seed(new_seed)

    ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10],
                           weight_initalizer=torch.nn.init.xavier_normal_,
                           activation_fn=lambda x: F.relu(x),
                           loss_fn=None,
                           optimizer_fn=None)

    model_data = torch.load(ae_model_path, map_location='cpu')
    ae_module.load_state_dict(model_data)
    ae_module = ae_module.cuda()

    # Get embedded data
    embedded_data = None
    for batch_data in torch.utils.data.DataLoader(pt_data,
                                                  batch_size=256,
                                                  shuffle=False):
        embedded_batch_np = ae_module.forward(
            batch_data.cuda())[0].detach().cpu().numpy()
        if embedded_data is None:
            embedded_data = embedded_batch_np
        else:
            embedded_data = np.concatenate([embedded_data, embedded_batch_np],
                                           0)
    del ae_module

    # Perform k-means
    k_means_labels = k_means(embedded_data, n_clusters, n_init=20)[1]

    k_means_nmi_value = nmi(gold_labels,
                            k_means_labels,
                            average_method='arithmetic')
    k_means_acc_value = cluster_acc(gold_labels, k_means_labels)[0]

    result_file = Path(f"{result_dir}/results_ae_kmeans_{dataset_name}.txt")
    result_file_exists = result_file.exists()
    f = open(result_file, "a+")
    if not result_file_exists:
        f.write("#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\n")
    f.write(
        f"{ae_model_path.name}\t{k_means_nmi_value}\t{k_means_acc_value}\n")
    f.close()
コード例 #2
0
ファイル: common_stuff.py プロジェクト: Mcpaeis/Liger
def init_data_and_ae():
    ae_path = Path(Path(__file__).parent, "ae.model")

    data, gold_labels = make_blobs(n_samples=1000, centers=3, n_features=2, random_state=42)

    min_max_scaler = preprocessing.MinMaxScaler((0.01, 0.99))
    data = np.float32(min_max_scaler.fit_transform(data))
    n_features = data.shape[1]
    pt_data = torch.from_numpy(data).cuda()
    train_ds = torch.utils.data.TensorDataset(pt_data)
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=64, shuffle=True)

    plot_data(data, gold_labels, "Original data")

    ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y) ** 2)
    ae_module = stacked_ae(n_features, [50, 50, 200, 2],
                           weight_initalizer=torch.nn.init.xavier_normal_,
                           activation_fn=lambda x: F.leaky_relu(x),
                           loss_fn=ae_reconstruction_loss_fn,
                           optimizer_fn=lambda parameters: torch.optim.Adam(parameters, lr=0.001))

    if ae_path.exists():
        model_data = torch.load(ae_path, map_location='cpu')
        ae_module.load_state_dict(model_data)
        ae_module = ae_module.cuda()
    else:
        print(
            "Warning we train a new AE, because we did not find the preexisting one, this could generate a diffrent result to the paper")
        ae_module = ae_module.cuda()
        ae_module.pretrain(train_loader, 1000)
        ae_module.refine_training(train_loader, 5000)
        torch.save(ae_module.state_dict(), ae_path)

    embedded_data = ae_module.forward(pt_data)[0]
    embedded_data_np = embedded_data.data.cpu().numpy()
    plot_data(embedded_data_np, gold_labels, "Emedded data initial")

    return ae_module, pt_data, gold_labels, train_ds, train_loader, ae_reconstruction_loss_fn
コード例 #3
0
ファイル: ect_vanilla.py プロジェクト: Mcpaeis/Liger
def run_experiment(ae_model_path, seed):
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(f"Working now on {ae_model_path.name}")
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(f"Seed value for this is: {seed}")
    set_random_seed(seed)

    train = torch.utils.data.TensorDataset(pt_data)
    train_loader = torch.utils.data.DataLoader(train,
                                               batch_size=256,
                                               shuffle=True)

    n_features = data.shape[1]
    ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y)**2)

    ae_module = stacked_ae(n_features, [500, 500, 2000, 10],
                           weight_initalizer=torch.nn.init.xavier_normal_,
                           activation_fn=lambda x: F.relu(x),
                           loss_fn=None,
                           optimizer_fn=None)

    model_data = torch.load(ae_model_path, map_location='cpu')
    ae_module.load_state_dict(model_data)
    ae_module = ae_module.cuda()

    optimizer = torch.optim.Adam(list(ae_module.parameters()), lr=0.0001)

    embedded_split_data_loader = lambda: map(
        lambda x: ae_module.forward(x.cuda())[0],
        torch.utils.data.DataLoader(
            pt_split_data, batch_size=256, shuffle=True))

    cluster_module = ECTree(optimizer, embedded_split_data_loader).cuda()

    def evaluate(train_round_idx, ae_module, cluster_module):
        test_loader = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(
                pt_data, torch.tensor(range(pt_data.shape[0]))),
            batch_size=256)

        pred_labels = np.zeros(pt_data.shape[0], dtype=np.int)
        pred_tree = None
        index = 0
        n_batches = 0
        print("start evaluation")
        for batch_data_id in test_loader:
            batch_data, batch_ids = batch_data_id
            batch_data = batch_data.cuda()
            n_batches += 1
            batch_size = batch_data.shape[0]
            embeded_data, reconstructed_data = ae_module.forward(batch_data)
            labels = cluster_module.prediction_labels_np(
                min(cluster_module.n_leaf_nodes, n_clusters), embeded_data)[0]
            pred_labels[index:index + batch_size] = labels
            new_pred_tree = cluster_module.predict_tree(
                embeded_data, batch_ids)
            if pred_tree is None:
                pred_tree = new_pred_tree
            else:
                pred_tree = combine_to_trees(pred_tree, new_pred_tree)
            index = index + batch_size

        lp = leaf_purity(pred_tree, gold_labels)
        nmi_best_prun_tree = as_flat_clustering_pruned_for_highest_measure(
            pred_tree, n_clusters, gold_labels,
            lambda x, y: nmi(x, y, 'arithmetic'))
        acc_best_prun_tree = as_flat_clustering_pruned_for_highest_measure(
            pred_tree, n_clusters, gold_labels,
            lambda x, y: cluster_acc(x, y)[0])
        nmi_value = nmi(gold_labels, pred_labels, average_method='arithmetic')
        acc_value = cluster_acc(gold_labels, pred_labels)[0]
        dp_value = dendrogram_purity(pred_tree, gold_labels)
        leaf_purity_value = f"{lp[0]:1.3}\t({lp[1]:1.3})"
        logger.info(
            f"{train_round_idx}  leaf_purity: {leaf_purity_value}, D-purity: {dp_value}, NMI: {nmi_value} ACC: {acc_value}  NMI(best p-tree): {nmi_best_prun_tree} ACC (best p-tree): {acc_best_prun_tree}"
        )
        return nmi_value, acc_value, dp_value, leaf_purity_value, nmi_best_prun_tree, acc_best_prun_tree

    evaluate("init", ae_module, cluster_module)

    n_rounds = 40000
    train_round_idx = 0
    while True:  # each iteration is equal to an epoch
        for batch_data in train_loader:
            train_round_idx += 1
            if train_round_idx > n_rounds:
                break
            batch_data = batch_data[0].cuda()

            if train_round_idx % 500 == 0 and cluster_module.n_leaf_nodes < n_leaf_nodes_final:
                cluster_module.split_highest_sse_node()

            embedded_data, reconstruced_data = ae_module.forward(batch_data)
            ae_loss = ae_reconstruction_loss_fn(batch_data, reconstruced_data)

            dp_loss, center_losses = cluster_module.loss(embedded_data,
                                                         is_training=True)

            total_loss = dp_loss + center_losses + ae_loss

            if train_round_idx <= 10 or train_round_idx % 100 == 0:
                logger.info(
                    f"{train_round_idx} - loss in this batch: dp_loss:{dp_loss.item()} "
                    f"center_losses:{center_losses.item()} ae_loss:{ae_loss.item()} total_loss: {total_loss.item()}"
                )

            # Backward pass
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            if train_round_idx % 2000 == 0:
                evaluate(train_round_idx, ae_module, cluster_module)

        else:  # For else is being executed if break did not occur, we continue the while true loop otherwise we break it too
            continue
        break  # Break while loop here

    # Write last evaluation
    nmi_value, acc_value, dp_value, leaf_purity_value, nmi_best_prun_tree, acc_best_prun_tree = evaluate(
        "", ae_module, cluster_module)
    result_file = Path(result_dir, f"results_{dataset_name}.txt")
    result_file_exists = result_file.exists()
    f = open(result_file, "a+")
    if not result_file_exists:
        f.write(
            "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\"\t\"(Std)\"\t\"NMI(best-p-tree)\"\t\"ACC(best-p-tree)\"\n"
        )
    f.write(
        f"{ae_model_path.name}\t{nmi_value}\t{acc_value}\t{dp_value}\t{leaf_purity_value}\t{nmi_best_prun_tree}\t{acc_best_prun_tree}\n"
    )
    f.close()
コード例 #4
0
torch.manual_seed(np.random.randint(10000))

pt_data = torch.from_numpy(data)
pt_split_data = torch.from_numpy(data[split_idx, :])

ae_model_path = Path(ae_dir, dataset_name,
                     "ae_reuters_5.model")  # with partial splits

train = torch.utils.data.TensorDataset(pt_data)
train_loader = torch.utils.data.DataLoader(train, batch_size=256, shuffle=True)

n_features = data.shape[1]
ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y)**2)
ae_module = stacked_ae(n_features, [500, 500, 2000, 10],
                       weight_initalizer=torch.nn.init.xavier_normal_,
                       activation_fn=lambda x: F.relu(x),
                       loss_fn=None,
                       optimizer_fn=None)

model_data = torch.load(ae_model_path, map_location='cpu')
ae_module.load_state_dict(model_data)
ae_module = ae_module.cuda()

optimizer = torch.optim.Adam(list(ae_module.parameters()), lr=0.0001)

embedded_split_data_loader = lambda: map(
    lambda x: ae_module.forward(x.cuda())[0],
    torch.utils.data.DataLoader(pt_split_data, batch_size=256, shuffle=True))

cluster_module = ECTree(optimizer, embedded_split_data_loader).cuda()
コード例 #5
0
def run_experiment(ae_model_path):
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(f"Working now on {ae_model_path.name}")
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    new_seed = random.randint(0, 1000)
    logger.info(f"Seed value for this is: {new_seed}")
    set_random_seed(new_seed)

    ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10],
                           weight_initalizer=torch.nn.init.xavier_normal_,
                           activation_fn=lambda x: F.relu(x),
                           loss_fn=None,
                           optimizer_fn=None)

    model_data = torch.load(ae_model_path, map_location='cpu')
    ae_module.load_state_dict(model_data)
    ae_module = ae_module.cuda()

    # Get embedded data
    embedded_data = None
    for batch_data in torch.utils.data.DataLoader(pt_data,
                                                  batch_size=256,
                                                  shuffle=False):
        embedded_batch_np = ae_module.forward(
            batch_data.cuda())[0].detach().cpu().numpy()
        if embedded_data is None:
            embedded_data = embedded_batch_np
        else:
            embedded_data = np.concatenate([embedded_data, embedded_batch_np],
                                           0)
    del ae_module

    # bisecting k-means:
    tree = bisection(n_leaf_nodes_final, embedded_data)
    bisec_labels = predict_by_tree(tree, embedded_data, n_clusters)
    bisec_tree = predict_id_tree(tree, embedded_data)
    bisec_km_nmi = nmi(gold_labels, bisec_labels, average_method='arithmetic')
    bisec_km_acc = cluster_acc(bisec_labels, gold_labels)[0]
    bisec_km_purity = dendrogram_purity(bisec_tree, gold_labels)
    lp = leaf_purity(bisec_tree, gold_labels)
    leaf_purity_value = f"{lp[0]:1.3}\t({lp[1]:1.3})"

    result_file = Path(f"{result_dir}/results_ae_biseckm_{dataset_name}.txt")
    result_file_exists = result_file.exists()
    f = open(result_file, "a+")
    if not result_file_exists:
        f.write(
            "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n"
        )
    f.write(
        f"{ae_model_path.name}\t{bisec_km_nmi}\t{bisec_km_acc}\t{bisec_km_purity}\t{leaf_purity_value}\n"
    )
    f.close()
コード例 #6
0
ファイル: idec_center_linkage.py プロジェクト: Mcpaeis/Liger
def run_experiment(ae_model_path):
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(f"Working now on {ae_model_path.name}")
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    new_seed = random.randint(0, 1000)
    logger.info(f"Seed value for this is: {new_seed}")
    set_random_seed(new_seed)
    train = torch.utils.data.TensorDataset(pt_data)
    train_loader = torch.utils.data.DataLoader(train,
                                               batch_size=256,
                                               shuffle=True)

    n_features = pt_data.shape[1]
    # Same loss as in the DEC implementation
    ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y)**2)
    ae_module = stacked_ae(n_features, [500, 500, 2000, 10],
                           weight_initalizer=torch.nn.init.xavier_normal_,
                           activation_fn=lambda x: F.relu(x),
                           loss_fn=None,
                           optimizer_fn=None)

    model_data = torch.load(ae_model_path, map_location='cpu')
    ae_module.load_state_dict(model_data)
    ae_module = ae_module.cuda()

    node_data = None
    for batch_data in torch.utils.data.DataLoader(pt_init_sample,
                                                  batch_size=256,
                                                  shuffle=True):
        embedded_batch_np = ae_module.forward(
            batch_data.cuda())[0].detach().cpu().numpy()
        if node_data is None:
            node_data = embedded_batch_np
        else:
            node_data = np.concatenate([node_data, embedded_batch_np], 0)
    init_centers = k_means(node_data, n_clusters, n_init=20)[0]

    # Initialize cluster centers based on a smaller sample
    cluster_module = DEC(init_centers).cuda()
    optimizer = torch.optim.Adam(list(ae_module.parameters()) +
                                 list(cluster_module.parameters()),
                                 lr=0.001)

    def evaluate(train_round_idx, ae_module, cluster_module):
        test_loader = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(pt_data), batch_size=256)

        pred_labels = np.zeros(pt_data.shape[0], dtype=np.int)
        index = 0
        n_batches = 0
        for batch_data in test_loader:
            batch_data = batch_data[0].cuda()
            n_batches += 1
            batch_size = batch_data.shape[0]
            embedded_data, reconstructed_data = ae_module.forward(batch_data)
            labels = cluster_module.prediction_hard_np(embedded_data)
            pred_labels[index:index + batch_size] = labels
            index = index + batch_size
        pred_tree = dendrogram_purity_tree_from_clusters(
            cluster_module, pred_labels, 'single')
        pred_tree2 = dendrogram_purity_tree_from_clusters(
            cluster_module, pred_labels, 'complete')
        lp = leaf_purity(pred_tree, gold_labels)
        leaf_purity_value = f"{lp[0]:1.3}\t({lp[1]:1.3})"
        dp_value_single = dendrogram_purity(pred_tree, gold_labels)
        dp_value_complete = dendrogram_purity(pred_tree2, gold_labels)
        logger.info(
            f"{train_round_idx} Evaluation:  leaf_purity: {leaf_purity_value}, purity_single: {dp_value_single}, purity_complete: {dp_value_complete}"
        )
        return dp_value_single, dp_value_complete, leaf_purity_value

    evaluate("init", ae_module, cluster_module)

    n_rounds = 40000
    train_round_idx = 0
    while True:  # each iteration is equal to an epoch
        for batch_data in train_loader:
            train_round_idx += 1
            if train_round_idx > n_rounds:
                break
            batch_data = batch_data[0].cuda()

            embedded_data, reconstruced_data = ae_module.forward(batch_data)
            ae_loss = ae_reconstruction_loss_fn(batch_data, reconstruced_data)

            cluster_loss = cluster_module.loss_dec_compression(embedded_data)
            loss = cluster_loss + 0.1 * ae_loss
            if train_round_idx == 1 or train_round_idx % 100 == 0:
                logger.info(
                    f"{train_round_idx} - loss in this batch: cluster_loss:{cluster_loss.item()} "
                    f"ae_loss:{ae_loss.item()} total_loss: {ae_loss.item() + cluster_loss.item()}"
                )

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if train_round_idx % 2000 == 0:
                evaluate(train_round_idx, ae_module, cluster_module)
        else:  # For else is being executed if break did not occur, we continue the while true loop otherwise we break it too
            continue
        break  # Break while loop here

    # Write last evaluation

    dp_value_single, dp_value_complete, leaf_purity_value = evaluate(
        "", ae_module, cluster_module)
    result_file = Path(result_dir, f"results_{dataset_name}.txt")
    result_file_exists = result_file.exists()
    f = open(result_file, "a+")
    if not result_file_exists:
        f.write(
            "#\"ae_model_name\"\t\"Dendrogram_Purity Single\"\t\"Dendrogram_Purity Complete\"\t\"Leaf_Purity\t(Std)\"\n"
        )
    f.write(
        f"{ae_model_path.name}\t{dp_value_single}\t{dp_value_complete}\t{leaf_purity_value}\n"
    )
    f.close()
コード例 #7
0
def run_experiment(ae_model_path):
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(f"Working now on {ae_model_path.name}")
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    new_seed = random.randint(0, 1000)
    logger.info(f"Seed value for this is: {new_seed}")
    set_random_seed(new_seed)

    ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10],
                           weight_initalizer=torch.nn.init.xavier_normal_,
                           activation_fn=lambda x: F.relu(x),
                           loss_fn=None,
                           optimizer_fn=None)

    model_data = torch.load(ae_model_path, map_location='cpu')
    ae_module.load_state_dict(model_data)
    ae_module = ae_module.cuda()

    # Get embedded data
    embedded_data = None
    for batch_data in torch.utils.data.DataLoader(pt_data,
                                                  batch_size=256,
                                                  shuffle=False):
        embedded_batch_np = ae_module.forward(
            batch_data.cuda())[0].detach().cpu().numpy()
        if embedded_data is None:
            embedded_data = embedded_batch_np
        else:
            embedded_data = np.concatenate([embedded_data, embedded_batch_np],
                                           0)
    del ae_module

    sl_cl = AgglomerativeClustering(compute_full_tree=True,
                                    n_clusters=n_clusters,
                                    linkage="single").fit(embedded_data)
    sl_labels = sl_cl.labels_
    sl_purity_tree = prune_dendrogram_purity_tree(
        to_dendrogram_purity_tree(sl_cl.children_), n_leaf_nodes_final)
    sl_nmi = nmi(gold_labels, sl_labels, average_method='arithmetic')
    sl_acc = cluster_acc(sl_labels, gold_labels)[0]
    sl_purity = dendrogram_purity(sl_purity_tree, gold_labels)
    sl_lp = leaf_purity(sl_purity_tree, gold_labels)
    sl_leaf_purity_value = f"{sl_lp[0]:1.3}\t({sl_lp[1]:1.3})"

    result_file_sl = Path(
        f"{result_dir}/results_ae_agglo_single_{dataset_name}.txt")
    result_file_sl_exists = result_file_sl.exists()
    f = open(result_file_sl, "a+")
    if not result_file_sl_exists:
        f.write(
            "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n"
        )
    f.write(
        f"{ae_model_path.name}\t{sl_nmi}\t{sl_acc}\t{sl_purity}\t{sl_leaf_purity_value}\n"
    )
    f.close()
    del sl_cl, sl_labels, sl_purity_tree

    cl_cl = AgglomerativeClustering(compute_full_tree=True,
                                    n_clusters=n_clusters,
                                    linkage="complete").fit(embedded_data)
    cl_labels = cl_cl.labels_
    cl_purity_tree = prune_dendrogram_purity_tree(
        to_dendrogram_purity_tree(cl_cl.children_), n_leaf_nodes_final)
    cl_nmi = nmi(gold_labels, cl_labels, average_method='arithmetic')
    cl_acc = cluster_acc(cl_labels, gold_labels)[0]
    cl_purity = dendrogram_purity(cl_purity_tree, gold_labels)
    cl_lp = leaf_purity(cl_purity_tree, gold_labels)
    cl_leaf_purity_value = f"{cl_lp[0]:1.3}\t({cl_lp[1]:1.3})"

    result_file_cl = Path(
        f"{result_dir}/results_ae_agglo_complete_{dataset_name}.txt", )
    result_file_cl_exists = result_file_cl.exists()
    f = open(result_file_cl, "a+")
    if not result_file_cl_exists:
        f.write(
            "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n"
        )
    f.write(
        f"{ae_model_path.name}\t{cl_nmi}\t{cl_acc}\t{cl_purity}\t{cl_leaf_purity_value}\n"
    )
    f.close()
    del cl_cl, cl_labels, cl_purity_tree
コード例 #8
0
ファイル: pretrain_ae.py プロジェクト: Mcpaeis/Liger

for index in range(0, 10):
    logging.info(f"Start training ae {index}")

    train = torch.utils.data.TensorDataset(pt_data)
    train_loader = torch.utils.data.DataLoader(train,
                                               batch_size=256,
                                               shuffle=True,
                                               pin_memory=True)

    # Original DEC paper AE
    ae = stacked_ae(n_features,
                    ae_layout,
                    weight_initalizer=torch.nn.init.xavier_normal_,
                    activation_fn=lambda x: F.relu(x),
                    loss_fn=loss_fn,
                    optimizer_fn=lambda parameters: torch.optim.Adam(
                        parameters, lr=0.0001)).cuda()

    def add_noise(batch):
        mask = torch.empty(batch.shape, device=batch.device).bernoulli_(0.8)
        return batch * mask

    ae.pretrain(train_loader,
                rounds_per_layer=steps_per_layer,
                dropout_rate=0.2,
                corruption_fn=add_noise)

    logging.info(f"Complete data loss after pretraining {get_total_loss()}")