Esempio n. 1
0
def main():
    update_config(base_config)

    results = {"test_name": [], "acc": []}

    for seed in seeds:
        for train_amount in train_amounts:
            for unlab_amount in unlab_amounts:
                for test_amount in test_amounts:
                    info_to_save = {
                        "seed": seed,
                        "train_amount": train_amount,
                        "unlab_amount": unlab_amount,
                        "test_amount": test_amount
                    }

                    indices_name = str(seed) + "_" + str(
                        train_amount) + "train" + str(
                            unlab_amount) + "unlab" + str(test_amount) + "test"
                    update_config({"indices": indices_name})
                    docs, labels, tvt_idx = load_data()

                    for i in range(repeats_per_seed):
                        results = test_all_models(results, docs, labels,
                                                  tvt_idx, info_to_save)
Esempio n. 2
0
def main():
    update_config(base_config)

    results = {
        "test_name" : [],
        "acc" : []
    }

    for seed in seeds:
        for language in languages:
            info_to_save = {
                "seed" : seed,
                "train_amount" : train_amount,
                "unlab_amount" : unlab_amount,
                "test_amount" : test_amount,
                "language" : language
            }

            data_name = "reuters_%s_FULL_min5" % language
            indices_name = str(seed) + "_" + str(train_amount) + "train" + str(unlab_amount) + "unlab" + str(test_amount) + "test"
            update_config({"dataset":data_name, "indices" : indices_name})
            docs, labels, tvt_idx = load_data()

            for i in range(repeats_per_seed):
                results = test_all_models(results, docs, labels, tvt_idx, info_to_save)
Esempio n. 3
0
def main():

    results = {
        "test_name": [],
        "acc": [],
        "ductive": [],
        "unique_embeddings": [],
    }
    update_config(transductive_settings)

    for seed in seeds:
        for unlabeled_amount in unlabeled_amounts:
            for labeled_amount in labeled_amounts:
                info_to_save = {
                    "seed": seed,
                    "unlabeled_amount": unlabeled_amount,
                    "labeled_amount": labeled_amount
                }

                indices_name = str(seed) + "_" + str(
                    labeled_amount) + "lab" + str(unlabeled_amount) + "unlab"
                update_config({"indices": indices_name})
                docs, labels, tvt_idx = load_data()

                for i in range(repeats_per_seed):
                    update_config({"unique_document_embeddings": True})
                    results = test_all_models(results, docs, labels, tvt_idx,
                                              info_to_save)
                    update_config({"unique_document_embeddings": False})
                    results = test_all_models(results, docs, labels, tvt_idx,
                                              info_to_save)
Esempio n. 4
0
def main():
    results = {
        "seed": [],
        "unlabeled_amount": [],
        "labeled_amount": [],
        "acc": [],
    }
    for var in variables_to_follow:
        results[var] = []

    for seed in seeds:
        for unlabeled_amount in unlabeled_amounts:
            for labeled_amount in labeled_amounts:
                # Load data
                indices_name = str(seed) + "_" + str(
                    labeled_amount) + "lab" + str(unlabeled_amount) + "unlab"
                update_config({"indices": indices_name})
                docs, labels, tvt_idx = load_data()

                # Test configs
                for cfg in configs:
                    update_config(cfg)

                    for i in range(repeats_per_seed):
                        # print to say where we at
                        summary_string = ""
                        for var in variables_to_follow:
                            summary_string += var + ":" + str(
                                config[var]) + ","
                        print("Working on {" + summary_string[:-1] + "}")

                        # evaluate
                        try:
                            result = evaluate_current_config(docs,
                                                             labels,
                                                             tvt_idx,
                                                             verbose=True)

                            # save
                            results["seed"].append(seed)
                            results["unlabeled_amount"].append(
                                unlabeled_amount)
                            results["labeled_amount"].append(labeled_amount)
                            results["acc"].append(result)
                            for var in variables_to_follow:
                                results[var].append(config[var])

                            save_dic(results)
                        except KeyboardInterrupt:
                            exit()
                        except Exception as e:
                            print(summary_string, "gone wrong! Error:", e)
Esempio n. 5
0
def main():
    docs, labels, (t_idx, v_idx, test_idx) = load_data()

    in_training_test = test_idx[:len(test_idx) // 2]
    out_training_test = test_idx[len(test_idx) // 2:]

    # train on (train, val, in_training_test)
    train_docs = docs[t_idx + v_idx + in_training_test]
    train_labels = labels[t_idx + v_idx + in_training_test]
    train_and_intest_indices = (t_idx, v_idx,
                                range(
                                    len(t_idx) + len(v_idx),
                                    len(train_labels)))

    a = 0
    b = 0

    # test on (in_training_test)
    only_in_training_docs = docs[t_idx[:a] + v_idx[:b] + in_training_test]
    only_in_training_labels = labels[t_idx[:a] + v_idx[:b] + in_training_test]
    only_in_training_indices = ([t_idx[:a]], [v_idx[:b]],
                                range(a + b, len(only_in_training_labels)))

    # test on (out_training_test)
    only_out_training_docs = docs[t_idx[:a] + v_idx[:b] + out_training_test]
    only_out_training_labels = labels[t_idx[:a] + v_idx[:b] +
                                      out_training_test]
    only_out_training_indices = ([t_idx[:a]], [v_idx[:b]],
                                 range(a + b, len(only_out_training_labels)))

    # test on (train, val, out_training_test)
    train_and_outtest_docs = docs[t_idx + v_idx + out_training_test]
    train_and_outtest_labels = labels[t_idx + v_idx + out_training_test]
    train_and_outtest_indices = (t_idx, v_idx,
                                 range(
                                     len(t_idx) + len(v_idx),
                                     len(train_and_outtest_labels)))

    train_and_intest_dataset = DocumentGraphDataset(train_docs, train_labels,
                                                    train_and_intest_indices)
    only_in_training_dataset = DocumentGraphDataset(
        only_in_training_docs,
        only_in_training_labels,
        only_in_training_indices,
        force_vocab=train_and_intest_dataset.vocab)
    only_out_training_dataset = DocumentGraphDataset(
        only_out_training_docs,
        only_out_training_labels,
        only_out_training_indices,
        force_vocab=train_and_intest_dataset.vocab)
    train_and_outtest_dataset = DocumentGraphDataset(
        train_and_outtest_docs,
        train_and_outtest_labels,
        train_and_outtest_indices,
        force_vocab=train_and_intest_dataset.vocab)

    model = None

    best_val_loss = float('inf')
    time_since_best = 0

    for i in range(config["epochs"]):
        if model is None:
            model = create_model(train_and_intest_dataset)
            trainer = Trainer(train_and_intest_dataset, model)
        else:
            trainer.update_data(train_and_intest_dataset)

        train_loss, val_loss = trainer.train_epoch()
        test_acc_train_and_intest = trainer.test()

        trainer.update_data(only_in_training_dataset)
        test_acc_only_in_training = trainer.test()

        trainer.update_data(only_out_training_dataset)
        test_acc_only_out_training = trainer.test()

        trainer.update_data(train_and_outtest_dataset)
        test_acc_train_and_outtest = trainer.test()

        print("[epoch %02d] Train loss %.4f, Val loss %.4f" %
              (i, train_loss, val_loss))
        print(" acc on in training test, with training docs in graph: %.4f" %
              (test_acc_train_and_intest))
        print(
            " acc on in training test, WITHOUT training docs in graph: %.4f" %
            (test_acc_only_in_training))
        print(
            " acc on out of training test, WITHOUT training docs in graph: %.4f"
            % (test_acc_only_out_training))
        print(
            " acc on out of training test, with training docs in graph: %.4f" %
            (test_acc_train_and_outtest))

        # Early stopping
        time_since_best += 1

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            time_since_best = 0

        if config["terminate_early"] and time_since_best >= config[
                "terminate_patience"]:
            print("\n[RESULT!] Final test score: see above")
            break
Esempio n. 6
0
def main():
    update_config(quick_config)
    docs, labels, tvt_idx = load_data()

    dataset = DocumentGraphDataset(docs, labels, tvt_idx)

    model = create_model(dataset)

    trainer = Trainer(dataset, model)

    # trainer.save_initial_reps()

    best_val_loss = float('inf')
    time_since_best = 0
    best_val_loss_acc = 0
    high_score = 0
    for i in range(config["epochs"]):

        # split for special debug printing
        if config["sampled_training"] and config["unsupervised_loss"]:
            # trainer.save_sage_reps()
            train_loss, val_loss, unsup_train_loss_pos, unsup_train_loss_neg, unsup_val_loss_pos, unsup_val_loss_neg, unsup_test_pos, unsup_test_neg = trainer.train_epoch(
            )

            test_acc, test_loss, unsup_test_loss_pos, unsup_test_loss_neg = trainer.test(
            )

            total_train = train_loss + unsup_train_loss_pos + unsup_train_loss_neg
            total_val = val_loss + unsup_val_loss_pos + unsup_val_loss_neg
            total_test = test_loss + unsup_test_loss_pos + unsup_test_loss_neg
            print("[epoch %02d] Test Acc %.4f (Trained %s-supervised)" %
                  (i, test_acc, config['sup_mode']))
            print("\t Train Loss: %.4f (%.4f / %.4f / %.4f) (%.0f%% sup)" %
                  (total_train, train_loss, unsup_train_loss_pos,
                   unsup_train_loss_neg, train_loss / total_train * 100))
            print("\t Val Loss: %.4f (%.4f / %.4f / %.4f) (%.0f%% sup)" %
                  (total_val, val_loss, unsup_val_loss_pos, unsup_val_loss_neg,
                   val_loss / total_val * 100))
            print("\t Training on test Losses: %.4f, %.4f, %.1f%% of total" %
                  (unsup_test_pos, unsup_test_neg,
                   (unsup_test_pos + unsup_test_neg) /
                   (unsup_test_pos + unsup_test_neg + total_train) * 100))
            # print("\t Test Loss: %.4f (%.4f / %.4f / %.4f) (%.0f%% sup)" % (total_test, test_loss, unsup_test_loss_pos, unsup_test_loss_neg, test_loss / total_test))

            val_loss = total_val
        else:
            train_loss, val_loss = trainer.train_epoch()
            test_acc = trainer.test()
            high_score = max(test_acc, high_score)
            print(
                "[epoch %02d] Train loss %.4f, Val loss %.4f, Test Acc %.4f, Highscore: %.4f"
                % (i, train_loss, val_loss, test_acc, high_score))

        # Early stopping
        time_since_best += 1

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            time_since_best = 0
            best_val_loss_acc = test_acc

        if config["terminate_early"] and time_since_best >= config[
                "terminate_patience"]:

            if config["sampled_training"] and config["unsupervised_loss"]:
                test_acc, test_loss, unsup_test_loss_pos, unsup_test_loss_neg = trainer.test(
                )
                print("\n[RESULT!] Final test score: ", best_val_loss_acc)
            else:
                test_acc = trainer.test()
            break

    print("\n[RESULT!] Final test score: ", best_val_loss_acc)
Esempio n. 7
0
def main():
    # load data
    docs, labels, tvt_idx = load_data()
    train_idx, val_idx, test_idx = tvt_idx

    model = None

    results = {
        "epochs": [],
        "average_train_loss": [],
        "average_val_loss": [],
        "average_test_acc": [],
        "split_amount": [],
        "acc": []
    }

    # create model with vocab for entire dataset
    dataset = DocumentGraphDataset(docs, labels, tvt_idx)
    model = create_model(dataset)
    trainer = Trainer(dataset, model)

    for i in range(config["epochs"]):
        number_of_splits = random.randint(1, 40)

        # split in x random divisions
        split_train_amount = len(train_idx) // number_of_splits
        split_val_amount = len(val_idx) // number_of_splits
        split_test_amount = len(test_idx) // number_of_splits

        print("Splitting %i segments, %i %i %i" %
              (number_of_splits, split_train_amount, split_val_amount,
               split_test_amount))

        # train on different random splits
        random.shuffle(train_idx)
        random.shuffle(val_idx)
        random.shuffle(test_idx)

        train_losses = []
        val_losses = []
        test_accs = []

        for split_i in range(number_of_splits):
            split_train_idx = train_idx[split_i *
                                        split_train_amount:(split_i + 1) *
                                        split_train_amount]
            split_val_idx = val_idx[split_i * split_val_amount:(split_i + 1) *
                                    split_val_amount]
            split_test_idx = test_idx[split_i *
                                      split_test_amount:(split_i + 1) *
                                      split_test_amount]

            # this ain't right
            dataset = DocumentGraphDataset(
                docs, labels, (split_train_idx, split_val_idx, split_test_idx))

            if model is None:
                model = create_model(dataset)
                trainer = Trainer(dataset, model)
            else:
                trainer.update_data(dataset)

            train_loss, val_loss = trainer.train_epoch()
            test_acc = trainer.test()

            print("split %02d: (%.4f, %.4f, ! %.4f !), " %
                  (split_i, train_loss, val_loss, test_acc),
                  end="")
            train_losses.append(train_loss)
            val_losses.append(val_loss)
            test_accs.append(test_acc)

        # test on entire graph
        dataset = DocumentGraphDataset(docs, labels, tvt_idx)
        trainer.update_data(dataset)
        test_acc = trainer.test()

        print("\n\n[epoch %02d] Test Acc on entire dataset %.4f\n\n" %
              (i, test_acc))

        # summary
        results["epochs"].append(i)
        results["average_train_loss"].append(
            float(sum(train_losses) / len(train_losses)))
        results["average_val_loss"].append(
            float(sum(val_losses) / len(val_losses)))
        results["average_test_acc"].append(
            float(sum(test_accs) / len(test_accs)))
        results["split_amount"].append(number_of_splits)
        results["acc"].append(test_acc)

        df = pd.DataFrame(results)
        df.to_csv('./results/' + config['experiment_name'] + '.csv')