Esempio n. 1
0
def get_classifier(path, clf):
    embeddings = StackedEmbeddings([
        FlairEmbeddings('news-forward-fast'),
        FlairEmbeddings('news-backward-fast')
    ])
    model = BasicClassifier(clf,
                            embeddings,
                            emb_type="flair",
                            wiki_file=wiki_file,
                            lowercase=True)

    try:
        model.load_model(path=path)
    except FileNotFoundError:
        print(
            "No saved model was found for the given path. Training one instead..."
        )
        #model.train_model(data_files=train_sets + dev_set)
        print("Done training")
        #model.save_model(path)

    return model
    def test_model(train_ds, train_string, dev_ds, dev_string):
        classifiers = [
            AdaBoostClassifier(),
            LogisticRegression(class_weight="balanced"),
            SGDClassifier(class_weight="balanced"),
            LinearSVC(class_weight="balanced"),
            RandomForestClassifier(class_weight="balanced"),
            GradientBoostingClassifier()
        ]

        for clf, clf_name in zip(classifiers, clf_names):
            model_list.append(clf_name)
            train_list.append(train_string)
            dev_list.append(dev_string)

            model = BasicClassifier(clf,
                                    embeddings,
                                    emb_type="flair",
                                    wiki_file=wiki_file)

            st = time.time()
            model.train_model(data_files=train_ds + dev_ds)
            time_list.append(time.time() - st)

            r, p, f = model.test_model(itac_test)
            itac_f.append(f)
            itac_p.append(p)
            itac_r.append(r)

            r, p, f = model.test_model(conll_test)
            conll_f.append(f)
            conll_p.append(p)
            conll_r.append(r)

            r, p, f = model.test_model(rsics_test)
            rsics_f.append(f)
            rsics_p.append(p)
            rsics_r.append(r)

            result_df = pd.DataFrame()
            result_df["clf_type"] = model_list
            result_df["train_sets"] = train_list
            result_df["dev_sets"] = dev_list
            result_df["train_time"] = time_list
            result_df["precision_itac"] = itac_p
            result_df["recall_itac"] = itac_r
            result_df["f1_itac"] = itac_f
            result_df["precision_conll"] = conll_p
            result_df["recall_conll"] = conll_r
            result_df["f1_conll"] = conll_f
            result_df["precision_rsics"] = rsics_p
            result_df["recall_rsics"] = rsics_r
            result_df["f1_rsics"] = rsics_f
            result_df.to_csv("results/19_06_06/bc_flair_datasets.csv")
    def test_models(embeddings, emb_type):
        # classifiers = [AdaBoostClassifier(), LogisticRegression(class_weight="balanced"),
        #               SGDClassifier(class_weight="balanced"), BayesianGaussianMixture(), GaussianNB(),
        #               LinearSVC(class_weight="balanced"), RandomForestClassifier(class_weight="balanced"),
        #               GradientBoostingClassifier()]

        classifiers = [
            AdaBoostClassifier(),
            LogisticRegression(class_weight="balanced"),
            SGDClassifier(class_weight="balanced"),
            LinearSVC(class_weight="balanced"),
            RandomForestClassifier(class_weight="balanced"),
            GradientBoostingClassifier()
        ]

        for clf, clf_name in zip(classifiers, clf_names):
            embedding_list.append(emb_type)
            model_list.append(clf_name)

            model = BasicClassifier(clf,
                                    embeddings,
                                    emb_type="flair",
                                    wiki_file=wiki_file)
            st = time.time()
            model.train_model(data_files=train_sets)
            time_list.append(time.time() - st)

            r, p, f = model.test_model(itac_test)
            itac_f.append(f)
            itac_p.append(p)
            itac_r.append(r)

            r, p, f = model.test_model(conll_test)
            conll_f.append(f)
            conll_p.append(p)
            conll_r.append(r)

        result_df = pd.DataFrame()
        result_df["embedding_type"] = embedding_list
        result_df["clf_type"] = model_list
        result_df["train_time"] = time_list
        result_df["precision_itac"] = itac_p
        result_df["recall_itac"] = itac_r
        result_df["f1_itac"] = itac_f
        result_df["precision_conll"] = conll_p
        result_df["recall_conll"] = conll_r
        result_df["f1_conll"] = conll_f
        result_df.to_csv("results/19_06_06/bc_flair_embeddings2.csv")
Esempio n. 4
0
def test_depth_n_trees():
    d_list = []
    n_list = []
    time_list = []
    itac_p = []
    itac_r = []
    itac_f = []
    conll_r = []
    conll_p = []
    conll_f = []

    for d in list(range(5, 20)):  # [5, 10, 20, 50, None]:
        for n in [10, 20, 50, 100]:
            d_list.append(d)
            n_list.append(n)
            print("\nTraining {} trees with max_depth {}".format(n, d))
            g_man = GloveEmbeddings(path="embeddings/glove/glove.6B.50d.txt",
                                    dim=50)
            clf = BasicClassifier(model=RandomForestClassifier(n_estimators=n,
                                                               max_depth=d),
                                  emb_man=g_man,
                                  wiki_file=wiki_file)
            t = time.time()
            clf.train_model(data_files=train_sets)
            time_list.append(time.time() - t)

            r, p, f = clf.test_model(itac_test)
            itac_f.append(f)
            itac_p.append(p)
            itac_r.append(r)

            r, p, f = clf.test_model(conll_test)
            conll_f.append(f)
            conll_p.append(p)
            conll_r.append(r)

    result_df = pd.DataFrame()
    result_df["n_estimators"] = n_list
    result_df["max_depth"] = d_list
    result_df["train_time"] = time_list
    result_df["precision_itac"] = itac_p
    result_df["recall_itac"] = itac_r
    result_df["f1_itac"] = itac_f
    result_df["precision_conll"] = conll_p
    result_df["recall_conll"] = conll_r
    result_df["f1_conll"] = conll_f
    result_df.to_csv("results/random_forest/n_trees_depth_search2.csv")
Esempio n. 5
0
LR_DECAY = 0.5
EPOCH_PATIENCE = 6
MAX_EPOCHS = 20
TEACHER_FORCING = 0.5


if __name__ == '__main__':

    embeddings = StackedEmbeddings([FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')])
    model_file = "pointer/models/19_06_06/flair_itac+conll.pt"

    man = FlairPointerManager(embeddings, learning_rate=START_LR, lr_factor=LR_DECAY, lr_patience=LR_PATIENCE,
                              n_encoder_layers=1, n_decoder_layers=1, cuda_device=0)
    man.load_model(model_file)
    man.test_misclassifications(["data/standardized/itac_test.txt"],
                                "results/19_06_06/flair_ptr_misclassifications_itac.csv")
    man.test_misclassifications(["data/standardized/conll_test.txt"],
                                "results/19_06_06/flair_ptr_misclassifications_conll.csv")

    print("Testing and training Random Forest")

    clf = RandomForestClassifier(n_estimators=100, class_weight="balanced")
    model = BasicClassifier(clf, embeddings, emb_type="flair")
    model.train_model(data_files=["data/standardized/conll_train.txt", "data/standardized/itac_dev.txt",
                                  "data/standardized/itac_train0.txt"])
    model.test_misclassifications(["data/standardized/itac_test.txt"],
                                  "results/19_06_06/flair_bc_misclassifications_itac.csv")
    model.test_misclassifications(["data/standardized/conll_test.txt"],
                                  "results/19_06_06/flair_bc_misclassifications_conll.csv")

Esempio n. 6
0
def bc_test_case_sensitivity():
    embeddings = StackedEmbeddings([
        FlairEmbeddings('news-forward-fast'),
        FlairEmbeddings('news-backward-fast')
    ])

    model_list = []
    case_list = []
    time_list = []
    itac_p = []
    itac_r = []
    itac_f = []
    conll_r = []
    conll_p = []
    conll_f = []
    rsics_p = []
    rsics_r = []
    rsics_f = []

    clf_names = [
        "AdaBoostClassifier", "LogisticRegression", "SGDClassifier",
        "LinearSVC", "RandomForest", "GradientBoosting"
    ]
    classifiers = [
        AdaBoostClassifier(),
        LogisticRegression(class_weight="balanced"),
        SGDClassifier(class_weight="balanced"),
        LinearSVC(class_weight="balanced"),
        RandomForestClassifier(class_weight="balanced"),
        GradientBoostingClassifier()
    ]

    for clf, clf_name in zip(classifiers, clf_names):
        model_list.append(clf_name)
        case_list.append("True")

        model = BasicClassifier(clf,
                                embeddings,
                                emb_type="flair",
                                wiki_file=wiki_file,
                                lowercase=True)

        st = time.time()
        model.train_model(data_files=train_sets + dev_set)
        time_list.append(time.time() - st)

        r, p, f = model.test_model(itac_test)
        itac_f.append(f)
        itac_p.append(p)
        itac_r.append(r)

        r, p, f = model.test_model(conll_test)
        conll_f.append(f)
        conll_p.append(p)
        conll_r.append(r)

        r, p, f = model.test_model(rsics_test)
        rsics_f.append(f)
        rsics_p.append(p)
        rsics_r.append(r)

        result_df = pd.DataFrame()
        result_df["clf_type"] = model_list
        result_df["lowercase"] = case_list
        result_df["train_time"] = time_list
        result_df["precision_itac"] = itac_p
        result_df["recall_itac"] = itac_r
        result_df["f1_itac"] = itac_f
        result_df["precision_conll"] = conll_p
        result_df["recall_conll"] = conll_r
        result_df["f1_conll"] = conll_f
        result_df["precision_rsics"] = rsics_p
        result_df["recall_rsics"] = rsics_r
        result_df["f1_rsics"] = rsics_f
        result_df.to_csv("results/19_07_02/bc_case_sensitivity.csv")

    classifiers = [
        AdaBoostClassifier(),
        LogisticRegression(class_weight="balanced"),
        SGDClassifier(class_weight="balanced"),
        LinearSVC(class_weight="balanced"),
        RandomForestClassifier(class_weight="balanced"),
        GradientBoostingClassifier()
    ]

    for clf, clf_name in zip(classifiers, clf_names):
        model_list.append(clf_name)
        case_list.append("False")

        model = BasicClassifier(clf,
                                embeddings,
                                emb_type="flair",
                                wiki_file=wiki_file,
                                lowercase=False)

        st = time.time()
        model.train_model(data_files=train_sets + dev_set)
        time_list.append(time.time() - st)

        r, p, f = model.test_model(itac_test)
        itac_f.append(f)
        itac_p.append(p)
        itac_r.append(r)

        r, p, f = model.test_model(conll_test)
        conll_f.append(f)
        conll_p.append(p)
        conll_r.append(r)

        r, p, f = model.test_model(rsics_test)
        rsics_f.append(f)
        rsics_p.append(p)
        rsics_r.append(r)

        result_df = pd.DataFrame()
        result_df["clf_type"] = model_list
        result_df["lowercase"] = case_list
        result_df["train_time"] = time_list
        result_df["precision_itac"] = itac_p
        result_df["recall_itac"] = itac_r
        result_df["f1_itac"] = itac_f
        result_df["precision_conll"] = conll_p
        result_df["recall_conll"] = conll_r
        result_df["f1_conll"] = conll_f
        result_df["precision_rsics"] = rsics_p
        result_df["recall_rsics"] = rsics_r
        result_df["f1_rsics"] = rsics_f
        result_df.to_csv("results/19_07_02/bc_case_sensitivity.csv")
def binary_classification_bpemb_glove():  # Fix Vocabulary Size to 50000
    clf_names = [
        "AdaBoostClassifier", "LogisticRegression", "SGDClassifier",
        "BayesianGaussianMixture", "GaussianNB", "LinearSVC", "RandomForest",
        "GradientBoosting"
    ]
    g_dim_list = []
    b_dim_list = []
    time_list = []
    itac_p = []
    itac_r = []
    itac_f = []
    conll_r = []
    conll_p = []
    conll_f = []
    model_list = []

    for path, d in glove_embeddings:
        for b_dim in bpemb_dims:

            classifiers = [
                AdaBoostClassifier(),
                LogisticRegression(class_weight="balanced"),
                SGDClassifier(class_weight="balanced"),
                BayesianGaussianMixture(),
                GaussianNB(),
                LinearSVC(class_weight="balanced"),
                RandomForestClassifier(class_weight="balanced"),
                GradientBoostingClassifier()
            ]

            g_man = GloveEmbeddings(path=path, dim=d)
            b_man = BPEmbeddings(dim=b_dim, bp_vocab_size=50000)
            c_man = CombinedEmbeddings([g_man, b_man])

            for model, model_name in zip(classifiers, clf_names):
                g_dim_list.append(d)
                b_dim_list.append(b_dim)
                model_list.append(model_name)
                clf = BasicClassifier(model=model,
                                      emb_man=c_man,
                                      wiki_file=wiki_file)

                st = time.time()
                clf.train_model(data_files=train_sets)
                time_list.append(time.time() - st)

                r, p, f = clf.test_model(itac_test)
                itac_f.append(f)
                itac_p.append(p)
                itac_r.append(r)

                r, p, f = clf.test_model(conll_test)
                conll_f.append(f)
                conll_p.append(p)
                conll_r.append(r)

    result_df = pd.DataFrame()
    result_df["clf_type"] = model_list
    result_df["glove_dim"] = g_dim_list
    result_df["bpemb dim"] = b_dim_list
    result_df["train_time"] = time_list
    result_df["precision_itac"] = itac_p
    result_df["recall_itac"] = itac_r
    result_df["f1_itac"] = itac_f
    result_df["precision_conll"] = conll_p
    result_df["recall_conll"] = conll_r
    result_df["f1_conll"] = conll_f
    result_df.to_csv("results/19_05_11b/bc_bpemb_glove_search.csv")
def binary_classification_pure_bpemb():
    clf_names = [
        "AdaBoostClassifier", "LogisticRegression", "SGDClassifier",
        "BayesianGaussianMixture", "GaussianNB", "LinearSVC", "RandomForest",
        "GradientBoosting"
    ]
    vs_list = []
    d_list = []
    time_list = []
    itac_p = []
    itac_r = []
    itac_f = []
    conll_r = []
    conll_p = []
    conll_f = []
    model_list = []

    for vs in bpemb_vocab_sizes:
        for d in bpemb_dims:
            classifiers = [
                AdaBoostClassifier(),
                LogisticRegression(class_weight="balanced"),
                SGDClassifier(class_weight="balanced"),
                BayesianGaussianMixture(),
                GaussianNB(),
                LinearSVC(class_weight="balanced"),
                RandomForestClassifier(class_weight="balanced"),
                GradientBoostingClassifier()
            ]
            bp_man = BPEmbeddings(bp_vocab_size=vs, dim=d)

            for model, model_name in zip(classifiers, clf_names):
                vs_list.append(vs)
                d_list.append(d)
                model_list.append(model_name)
                clf = BasicClassifier(model=model,
                                      emb_man=bp_man,
                                      wiki_file=wiki_file)

                st = time.time()
                clf.train_model(data_files=train_sets)
                time_list.append(time.time() - st)

                r, p, f = clf.test_model(itac_test)
                itac_f.append(f)
                itac_p.append(p)
                itac_r.append(r)

                r, p, f = clf.test_model(conll_test)
                conll_f.append(f)
                conll_p.append(p)
                conll_r.append(r)

    result_df = pd.DataFrame()
    result_df["clf_type"] = model_list
    result_df["vocabulary_size"] = vs_list
    result_df["bpemb dim"] = d_list
    result_df["train_time"] = time_list
    result_df["precision_itac"] = itac_p
    result_df["recall_itac"] = itac_r
    result_df["f1_itac"] = itac_f
    result_df["precision_conll"] = conll_p
    result_df["recall_conll"] = conll_r
    result_df["f1_conll"] = conll_f
    result_df.to_csv("results/19_05_11b/bc_pure_bpemb_grid_search.csv")