Esempio n. 1
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--pmb")
    parser.add_argument("--sick")
    parser.add_argument("--sick2pd")
    parser.add_argument("--out")
    args = parser.parse_args()

    # Final model
    data = Loader.load_data("../NLI2FOLI/SICK/SICK_train.txt")
    data = data.append(Loader.load_data("../NLI2FOLI/SICK/SICK_trial.txt"))
    test = Loader.load_data(args.sick)

    data["postags"] = FeatureExtractor.postag_tokenizer(data["tokens"])
    test["postags"] = FeatureExtractor.postag_tokenizer(test["tokens"])

    # The countvectorizer features with postagging appended to each token
    bag_of_words_plus_pos = ColumnTransformer([
        ("POS", CountVectorizer(tokenizer=lambda x: x,
                                preprocessor=lambda x: x), "postags")
    ])

    m = Model(data)

    m.add_feature(bag_of_words_plus_pos, "Feature")

    m.train_model(
        RandomForestClassifier(n_estimators=900,
                               criterion="entropy",
                               max_depth=729))

    with open('model.pkl', 'wb') as fid:
        pickle.dump(m, fid)

    with open('model.pkl', 'rb') as fid:
        m = pickle.load(fid)

    try:
        m.test_model(test, test["entailment_judgment"])

        labels = m.model.classes_
        cm = confusion_matrix(test["entailment_judgment"], m.prediction)

        print_cm(cm, labels)
    except:

        m.test_model(test)

        with open(args.out, "w") as file:
            for idx, pred in enumerate(m.prediction):
                pid = test.iloc[idx]['pair_ID']
                file.write("{}:{}\n".format(pid, pred))

        print("Successfully generated prediction on test data.")
Esempio n. 2
0
def test():

    data = Loader.load_data("../NLI2FOLI/SICK/SICK_train.txt")
    test = Loader.load_data("../NLI2FOLI/SICK/SICK_trial.txt")

    encoder = FeatureExtractor.generate_postag_onehot(data["tokens"])

    data["postags"] = FeatureExtractor.postag_tokenizer(data["tokens"])
    test["postags"] = FeatureExtractor.postag_tokenizer(test["tokens"])

    data["antons"] = FeatureExtractor.antonym_relations(data["pair_ID"])
    test["antons"] = FeatureExtractor.antonym_relations(test["pair_ID"])

    data["synons"] = FeatureExtractor.synonym_relations(
        data["tokens"], data["pair_ID"])
    test["synons"] = FeatureExtractor.synonym_relations(
        test["tokens"], test["pair_ID"])

    # Features
    # You don't have to do anything here
    # These are all the features we have

    # The tfidf features with both sentences seperated
    bag_of_words = ColumnTransformer([("A", TfidfVectorizer(), "sentence_A"),
                                      ("B", TfidfVectorizer(), "sentence_B")])

    # The countvectorizer features with postagging appended to each token
    bag_of_words_plus_pos = ColumnTransformer([
        ("POS", CountVectorizer(tokenizer=lambda x: x,
                                preprocessor=lambda x: x), "postags")
    ])

    # The One-hot-encoded postag features for both sentences
    postags_A = POSTAGTransformer(encoder, "sentence_A", maxlen=800)
    postags_B = POSTAGTransformer(encoder, "sentence_B", maxlen=800)

    # The negation features for both sentences
    negation_A = NEGTransformer("sentence_A")
    negation_B = NEGTransformer("sentence_B")

    # The antonyms and synonyms features
    antons = DumbTransfromer("antons")
    synons = DumbTransfromer("synons")

    # classifiers
    # Every classifiers should be put in a tuple with its name on the right hand side
    # You may tweak the hyperparameters

    nb = (MultinomialNB(alpha=0.1), "Naive Bayes")
    knn = (KNeighborsClassifier(), "KNN")
    svm = (SVC(kernel="linear", C=0.7), "SVM")
    forest = (RandomForestClassifier(n_estimators=1000,
                                     max_depth=128), "Random Forest")
    mlp = (MLPClassifier(1000), "Multi layer Perceptrons")

    classifiers = [nb, knn, svm, forest, mlp]

    # for n in range(5, 10):
    #     hyperforest = (RandomForestClassifier(
    #         n_estimators=n*100, max_depth=n**3), f"Random Forest with {n*100} of estimators and a max depth of {n**3}")
    #     classifiers.append(hyperforest)

    # Feature_combs
    # Feature combinations are a list of tuples
    # Give the combination a name on the right hand side
    combs = [([bag_of_words], "TFIDF"),
             ([bag_of_words_plus_pos], "Combined + Postagging"),
             ([bag_of_words, postags_A, postags_B], "TFIDF + One hot postags"),
             ([postags_A, postags_B], "OneHotPosTag only"),
             ([bag_of_words, negation_A, negation_B], "TFIDF + NEGATION"),
             ([negation_A, negation_B], "NEGATION_ONLY"),
             ([bag_of_words, antons, synons], "TFIDF + ANTONYMS + SYNONYMS"),
             ([bag_of_words, antons], "TFIDF + ANTONYMS"),
             ([bag_of_words, synons], "TFIDF + SYNONYMS"),
             ([
                 bag_of_words, postags_B, postags_A, negation_A, negation_B,
                 antons, synons
             ], "All features")]

    # The seach function takes the combination, classifier list and the train test data.
    # The result will be printed and exported to a csv file called search result
    # It also returns the dictionary of all the accuracys
    search(combs, classifiers, data, test)
Esempio n. 3
0
        for line in ss_file:
            line = line.split("|")
            nr = line[0]
            synsets = line[1].rstrip().split(",")[:-1]
            wn_synsets = []
            for ss in synsets:
                ss = wn.synset(ss)
                wn_synsets.append(ss)

            wordnet_ss[nr] = wn_synsets

    return wordnet_ss


if __name__ == "__main__":
    data = Loader.load_data("../NLI2FOLI/SICK/SICK_train.txt")
    test = Loader.load_data("../NLI2FOLI/SICK/SICK_trial.txt")

    model = Model(data)

    encoder = FeatureExtractor.generate_postag_onehot(data["tokens"])

    data["postags"] = FeatureExtractor.postag_tokenizer(data["tokens"])
    test["postags"] = FeatureExtractor.postag_tokenizer(test["tokens"])

    data["antons"] = FeatureExtractor.antonym_relations(data["pair_ID"])
    test["antons"] = FeatureExtractor.antonym_relations(test["pair_ID"])

    data["synons"] = FeatureExtractor.synonym_relations(
        data["tokens"], data["pair_ID"])
    test["synons"] = FeatureExtractor.synonym_relations(