def main(): parser = argparse.ArgumentParser() parser.add_argument("--pmb") parser.add_argument("--sick") parser.add_argument("--sick2pd") parser.add_argument("--out") args = parser.parse_args() # Final model data = Loader.load_data("../NLI2FOLI/SICK/SICK_train.txt") data = data.append(Loader.load_data("../NLI2FOLI/SICK/SICK_trial.txt")) test = Loader.load_data(args.sick) data["postags"] = FeatureExtractor.postag_tokenizer(data["tokens"]) test["postags"] = FeatureExtractor.postag_tokenizer(test["tokens"]) # The countvectorizer features with postagging appended to each token bag_of_words_plus_pos = ColumnTransformer([ ("POS", CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x), "postags") ]) m = Model(data) m.add_feature(bag_of_words_plus_pos, "Feature") m.train_model( RandomForestClassifier(n_estimators=900, criterion="entropy", max_depth=729)) with open('model.pkl', 'wb') as fid: pickle.dump(m, fid) with open('model.pkl', 'rb') as fid: m = pickle.load(fid) try: m.test_model(test, test["entailment_judgment"]) labels = m.model.classes_ cm = confusion_matrix(test["entailment_judgment"], m.prediction) print_cm(cm, labels) except: m.test_model(test) with open(args.out, "w") as file: for idx, pred in enumerate(m.prediction): pid = test.iloc[idx]['pair_ID'] file.write("{}:{}\n".format(pid, pred)) print("Successfully generated prediction on test data.")
def test(): data = Loader.load_data("../NLI2FOLI/SICK/SICK_train.txt") test = Loader.load_data("../NLI2FOLI/SICK/SICK_trial.txt") encoder = FeatureExtractor.generate_postag_onehot(data["tokens"]) data["postags"] = FeatureExtractor.postag_tokenizer(data["tokens"]) test["postags"] = FeatureExtractor.postag_tokenizer(test["tokens"]) data["antons"] = FeatureExtractor.antonym_relations(data["pair_ID"]) test["antons"] = FeatureExtractor.antonym_relations(test["pair_ID"]) data["synons"] = FeatureExtractor.synonym_relations( data["tokens"], data["pair_ID"]) test["synons"] = FeatureExtractor.synonym_relations( test["tokens"], test["pair_ID"]) # Features # You don't have to do anything here # These are all the features we have # The tfidf features with both sentences seperated bag_of_words = ColumnTransformer([("A", TfidfVectorizer(), "sentence_A"), ("B", TfidfVectorizer(), "sentence_B")]) # The countvectorizer features with postagging appended to each token bag_of_words_plus_pos = ColumnTransformer([ ("POS", CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x), "postags") ]) # The One-hot-encoded postag features for both sentences postags_A = POSTAGTransformer(encoder, "sentence_A", maxlen=800) postags_B = POSTAGTransformer(encoder, "sentence_B", maxlen=800) # The negation features for both sentences negation_A = NEGTransformer("sentence_A") negation_B = NEGTransformer("sentence_B") # The antonyms and synonyms features antons = DumbTransfromer("antons") synons = DumbTransfromer("synons") # classifiers # Every classifiers should be put in a tuple with its name on the right hand side # You may tweak the hyperparameters nb = (MultinomialNB(alpha=0.1), "Naive Bayes") knn = (KNeighborsClassifier(), "KNN") svm = (SVC(kernel="linear", C=0.7), "SVM") forest = (RandomForestClassifier(n_estimators=1000, max_depth=128), "Random Forest") mlp = (MLPClassifier(1000), "Multi layer Perceptrons") classifiers = [nb, knn, svm, forest, mlp] # for n in range(5, 10): # hyperforest = (RandomForestClassifier( # n_estimators=n*100, max_depth=n**3), f"Random Forest with {n*100} of estimators and a max depth of {n**3}") # classifiers.append(hyperforest) # Feature_combs # Feature combinations are a list of tuples # Give the combination a name on the right hand side combs = [([bag_of_words], "TFIDF"), ([bag_of_words_plus_pos], "Combined + Postagging"), ([bag_of_words, postags_A, postags_B], "TFIDF + One hot postags"), ([postags_A, postags_B], "OneHotPosTag only"), ([bag_of_words, negation_A, negation_B], "TFIDF + NEGATION"), ([negation_A, negation_B], "NEGATION_ONLY"), ([bag_of_words, antons, synons], "TFIDF + ANTONYMS + SYNONYMS"), ([bag_of_words, antons], "TFIDF + ANTONYMS"), ([bag_of_words, synons], "TFIDF + SYNONYMS"), ([ bag_of_words, postags_B, postags_A, negation_A, negation_B, antons, synons ], "All features")] # The seach function takes the combination, classifier list and the train test data. # The result will be printed and exported to a csv file called search result # It also returns the dictionary of all the accuracys search(combs, classifiers, data, test)
for line in ss_file: line = line.split("|") nr = line[0] synsets = line[1].rstrip().split(",")[:-1] wn_synsets = [] for ss in synsets: ss = wn.synset(ss) wn_synsets.append(ss) wordnet_ss[nr] = wn_synsets return wordnet_ss if __name__ == "__main__": data = Loader.load_data("../NLI2FOLI/SICK/SICK_train.txt") test = Loader.load_data("../NLI2FOLI/SICK/SICK_trial.txt") model = Model(data) encoder = FeatureExtractor.generate_postag_onehot(data["tokens"]) data["postags"] = FeatureExtractor.postag_tokenizer(data["tokens"]) test["postags"] = FeatureExtractor.postag_tokenizer(test["tokens"]) data["antons"] = FeatureExtractor.antonym_relations(data["pair_ID"]) test["antons"] = FeatureExtractor.antonym_relations(test["pair_ID"]) data["synons"] = FeatureExtractor.synonym_relations( data["tokens"], data["pair_ID"]) test["synons"] = FeatureExtractor.synonym_relations(