def rte_classifier(): train_set = rte_corpus.pairs( ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) test_set = rte_corpus.pairs(['rte1_test.xml']) featurized_train_set = rte_featurize(train_set, True) featurized_test_set = rte_featurize(test_set, False, test_id=0) print('Training classifier...') svm = SklearnClassifier(LinearSVC()) clf_svm = svm.train(featurized_train_set) # clf_nb = nltk.NaiveBayesClassifier.train(featurized_train_set) # clf_gis = MaxentClassifier.train(featurized_train_set, 'GIS') # clf_iis = MaxentClassifier.train(featurized_train_set, 'IIS') # clf_dt = SklearnClassifier(RandomForestClassifier(random_state=0)).train(featurized_train_set) # clf_dt = DecisionTreeClassifier.train(featurized_train_set) print('Testing classifier...') # acc = m_accuracy(clf, featurized_test_set, test_set) # acc_dt = accuracy(clf_dt, featurized_test_set) # acc_gis = accuracy(clf_gis, featurized_test_set) # acc_iis = accuracy(clf_iis, featurized_test_set) acc_svm = accuracy(clf_svm, featurized_test_set) # acc_nb = accuracy(clf_nb, featurized_test_set) # print('rf Accuracy: %8.4f' % acc_dt) print('svm Accuracy: %8.4f' % acc_svm) # print('nb Accuracy: %8.4f' % acc_nb) # print('gis Accuracy: %8.4f' % acc_gis) # print('iis Accuracy: %8.4f' % acc_iis) print '==================================='
def rte_classifier(algorithm, sample_N=None): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs( ["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"]) test_set = rte_corpus.pairs( ["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"]) if sample_N is not None: train_set = train_set[:sample_N] test_set = test_set[:sample_N] featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print("Training classifier...") if algorithm in ["megam"]: # MEGAM based algorithms. clf = MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str("RTEClassifier only supports these algorithms:\n " "'megam', 'GIS', 'IIS'.\n") raise Exception(err_msg) print("Testing classifier...") acc = accuracy(clf, featurized_test_set) print("Accuracy: %6.4f" % acc) return clf
def rte_classifier(algorithm): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']) featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print('Training classifier...') if algorithm in ['megam', 'BFGS']: # MEGAM based algorithms. # Ensure that MEGAM is configured first. check_megam_config() clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ['GIS', 'IIS']: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str( "RTEClassifier only supports these algorithms:\n " "'megam', 'BFGS', 'GIS', 'IIS'.\n" ) raise Exception(err_msg) print('Testing classifier...') acc = accuracy(clf, featurized_test_set) print('Accuracy: %6.4f' % acc) return clf
def rte_classifier(algorithm): from nltk.corpus import rte as rte_corpus train_set = rte_corpus.pairs( ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) test_set = rte_corpus.pairs( ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']) featurized_train_set = rte_featurize(train_set) featurized_test_set = rte_featurize(test_set) # Train the classifier print('Training classifier...') if algorithm in ['megam', 'BFGS']: # MEGAM based algorithms. # Ensure that MEGAM is configured first. check_megam_config() clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm) elif algorithm in ['GIS', 'IIS']: # Use default GIS/IIS MaxEnt algorithm clf = MaxentClassifier.train(featurized_train_set, algorithm) else: err_msg = str("RTEClassifier only supports these algorithms:\n " "'megam', 'BFGS', 'GIS', 'IIS'.\n") raise Exception(err_msg) print('Testing classifier...') acc = accuracy(clf, featurized_test_set) print('Accuracy: %6.4f' % acc) return clf
def test_feature_extractor_object(self): rtepair = rte_corpus.pairs(['rte3_dev.xml'])[33] extractor = RTEFeatureExtractor(rtepair) self.assertEqual(extractor.hyp_words, {'member', 'China', 'SCO.'}) self.assertEqual(extractor.overlap('word'), set()) self.assertEqual(extractor.overlap('ne'), {'China'}) self.assertEqual(extractor.hyp_extra('word'), {'member'})
def test_rte_feature_extraction(self): pairs = rte_corpus.pairs(['rte1_dev.xml'])[:6] test_output = ["%-15s => %s" % (key, rte_features(pair)[key]) for pair in pairs for key in sorted(rte_features(pair))] expected_output = expected_from_rte_feature_extration.strip().split('\n') # Remove null strings. expected_output = list(filter(None, expected_output)) self.assertEqual(test_output, expected_output)
def tf_idf(which): cv = TfidfVectorizer(binary=False, decode_error='ignore', stop_words='english') if which == 'train': vec = cv.fit_transform( pairs_to_list( rte_corpus.pairs( ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']))) # print vec.toarray() return vec else: vec = cv.fit_transform( pairs_to_list( rte_corpus.pairs( ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']))) return vec
def test_feature_extractor_object(self): rtepair = rte_corpus.pairs(["rte3_dev.xml"])[33] extractor = RTEFeatureExtractor(rtepair) assert extractor.hyp_words == {"member", "China", "SCO."} assert extractor.overlap("word") == set() assert extractor.overlap("ne") == {"China"} assert extractor.hyp_extra("word") == {"member"}
def ne_word2vec_sim(pretrain_model, train=True): lst = ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'] if train: lst = ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'] sents = pairs_to_list(rte_corpus.pairs(lst)) nes = mpipeline.detect_v(sents) w2v_sim = [] for i in range(0, len(nes), 2): if nes[i] == [] or nes[i + 1] == []: w2v_sim.append(0.0) else: total_text = [] total_hyp = [] mdict = {} for ne in nes[i]: # print ne.encode('gbk', "ignore") tks = nltk.word_tokenize(ne) for word in tks: try: mdict[word] = pretrain_model.wv[word] except KeyError: pass # print "sth not in vocabulary" ne_vec = sum( [mdict[word] for word in tks if word in mdict.keys()]) total_text.append(ne_vec) # get a entity vector, append in # print total_text for ne in nes[i + 1]: # entities in hyp # print ne.encode('gbk', "ignore") # print nes[i+1] # print "=======================" tks = nltk.word_tokenize(ne) for word in tks: try: mdict[word] = pretrain_model.wv[word] except KeyError: pass # print "sth not in vocabulary" ne_vec = sum( [mdict[word] for word in tks if word in mdict.keys()]) # print "append in ", ne_vec total_hyp.append(ne_vec) # get a entity vector, append in # print 'TOTAL ne_vec: ', total_hyp # print "total a hyp of a pair", total_hyp # print "total a text of a pair", total_text # print "SUM: ", sum(total_hyp) sim = vec_cosine_sim(sum(total_text), sum(total_hyp)) # a pair # print sim w2v_sim.append(sim) fo = file('./w2v_verb_sim_train.txt', 'w') for i in w2v_sim: if type(i) == type(np.zeros(3)): fo.write("0.0\n") else: fo.write(str(i) + '\n') return w2v_sim
def test_rte_feature_extraction(self): pairs = rte_corpus.pairs(["rte1_dev.xml"])[:6] test_output = [ f"{key:<15} => {rte_features(pair)[key]}" for pair in pairs for key in sorted(rte_features(pair)) ] expected_output = expected_from_rte_feature_extration.strip().split( "\n") # Remove null strings. expected_output = list(filter(None, expected_output)) assert test_output == expected_output
def read_rte_from_nltk(version=3): train_saved_path = './data/raw-rte{0}-train.csv'.format(version) test_saved_path = './data/raw-rte{0}-test.csv'.format(version) if os.path.isfile(train_saved_path) and os.path.isfile(test_saved_path): rte_train = pd.read_csv(train_saved_path) rte_test = pd.read_csv(test_saved_path) return RTEData(rte_train, rte_test) train_xml = 'rte{0}_dev.xml'.format(version) test_xml = 'rte{0}_test.xml'.format(version) train_pairs = rte.pairs(train_xml) test_pairs = rte.pairs(test_xml) train_ts, train_hs, train_labels = get_sentence_sample(train_pairs) test_ts, test_hs, test_labels = get_sentence_sample(test_pairs) rte_train = pd.DataFrame( data=dict(text=train_ts, hypothesis=train_hs, label=train_labels) ) rte_test = pd.DataFrame( data=dict(text=test_ts, hypothesis=test_hs, label=test_labels) ) rte_train.to_csv(train_saved_path, index=False, encoding='utf-8') rte_test.to_csv(test_saved_path, index=False, encoding='utf-8') return RTEData(rte_train, rte_test)
def tf_idf(which): # construct tf-idf vectors cv = TfidfVectorizer(binary=False, decode_error='ignore', stop_words='english') if which == 'train': vec = cv.fit_transform(pairs_to_list(train_set)) return vec else: # vec = cv.fit_transform(pairs_to_list(rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']))) vec = cv.fit_transform( pairs_to_list( rte_corpus.pairs( ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']) + new_1 + new_2)) return vec
def demo(): """ Demo of the random guesser for RTE """ gold = rte.pairs(('rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml')) tagger = RTEGuesser() print "=" * 20 print "Random guessing:" print "%0.3f" % (accuracy(tagger, gold) * 100) tagger = RTEBoWTagger() print print "=" * 20 print "Bag of Words overlap:" print "%0.3f" % (accuracy(tagger, gold) * 100)
import nltk from nltk.classify.maxent import MaxentClassifier from nltk.classify.scikitlearn import SklearnClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import rte as rte_corpus import numpy as np import Assignment1.DataPreprocessing as mpipeline # from gensim.models import KeyedVectors # from gensim.models import Word2Vec from nltk.corpus.reader.rte import RTECorpusReader train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) rte_newtest = RTECorpusReader( 'D:\workplace_py\TestWN\AssignmentB', ['COMP6751-RTE-10_TEST-SET_gold.xml', 'COMP6751-RTE-30_TEST-SET_gold.xml']) test_set_1 = rte_corpus.pairs(['rte1_test.xml']) test_set_2 = rte_corpus.pairs(['rte2_test.xml']) test_set_3 = rte_corpus.pairs(['rte3_test.xml']) new_1 = rte_newtest.pairs(['COMP6751-RTE-10_TEST-SET_gold.xml']) new_2 = rte_newtest.pairs(['COMP6751-RTE-30_TEST-SET_gold.xml']) def rte_featurize(rte_pairs, training, test_id=0): # construct feature list id = 0 rl = [] for pair in rte_pairs: rl.append((rte_features(pair, id, training, test_id), pair.value)) id += 1 return rl
#!/usr/bin/python # -*- coding: utf-8 -*- # The RTE (Recognizing Textual Entailment) corpus was derived from the RTE1, RTE2 and RTE3 datasets (dev and test data), and consists of a list of XML-formatted 'text'/'hypothesis' pairs. from nltk.corpus import rte print(rte.fileids()) # doctest: +ELLIPSIS rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml']) print(rtepairs) # doctest: +ELLIPSIS rtepairs[5] rtepairs[5].text # doctest: +NORMALIZE_WHITESPACE rtepairs[5].hyp rtepairs[5].value xmltree = rte.xml('rte3_dev.xml') xmltree # doctest: +SKIP xmltree[7].findtext('t') # doctest: +NORMALIZE_WHITESPACE
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel from pyspark.mllib.util import MLUtils # Defining a function for lemmatizing the words in text and hypothesis def lemmatize(word): lemma = nltk.corpus.wordnet.morphy(word, pos = nltk.corpus.wordnet.VERB) if lemma is not None: return lemma return word # Creating a variable for english stopwords and named it as stop stop = stopwords.words('english') # Parse the XML print("parsing the XML") rte_te = rte.pairs(['/Users/cbrandenburg/documents/ie/courses/term3/nlp/textualentailmentdata.xml']) text_ls = [] hyp_ls = [] val_ls = [] text_tokens_ls = [] h_tokens_ls = [] text_punc_ls=[] for element in range(len(rte_te)): text_ls.append(rte_te[element].text) hyp_ls.append(rte_te[element].hyp) val_ls.append(rte_te[element].value) text_tokens_ls.append(word_tokenize(rte_te[element].text)) h_tokens_ls.append(word_tokenize(rte_te[element].hyp)) # Put values into a DataFrame
from nltk.corpus import stopwords def lemmatize(word): lemma = nltk.corpus.wordnet.morphy(word, pos = nltk.corpus.wordnet.VERB) if lemma is not None: return lemma return word stop = stopwords.words('english') # parse the XML print("parsing the XML") rte_te = rte.pairs(['/Users/rahulmehra/Downloads/xml_NLP.xml']) text_ls = [] hyp_ls = [] val_ls = [] text_tokens_ls = [] h_tokens_ls = [] text_punc_ls=[] for element in range(len(rte_te)): text_ls.append(rte_te[element].text) hyp_ls.append(rte_te[element].hyp) val_ls.append(rte_te[element].value) text_tokens_ls.append(word_tokenize(rte_te[element].text)) h_tokens_ls.append(word_tokenize(rte_te[element].hyp)) #print(text_ls) #print(hyp_ls)
import nltk from nltk.corpus import rte as rte_corpus import math from nltk.tokenize import RegexpTokenizer import test as treedis ####################################################### rte_10 = nltk.corpus.reader.rte.RTECorpusReader( "/Users/yuhaomao/Downloads/rte/rte10.xml", "rte_10.xml") rte_30 = nltk.corpus.reader.rte.RTECorpusReader( "/Users/yuhaomao/Downloads/rte/rte10.xml", "rte_30.xml") test_pair_rte10 = rte_corpus.pairs(['/Users/yuhaomao/Downloads/rte/rte10.xml']) test_pair_rte30 = rte_corpus.pairs(['/Users/yuhaomao/Downloads/rte/rte30.xml']) ####################################################### test_pair = rte_corpus.pairs(['rte1_test.xml']) rte_pair = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']) for pair in rte_pair: # print "2222222222",pair.text text_tokenize = [] hyp_token = [] extractor = nltk.RTEFeatureExtractor(pair) text_tokenize.append(list(extractor.text_words)) hyp_token.append(extractor.hyp_words) # print "1111111",type(text_tokenize) # print text_tokenize # print "2222222",type(hyp_token) # print hyp_token # print "......................",rte_corpus.pairs(['rte1_dev.xml'])[8].text # print "......................",rte_corpus.pairs(['rte1_dev.xml'])[8].hyp tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+')