コード例 #1
0
def rte_classifier():
    train_set = rte_corpus.pairs(
        ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
    test_set = rte_corpus.pairs(['rte1_test.xml'])
    featurized_train_set = rte_featurize(train_set, True)
    featurized_test_set = rte_featurize(test_set, False, test_id=0)
    print('Training classifier...')
    svm = SklearnClassifier(LinearSVC())
    clf_svm = svm.train(featurized_train_set)
    # clf_nb = nltk.NaiveBayesClassifier.train(featurized_train_set)
    # clf_gis = MaxentClassifier.train(featurized_train_set, 'GIS')
    # clf_iis = MaxentClassifier.train(featurized_train_set, 'IIS')
    # clf_dt = SklearnClassifier(RandomForestClassifier(random_state=0)).train(featurized_train_set)
    # clf_dt = DecisionTreeClassifier.train(featurized_train_set)
    print('Testing classifier...')
    # acc = m_accuracy(clf, featurized_test_set, test_set)

    # acc_dt = accuracy(clf_dt, featurized_test_set)
    # acc_gis = accuracy(clf_gis, featurized_test_set)
    # acc_iis = accuracy(clf_iis, featurized_test_set)
    acc_svm = accuracy(clf_svm, featurized_test_set)
    # acc_nb = accuracy(clf_nb, featurized_test_set)
    # print('rf Accuracy: %8.4f' % acc_dt)
    print('svm Accuracy: %8.4f' % acc_svm)
    # print('nb Accuracy: %8.4f' % acc_nb)
    # print('gis Accuracy: %8.4f' % acc_gis)
    # print('iis Accuracy: %8.4f' % acc_iis)
    print '==================================='
コード例 #2
0
def rte_classifier(algorithm, sample_N=None):
    from nltk.corpus import rte as rte_corpus

    train_set = rte_corpus.pairs(
        ["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"])
    test_set = rte_corpus.pairs(
        ["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"])

    if sample_N is not None:
        train_set = train_set[:sample_N]
        test_set = test_set[:sample_N]

    featurized_train_set = rte_featurize(train_set)
    featurized_test_set = rte_featurize(test_set)

    # Train the classifier
    print("Training classifier...")
    if algorithm in ["megam"]:  # MEGAM based algorithms.
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    elif algorithm in ["GIS", "IIS"]:  # Use default GIS/IIS MaxEnt algorithm
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    else:
        err_msg = str("RTEClassifier only supports these algorithms:\n "
                      "'megam', 'GIS', 'IIS'.\n")
        raise Exception(err_msg)
    print("Testing classifier...")
    acc = accuracy(clf, featurized_test_set)
    print("Accuracy: %6.4f" % acc)
    return clf
コード例 #3
0
ファイル: rte_classify.py プロジェクト: prz3m/kind2anki
def rte_classifier(algorithm):
    from nltk.corpus import rte as rte_corpus

    train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
    test_set = rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
    featurized_train_set = rte_featurize(train_set)
    featurized_test_set = rte_featurize(test_set)
    # Train the classifier
    print('Training classifier...')
    if algorithm in ['megam', 'BFGS']:  # MEGAM based algorithms.
        # Ensure that MEGAM is configured first.
        check_megam_config()
        clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
    elif algorithm in ['GIS', 'IIS']:  # Use default GIS/IIS MaxEnt algorithm
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    else:
        err_msg = str(
            "RTEClassifier only supports these algorithms:\n "
            "'megam', 'BFGS', 'GIS', 'IIS'.\n"
        )
        raise Exception(err_msg)
    print('Testing classifier...')
    acc = accuracy(clf, featurized_test_set)
    print('Accuracy: %6.4f' % acc)
    return clf
コード例 #4
0
def rte_classifier(algorithm):
    from nltk.corpus import rte as rte_corpus

    train_set = rte_corpus.pairs(
        ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
    test_set = rte_corpus.pairs(
        ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])
    featurized_train_set = rte_featurize(train_set)
    featurized_test_set = rte_featurize(test_set)
    # Train the classifier
    print('Training classifier...')
    if algorithm in ['megam', 'BFGS']:  # MEGAM based algorithms.
        # Ensure that MEGAM is configured first.
        check_megam_config()
        clf = lambda x: MaxentClassifier.train(featurized_train_set, algorithm)
    elif algorithm in ['GIS', 'IIS']:  # Use default GIS/IIS MaxEnt algorithm
        clf = MaxentClassifier.train(featurized_train_set, algorithm)
    else:
        err_msg = str("RTEClassifier only supports these algorithms:\n "
                      "'megam', 'BFGS', 'GIS', 'IIS'.\n")
        raise Exception(err_msg)
    print('Testing classifier...')
    acc = accuracy(clf, featurized_test_set)
    print('Accuracy: %6.4f' % acc)
    return clf
コード例 #5
0
ファイル: test_rte_classify.py プロジェクト: wardbradt/nltk
 def test_feature_extractor_object(self):
     rtepair = rte_corpus.pairs(['rte3_dev.xml'])[33]
     extractor = RTEFeatureExtractor(rtepair)
     self.assertEqual(extractor.hyp_words, {'member', 'China', 'SCO.'})
     self.assertEqual(extractor.overlap('word'), set())
     self.assertEqual(extractor.overlap('ne'), {'China'})
     self.assertEqual(extractor.hyp_extra('word'), {'member'})
コード例 #6
0
ファイル: test_rte_classify.py プロジェクト: muneson/nltk
 def test_feature_extractor_object(self):
     rtepair = rte_corpus.pairs(['rte3_dev.xml'])[33]
     extractor = RTEFeatureExtractor(rtepair)
     self.assertEqual(extractor.hyp_words, {'member', 'China', 'SCO.'})
     self.assertEqual(extractor.overlap('word'), set())
     self.assertEqual(extractor.overlap('ne'), {'China'})
     self.assertEqual(extractor.hyp_extra('word'), {'member'})
コード例 #7
0
ファイル: test_rte_classify.py プロジェクト: muneson/nltk
 def test_rte_feature_extraction(self):
     pairs = rte_corpus.pairs(['rte1_dev.xml'])[:6]
     test_output = ["%-15s => %s" % (key, rte_features(pair)[key])
                    for pair in pairs for key in sorted(rte_features(pair))]
     expected_output = expected_from_rte_feature_extration.strip().split('\n')
     # Remove null strings.
     expected_output = list(filter(None, expected_output))
     self.assertEqual(test_output, expected_output)
コード例 #8
0
def tf_idf(which):
    cv = TfidfVectorizer(binary=False,
                         decode_error='ignore',
                         stop_words='english')
    if which == 'train':
        vec = cv.fit_transform(
            pairs_to_list(
                rte_corpus.pairs(
                    ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])))
        # print vec.toarray()
        return vec
    else:
        vec = cv.fit_transform(
            pairs_to_list(
                rte_corpus.pairs(
                    ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])))
        return vec
コード例 #9
0
ファイル: test_rte_classify.py プロジェクト: wardbradt/nltk
 def test_rte_feature_extraction(self):
     pairs = rte_corpus.pairs(['rte1_dev.xml'])[:6]
     test_output = ["%-15s => %s" % (key, rte_features(pair)[key])
                    for pair in pairs for key in sorted(rte_features(pair))]
     expected_output = expected_from_rte_feature_extration.strip().split('\n')
     # Remove null strings.
     expected_output = list(filter(None, expected_output))
     self.assertEqual(test_output, expected_output)
コード例 #10
0
    def test_feature_extractor_object(self):
        rtepair = rte_corpus.pairs(["rte3_dev.xml"])[33]
        extractor = RTEFeatureExtractor(rtepair)

        assert extractor.hyp_words == {"member", "China", "SCO."}
        assert extractor.overlap("word") == set()
        assert extractor.overlap("ne") == {"China"}
        assert extractor.hyp_extra("word") == {"member"}
コード例 #11
0
def ne_word2vec_sim(pretrain_model, train=True):
    lst = ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']
    if train:
        lst = ['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml']
    sents = pairs_to_list(rte_corpus.pairs(lst))
    nes = mpipeline.detect_v(sents)
    w2v_sim = []
    for i in range(0, len(nes), 2):
        if nes[i] == [] or nes[i + 1] == []:
            w2v_sim.append(0.0)
        else:
            total_text = []
            total_hyp = []
            mdict = {}
            for ne in nes[i]:
                # print ne.encode('gbk', "ignore")
                tks = nltk.word_tokenize(ne)
                for word in tks:
                    try:
                        mdict[word] = pretrain_model.wv[word]
                    except KeyError:
                        pass
                        # print "sth not in vocabulary"
                ne_vec = sum(
                    [mdict[word] for word in tks if word in mdict.keys()])
                total_text.append(ne_vec)  # get a entity vector, append in
            # print total_text
            for ne in nes[i + 1]:  # entities in hyp
                # print ne.encode('gbk', "ignore")
                # print nes[i+1]
                # print "======================="
                tks = nltk.word_tokenize(ne)
                for word in tks:
                    try:
                        mdict[word] = pretrain_model.wv[word]
                    except KeyError:
                        pass
                        # print "sth not in vocabulary"
                ne_vec = sum(
                    [mdict[word] for word in tks if word in mdict.keys()])
                # print "append in ", ne_vec
                total_hyp.append(ne_vec)  # get a entity vector, append in
                # print 'TOTAL ne_vec: ', total_hyp
            # print "total a hyp of a pair", total_hyp
            # print "total a text of a pair", total_text
            # print "SUM: ", sum(total_hyp)
            sim = vec_cosine_sim(sum(total_text), sum(total_hyp))  # a pair
            # print sim
            w2v_sim.append(sim)
    fo = file('./w2v_verb_sim_train.txt', 'w')
    for i in w2v_sim:
        if type(i) == type(np.zeros(3)):
            fo.write("0.0\n")
        else:
            fo.write(str(i) + '\n')
    return w2v_sim
コード例 #12
0
 def test_rte_feature_extraction(self):
     pairs = rte_corpus.pairs(["rte1_dev.xml"])[:6]
     test_output = [
         f"{key:<15} => {rte_features(pair)[key]}" for pair in pairs
         for key in sorted(rte_features(pair))
     ]
     expected_output = expected_from_rte_feature_extration.strip().split(
         "\n")
     # Remove null strings.
     expected_output = list(filter(None, expected_output))
     assert test_output == expected_output
コード例 #13
0
ファイル: rte_utils.py プロジェクト: junfenglx/skip-thoughts
def read_rte_from_nltk(version=3):
    train_saved_path = './data/raw-rte{0}-train.csv'.format(version)
    test_saved_path = './data/raw-rte{0}-test.csv'.format(version)
    if os.path.isfile(train_saved_path) and os.path.isfile(test_saved_path):
        rte_train = pd.read_csv(train_saved_path)
        rte_test = pd.read_csv(test_saved_path)
        return RTEData(rte_train, rte_test)

    train_xml = 'rte{0}_dev.xml'.format(version)
    test_xml = 'rte{0}_test.xml'.format(version)
    train_pairs = rte.pairs(train_xml)
    test_pairs = rte.pairs(test_xml)
    train_ts, train_hs, train_labels = get_sentence_sample(train_pairs)
    test_ts, test_hs, test_labels = get_sentence_sample(test_pairs)
    rte_train = pd.DataFrame(
            data=dict(text=train_ts, hypothesis=train_hs, label=train_labels)
    )
    rte_test = pd.DataFrame(
            data=dict(text=test_ts, hypothesis=test_hs, label=test_labels)
    )
    rte_train.to_csv(train_saved_path, index=False, encoding='utf-8')
    rte_test.to_csv(test_saved_path, index=False, encoding='utf-8')
    return RTEData(rte_train, rte_test)
コード例 #14
0
def tf_idf(which):  # construct tf-idf vectors
    cv = TfidfVectorizer(binary=False,
                         decode_error='ignore',
                         stop_words='english')
    if which == 'train':
        vec = cv.fit_transform(pairs_to_list(train_set))
        return vec
    else:
        # vec = cv.fit_transform(pairs_to_list(rte_corpus.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])))
        vec = cv.fit_transform(
            pairs_to_list(
                rte_corpus.pairs(
                    ['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml']) +
                new_1 + new_2))
        return vec
コード例 #15
0
ファイル: bow.py プロジェクト: steven-cutting/icsisumm
def demo():
    """
    Demo of the random guesser for RTE
    """
    gold = rte.pairs(('rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'))

    tagger = RTEGuesser()
    print "=" * 20
    print "Random guessing:"
    print "%0.3f" % (accuracy(tagger, gold) * 100)

    tagger = RTEBoWTagger()
    print
    print "=" * 20
    print "Bag of Words overlap:"
    print "%0.3f" % (accuracy(tagger, gold) * 100)
コード例 #16
0
ファイル: bow.py プロジェクト: DrDub/icsisumm
def demo():
    """
    Demo of the random guesser for RTE
    """
    gold = rte.pairs(('rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'))

    tagger = RTEGuesser()
    print "=" * 20
    print "Random guessing:"
    print "%0.3f" % (accuracy(tagger, gold) * 100)
    
    tagger = RTEBoWTagger()
    print 
    print "=" * 20
    print "Bag of Words overlap:"
    print "%0.3f" % (accuracy(tagger, gold) * 100)
コード例 #17
0
import nltk
from nltk.classify.maxent import MaxentClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import rte as rte_corpus
import numpy as np
import Assignment1.DataPreprocessing as mpipeline
# from gensim.models import KeyedVectors
# from gensim.models import Word2Vec
from nltk.corpus.reader.rte import RTECorpusReader

train_set = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
rte_newtest = RTECorpusReader(
    'D:\workplace_py\TestWN\AssignmentB',
    ['COMP6751-RTE-10_TEST-SET_gold.xml', 'COMP6751-RTE-30_TEST-SET_gold.xml'])
test_set_1 = rte_corpus.pairs(['rte1_test.xml'])
test_set_2 = rte_corpus.pairs(['rte2_test.xml'])
test_set_3 = rte_corpus.pairs(['rte3_test.xml'])
new_1 = rte_newtest.pairs(['COMP6751-RTE-10_TEST-SET_gold.xml'])
new_2 = rte_newtest.pairs(['COMP6751-RTE-30_TEST-SET_gold.xml'])


def rte_featurize(rte_pairs, training, test_id=0):  # construct feature list
    id = 0
    rl = []
    for pair in rte_pairs:
        rl.append((rte_features(pair, id, training, test_id), pair.value))
        id += 1
    return rl
コード例 #18
0
ファイル: corpus-howto-rte.py プロジェクト: Lingwars/GAPLEN
#!/usr/bin/python
# -*- coding: utf-8 -*-
# The RTE (Recognizing Textual Entailment) corpus was derived from the RTE1, RTE2 and RTE3 datasets (dev and test data), and consists of a list of XML-formatted 'text'/'hypothesis' pairs.

from nltk.corpus import rte
print(rte.fileids())  # doctest: +ELLIPSIS
rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml'])
print(rtepairs)  # doctest: +ELLIPSIS
rtepairs[5]
rtepairs[5].text  # doctest: +NORMALIZE_WHITESPACE
rtepairs[5].hyp
rtepairs[5].value
xmltree = rte.xml('rte3_dev.xml')
xmltree  # doctest: +SKIP
xmltree[7].findtext('t')  # doctest: +NORMALIZE_WHITESPACE
コード例 #19
0
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

# Defining a function for lemmatizing the words in text and hypothesis
def lemmatize(word):
    lemma = nltk.corpus.wordnet.morphy(word, pos = nltk.corpus.wordnet.VERB)
    if lemma is not None:
        return lemma
    return word

# Creating a variable for english stopwords and named it as stop
stop = stopwords.words('english')

# Parse the XML
print("parsing the XML")
rte_te = rte.pairs(['/Users/cbrandenburg/documents/ie/courses/term3/nlp/textualentailmentdata.xml'])
text_ls = []
hyp_ls = []
val_ls = []
text_tokens_ls = []
h_tokens_ls = []
text_punc_ls=[]
for element in range(len(rte_te)):
    text_ls.append(rte_te[element].text)
    hyp_ls.append(rte_te[element].hyp)
    val_ls.append(rte_te[element].value)
    text_tokens_ls.append(word_tokenize(rte_te[element].text))
    h_tokens_ls.append(word_tokenize(rte_te[element].hyp))


# Put values into a DataFrame
コード例 #20
0
from nltk.corpus import stopwords


 
def lemmatize(word):
    lemma = nltk.corpus.wordnet.morphy(word, pos = nltk.corpus.wordnet.VERB)
    if lemma is not None:
        return lemma
    return word
     


stop = stopwords.words('english')
# parse the XML
print("parsing the XML")
rte_te = rte.pairs(['/Users/rahulmehra/Downloads/xml_NLP.xml'])
text_ls = []
hyp_ls = []
val_ls = []
text_tokens_ls = []
h_tokens_ls = []
text_punc_ls=[]
for element in range(len(rte_te)):
    text_ls.append(rte_te[element].text)
    hyp_ls.append(rte_te[element].hyp)
    val_ls.append(rte_te[element].value)
    text_tokens_ls.append(word_tokenize(rte_te[element].text))
    h_tokens_ls.append(word_tokenize(rte_te[element].hyp))

#print(text_ls)
#print(hyp_ls)
コード例 #21
0
import nltk
from nltk.corpus import rte as rte_corpus
import math
from nltk.tokenize import RegexpTokenizer
import test as treedis
#######################################################
rte_10 = nltk.corpus.reader.rte.RTECorpusReader(
    "/Users/yuhaomao/Downloads/rte/rte10.xml", "rte_10.xml")
rte_30 = nltk.corpus.reader.rte.RTECorpusReader(
    "/Users/yuhaomao/Downloads/rte/rte10.xml", "rte_30.xml")
test_pair_rte10 = rte_corpus.pairs(['/Users/yuhaomao/Downloads/rte/rte10.xml'])
test_pair_rte30 = rte_corpus.pairs(['/Users/yuhaomao/Downloads/rte/rte30.xml'])

#######################################################

test_pair = rte_corpus.pairs(['rte1_test.xml'])
rte_pair = rte_corpus.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])
for pair in rte_pair:
    # print "2222222222",pair.text
    text_tokenize = []
    hyp_token = []
    extractor = nltk.RTEFeatureExtractor(pair)
    text_tokenize.append(list(extractor.text_words))
    hyp_token.append(extractor.hyp_words)
# print "1111111",type(text_tokenize)
# print text_tokenize
# print "2222222",type(hyp_token)
# print hyp_token
# print "......................",rte_corpus.pairs(['rte1_dev.xml'])[8].text
# print "......................",rte_corpus.pairs(['rte1_dev.xml'])[8].hyp
tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+')