def load_data(self, percentage):
        print("Started Loading the Data")
        # Get the complete data
        data_set = treebank.fileids()
        # Partition the data into train and test data sets
        training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)]
        testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)]

        # How much percentage of files consider for training?
        index = int(percentage*len(training_data_fileIds))
        training_data_fileIds = training_data_fileIds[:index]

        tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds)
        tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds)

        tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds)
        tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds)

        # print(len(tagged_training_data1), len(tagged_testing_data1))

        # UnTag the data for other uses
        untagged_training_data = [untag(item) for item in tagged_training_data]
        untagged_testing_data = [untag(item) for item in tagged_testing_data]

        print("Data Loaded Successfully. Stats are")
        print("Training Data Sentences: ", len(tagged_training_data))
        print("Testing Data  Sentences: ", len(tagged_testing_data))

        return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
Example #2
0
def _untag_sequence(tagged):
    try:
        if isinstance(tagged[0][0], str):
            return tuple(untag(tagged))
        else:
            return [tuple(untag(t)) for t in tagged]
    except IndexError:
        return []
Example #3
0
 def trainUniTnT(self):
     """train unigram and tnt seperatly without DefaultTagger"""
     self.split_into_folds()
     for k in range(1, (self.folds + 1)):
         train_sents = sum(self.foldlist[: (self.folds - 1)], [])
         tnt_tagger = tnt.TnT(N=100)
         tnt_tagger.train(train_sents)
         print(str(k) + " fold: tnt evaluated")
         unigram = UnigramTagger(train_sents)
         print(str(k) + " fold: unigram evaluated")
         to_tag = [untag(i) for i in self.foldlist[self.folds - 1]]
         self.tnt_tagged += tnt_tagger.tag_sents(to_tag)
         self.uni_tagged += unigram.tag_sents(to_tag)
         self.org_tagged += self.foldlist[self.folds - 1]
         self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)]
     self.tnt = tnt_tagger
     self.unigram = unigram
     self.tnt_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.tnt_tagged, []))
     self.uni_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.uni_tagged, []))
     print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc)
     print("Accuracy of concatenated unigram-tagged sentences: ", self.uni_avg_acc)
     (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall(
         tnt_tagger, self.tnt_tagged, self.org_tagged
     )
     (self.unigram_tagprecision, self.unigram_tagrecall) = self.tagprecision_recall(
         unigram, self.uni_tagged, self.org_tagged
     )
     # delete following values so that trainRegexp has the inicial values
     self.org_tagged = []
     self.foldlist = []
     for i in range(1, self.folds + 1):
         self.foldlist.append(self.create_fold(i))
Example #4
0
def update_category_by_pos():
    from nltk.corpus import brown
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.tag import untag
    from nltk import DecisionTreeClassifier

    def pos_features(sentence, i):
        features = {'suffix(1)':sentence[i][-1:],
                    'suffix(2)':sentence[i][-2:],
                    'suffix(3)':sentence[i][-3:]
                    }
        features['prev-word'] = '<start>' if i==0 else sentence[i-1]
        return features

    print pos_features(brown.sents()[0], 8)

    tagged_sents = brown.tagged_sents(categories='news')
    featuresets = []

    for tagged_sent in tagged_sents:
        untagged_sent = untag(tagged_sent)
        for i, (word, tag) in enumerate(tagged_sent):
            featuresets.append((pos_features(untagged_sent, i), tag))

    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = NaiveBayesClassifier.train(train_set)
    classifier = DecisionTreeClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
    def entropy_of_words(self, tagged_data):
        """
        Takes in tagged words as input and return tag entropies of the words

        """
        tagged_words_fdist = Counter(tagged_data)
        total_no_of_tagged_words = sum(tagged_words_fdist.values())

        untagged_data = untag(tagged_data)
        untagged_words_fdist = Counter(untagged_data)
        total_no_of_words = sum(untagged_words_fdist.values())

        # Create Word Tags dictionary as shown below in format
        word_tags = dict() # {word:{(word,tag1),....(word,tagN)},.....}
        for tagged_word in tagged_words_fdist.keys():
            if tagged_word[0] in word_tags.keys():
                word_tags[tagged_word[0]].add(tagged_word)
            else:
                word_tags[tagged_word[0]] = set()
                word_tags[tagged_word[0]].add(tagged_word)

        # Compute the entropies of the words
        entropies = dict()
        for word in untagged_words_fdist.keys():
            entropies[word] = 0
            tagged_words = word_tags[word]
            for tagged_word in tagged_words:
                yi = tagged_words_fdist[tagged_word]/untagged_words_fdist[word]
                entropies[word] += -(yi*log2(yi))

        entropies = sorted(entropies.items(), key=operator.itemgetter(1), reverse=True)
        return entropies, word_tags, tagged_words_fdist
Example #6
0
 def trainRegexp(self, backoff):
     self.split_into_folds()
     for k in range(1, (self.folds + 1)):
         train_sents = sum(self.foldlist[: (self.folds - 1)], [])
         if self.option_tone == "tonal" and self.option_tag == "Affixes":
             regex = RegexpTonalSA(backoff)
         if self.option_tone == "tonal" and self.option_tag == "POS":
             regex = RegexpTonal(backoff)
         if self.option_tone == "nontonal" and self.option_tag == "Affixes":
             regex = RegexpSA(backoff)
         if self.option_tone == "nontonal" and self.option_tag == "POS":
             regex = Regexp(backoff)
         to_tag = [untag(i) for i in self.foldlist[self.folds - 1]]
         self.regex_tagged += regex.tag_sents(to_tag)
         self.org_tagged += self.foldlist[self.folds - 1]
         self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)]
     self.regex = regex
     self.regex_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.regex_tagged, []))
     print("Accuracy of concatenated regexp-tagged sentences: ", self.regex_avg_acc)
     (self.regex_tagprecision, self.regex_tagrecall) = self.tagprecision_recall(
         regex, self.regex_tagged, self.org_tagged
     )
     self.org_tagged = []
     self.foldlist = []
     for i in range(1, self.folds + 1):
         self.foldlist.append(self.create_fold(i))
Example #7
0
 def demo(self, test_sents):
     tagger = CRFTagger(feature_func=self.feature_detector)
     tagger.set_model_file(self.modelpath)
     for sent in test_sents:
         tagged = tagger.tag(untag(sent))
         for s in self._to_sentence(tagged):
             print(s)
     print(tagger.evaluate(test_sents))
Example #8
0
def calc_switched_words(bambara, tagger, tagpairs):
    '''iterates over the switched tag-pairs to find all the words which are responsible
    for these switches'''
    untag_testsents = [untag(i) for i in bambara.test_sents]
    tagger_tagged_sents = tagger.tag_sents(untag_testsents)
    compareTags = list(zip(sum(tagger_tagged_sents,[]), sum(bambara.test_sents,[])))
    word_tag_list = sum(bambara.reader.tagged_sents, [])
    switch_list = [(i[0],i[1]) for i in tagpairs]
    for i in switch_list:
        calc_one_switched_word(i[0], i[1], compareTags, word_tag_list)
 def test_MLT(self, model, tagged_testing_data):
     """
     Takes the model as well as the tagged testing data SENTENCES to tag the words using most likely tag tagger
     :param model dictionary to use for most likely tag tagger
     :param tagged_testing_data testing data
     :return accuracy
     """
     untagged_testing_data = [untag(item) for item in tagged_testing_data]
     for sent in untagged_testing_data:
         for word in untagged_testing_data:
             pass
     return
Example #10
0
    def train(cls,train_sents, feature_extractor, classifier_cls,**kwargs):
        train_set = []

        for tagged_sent in train_sents:
            untagged_sent = untag(tagged_sent)
            history = []

            for i, (word,tag) in enumerate(tagged_sent):
                featureset = feature_extractor(untagged_sent,i, history)
                train_set.append((featureset,tag))
                history.append(tag)
        classifier = classifier_cls.train(train_set, **kwargs)
        return cls(feature_extractor, classifier)
Example #11
0
    def train(cls, train_sents, feature_extractor, classifier_cls, **kwargs):
        train_set = []

        for tagged_sent in train_sents:
            untagged_sent = untag(tagged_sent)
            history = []

            for i, (word, tag) in enumerate(tagged_sent):
                featureset = feature_extractor(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)

        classifier = classifier_cls.train(train_set, **kwargs)
        return cls(feature_extractor, classifier)
Example #12
0
   def __init__(self):

      boundary = int(len(brown.tagged_sents())*0.8)
      train_naive = brown.tagged_sents(simplify_tags=True)[:boundary] 
      temp_train_data = []
      for sentence in train_naive:
         untagged_sent = untag(sentence)
         history = []
         for i, (word, tag) in enumerate(sentence):
            temp_train_data.append((self.featextract(untagged_sent,
                                                      i,
                                                      history),
                                                      tag))
            history.append(tag)
      self.bayes=naivebayes.NaiveBayesClassifier.train(temp_train_data)
Example #13
0
    def evaluate(self, gold):
        """
        Score the accuracy of the tagger against the gold standard.
        Strip the tags from the gold standard text, retag it using
        the tagger, then compute the accuracy score.

        :type gold: list(list(tuple(str, str)))
        :param gold: The list of tagged sentences to score the tagger on.
        :rtype: float
        """
        from nltk.tag import untag

        tagged_sents = self.tag_sents(untag(sent) for sent in gold)
        gold_tokens = sum(gold, [])
        test_tokens = sum(tagged_sents, [])
        return self.accuracy(gold_tokens, test_tokens)
Example #14
0
def demo():
    myTagger = Tagger()
    
    from nltk.tag import untag
    # Load the brown corpus.
    from nltk.corpus import brown
    
    #brown_train = brown.tagged_sents()[100:]
    brown_test = brown.tagged_sents()[:100]
    test_sent = untag(brown_test[1])
    
    score = myTagger.evaluate(brown_test)
    print "Score: ", score
    result = myTagger.tag(test_sent)
    print result
    
    print myTagger.tag(['drink', 'some', 'water'])
Example #15
0
def read_and_write():

    flist = []
    path = "C:\\Users\\gsree\\OneDrive\\Desktop\\Book2.xlsx"
    wb_obj = openpyxl.load_workbook(path)
    sheet_obj = wb_obj.active
    m_row = sheet_obj.max_row
    for i in range(2, m_row + 1):
        cell_obj = sheet_obj.cell(row=i, column=1)

        result = summarization(cell_obj.value)
        ci = sheet_obj.cell(row=i, column=8)
        ci.value = result
        tok = word_tokenize(result)
        tagged = pos_tag(tok)
        nn_vb_tagged = [(word, tag) for word, tag in tagged
                        if tag in ('NN', 'JJ', 'NNS', 'VBN', 'RB', 'VBG', 'VB')
                        ]
        mylist = untag(nn_vb_tagged)
        cj = sheet_obj.cell(row=i, column=9)
        str = ""
        for l in mylist:
            str = str + l + ","
        cj.value = str
        mylist = (list(dict.fromkeys(mylist)))
        #print(mylist)
        ck = sheet_obj.cell(row=i, column=10)
        string_story = []
        string_story = (create_user_story(mylist))
        #print(string_story)

        st = ""
        for i in string_story:
            st = st + i
        ck.value = st

        flist.append(string_story)
        #print(flist)

        wb_obj.save("C:\\Users\\gsree\\OneDrive\\Desktop\\Book2.xlsx")

    generate_html(final_user_story(flist))
    def load_data(self, data_set):
        """
        Loads the given data set. Makes the data set case insensitive.
        Remove words that appear less than 5 times.
        :return updated data set
        """
        print("Started Loading the Data")
        tagged_tokens = data_set.tagged_words()
        tokens = untag(tagged_tokens)

        # Get the list of words that appear less than 5 times in  Corpus
        print("Get LT5's")
        tokens = [token.lower() for token in tokens] # Convert to lower case
        freq_dist = FreqDist(tokens) # Compute the freq dist
        tokens_lt_5 = [word for word, count in freq_dist.items() if count < 5]

        # Delete words less than 5 and make the corpus insensitive
        print("Making data case insensitive")
        token_range = range(len(tagged_tokens))
        indexed_tokens = OrderedDict(zip(token_range,tagged_tokens))
        updated_tagged_tokens = OrderedDict()
        for tagged_token_id, tagged_token in indexed_tokens.items():
            if tagged_token[0].lower() in tokens_lt_5:
                del indexed_tokens[tagged_token_id]
            else:
                temp = list()
                temp.append(tagged_token[0].lower())
                temp.append(tagged_token[1])
                temp = tuple(temp)
                updated_tagged_tokens[tagged_token_id] = temp
        tagged_tokens = list(updated_tagged_tokens.values())

        # Pickle the data for future purpose
        print("Pickling the Updated Corpus")
        if data_set == brown:
            file_name = "q5_brown_updated.pkl"
        else:
            file_name = "q5_treebank_updated.pkl"
        pkl.dump((tagged_tokens, tokens_lt_5), open(file_name,'wb'))

        return tagged_tokens, tokens_lt_5
Example #17
0
def get_statistics_per_tag(corpus_test, tagger):
    from nltk.tag import untag

    possible_tags = get_all_tags(corpus_test)
    untaged_test = [untag(x) for x in corpus_test]
    tagged_sents = tagger.tag_sents(untaged_test)
    ref_words = sum(corpus_test, [])
    test_words = sum(tagged_sents, [])
    best_fmeasure = 0
    best_tag = "Chtulhu"
    worst_tag_fmeasure = 100
    worst_tag = "cthulhu"
    for tag in possible_tags:
        f_measure = evaluate_tag(ref_words, test_words, tag)
        if f_measure > best_fmeasure:
            best_tag = tag
            best_fmeasure = f_measure
        if f_measure < worst_tag_fmeasure:
            worst_tag = tag
            worst_tag_fmeasure = f_measure
    print "best tag: ", best_tag
    print "worst tag: ", worst_tag
    def calculate_contingenz_with_sets(self, tagger):
        """
        Compares the original tags with the tags created by the tagger.
        """
        tagger_tagged = tagger.tag_sents([untag(i) for i in self.test_sents])
        tagger_words = sum(tagger_tagged,[])
        original_tagged = self.test_sents
        original_words = sum(original_tagged,[])
        tagged_org_zip = zip([i[1] for i in original_words],[i[1] for i in tagger_words])
        contingenzliste = []
        orig_tags = []
        tag_tags = []
        for i in tagged_org_zip:
            if i[0] != i[1]:
                if i[1] == None:
                    i = (i[0], "None")
                contingenzliste.append(i[1]+"   :   "+i[0]+"\n")
                orig_tags.append(i[0])
                tag_tags.append(i[1])   

        self.contingenzliste = self.contingenzliste + contingenzliste
        self.reference_tags = self.reference_tags + orig_tags
        self.test_tags = self.test_tags + tag_tags
Example #19
0
        print "unique words that are None: ", len(self.distinct_nones)
        print "precentage of none tokesns to overall tokes: ", float(self.overall_nones) / float(
            self.overall_tokens_tagged)
        print "precentage of unique nones to overall unique ", float(len(self.distinct_nones)) / float(
            self.get_overall_distinct())


if __name__ == "__main__":
    # split the brown corpus to test, dev, and test set
    all_words = corpus.brown.tagged_sents(tagset='universal')
    ds_length = len(all_words)
    train = all_words[int(0.2 * ds_length):]
    dev = all_words[:int(0.1 * ds_length)]
    test = all_words[int(0.1 * ds_length):int(0.2 * ds_length)]

    untagged_dev = [untag(item) for item in dev]
    words_in_dev = 0
    for item in untagged_dev:
        words_in_dev += len(item)
    print "overall words in dev : ", words_in_dev
    u1 = SimpleUnigramTagger(train)
    tagged_dev = u1.tag_sents(untagged_dev)
    print len(tagged_dev)
    none_count = 0
    for sent in tagged_dev:
        for tagged_word in sent:
            if tagged_word[1] is None:
                none_count += 1

    print "Number of Nones in dev is: ", none_count
    print "number of options per (token, word) is:"
Example #20
0
# every tagger has a tag() method.
# DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method.
from nltk.tag import DefaultTagger
from nltk.corpus import treebank

tagger = DefaultTagger('NN')
print(tagger.tag(['Hello', 'World']))

# thought it's too simple, we can try to evaluate it
test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))

# for sentences
print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']]))

# untagging
from nltk.tag import untag

print(untag([('Hello', 'NN'), ('World', 'NN')]))
    def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
        """
        Trains the Brill tagger on the corpus *train_sents*,
        producing at most *max_rules* transformations, each of which
        reduces the net number of errors in the corpus by at least
        *min_score*, and each of which has accuracy not lower than
        *min_acc*.

        #imports
        >>> from nltk.tbl.template import Template
        >>> from nltk.tag.brill import Pos, Word
        >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer

        #some data
        >>> from nltk.corpus import treebank
        >>> training_data = treebank.tagged_sents()[:100]
        >>> baseline_data = treebank.tagged_sents()[100:200]
        >>> gold_data = treebank.tagged_sents()[200:300]
        >>> testing_data = [untag(s) for s in gold_data]

        >>> backoff = RegexpTagger([
        ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
        ... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
        ... (r'.*able$', 'JJ'),                # adjectives
        ... (r'.*ness$', 'NN'),                # nouns formed from adjectives
        ... (r'.*ly$', 'RB'),                  # adverbs
        ... (r'.*s$', 'NNS'),                  # plural nouns
        ... (r'.*ing$', 'VBG'),                # gerunds
        ... (r'.*ed$', 'VBD'),                 # past tense verbs
        ... (r'.*', 'NN')                      # nouns (default)
        ... ])

        >>> baseline = backoff #see NOTE1

        >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
        0.2450142...

        #templates
        >>> Template._cleartemplates() #clear any templates created in earlier tests
        >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        #construct a BrillTaggerTrainer
        >>> tt = BrillTaggerTrainer(baseline, templates, trace=3)

        >>> tagger1 = tt.train(training_data, max_rules=10)
        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
        Finding initial useful rules...
            Found 845 useful rules.
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          47  63  16 161  | NN->IN if Pos:NNS@[-1]
          33  33   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | IN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | IN->, if Pos:NNS@[-1] & Word:,@[0]
          22  27   5  24  | NN->-NONE- if Pos:VBD@[-1]
          17  17   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]

        >>> tagger1.rules()[1:3]
        (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]))

        >>> train_stats = tagger1.train_stats()
        >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]

        >>> tagger1.print_template_statistics(printunused=False)
        TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
        TRAIN (   2417 tokens) initial  1775 0.2656 final:  1269 0.4750
        #ID | Score (train) |  #Rules     | Template
        --------------------------------------------
        001 |   305   0.603 |   7   0.700 | Template(Pos([-1]),Word([0]))
        000 |   201   0.397 |   3   0.300 | Template(Pos([-1]))
        <BLANKLINE>
        <BLANKLINE>

        >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
        0.43996...

        >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)

        >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'),
        ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'),
        ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')]
        True

        >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]

        # a high-accuracy tagger
        >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
        Finding initial useful rules...
            Found 845 useful rules.
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          36  36   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | NN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | NN->, if Pos:NNS@[-1] & Word:,@[0]
          19  19   0   6  | NN->VB if Pos:TO@[-1]
          18  18   0   0  | CD->-NONE- if Pos:NN@[-1] & Word:0@[0]
          18  18   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]

        >>> tagger2.evaluate(gold_data)  # doctest: +ELLIPSIS
        0.44159544...
        >>> tagger2.rules()[2:4]
        (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))

        # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger,
        # with a RegexpTagger only as backoff. For instance,
        # >>> baseline = UnigramTagger(baseline_data, backoff=backoff)
        # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results
        # between python versions. The simplistic backoff above is a workaround to make doctests
        # get consistent input.

        :param train_sents: training data
        :type train_sents: list(list(tuple))
        :param max_rules: output at most max_rules rules
        :type max_rules: int
        :param min_score: stop training when no rules better than min_score can be found
        :type min_score: int
        :param min_acc: discard any rule with lower accuracy than min_acc
        :type min_acc: float or None
        :return: the learned tagger
        :rtype: BrillTagger

        """
        # FIXME: several tests are a bit too dependent on tracing format
        # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates

        # Basic idea: Keep track of the rules that apply at each position.
        # And keep track of the positions to which each rule applies.

        # Create a new copy of the training corpus, and run the
        # initial tagger on it.  We will progressively update this
        # test corpus to look more like the training corpus.
        test_sents = [list(self._initial_tagger.tag(untag(sent)))
                      for sent in train_sents]

        # Collect some statistics on the training process
        trainstats = {}
        trainstats['min_acc'] = min_acc
        trainstats['min_score'] = min_score
        trainstats['tokencount'] = sum(len(t) for t in test_sents)
        trainstats['sequencecount'] = len(test_sents)
        trainstats['templatecount'] = len(self._templates)
        trainstats['rulescores'] = []
        trainstats['initialerrors'] = sum(
            tag[1] != truth[1]
            for paired in zip(test_sents, train_sents)
            for (tag, truth) in zip(*paired)
        )
        trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount']
        if self._trace > 0:
            print("TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; "
                  "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats))

        # Initialize our mappings.  This will find any errors made
        # by the initial tagger, and use those to generate repair
        # rules, which are added to the rule mappings.
        if self._trace:
            print("Finding initial useful rules...")
        self._init_mappings(test_sents, train_sents)
        if self._trace:
            print(("    Found %d useful rules." % len(self._rule_scores)))

        # Let the user know what we're up to.
        if self._trace > 2:
            self._trace_header()
        elif self._trace == 1:
            print("Selecting rules...")

        # Repeatedly select the best rule, and add it to `rules`.
        rules = []
        try:
            while (len(rules) < max_rules):
                # Find the best rule, and add it to our rule list.
                rule = self._best_rule(train_sents, test_sents, min_score, min_acc)
                if rule:
                    rules.append(rule)
                    score = self._rule_scores[rule]
                    trainstats['rulescores'].append(score)
                else:
                    break  # No more good rules left!

                # Report the rule that we found.
                if self._trace > 1:
                    self._trace_rule(rule)

                # Apply the new rule at the relevant sites
                self._apply_rule(rule, test_sents)

                # Update _tag_positions[rule.original_tag] and
                # _tag_positions[rule.replacement_tag] for the affected
                # positions (i.e., self._positions_by_rule[rule]).
                self._update_tag_positions(rule)

                # Update rules that were affected by the change.
                self._update_rules(rule, train_sents, test_sents)

        # The user can cancel training manually:
        except KeyboardInterrupt:
            print("Training stopped manually -- %d rules found" % len(rules))

        # Discard our tag position mapping & rule mappings.
        self._clean()
        trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores'])
        trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount']
        # Create and return a tagger from the rules we found.
        return BrillTagger(self._initial_tagger, rules, trainstats)
Example #22
0
from nltk.tag import pos_tag, untag
from nltk.tokenize import word_tokenize

sentence = 'Emma refused to permit us to obtain the refuse permit"'

# 문장을 단어로 분리하고 그것을 품사와 같이 표시
tagged_list = pos_tag(word_tokenize(sentence))
print(tagged_list)
#    t[0]   t[1]      t[0]      t[1]     t[0]  t[1]
#[('Emma', 'NNP'), ('refused', 'VBD'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'), ('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN'), ("''", "''")]
noun_list = [t[0] for t in tagged_list if t[1] == 'NN']
# 리스트중 =='NN' 인 명사만 가져온다.
print(noun_list)

# 품사 tag 를 제거 한다.
print(untag(tagged_list))


# 단어와 품사를 묶어서 한 문자열로 변경
def tokenizer(doc):
    return ['/'.join(p) for p in tagged_list]  # 배열을 하나의 문자열로 '/'로 구분해서 합친다.


print(tokenizer(tagged_list))
# ['Emma/NNP', 'refused/VBD', 'to/TO', 'permit/VB', 'us/PRP', 'to/TO', 'obtain/VB', 'the/DT', 'refuse/NN', 'permit/NN', "''/''"]
def FeatureExtractor(tree):
    rc = []
    for subtree in tree.subtrees():
        if subtree.label() == 'NP':
            rc.append(str(untag(subtree)))
    return rc
    print("wrong input")
    last = input("Unigram or Bigram+Affix+Dictionary+Regexp+Default? UNIGRAM/BIGRAM-> ")



print("\nCalculating Brill & Wu Complementarity\n\nAfter the programme finished, please look for \
'Results\CompareBrillWu_"+last+tone+tag+".txt'  and 'Results\Disagreement_BrillWuHtml_"+last+tone+tag+".txt' in your current working directory.\n")



bambara = create_reader(tone, tag)          


if last == "BIGRAM":
    lasttagger = backoff_tagger([5,6,8,1],bambara,option_tones=tone, option_tag=tag)
    lasttaggertagged = lasttagger.tag_sents([untag(i) for i in bambara.test_sents])
    lasttagger_acc = lasttagger.evaluate(bambara.test_sents)
    lasttagger_err = round((1-(lasttagger_acc)),4)
if last == "UNIGRAM":
    lasttagger = indivUnigram(bambara, backoff)
    lasttaggertagged = lasttagger.tag_sents([untag(i) for i in bambara.test_sents])
    lasttagger_acc = lasttagger.evaluate(bambara.test_sents)
    lasttagger_err = round((1-(lasttagger_acc)),4)

hmm = indivHMM(bambara)
hmmtagged = hmm.tag_sents([untag(i) for i in bambara.test_sents])
hmm_acc = hmm.evaluate(bambara.test_sents)
hmm_err = round((1 - (hmm_acc)),4)

crf = indivCRF(bambara, tone, tag)
crftagged = crf.tag_sents([untag(i) for i in bambara.test_sents]) 
Example #25
0
File: data.py Project: zinaaa/hazm
	def retag_trees(trees, sents):
		tagged_sents = tagger.tag_sents([untag(sent) for sent in sents])
		for tree, sentence in zip(trees, tagged_sents):
			for (n, word) in zip(tree.treepositions('leaves'), sentence):
				tree[n] = word
def NounExtractor(tree):
    rc1 = []
    for subtree in tree.subtrees():
        if subtree.label() == 'Noun':
            rc1.append(str(untag(subtree)))
    return rc1
Example #27
0
import nltk
nltk.download('brown')

from nltk.corpus import brown

brown_news_tagged = brown.tagged_sents(categories='news', tagset='universal')
brown_news_words = brown.tagged_words(categories='news', tagset='universal')

brown_train = brown_news_tagged[100:]
brown_test = brown_news_tagged[:100]

from nltk.tag import untag
test_sent = untag(brown_test[0])
print("Tagged: ", brown_test[0])
print()
print("Untagged: ", test_sent)

# A default tagger assigns the same tag to all words
from nltk import DefaultTagger
default_tagger = DefaultTagger('NOUN')
default_tagger.tag('This is a test'.split())
Example #28
0
def apply_tagger(tagger, corpus):
    return [tagger.tag(untag(sent)) for sent in corpus]
[lm.lemmatize(w) for w in words]

lm.lemmatize("dying", pos="v")

nltk.help.upenn_tagset('VB')

from nltk.tag import pos_tag
sentence = "Emma refused to permit us to obtain the refuse permit"
tagged_list = pos_tag(word_tokenize(sentence))
tagged_list

nouns_list = [t[0] for t in tagged_list if t[1] == "NN"]
nouns_list

from nltk.tag import untag
untag(tagged_list)

def tokenizer(doc):
    return ["/".join(p) for p in tagged_list]

tokenizer(sentence)

from nltk import Text

text = Text(retokenize.tokenize(emma_raw), name="Emma")

text.plot(20)
plt.show()

text.dispersion_plot(["Emma", "Knightley", "Frank", "Jane", "Harriet", "Robert"])
    tag = input("POS/Affixes? -> ")

backoff = input("DefaultTagger as backoff? J/N-> ")
if backoff =="J":
    backoff = DefaultTagger('n')
else:
    backoff = None

print("\nCalculating Brill & Wu Complementarity\n\nAfter the programme finished, please look for \
'Results\CompareBrillWu_Regex"+tone+tag+".txt'  and 'Results\Disagreement_BrillWuHtml_Regex"+tone+tag+".txt' in your current working directory.\n")


bambara = create_reader(tone, tag)          

hmm = indivHMM(bambara)
hmmtagged = hmm.tag_sents([untag(i) for i in bambara.test_sents])
hmm_acc = hmm.evaluate(bambara.test_sents)
hmm_err = round((1 - (hmm_acc)),4)

crf = indivCRF(bambara, tone, tag)
crftagged = crf.tag_sents([untag(i) for i in bambara.test_sents]) 
crf_acc = crf.evaluate(bambara.test_sents)
crf_err = round((1-(crf_acc)),4)

tnt = indivTnT(bambara, backoff)
tnttagged = tnt.tag_sents([untag(i) for i in bambara.test_sents])
tnt_acc = tnt.evaluate(bambara.test_sents)
tnt_err = round((1-(tnt_acc)),4)

unigram = indivUnigram(bambara, backoff)
unitagged = unigram.tag_sents([untag(i) for i in bambara.test_sents])
Example #31
0
    # print affix_ugram_backoff.evaluate(test)
    # print unigram_affix_backoff.evaluate(test)
    # cutoffs = [x*0.1 for x in range(20)]
    # for c in cutoffs:
    # tagger = EntropyVotingTagger(taggers, c)
    # print "Accuracy of entropy voting = ", tagger.evaluate(test)


    affix_tagger = EntropyAffixTagger(train)
    unigram_tagger = EntropyUnigramTagger(train)
    taggers = [unigram_tagger, affix_tagger]
    tagger = EntropyVotingTagger(taggers, max_entropy=80)

    from nltk.tag import untag

    untagged_test = [untag(x) for x in dev]
    tagged_sents_uni_affix = unigram_affix_backoff.tag_sents(untagged_test)
    tagged_sents_entr = tagger.tag_sents(untagged_test)
    affix_mistake = 0
    unigram_mistake = 0
    overall_mistakes = 0
    print "len of dev: ", len(dev)
    for tagged_reference_sent, tagged_uni_affix_sent, tagged_entropy_sent in izip(dev, tagged_sents_uni_affix,
                                                                                  tagged_sents_entr):
        # import pdb;pdb.set_trace()
        for tagged_reference, tagged_uni_affix, tagged_entropy in izip(tagged_reference_sent, tagged_uni_affix_sent,
                                                                       tagged_entropy_sent):
            if tagged_uni_affix[1] != tagged_entropy[1]:
                overall_mistakes += 1

                print "WE GOT MATCH!"
def VerbExtractor(tree):
    rc2 = []
    for subtree in tree.subtrees():
        if subtree.label() == 'Verb':
            rc2.append(str(untag(subtree)))
    return rc2
def AdjExtractor(tree):
    rc3 = []
    for subtree in tree.subtrees():
        if subtree.label() == 'Adj':
            rc3.append(str(untag(subtree)))
    return rc3
Example #34
0
import nltk
from nltk.tag import untag
print(untag([('beautiful', 'NN'), ('morning', 'NN')]))

def PhraseExtractor(tree):
    rc4 = []
    for subtree in tree.subtrees():
        if subtree.label() == 'Phrase':
            rc4.append(str(untag(subtree)))
    return rc4
def PosExtractor(tree):
    ps = []
    for subtree in tree.subtrees():
        if subtree.label() == 'PWD':
            ps.append(str(untag(subtree)))
    return ps
Example #37
0
    def train(self, train_sents, max_rules=200, min_score=2, min_acc=None):
        """
        Trains the Brill tagger on the corpus *train_sents*,
        producing at most *max_rules* transformations, each of which
        reduces the net number of errors in the corpus by at least
        *min_score*, and each of which has accuracy not lower than
        *min_acc*.

        #imports
        >>> from nltk.tbl.template import Template
        >>> from nltk.tag.brill import Pos, Word
        >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer

        #some data
        >>> from nltk.corpus import treebank
        >>> training_data = treebank.tagged_sents()[:100]
        >>> baseline_data = treebank.tagged_sents()[100:200]
        >>> gold_data = treebank.tagged_sents()[200:300]
        >>> testing_data = [untag(s) for s in gold_data]

        >>> backoff = RegexpTagger([
        ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
        ... (r'(The|the|A|a|An|an)$', 'AT'),   # articles
        ... (r'.*able$', 'JJ'),                # adjectives
        ... (r'.*ness$', 'NN'),                # nouns formed from adjectives
        ... (r'.*ly$', 'RB'),                  # adverbs
        ... (r'.*s$', 'NNS'),                  # plural nouns
        ... (r'.*ing$', 'VBG'),                # gerunds
        ... (r'.*ed$', 'VBD'),                 # past tense verbs
        ... (r'.*', 'NN')                      # nouns (default)
        ... ])

        >>> baseline = backoff #see NOTE1

        >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS
        0.2450142...

        #templates
        >>> Template._cleartemplates() #clear any templates created in earlier tests
        >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        #construct a BrillTaggerTrainer
        >>> tt = BrillTaggerTrainer(baseline, templates, trace=3)

        >>> tagger1 = tt.train(training_data, max_rules=10)
        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
        Finding initial useful rules...
            Found 845 useful rules.
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          47  63  16 161  | NN->IN if Pos:NNS@[-1]
          33  33   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | IN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | IN->, if Pos:NNS@[-1] & Word:,@[0]
          22  27   5  24  | NN->-NONE- if Pos:VBD@[-1]
          17  17   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]

        >>> tagger1.rules()[1:3]
        (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]))

        >>> train_stats = tagger1.train_stats()
        >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]]

        >>> tagger1.print_template_statistics(printunused=False)
        TEMPLATE STATISTICS (TRAIN)  2 templates, 10 rules)
        TRAIN (   2417 tokens) initial  1775 0.2656 final:  1269 0.4750
        #ID | Score (train) |  #Rules     | Template
        --------------------------------------------
        001 |   305   0.603 |   7   0.700 | Template(Pos([-1]),Word([0]))
        000 |   201   0.397 |   3   0.300 | Template(Pos([-1]))
        <BLANKLINE>
        <BLANKLINE>

        >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS
        0.43996...

        >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)

        >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'),
        ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'),
        ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')]
        True

        >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']]
        [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]]

        # a high-accuracy tagger
        >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
        TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99)
        Finding initial useful rules...
            Found 845 useful rules.
        <BLANKLINE>
                   B      |
           S   F   r   O  |        Score = Fixed - Broken
           c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
           o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
           r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
           e   d   n   r  |  e
        ------------------+-------------------------------------------------------
         132 132   0   0  | AT->DT if Pos:NN@[-1]
          85  85   0   0  | NN->, if Pos:NN@[-1] & Word:,@[0]
          69  69   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
          51  51   0   0  | NN->IN if Pos:NN@[-1] & Word:of@[0]
          36  36   0   0  | NN->TO if Pos:NN@[-1] & Word:to@[0]
          26  26   0   0  | NN->. if Pos:NNS@[-1] & Word:.@[0]
          24  24   0   0  | NN->, if Pos:NNS@[-1] & Word:,@[0]
          19  19   0   6  | NN->VB if Pos:TO@[-1]
          18  18   0   0  | CD->-NONE- if Pos:NN@[-1] & Word:0@[0]
          18  18   0   0  | NN->CC if Pos:NN@[-1] & Word:and@[0]

        >>> tagger2.evaluate(gold_data)  # doctest: +ELLIPSIS
        0.44159544...
        >>> tagger2.rules()[2:4]
        (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')]))

        # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger,
        # with a RegexpTagger only as backoff. For instance,
        # >>> baseline = UnigramTagger(baseline_data, backoff=backoff)
        # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results
        # between python versions. The simplistic backoff above is a workaround to make doctests
        # get consistent input.

        :param train_sents: training data
        :type train_sents: list(list(tuple))
        :param max_rules: output at most max_rules rules
        :type max_rules: int
        :param min_score: stop training when no rules better than min_score can be found
        :type min_score: int
        :param min_acc: discard any rule with lower accuracy than min_acc
        :type min_acc: float or None
        :return: the learned tagger
        :rtype: BrillTagger

        """
        # FIXME: several tests are a bit too dependent on tracing format
        # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates

        # Basic idea: Keep track of the rules that apply at each position.
        # And keep track of the positions to which each rule applies.

        # Create a new copy of the training corpus, and run the
        # initial tagger on it.  We will progressively update this
        # test corpus to look more like the training corpus.
        test_sents = [
            list(self._initial_tagger.tag(untag(sent))) for sent in train_sents
        ]

        # Collect some statistics on the training process
        trainstats = {}
        trainstats['min_acc'] = min_acc
        trainstats['min_score'] = min_score
        trainstats['tokencount'] = sum(len(t) for t in test_sents)
        trainstats['sequencecount'] = len(test_sents)
        trainstats['templatecount'] = len(self._templates)
        trainstats['rulescores'] = []
        trainstats['initialerrors'] = sum(
            tag[1] != truth[1] for paired in zip(test_sents, train_sents)
            for (tag, truth) in zip(*paired))
        trainstats['initialacc'] = 1 - trainstats[
            'initialerrors'] / trainstats['tokencount']
        if self._trace > 0:
            print(
                "TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; "
                "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})"
                .format(**trainstats))

        # Initialize our mappings.  This will find any errors made
        # by the initial tagger, and use those to generate repair
        # rules, which are added to the rule mappings.
        if self._trace:
            print("Finding initial useful rules...")
        self._init_mappings(test_sents, train_sents)
        if self._trace:
            print(("    Found %d useful rules." % len(self._rule_scores)))

        # Let the user know what we're up to.
        if self._trace > 2:
            self._trace_header()
        elif self._trace == 1:
            print("Selecting rules...")

        # Repeatedly select the best rule, and add it to `rules`.
        rules = []
        try:
            while (len(rules) < max_rules):
                # Find the best rule, and add it to our rule list.
                rule = self._best_rule(train_sents, test_sents, min_score,
                                       min_acc)
                if rule:
                    rules.append(rule)
                    score = self._rule_scores[rule]
                    trainstats['rulescores'].append(score)
                else:
                    break  # No more good rules left!

                # Report the rule that we found.
                if self._trace > 1:
                    self._trace_rule(rule)

                # Apply the new rule at the relevant sites
                self._apply_rule(rule, test_sents)

                # Update _tag_positions[rule.original_tag] and
                # _tag_positions[rule.replacement_tag] for the affected
                # positions (i.e., self._positions_by_rule[rule]).
                self._update_tag_positions(rule)

                # Update rules that were affected by the change.
                self._update_rules(rule, train_sents, test_sents)

        # The user can cancel training manually:
        except KeyboardInterrupt:
            print("Training stopped manually -- %d rules found" % len(rules))

        # Discard our tag position mapping & rule mappings.
        self._clean()
        trainstats['finalerrors'] = trainstats['initialerrors'] - sum(
            trainstats['rulescores'])
        trainstats['finalacc'] = 1 - trainstats['finalerrors'] / trainstats[
            'tokencount']
        # Create and return a tagger from the rules we found.
        return BrillTagger(self._initial_tagger, rules, trainstats)
def NegExtractor(tree):
    ns = []
    for subtree in tree.subtrees():
        if subtree.label() == 'NWD':
            ns.append(str(untag(subtree)))
    return ns
Example #39
0
######### DEFAULT TAGGER ###############

#Assigning the default Tag
from nltk.tag import DefaultTagger, untag
tagger=DefaultTagger('NN')
tokens=[['Hello','World'],['How','are','you','?']]
print tagger.tag(tokens)

print tagger.tag_sents(tokens)

#Untagging
tagged=tagger.tag(tokens)
print untag(tagged)

#Evaluating the tagger accuracy
from nltk.corpus import treebank
test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)
Example #40
0
 def trainALL(self, last):
     self.split_into_folds()
     for k in range(1, (self.folds + 1)):
         train_sents = sum(self.foldlist[: (self.folds - 1)], [])
         crf = CRFTagger(training_opt={"max_iterations": 100, "max_linesearch": 10, "c1": 0.0001, "c2": 1.0})
         crf_trained = crf.train(
             train_sents,
             "Models/model.crfCrossValidation1" + str(k) + self.option_tone + self.option_tag + ".tagger",
         )
         print(str(k) + " fold: crf")
         tnt_tagger = tnt.TnT(unk=DefaultTagger("n"), Trained=True, N=100)
         tnt_tagger.train(train_sents)
         print(str(k) + " fold: tnt")
         tag_set = set()
         symbols = set()
         for i in train_sents:
             for j in i:
                 tag_set.add(j[1])
                 symbols.add(j[0])
         trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols))
         hmm = trainer.train_supervised(train_sents, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
         print(str(k) + " fold: hmm")
         if last == "U":
             lasttagger = UnigramTagger(train_sents, backoff=DefaultTagger("n"))
             print(str(k) + " fold: unigram")
         if last == "B":
             if self.option_tone == "tonal" and self.option_tag == "Affixes":
                 regex = RegexpTonalSA(DefaultTagger("n"))
             if self.option_tone == "tonal" and self.option_tag == "POS":
                 regex = RegexpTonal(DefaultTagger("n"))
             if self.option_tone == "nontonal" and self.option_tag == "Affixes":
                 regex = RegexpSA(DefaultTagger("n"))
             if self.option_tone == "nontonal" and self.option_tag == "POS":
                 regex = Regexp(DefaultTagger("n"))
             dic = dictionary_backoff(self.option_tone, regex)
             affix = AffixTagger(train_sents, min_stem_length=0, affix_length=-4, backoff=dic)
             lasttagger = BigramTagger(train_sents, backoff=affix)
             print(str(k) + " fold: bigram")
         to_tag = [untag(i) for i in self.foldlist[self.folds - 1]]
         self.crf_tagged += crf.tag_sents(to_tag)
         self.tnt_tagged += tnt_tagger.tag_sents(to_tag)
         self.hmm_tagged += hmm.tag_sents(to_tag)
         self.lasttagger_tagged += lasttagger.tag_sents(to_tag)
         self.org_tagged += self.foldlist[self.folds - 1]
         self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)]
     self.crf = crf
     self.tnt = tnt_tagger
     self.hmm = hmm
     self.lasttagger = lasttagger
     org_words = sum(self.org_tagged, [])
     self.crf_avg_acc = accuracy(org_words, sum(self.crf_tagged, []))
     self.tnt_avg_acc = accuracy(org_words, sum(self.tnt_tagged, []))
     self.hmm_avg_acc = accuracy(org_words, sum(self.hmm_tagged, []))
     self.lasttagger_avg_acc = accuracy(org_words, sum(self.lasttagger_tagged, []))
     print("Accuracy of concatenated crf-tagged sentences: ", self.crf_avg_acc)
     print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc)
     print("Accuracy of concatenated hmm-tagged sentences: ", self.hmm_avg_acc)
     print("Accuracy of concatenated " + last + "-tagged sentences: ", self.lasttagger_avg_acc)
     (self.crf_tagprecision, self.crf_tagrecall) = self.tagprecision_recall(crf, self.crf_tagged, self.org_tagged)
     (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall(
         tnt_tagger, self.tnt_tagged, self.org_tagged
     )
     (self.hmm_tagprecision, self.hmm_tagrecall) = self.tagprecision_recall(hmm, self.hmm_tagged, self.org_tagged)
     (self.lasttagger_tagprecision, self.lasttagger_tagrecall) = self.tagprecision_recall(
         lasttagger, self.lasttagger_tagged, self.org_tagged
     )
     self.org_tagged = []
     self.foldlist = []
     for i in range(1, self.folds + 1):
         self.foldlist.append(self.create_fold(i))