コード例 #1
0
def train(train_sentences):
    print "- Default Tagger"
    default_tagger = DefaultTagger('NC')

    print "- Unigram Tagger"
    unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger)

    print "- Templates"
    #These templates define the features to be used for the brill tagger
    # relatively to the word position.
    Template._cleartemplates()
    templates = [
        Template(Pos([-1])),
        Template(Pos([-1]), Word([0])),
        Template(Pos([-2])),
        Template(Pos([-2]), Word([0])),
        Template(Pos([1])),
    ]
    print "- Brill Tagger"
    tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1)
    tagger = tt.train(train_sentences, max_rules=1000)

    print "- Done."

    return tagger
コード例 #2
0
    def train(self, sentence_list):
        """Trains the tagger from the tagged sentences provided
        """
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
        trigram_fallback = TrigramTagger(sentence_list,
                                         backoff=bigram_fallback)
        templates = [
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 3)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 3)),
            brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                          (1, 1)),
            brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                          (1, 1))
        ]

        trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates)
        self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
コード例 #3
0
ファイル: pos_tagger.py プロジェクト: jkussmann/ner_tagger
def train_tagger():
    """
	This function trains the tagger
	"""
    print("Training POS tagger...")
    # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py

    tagged_sentences = treebank.tagged_sents()
    size = int(len(tagged_sentences) * 0.9)
    train_sents = tagged_sentences[:size]
    test_sents = tagged_sentences[3000:]

    default = DefaultTagger("NN")
    tagger = ClassifierBasedPOSTagger(
        train=train_sents, backoff=default, cutoff_prob=0.3
    )
    print(tagger.evaluate(test_sents))  # 0.9613641269156055

    # save model to pickle file as binary
    file_name = MODEL_PATH + "tag_model.pkl"
    with open(file_name, "wb") as fout:
        pickle.dump(tagger, fout)

    print("model written to: " + file_name)
    print("")

    return tagger
コード例 #4
0
    def tag(self, sent, tagregex=True, deftag='XX', verbose=False):
        kalimat = sent.encode('utf-8')

        text = self.regexTokenizer(kalimat.lower().strip())

        ## :> --___<<IMPORTANT>>___--
        ##      Untuk beberapa hal default tagger harus dibiarkan 'XX'
        ##      dengan tujuan identifikasi Entitas
        backoff_tagger = DefaultTagger(deftag)

        if tagregex:
           regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger)
           unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger)
        else:
           unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger)
           
        bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger)
        
        """
        # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt"
        # kombinasi proses tagging diatas menghasilkan tingkat akurasi:
        #      dengan regextagger: 77%
        #      tanpa regextagger : > 90%
        """
        if verbose:
           # Semakin besar dokumen, semakin lama proses perhitungan akurasi
           # disarankan hanya untuk testing
           print ("Calculating Tagger Accuracy...")
           self.tagAccuracy = trigram_tagger.evaluate(self.test_sent)
           print ("Accuracy is: %s" % (self.tagAccuracy))
        
        return trigram_tagger.tag(text)
コード例 #5
0
def train():
    try:
        input = open('tagger.pkl', 'rb')
        print("Found tagger")

        tagger = load(input)
        input.close()
    except IOError:
        print('Training:')

        train_sents = brown.tagged_sents()[:50000]
        test_sents = brown.tagged_sents()[50000:]

        tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger]

        tagger = backoff_tagger(train_sents,
                                tagger_classes,
                                backoff=DefaultTagger('unseen'))
        print('Finished training, tagger accuracy:')
        print(tagger.evaluate(test_sents))

        output = open('tagger.pkl', 'wb')
        dump(tagger, output, -1)
        output

    return tagger
コード例 #6
0
ファイル: q2.py プロジェクト: kyajpauley/ling-539
def getDefaultTaggerAccuracy(testingSet):
    # gets the accuracy of the DefaultTagger

    # get untagged sentences and gold POS tags
    untaggedSentences = [[taggedWord[0] for taggedWord in sentence]
                         for sentence in testingSet]
    goldPOSTags = [[taggedWord[1] for taggedWord in sentence]
                   for sentence in testingSet]

    # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this
    # altogether
    # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary
    defaultTagger = DefaultTagger('NN')
    defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences)

    # calculate accuracy
    totalTags = 0
    matches = 0
    # iterate through sentences
    for sentencePOSTags in goldPOSTags:
        # iterate through tags
        for individualPOSTag in sentencePOSTags:
            totalTags += 1
            # if the gold tag is NN, then match
            if individualPOSTag == 'NN':
                matches += 1

    accuracy = (matches / totalTags) * 100
    return accuracy
コード例 #7
0
def find_accuracy(train_set, test_set):
    #skal alt her være test-set?
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    train_set_most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(train_set_most_frequent_tag)
    accuracy_result = default_tagger.evaluate(test_set)
    return accuracy_result
コード例 #8
0
ファイル: app.py プロジェクト: irfanandratama/ChatbotJurusan
def ngramtagging(train): #PROSES POS TAGGING
    train_data = []
    train_data.append(train)
    backoff_tagger = DefaultTagger('nn')
    unigram_tagger = UnigramTagger(train_data, backoff = backoff_tagger)
    bigram_tagger = BigramTagger(train_data, backoff = unigram_tagger)
    trigram_tagger = TrigramTagger(train_data, backoff = bigram_tagger)
    return trigram_tagger
コード例 #9
0
 def test_default_tagger(self):
     test_list = make_sentence_list(path.join(self.test_dir, 'test.tsv'))
     tagger = DefaultTagger('N')
     split = int(len(test_list) * .90)
     train_data = test_list[:split]
     test_data = test_list[split:]
     print(tagger.evaluate(train_data))
     print(tagger.evaluate(test_data))
コード例 #10
0
 def __init__(self):
     self.backoff = self.backoff_tagger(backoff=DefaultTagger('NN'))
     self.st = StanfordNERTagger(
         'stanfordNERJars/classifiers/english.all.3class.distsim.crf.ser.gz',
         'stanfordNERJars/stanford-ner.jar',
         encoding='utf-8')
     if os.path.exists("out/"):
         shutil.rmtree('out/')
コード例 #11
0
ファイル: tagger.py プロジェクト: tgem/crowdre-glossary
def train_tagger(tagger_name):
    train_sents = treebank.tagged_sents()[:5000]
    if tagger_name == "TnT" or tagger_name == 'tagger':
        trained_tagger = tnt.TnT()
        trained_tagger.train(train_sents)
    else:
        tagger1 = DefaultTagger('NN')
        tagger2 = TrigramTagger(train_sents, backoff=tagger1)
        tagger3 = BigramTagger(train_sents, backoff=tagger2)
        trained_tagger = UnigramTagger(train_sents, backoff=tagger3)
    return trained_tagger
コード例 #12
0
def baseline(tagged_sentences):
    from nltk.tag import UnigramTagger
    from nltk.tag import DefaultTagger
    from collections import Counter

    # lowercase everything
    # remove all instances of non-universal tags for propper comparison with
    # the other methods
    new_tagged_sentences = []
    for sent in tagged_sentences:
        sent = [(x[0].lower(), x[1]) for x in sent]
        sent = [x for x in sent if x[1] in _UNI]
        new_tagged_sentences.append(sent)
    tagged_sentences = new_tagged_sentences

    # size of corpus
    corpus_size = sum([len(sent) for sent in tagged_sentences])
    print('Corpus size: {} docs'.format(len(tagged_sentences)))
    print('Corpus size: {} tokens'.format(corpus_size))
    
    # train/test split
    test_pct = 0.3
    test_len = int(len(tagged_sentences) * test_pct)
    test_idx = len(tagged_sentences) - test_len
    train_set = tagged_sentences[:test_idx]
    test_set = tagged_sentences[test_idx:]
    print('Train set: {} docs'.format(len(train_set)))
    print('Test set: {} docs'.format(len(test_set)))

    # calculate test set size in tokens
    test_size = sum([len(sent) for sent in test_set])
    print('Test set: {} tokens'.format(test_size))

    # calculate most comman tag in the train set
    # this should be 'NOUN'
    tag_dist = []
    for sent in train_set:
        tag_dist += [x[1] for x in sent]
    counts = Counter()
    counts.update(tag_dist)
    most_common = counts.most_common(1)[0][0]
    print('Most common tag: {}'.format(most_common))

    # Create model
    backoff = DefaultTagger(most_common)
    tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5)

    # Evaluate
    acc = tagger.evaluate(test_set)
    print('Baseline: {}'.format(acc))
コード例 #13
0
ファイル: aeb_tagging.py プロジェクト: karenlmcneil/Final
def evaluate_nltk_pos_taggers(gold_standard_filename, num_folds=10, loo=False):
    """
    Evaluates the NLTK backoff taggers on the corpus data. Uses cross-validation.
    :param gold_standard_filename: tsv file of format: word \t POS \n
    :param num_folds: int: number of folds for cross-validation
    :param loo: bool: whether to use Leave One Out cross-validation
    :return:
    """
    tagged_sents = make_sentence_list(gold_standard_filename)
    backoff = DefaultTagger('N')
    tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger]
    scores = {
        'DefaultTagger': [],
        'UnigramTagger': [],
        'BigramTagger': [],
        'TrigramTagger': [],
        'BrillTagger': [],
    }

    # k-fold cross-validation
    if loo:  # Leave One Out cross-validation
        num_folds = len(tagged_sents)-1
    subset_size = int(len(tagged_sents) / num_folds)
    for i in range(num_folds):

        # training and testing data for this round
        X_test = tagged_sents[i * subset_size:][:subset_size]
        X_train = tagged_sents[:i * subset_size] + tagged_sents[(i + 1) * subset_size:]

        # compute score for taggers
        default_score = backoff.evaluate(X_train)
        trigram, tagger_scores = backoff_tagger(X_train, X_test,
                                                tagger_classes, backoff=backoff)
        uni_score, bi_score, tri_score = tagger_scores
        brill_tagger = train_brill_tagger(trigram, X_train)
        brill_score = brill_tagger.evaluate(X_test)
        brill_tagger.print_template_statistics(printunused=False)

        # save scores
        scores['DefaultTagger'].append(default_score)
        scores['UnigramTagger'].append(uni_score)
        scores['BigramTagger'].append(bi_score)
        scores['TrigramTagger'].append(tri_score)
        scores['BrillTagger'].append(brill_score)

    for k, v in scores.items():  # average scores across folds
        if v:
            scores[k] = sum(v)/len(v)
            print(k, ": {:2.2%}".format(scores[k]))
    return scores
コード例 #14
0
def find_combined_taggers_accuracy(train_set, test_set):
    # finding most used tag
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(most_frequent_tag)

    # default tagger
    default_tagger_result = default_tagger.evaluate(test_set)
    print("Default Tagger accuracy: ", default_tagger_result)

    # regex tagger
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    regex_tagger = RegexpTagger(patterns)
    regex_tagger_result = regex_tagger.evaluate(test_set)
    print("Regex Tagger Accuracy: ", regex_tagger_result)

    # unigram tagger with default tagger as backoff
    unigram_tagger = UnigramTagger(train_set, backoff=default_tagger)
    unigram_tagger_result = unigram_tagger.evaluate(test_set)
    print("Unigram Tagger accuracy (Backoff = Default Tagger): ",
          unigram_tagger_result)

    # bigram tagger with different backoffs
    bigram_tagger = BigramTagger(train_set)
    bigram_tagger_backoff_unigram = BigramTagger(train_set,
                                                 backoff=unigram_tagger)
    bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger)

    bigram_tagger_result = bigram_tagger.evaluate(test_set)
    bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate(
        test_set)
    bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate(
        test_set)

    print("Bigram Tagger Accuracy: ", bigram_tagger_result)
    print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ",
          bigram_tagger_backoff_regex_result)
    print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ",
          bigram_tagger_backoff_unigram_result)
コード例 #15
0
def tag_words(words, tag):
    """
    Associates a tag with words.

    Parameters
    ----------
    words: A list of strings.
    tag: A str.

    Returns
    -------
    A list of tuples of (str, str)
    """

    default_tagger = DefaultTagger(tag)
    tags = default_tagger.tag(words)

    return tags
コード例 #16
0
    def wordTagger(self, wordlist,number):
        train_sents = treebank.tagged_sents()[:3000]
        if number==1:
            taglist = nltk.pos_tag(wordlist)
        elif number ==2:
            tagger = DefaultTagger('NN')
            taglist = tagger.tag(wordlist)
        elif number ==3:
            tagger = UnigramTagger(train_sents)
            taglist = tagger.tag(wordlist)

        elif number ==4:
            tnt_tagger = tnt.TnT()
            tnt_tagger.train(train_sents)
            taglist = tnt_tagger.tag(wordlist)
        elif number ==5:
            tagger = ClassifierBasedPOSTagger(train=train_sents)
            taglist = tagger.tag(wordlist)
        return taglist
コード例 #17
0
def lexical(tokens):
    print "\n"
    print "Step 2: Lexical Analysis\n"
    print "Essentially refers to dictionary and obtains the properties of the word"
    print "Part-Of-Speech tagging"
    print "The tagset is:\n"

    tag = DefaultTagger('NN')
    tagg = UnigramTagger(train_sent, backoff=tag)
    tagger = BigramTagger(train_sent, backoff=tagg)

    tagtokens = tagger.tag(tokens)
    for token, tag in tagtokens:
        print token + "->" + tag
    print "\n"
    print "The acurracy of the trained pos tagger is:"
    print tagger.evaluate(test_sents)

    return tagtokens
コード例 #18
0
def tag_linked(words, default_tag='INFO'):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers.
    Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    :param default_tag:
    """

    default_tagger = DefaultTagger(default_tag)
    pt_tagger = UnigramTagger(treebank.tagged_sents())

    pt_tagger._taggers = [pt_tagger, default_tagger]

    tags = pt_tagger.tag(words)

    return tags
コード例 #19
0
def train_tagger():
    '''
    Um exemplo de treinamento de um etiquetador sintático usando
    um modelo de tri-gramas baseado em probabilidades.

    Um etiquetador sintático identifica quais a classe de uma palavra
    Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N
    Preposição Verbo Artigo Substantivo
    '''

    # Carregando um conjunto de dados em português que possui
    # sentenças manualmente identificadas
    data = [
        [(w, re.split('[|-]', tag)[0]) for w, tag in sent]
        for sent in mac_morpho.tagged_sents()]

    # Classe sintática padrão. N siginifica Nome/substantivo
    tagger0 = DefaultTagger('N')
    print('train unigram')
    tagger1 = UnigramTagger(data, backoff=tagger0)
    print('training bigram')
    tagger2 = BigramTagger(data, backoff=tagger1)
    print('training trigram')
    return TrigramTagger(data, backoff=tagger2)
コード例 #20
0
from nltk.metrics import *
import string
'''import replacer
from replacer import RegexpReplacer
from replacer import RepeatReplacer'''
import linecache
import matplotlib.pyplot as plt
'''
Train Tagger
'''
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.corpus import treebank
train = treebank.tagged_sents()[:10000]
t0 = DefaultTagger('NN')
t1 = UnigramTagger(train, backoff=t0)
t2 = BigramTagger(train, backoff=t1)
'''
Initialize
'''
my_corp = web.sents(fileids='firefox.txt')
sent_count = 0
ques_count = 0
All_count = 1
NN_count = 0
NNS_count = 0
NNP_count = 0
VB_count = 0
VBN_count = 0
VBG_count = 0
コード例 #21
0
    ('BASIS', 'BASIS'),
    ('CORPORATE', 'CORPORATE'),
    ('OTHER|other', 'OTHER'),
    ('LAST|last', 'LAST'),
    (r'POS', 'POINT_OF_SALE'),
    (r'AFTERCOMPANY|CORPORATE|COMPANY', 'CORPORATE'),
    (r'DISCOUNT|COMMISSIO|COMMISSIONS|COMISSION|discount|discounts|Commissionable|commissionable',
     'DISCOUNT'),
    (r'ASIA|Asia|asia|AISA|亚洲', 'ASIA'),
    (r'NORTH|North|north', 'NORTH'),
    (r'SOUTH|South|south', 'SOUTH')

    #            ('TICKET','VALIDITY')
]  # add learning loop here for tags

def_tagger = DefaultTagger('NN')
prelim_def_tagger = DefaultTagger(None)

backoff = RegexpTagger(
    [
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'is|was|are|were', 'VBZ'),  # verb to be
        (r'"', 'QT'),  # quote
        (r'.*', 'NN')  # nouns (default)
コード例 #22
0
def Tagger():
    #Tagger
    etiq1 = DefaultTagger('N')
    sentencas_treinadoras = mac_morpho.tagged_sents()[::]
    etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
    return etiq2
コード例 #23
0
train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

train_brown = nltk.corpus.brown.tagged_sents()[0:5000]
test_brown = nltk.corpus.brown.tagged_sents()[5000:]

from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger


def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
        return backoff


backoff = DefaultTagger('NN')
btagger = backoff_tagger(train_sents,
                         [UnigramTagger, BigramTagger, TrigramTagger],
                         backoff=backoff)

#print tagger.evaluate(test_sents)

tnt_tagger = nltk.tag.tnt.TnT()
tnt_tagger.train(train_sents)

t_tagger_brown = nltk.tag.tnt.TnT()
t_tagger_brown.train(train_brown)


def readEssays(filename):
    infile = open(filename, 'r')
コード例 #24
0
def backoff_tagger(train_sents, tagger_classes):
    backoff = DefaultTagger('NN')
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff
コード例 #25
0

train_sents = update_tags(train_sents)
val_sents = update_tags(val_sents)
test_sents = update_tags(test_sents)
"""
# =============================================================================
# finalise a sequential tagger
# =============================================================================
"""
""" 1. run tagger with different corpus size (50% and 100%) """
# backoff tagger
tag1_eval = dict()
# train with backoff and Brill
tic()
tag1_tagger = DefaultTagger('NO')
tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger)
tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger)
tag1_tagger = BigramTagger(train_sents, backoff=tag1_tagger)
tag1_tagger = TrigramTagger(train_sents, backoff=tag1_tagger)
tag1b_tagger = train_brill_tagger(tag1_tagger,
                                  train_sents,
                                  True,
                                  max_rules=100)
tag1_eval['train_time'] = toc()
# test
tic()
コード例 #26
0
from nltk.tag import tnt, DefaultTagger
import pickle

datas = open('Indonesian_Manually_Tagged_Corpus.tsv', 'r').read()
datas = datas.split('\n\n')

train_sents = []

for data in datas:
    train_sents.append(list(tuple(i.split('\t')) for i in data.split('\n')))

unk = DefaultTagger('NN')
tnt_tagger = tnt.TnT(unk=unk, Trained=True)
tnt_tagger.train(train_sents)
tagger_file = open("indonesian_tnt_pos_tag.pickle", "wb")
pickle.dump(tnt_tagger, tagger_file)
tagger_file.close()
コード例 #27
0
ファイル: phase2.py プロジェクト: RGaonkar/bookmark_me
f.write("\n".join(caps))

f.close()

#adding the tagger

import nltk

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,
                                  backoff=default,
                                  cutoff_prob=0.3)

tagger.evaluate(test_sents)

#token = nltk.word_tokenize(title)  #title string tokenized

#removing all the punctuation  marks
コード例 #28
0
for page in list(root):
    l = []
    text = page.find('text').text.decode('utf8')
    language = page.find('language').text.decode('utf8')
    pos = page.find('pos_tags').text.decode('utf8')
    splitText = text.split(" ")[1:-1]
    posText = pos.split(" ")[1:-1]
    for i in range(len(splitText)):
        l.append((splitText[i], posText[i]))
    data.append(l)
    count = count + 1
shuffle(data)

# Divide data into train and test sets
eightyPercent = count*0.9
training_set = data[0:int(eightyPercent)]
test_set = data[int(eightyPercent):]

# Train
train_data = training_set
tag1 = DefaultTagger('NN')
tag2 = UnigramTagger(train_data, backoff = tag1)
tag3 = BigramTagger(train_data, backoff = tag2)
tag4 = TrigramTagger(train_data, backoff = tag3)

# Accuracy
# print tag4.tag('open a start up'.encode('utf-8').decode('utf-8').split())
# print tag4.tag('OUT nahi KARDO ISSE BAHUT HOGAYA aaj Salman'.encode('utf-8').decode('utf-8').split())
gold_sentences = test_set
print tag4.evaluate(gold_sentences)
コード例 #29
0
 def setUp(self):
     self.corpus = brown.tagged_sents()[:35]
     self.decoder = JSONTaggedDecoder()
     self.encoder = JSONTaggedEncoder()
     self.default_tagger = DefaultTagger("NN")
コード例 #30
0
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank
from nltk.corpus import wordnet as wn
from os.path import isfile, join
from os import listdir
from pprint import pprint
import gensim.downloader as api
import re
import nltk
import os

TEST_PATH = '../test/untagged'
COMMON_WORDS_PATH = '../resources/1-1000.txt'

TRAINING_SENTS = treebank.tagged_sents()
UNIGRAM = UnigramTagger(TRAINING_SENTS, backoff=DefaultTagger('NN'))
BIGRAM = BigramTagger(TRAINING_SENTS, backoff=UNIGRAM)
TRIGRAM = TrigramTagger(TRAINING_SENTS, backoff=BIGRAM)

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
WORD_VECTORS = api.load("glove-wiki-gigaword-100")
TEST_FILES = [f for f in listdir(TEST_PATH) if isfile(join(TEST_PATH, f))]

# Manual list of words to be considered "irrelevant"
IRRELEVANT_WORDS = ["talk", "seminar", "lecture"]

# manually created ontology tree, which is later extended
TREE = {"science": {}, "maths": {}, "engineering": {}, "medicine": {}}

# code to convert POS tags into the right form for lemmatization
# https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word