Exemple #1
0
def train(train_sentences):
    print "- Default Tagger"
    default_tagger = DefaultTagger('NC')

    print "- Unigram Tagger"
    unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger)

    print "- Templates"
    #These templates define the features to be used for the brill tagger
    # relatively to the word position.
    Template._cleartemplates()
    templates = [
        Template(Pos([-1])),
        Template(Pos([-1]), Word([0])),
        Template(Pos([-2])),
        Template(Pos([-2]), Word([0])),
        Template(Pos([1])),
    ]
    print "- Brill Tagger"
    tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1)
    tagger = tt.train(train_sentences, max_rules=1000)

    print "- Done."

    return tagger
Exemple #2
0
    def train(self, sentence_list):
        """Trains the tagger from the tagged sentences provided
        """
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
        trigram_fallback = TrigramTagger(sentence_list,
                                         backoff=bigram_fallback)
        templates = [
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 3)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 3)),
            brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                          (1, 1)),
            brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                          (1, 1))
        ]

        trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates)
        self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
Exemple #3
0
def train_tagger():
    """
	This function trains the tagger
	"""
    print("Training POS tagger...")
    # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py

    tagged_sentences = treebank.tagged_sents()
    size = int(len(tagged_sentences) * 0.9)
    train_sents = tagged_sentences[:size]
    test_sents = tagged_sentences[3000:]

    default = DefaultTagger("NN")
    tagger = ClassifierBasedPOSTagger(
        train=train_sents, backoff=default, cutoff_prob=0.3
    )
    print(tagger.evaluate(test_sents))  # 0.9613641269156055

    # save model to pickle file as binary
    file_name = MODEL_PATH + "tag_model.pkl"
    with open(file_name, "wb") as fout:
        pickle.dump(tagger, fout)

    print("model written to: " + file_name)
    print("")

    return tagger
Exemple #4
0
    def tag(self, sent, tagregex=True, deftag='XX', verbose=False):
        kalimat = sent.encode('utf-8')

        text = self.regexTokenizer(kalimat.lower().strip())

        ## :> --___<<IMPORTANT>>___--
        ##      Untuk beberapa hal default tagger harus dibiarkan 'XX'
        ##      dengan tujuan identifikasi Entitas
        backoff_tagger = DefaultTagger(deftag)

        if tagregex:
           regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger)
           unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger)
        else:
           unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger)
           
        bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger)
        
        """
        # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt"
        # kombinasi proses tagging diatas menghasilkan tingkat akurasi:
        #      dengan regextagger: 77%
        #      tanpa regextagger : > 90%
        """
        if verbose:
           # Semakin besar dokumen, semakin lama proses perhitungan akurasi
           # disarankan hanya untuk testing
           print ("Calculating Tagger Accuracy...")
           self.tagAccuracy = trigram_tagger.evaluate(self.test_sent)
           print ("Accuracy is: %s" % (self.tagAccuracy))
        
        return trigram_tagger.tag(text)
def train():
    try:
        input = open('tagger.pkl', 'rb')
        print("Found tagger")

        tagger = load(input)
        input.close()
    except IOError:
        print('Training:')

        train_sents = brown.tagged_sents()[:50000]
        test_sents = brown.tagged_sents()[50000:]

        tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger]

        tagger = backoff_tagger(train_sents,
                                tagger_classes,
                                backoff=DefaultTagger('unseen'))
        print('Finished training, tagger accuracy:')
        print(tagger.evaluate(test_sents))

        output = open('tagger.pkl', 'wb')
        dump(tagger, output, -1)
        output

    return tagger
Exemple #6
0
def getDefaultTaggerAccuracy(testingSet):
    # gets the accuracy of the DefaultTagger

    # get untagged sentences and gold POS tags
    untaggedSentences = [[taggedWord[0] for taggedWord in sentence]
                         for sentence in testingSet]
    goldPOSTags = [[taggedWord[1] for taggedWord in sentence]
                   for sentence in testingSet]

    # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this
    # altogether
    # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary
    defaultTagger = DefaultTagger('NN')
    defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences)

    # calculate accuracy
    totalTags = 0
    matches = 0
    # iterate through sentences
    for sentencePOSTags in goldPOSTags:
        # iterate through tags
        for individualPOSTag in sentencePOSTags:
            totalTags += 1
            # if the gold tag is NN, then match
            if individualPOSTag == 'NN':
                matches += 1

    accuracy = (matches / totalTags) * 100
    return accuracy
Exemple #7
0
def find_accuracy(train_set, test_set):
    #skal alt her være test-set?
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    train_set_most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(train_set_most_frequent_tag)
    accuracy_result = default_tagger.evaluate(test_set)
    return accuracy_result
Exemple #8
0
def ngramtagging(train): #PROSES POS TAGGING
    train_data = []
    train_data.append(train)
    backoff_tagger = DefaultTagger('nn')
    unigram_tagger = UnigramTagger(train_data, backoff = backoff_tagger)
    bigram_tagger = BigramTagger(train_data, backoff = unigram_tagger)
    trigram_tagger = TrigramTagger(train_data, backoff = bigram_tagger)
    return trigram_tagger
Exemple #9
0
 def test_default_tagger(self):
     test_list = make_sentence_list(path.join(self.test_dir, 'test.tsv'))
     tagger = DefaultTagger('N')
     split = int(len(test_list) * .90)
     train_data = test_list[:split]
     test_data = test_list[split:]
     print(tagger.evaluate(train_data))
     print(tagger.evaluate(test_data))
 def __init__(self):
     self.backoff = self.backoff_tagger(backoff=DefaultTagger('NN'))
     self.st = StanfordNERTagger(
         'stanfordNERJars/classifiers/english.all.3class.distsim.crf.ser.gz',
         'stanfordNERJars/stanford-ner.jar',
         encoding='utf-8')
     if os.path.exists("out/"):
         shutil.rmtree('out/')
Exemple #11
0
def train_tagger(tagger_name):
    train_sents = treebank.tagged_sents()[:5000]
    if tagger_name == "TnT" or tagger_name == 'tagger':
        trained_tagger = tnt.TnT()
        trained_tagger.train(train_sents)
    else:
        tagger1 = DefaultTagger('NN')
        tagger2 = TrigramTagger(train_sents, backoff=tagger1)
        tagger3 = BigramTagger(train_sents, backoff=tagger2)
        trained_tagger = UnigramTagger(train_sents, backoff=tagger3)
    return trained_tagger
Exemple #12
0
def baseline(tagged_sentences):
    from nltk.tag import UnigramTagger
    from nltk.tag import DefaultTagger
    from collections import Counter

    # lowercase everything
    # remove all instances of non-universal tags for propper comparison with
    # the other methods
    new_tagged_sentences = []
    for sent in tagged_sentences:
        sent = [(x[0].lower(), x[1]) for x in sent]
        sent = [x for x in sent if x[1] in _UNI]
        new_tagged_sentences.append(sent)
    tagged_sentences = new_tagged_sentences

    # size of corpus
    corpus_size = sum([len(sent) for sent in tagged_sentences])
    print('Corpus size: {} docs'.format(len(tagged_sentences)))
    print('Corpus size: {} tokens'.format(corpus_size))
    
    # train/test split
    test_pct = 0.3
    test_len = int(len(tagged_sentences) * test_pct)
    test_idx = len(tagged_sentences) - test_len
    train_set = tagged_sentences[:test_idx]
    test_set = tagged_sentences[test_idx:]
    print('Train set: {} docs'.format(len(train_set)))
    print('Test set: {} docs'.format(len(test_set)))

    # calculate test set size in tokens
    test_size = sum([len(sent) for sent in test_set])
    print('Test set: {} tokens'.format(test_size))

    # calculate most comman tag in the train set
    # this should be 'NOUN'
    tag_dist = []
    for sent in train_set:
        tag_dist += [x[1] for x in sent]
    counts = Counter()
    counts.update(tag_dist)
    most_common = counts.most_common(1)[0][0]
    print('Most common tag: {}'.format(most_common))

    # Create model
    backoff = DefaultTagger(most_common)
    tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5)

    # Evaluate
    acc = tagger.evaluate(test_set)
    print('Baseline: {}'.format(acc))
Exemple #13
0
def evaluate_nltk_pos_taggers(gold_standard_filename, num_folds=10, loo=False):
    """
    Evaluates the NLTK backoff taggers on the corpus data. Uses cross-validation.
    :param gold_standard_filename: tsv file of format: word \t POS \n
    :param num_folds: int: number of folds for cross-validation
    :param loo: bool: whether to use Leave One Out cross-validation
    :return:
    """
    tagged_sents = make_sentence_list(gold_standard_filename)
    backoff = DefaultTagger('N')
    tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger]
    scores = {
        'DefaultTagger': [],
        'UnigramTagger': [],
        'BigramTagger': [],
        'TrigramTagger': [],
        'BrillTagger': [],
    }

    # k-fold cross-validation
    if loo:  # Leave One Out cross-validation
        num_folds = len(tagged_sents)-1
    subset_size = int(len(tagged_sents) / num_folds)
    for i in range(num_folds):

        # training and testing data for this round
        X_test = tagged_sents[i * subset_size:][:subset_size]
        X_train = tagged_sents[:i * subset_size] + tagged_sents[(i + 1) * subset_size:]

        # compute score for taggers
        default_score = backoff.evaluate(X_train)
        trigram, tagger_scores = backoff_tagger(X_train, X_test,
                                                tagger_classes, backoff=backoff)
        uni_score, bi_score, tri_score = tagger_scores
        brill_tagger = train_brill_tagger(trigram, X_train)
        brill_score = brill_tagger.evaluate(X_test)
        brill_tagger.print_template_statistics(printunused=False)

        # save scores
        scores['DefaultTagger'].append(default_score)
        scores['UnigramTagger'].append(uni_score)
        scores['BigramTagger'].append(bi_score)
        scores['TrigramTagger'].append(tri_score)
        scores['BrillTagger'].append(brill_score)

    for k, v in scores.items():  # average scores across folds
        if v:
            scores[k] = sum(v)/len(v)
            print(k, ": {:2.2%}".format(scores[k]))
    return scores
Exemple #14
0
def find_combined_taggers_accuracy(train_set, test_set):
    # finding most used tag
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(most_frequent_tag)

    # default tagger
    default_tagger_result = default_tagger.evaluate(test_set)
    print("Default Tagger accuracy: ", default_tagger_result)

    # regex tagger
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    regex_tagger = RegexpTagger(patterns)
    regex_tagger_result = regex_tagger.evaluate(test_set)
    print("Regex Tagger Accuracy: ", regex_tagger_result)

    # unigram tagger with default tagger as backoff
    unigram_tagger = UnigramTagger(train_set, backoff=default_tagger)
    unigram_tagger_result = unigram_tagger.evaluate(test_set)
    print("Unigram Tagger accuracy (Backoff = Default Tagger): ",
          unigram_tagger_result)

    # bigram tagger with different backoffs
    bigram_tagger = BigramTagger(train_set)
    bigram_tagger_backoff_unigram = BigramTagger(train_set,
                                                 backoff=unigram_tagger)
    bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger)

    bigram_tagger_result = bigram_tagger.evaluate(test_set)
    bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate(
        test_set)
    bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate(
        test_set)

    print("Bigram Tagger Accuracy: ", bigram_tagger_result)
    print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ",
          bigram_tagger_backoff_regex_result)
    print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ",
          bigram_tagger_backoff_unigram_result)
Exemple #15
0
def tag_words(words, tag):
    """
    Associates a tag with words.

    Parameters
    ----------
    words: A list of strings.
    tag: A str.

    Returns
    -------
    A list of tuples of (str, str)
    """

    default_tagger = DefaultTagger(tag)
    tags = default_tagger.tag(words)

    return tags
    def wordTagger(self, wordlist,number):
        train_sents = treebank.tagged_sents()[:3000]
        if number==1:
            taglist = nltk.pos_tag(wordlist)
        elif number ==2:
            tagger = DefaultTagger('NN')
            taglist = tagger.tag(wordlist)
        elif number ==3:
            tagger = UnigramTagger(train_sents)
            taglist = tagger.tag(wordlist)

        elif number ==4:
            tnt_tagger = tnt.TnT()
            tnt_tagger.train(train_sents)
            taglist = tnt_tagger.tag(wordlist)
        elif number ==5:
            tagger = ClassifierBasedPOSTagger(train=train_sents)
            taglist = tagger.tag(wordlist)
        return taglist
Exemple #17
0
def lexical(tokens):
    print "\n"
    print "Step 2: Lexical Analysis\n"
    print "Essentially refers to dictionary and obtains the properties of the word"
    print "Part-Of-Speech tagging"
    print "The tagset is:\n"

    tag = DefaultTagger('NN')
    tagg = UnigramTagger(train_sent, backoff=tag)
    tagger = BigramTagger(train_sent, backoff=tagg)

    tagtokens = tagger.tag(tokens)
    for token, tag in tagtokens:
        print token + "->" + tag
    print "\n"
    print "The acurracy of the trained pos tagger is:"
    print tagger.evaluate(test_sents)

    return tagtokens
Exemple #18
0
def tag_linked(words, default_tag='INFO'):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers.
    Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    :param default_tag:
    """

    default_tagger = DefaultTagger(default_tag)
    pt_tagger = UnigramTagger(treebank.tagged_sents())

    pt_tagger._taggers = [pt_tagger, default_tagger]

    tags = pt_tagger.tag(words)

    return tags
Exemple #19
0
def train_tagger():
    '''
    Um exemplo de treinamento de um etiquetador sintático usando
    um modelo de tri-gramas baseado em probabilidades.

    Um etiquetador sintático identifica quais a classe de uma palavra
    Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N
    Preposição Verbo Artigo Substantivo
    '''

    # Carregando um conjunto de dados em português que possui
    # sentenças manualmente identificadas
    data = [
        [(w, re.split('[|-]', tag)[0]) for w, tag in sent]
        for sent in mac_morpho.tagged_sents()]

    # Classe sintática padrão. N siginifica Nome/substantivo
    tagger0 = DefaultTagger('N')
    print('train unigram')
    tagger1 = UnigramTagger(data, backoff=tagger0)
    print('training bigram')
    tagger2 = BigramTagger(data, backoff=tagger1)
    print('training trigram')
    return TrigramTagger(data, backoff=tagger2)
Exemple #20
0
from nltk.metrics import *
import string
'''import replacer
from replacer import RegexpReplacer
from replacer import RepeatReplacer'''
import linecache
import matplotlib.pyplot as plt
'''
Train Tagger
'''
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.corpus import treebank
train = treebank.tagged_sents()[:10000]
t0 = DefaultTagger('NN')
t1 = UnigramTagger(train, backoff=t0)
t2 = BigramTagger(train, backoff=t1)
'''
Initialize
'''
my_corp = web.sents(fileids='firefox.txt')
sent_count = 0
ques_count = 0
All_count = 1
NN_count = 0
NNS_count = 0
NNP_count = 0
VB_count = 0
VBN_count = 0
VBG_count = 0
    ('BASIS', 'BASIS'),
    ('CORPORATE', 'CORPORATE'),
    ('OTHER|other', 'OTHER'),
    ('LAST|last', 'LAST'),
    (r'POS', 'POINT_OF_SALE'),
    (r'AFTERCOMPANY|CORPORATE|COMPANY', 'CORPORATE'),
    (r'DISCOUNT|COMMISSIO|COMMISSIONS|COMISSION|discount|discounts|Commissionable|commissionable',
     'DISCOUNT'),
    (r'ASIA|Asia|asia|AISA|亚洲', 'ASIA'),
    (r'NORTH|North|north', 'NORTH'),
    (r'SOUTH|South|south', 'SOUTH')

    #            ('TICKET','VALIDITY')
]  # add learning loop here for tags

def_tagger = DefaultTagger('NN')
prelim_def_tagger = DefaultTagger(None)

backoff = RegexpTagger(
    [
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'is|was|are|were', 'VBZ'),  # verb to be
        (r'"', 'QT'),  # quote
        (r'.*', 'NN')  # nouns (default)
def Tagger():
    #Tagger
    etiq1 = DefaultTagger('N')
    sentencas_treinadoras = mac_morpho.tagged_sents()[::]
    etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
    return etiq2
Exemple #23
0
train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

train_brown = nltk.corpus.brown.tagged_sents()[0:5000]
test_brown = nltk.corpus.brown.tagged_sents()[5000:]

from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger


def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
        return backoff


backoff = DefaultTagger('NN')
btagger = backoff_tagger(train_sents,
                         [UnigramTagger, BigramTagger, TrigramTagger],
                         backoff=backoff)

#print tagger.evaluate(test_sents)

tnt_tagger = nltk.tag.tnt.TnT()
tnt_tagger.train(train_sents)

t_tagger_brown = nltk.tag.tnt.TnT()
t_tagger_brown.train(train_brown)


def readEssays(filename):
    infile = open(filename, 'r')
Exemple #24
0
def backoff_tagger(train_sents, tagger_classes):
    backoff = DefaultTagger('NN')
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff
Exemple #25
0

train_sents = update_tags(train_sents)
val_sents = update_tags(val_sents)
test_sents = update_tags(test_sents)
"""
# =============================================================================
# finalise a sequential tagger
# =============================================================================
"""
""" 1. run tagger with different corpus size (50% and 100%) """
# backoff tagger
tag1_eval = dict()
# train with backoff and Brill
tic()
tag1_tagger = DefaultTagger('NO')
tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger)
tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger)
tag1_tagger = BigramTagger(train_sents, backoff=tag1_tagger)
tag1_tagger = TrigramTagger(train_sents, backoff=tag1_tagger)
tag1b_tagger = train_brill_tagger(tag1_tagger,
                                  train_sents,
                                  True,
                                  max_rules=100)
tag1_eval['train_time'] = toc()
# test
tic()
Exemple #26
0
from nltk.tag import tnt, DefaultTagger
import pickle

datas = open('Indonesian_Manually_Tagged_Corpus.tsv', 'r').read()
datas = datas.split('\n\n')

train_sents = []

for data in datas:
    train_sents.append(list(tuple(i.split('\t')) for i in data.split('\n')))

unk = DefaultTagger('NN')
tnt_tagger = tnt.TnT(unk=unk, Trained=True)
tnt_tagger.train(train_sents)
tagger_file = open("indonesian_tnt_pos_tag.pickle", "wb")
pickle.dump(tnt_tagger, tagger_file)
tagger_file.close()
Exemple #27
0
f.write("\n".join(caps))

f.close()

#adding the tagger

import nltk

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,
                                  backoff=default,
                                  cutoff_prob=0.3)

tagger.evaluate(test_sents)

#token = nltk.word_tokenize(title)  #title string tokenized

#removing all the punctuation  marks
Exemple #28
0
for page in list(root):
    l = []
    text = page.find('text').text.decode('utf8')
    language = page.find('language').text.decode('utf8')
    pos = page.find('pos_tags').text.decode('utf8')
    splitText = text.split(" ")[1:-1]
    posText = pos.split(" ")[1:-1]
    for i in range(len(splitText)):
        l.append((splitText[i], posText[i]))
    data.append(l)
    count = count + 1
shuffle(data)

# Divide data into train and test sets
eightyPercent = count*0.9
training_set = data[0:int(eightyPercent)]
test_set = data[int(eightyPercent):]

# Train
train_data = training_set
tag1 = DefaultTagger('NN')
tag2 = UnigramTagger(train_data, backoff = tag1)
tag3 = BigramTagger(train_data, backoff = tag2)
tag4 = TrigramTagger(train_data, backoff = tag3)

# Accuracy
# print tag4.tag('open a start up'.encode('utf-8').decode('utf-8').split())
# print tag4.tag('OUT nahi KARDO ISSE BAHUT HOGAYA aaj Salman'.encode('utf-8').decode('utf-8').split())
gold_sentences = test_set
print tag4.evaluate(gold_sentences)
 def setUp(self):
     self.corpus = brown.tagged_sents()[:35]
     self.decoder = JSONTaggedDecoder()
     self.encoder = JSONTaggedEncoder()
     self.default_tagger = DefaultTagger("NN")
Exemple #30
0
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank
from nltk.corpus import wordnet as wn
from os.path import isfile, join
from os import listdir
from pprint import pprint
import gensim.downloader as api
import re
import nltk
import os

TEST_PATH = '../test/untagged'
COMMON_WORDS_PATH = '../resources/1-1000.txt'

TRAINING_SENTS = treebank.tagged_sents()
UNIGRAM = UnigramTagger(TRAINING_SENTS, backoff=DefaultTagger('NN'))
BIGRAM = BigramTagger(TRAINING_SENTS, backoff=UNIGRAM)
TRIGRAM = TrigramTagger(TRAINING_SENTS, backoff=BIGRAM)

STOPWORDS = set(nltk.corpus.stopwords.words('english'))
WORD_VECTORS = api.load("glove-wiki-gigaword-100")
TEST_FILES = [f for f in listdir(TEST_PATH) if isfile(join(TEST_PATH, f))]

# Manual list of words to be considered "irrelevant"
IRRELEVANT_WORDS = ["talk", "seminar", "lecture"]

# manually created ontology tree, which is later extended
TREE = {"science": {}, "maths": {}, "engineering": {}, "medicine": {}}

# code to convert POS tags into the right form for lemmatization
# https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word