Ejemplo n.º 1
0
def ap(train_path, test_path):
    modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle'

    test_sentences = list(gen_corpus(test_path))

    if not isfile(modelref):
        start = perf_counter()
        training_sentences = list(gen_corpus(train_path))

        ap_model = PerceptronTagger(load=False)
        ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref)
        end = perf_counter()
        print('Training took {} ms.'.format(int((end - start) * 1000)))
    else:
        ap_model = PerceptronTagger(load=False)
        ap_model.load(modelref)
        print('Model loaded from file.')

    # Evaluation
    start = perf_counter()
    y_pred, y_true = [], []
    for words, tags in test_sentences:
        y_pred.extend(y for x, y in ap_model.tag(words))
        y_true.extend(tags)

    end = perf_counter()
    print('Testing took {} ms.'.format(int((end - start) * 1000)))

    for l in classification_report(y_true, y_pred).split('\n'):
        print(l)
Ejemplo n.º 2
0
def _get_tagger(lang=None):
    if lang == "rus":
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    else:
        tagger = PerceptronTagger()
    return tagger
Ejemplo n.º 3
0
def _get_tagger(lang=None):
    if lang == 'rus':
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    else:
        tagger = PerceptronTagger()
    return tagger
Ejemplo n.º 4
0
 def _get_tagger():
     # TODO: Instead of manually downloading the dutch_tagger, download it from an external source if it isn't installed at Data/
     try:
         os.chdir(r"Data")
         tagger = PerceptronTagger(load=False)
         tagger.load('model.perc.dutch_tagger_small.pickle')
         return tagger
     except (IndexError, FileNotFoundError):
         return None
Ejemplo n.º 5
0
def _get_tagger(lang=None):
    if lang == "rus":
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    elif lang == "eng":
        tagger = PerceptronTagger()
    else:
        tagger = PerceptronTagger()
    return tagger
Ejemplo n.º 6
0
def _get_tagger(lang=None):
    if lang == 'rus':
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    elif lang == 'eng':
        tagger = PerceptronTagger()
    else:
        tagger = PerceptronTagger()
    return tagger
Ejemplo n.º 7
0
https://github.com/evanmiltenburg/Dutch-tagger

We got a POS tagger and a CHUNK tagger. In combination they can be used to apply NER...


"""

import os
import nltk

from nltk.tag.perceptron import PerceptronTagger
from nltk.corpus import alpino as alp  # Trained on ALP data.

tagger = PerceptronTagger(load=False)
os.chdir(r'D:\nlp_lib')
tagger.load('model.perc.dutch_tagger_small.pickle'
            )  # I don't know the source of training data

# Tag a sentence.
tagger.tag('Alle vogels zijn nesten begonnen , behalve ik en jij .'.split())

training_corpus = alp.tagged_sents()
unitagger = nltk.tag.UnigramTagger(training_corpus)
bitagger = nltk.tag.BigramTagger(training_corpus, backoff=unitagger)
perctagger = PerceptronTagger(load=True)  # What does load=True mean??
perctagger.train(training_corpus)

sent = 'NLTK is een goeda taal voor NLP'.split()
bitagger.tag(sent)
unitagger.tag(sent)
perctagger.tag(sent)
Ejemplo n.º 8
0
import nltk
from nltk.tag.perceptron import PerceptronTagger

# nltk.download()

sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)

tagger = PerceptronTagger(False)
tagger.load('file:///C:/Users/yarov/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
tagged = tagger.tag(tokens)
print tagged
# 17 February 2017

import sys
import re
from bs4 import BeautifulSoup
from urllib.parse import unquote
from nltk import word_tokenize, sent_tokenize
import nltk
import glob

from nltk.tag.perceptron import PerceptronTagger

# Dutch POS tagger from https://github.com/evanmiltenburg/Dutch-tagger
# Make sure the model.perc.dutch_tagger_small.pickle is in the same directory
tagger = PerceptronTagger(load=False)
tagger.load('model.perc.dutch_tagger_small.pickle')

dbpedia_types = {}
with open('dbpedia-wikipedia-type.tsv', 'r') as f:
    for line in f:
        line = line.rstrip()
        elems = line.split('\t')
        elems[1] = elems[1].lstrip('<http://nl.wikipedia.org/wiki/')
        elems[1] = elems[1].rstrip('>')
        if 'dbpedia' in elems[2]:
            elems[2] = elems[2].lstrip('<http://dbpedia.org/ontology/')
            elems[2] = elems[2].rstrip('>')
            dbpedia_types[elems[1]] = elems[2]


def analyse_and_store_links(text):
Ejemplo n.º 10
0
def tokenize(sentence, suffixe, prefixe):
    a = sentence.split()
    sentence1 = ""
    for i in a:  #mots
        if (i.find('-') < 0):
            sentence1 = sentence1 + ' ' + i
        else:
            words = tokenize_word(i, suffixe, prefixe)
            sentence1 = sentence1 + ' ' + words
    sentence1 = sentence1.strip()
    return sentence1


trained_model = "file:///c:/tal/trained_model12.pickle"
tagger = PerceptronTagger()
tagger.load(trained_model)

f = open("c:/tal/tokenized_text.txt", "w+", encoding='utf-8')
h = open("c:/tal/tagged_text.txt", "w+", encoding='utf-8')
g = open("c:/tal/brut_text.txt", encoding='utf-8')
for line in g:
    #print (line)
    for i in Ponctuation:

        if i == '.':
            line = line.replace(i, ' ' + i + ' ')
        else:
            line = line.replace(i, ' ' + i + ' ')
    line = line.replace("\ufeff", "")

    ligne = tokenize(line, suffixe, prefixe)
Ejemplo n.º 11
0
import nltk
from nltk.tag.perceptron import PerceptronTagger

# nltk.download()

sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)

tagger = PerceptronTagger(False)
tagger.load(
    'file:///C:/Users/yarov/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle'
)
tagged = tagger.tag(tokens)
print tagged