def ap(train_path, test_path): modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle' test_sentences = list(gen_corpus(test_path)) if not isfile(modelref): start = perf_counter() training_sentences = list(gen_corpus(train_path)) ap_model = PerceptronTagger(load=False) ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref) end = perf_counter() print('Training took {} ms.'.format(int((end - start) * 1000))) else: ap_model = PerceptronTagger(load=False) ap_model.load(modelref) print('Model loaded from file.') # Evaluation start = perf_counter() y_pred, y_true = [], [] for words, tags in test_sentences: y_pred.extend(y for x, y in ap_model.tag(words)) y_true.extend(tags) end = perf_counter() print('Testing took {} ms.'.format(int((end - start) * 1000))) for l in classification_report(y_true, y_pred).split('\n'): print(l)
def _get_tagger(lang=None): if lang == "rus": tagger = PerceptronTagger(False) ap_russian_model_loc = "file:" + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) else: tagger = PerceptronTagger() return tagger
def _get_tagger(lang=None): if lang == 'rus': tagger = PerceptronTagger(False) ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) else: tagger = PerceptronTagger() return tagger
def _get_tagger(): # TODO: Instead of manually downloading the dutch_tagger, download it from an external source if it isn't installed at Data/ try: os.chdir(r"Data") tagger = PerceptronTagger(load=False) tagger.load('model.perc.dutch_tagger_small.pickle') return tagger except (IndexError, FileNotFoundError): return None
def _get_tagger(lang=None): if lang == "rus": tagger = PerceptronTagger(False) ap_russian_model_loc = "file:" + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) elif lang == "eng": tagger = PerceptronTagger() else: tagger = PerceptronTagger() return tagger
def _get_tagger(lang=None): if lang == 'rus': tagger = PerceptronTagger(False) ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE)) tagger.load(ap_russian_model_loc) elif lang == 'eng': tagger = PerceptronTagger() else: tagger = PerceptronTagger() return tagger
https://github.com/evanmiltenburg/Dutch-tagger We got a POS tagger and a CHUNK tagger. In combination they can be used to apply NER... """ import os import nltk from nltk.tag.perceptron import PerceptronTagger from nltk.corpus import alpino as alp # Trained on ALP data. tagger = PerceptronTagger(load=False) os.chdir(r'D:\nlp_lib') tagger.load('model.perc.dutch_tagger_small.pickle' ) # I don't know the source of training data # Tag a sentence. tagger.tag('Alle vogels zijn nesten begonnen , behalve ik en jij .'.split()) training_corpus = alp.tagged_sents() unitagger = nltk.tag.UnigramTagger(training_corpus) bitagger = nltk.tag.BigramTagger(training_corpus, backoff=unitagger) perctagger = PerceptronTagger(load=True) # What does load=True mean?? perctagger.train(training_corpus) sent = 'NLTK is een goeda taal voor NLP'.split() bitagger.tag(sent) unitagger.tag(sent) perctagger.tag(sent)
import nltk from nltk.tag.perceptron import PerceptronTagger # nltk.download() sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good.""" tokens = nltk.word_tokenize(sentence) tagger = PerceptronTagger(False) tagger.load('file:///C:/Users/yarov/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle') tagged = tagger.tag(tokens) print tagged
# 17 February 2017 import sys import re from bs4 import BeautifulSoup from urllib.parse import unquote from nltk import word_tokenize, sent_tokenize import nltk import glob from nltk.tag.perceptron import PerceptronTagger # Dutch POS tagger from https://github.com/evanmiltenburg/Dutch-tagger # Make sure the model.perc.dutch_tagger_small.pickle is in the same directory tagger = PerceptronTagger(load=False) tagger.load('model.perc.dutch_tagger_small.pickle') dbpedia_types = {} with open('dbpedia-wikipedia-type.tsv', 'r') as f: for line in f: line = line.rstrip() elems = line.split('\t') elems[1] = elems[1].lstrip('<http://nl.wikipedia.org/wiki/') elems[1] = elems[1].rstrip('>') if 'dbpedia' in elems[2]: elems[2] = elems[2].lstrip('<http://dbpedia.org/ontology/') elems[2] = elems[2].rstrip('>') dbpedia_types[elems[1]] = elems[2] def analyse_and_store_links(text):
def tokenize(sentence, suffixe, prefixe): a = sentence.split() sentence1 = "" for i in a: #mots if (i.find('-') < 0): sentence1 = sentence1 + ' ' + i else: words = tokenize_word(i, suffixe, prefixe) sentence1 = sentence1 + ' ' + words sentence1 = sentence1.strip() return sentence1 trained_model = "file:///c:/tal/trained_model12.pickle" tagger = PerceptronTagger() tagger.load(trained_model) f = open("c:/tal/tokenized_text.txt", "w+", encoding='utf-8') h = open("c:/tal/tagged_text.txt", "w+", encoding='utf-8') g = open("c:/tal/brut_text.txt", encoding='utf-8') for line in g: #print (line) for i in Ponctuation: if i == '.': line = line.replace(i, ' ' + i + ' ') else: line = line.replace(i, ' ' + i + ' ') line = line.replace("\ufeff", "") ligne = tokenize(line, suffixe, prefixe)
import nltk from nltk.tag.perceptron import PerceptronTagger # nltk.download() sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good.""" tokens = nltk.word_tokenize(sentence) tagger = PerceptronTagger(False) tagger.load( 'file:///C:/Users/yarov/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle' ) tagged = tagger.tag(tokens) print tagged