def ap(train_path, test_path): modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle' test_sentences = list(gen_corpus(test_path)) if not isfile(modelref): start = perf_counter() training_sentences = list(gen_corpus(train_path)) ap_model = PerceptronTagger(load=False) ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref) end = perf_counter() print('Training took {} ms.'.format(int((end - start) * 1000))) else: ap_model = PerceptronTagger(load=False) ap_model.load(modelref) print('Model loaded from file.') # Evaluation start = perf_counter() y_pred, y_true = [], [] for words, tags in test_sentences: y_pred.extend(y for x, y in ap_model.tag(words)) y_true.extend(tags) end = perf_counter() print('Testing took {} ms.'.format(int((end - start) * 1000))) for l in classification_report(y_true, y_pred).split('\n'): print(l)
class Tagger(AbstractTagger): def __init__(self): self._tagger = PerceptronTagger(load=False) self._name = 'nltkperceptron' self._model_name = "nltkperceptron" self._result = None super().__init__() def _save_model(self, fpath): with open(fpath, 'wb') as f: dill.dump(self._tagger, f) def load(self, path=''): if path == '': self._load_model(path) else: mpath = os.path.join(path, self.model_name) self._load_model(mpath) def _load_model(self, fpath): if fpath == '': self._tagger = PerceptronTagger(load=True) else: with open(fpath, 'rb') as f: self._tagger = dill.load(f) def tag(self, data): res = self._tagger.tag(data) return [x[1] for x in res] def train(self, data): # Reset tagger. self._tagger = PerceptronTagger(load=False) self._tagger.train(data) @property def produces_temp_data(self): return False @property def requires_additional_params(self): return False def set_additional_params(self, options): pass def add_temp_dir(self, options): pass @property def model_name(self): return self._model_name @property def name(self): return self._name
def nltk_perceptron_pos_tagger(input_dict): name= 'PerceptronPosTagger' if not input_dict['training_corpus']: perceptron_tagger = PerceptronTagger() name += '-pretrained' else: perceptron_tagger = PerceptronTagger(load=False) chunk = input_dict['training_corpus']['chunk'] corpus = input_dict['training_corpus']['corpus'] training_corpus=corpus_reader(corpus, chunk) perceptron_tagger.train(list(training_corpus)) return {'pos_tagger': { 'function':'tag_sents', 'object': perceptron_tagger, 'name': name } }
def _get_pretrain_model(): pos_tagger = PerceptronTagger(load=False) pos_tagger.train(sentences=train_sents, save_loc=PICKLE) print('Accuracy : ', pos_tagger.evaluate(test_sents))
def get_tagger(dfs_corpus, word_key="word", pos_key="universal_dependency"): ''' This function instantiates a tagger trained with some corpus annotations (out of a DataFrame) Args: dfs_corpus: one (or a list of) Pandas DataFrame(s) with annotated corpus data word_key: (default 'word') column name for wordforms in dfs_corpus pos_key: (default 'universal_dependency') column name for parts-of-speech in dfs_corpus Returns: a PerceptronTagger instance >>> # get a tagger, trained with df_corpus: a Pandas DataFrame with lots of corpus data >>> tagger = get_tagger(df_corpus) >>> # tag a sentence now >>> sentence = 'Here is some beautiful sentence' >>> tagged_sentence = tagger.tag( sentence.split() ) >>> print(tagged_sentence) ''' sentences = [] # The algorithm expects a list of DataFrames by default, so make sure we have just that if isinstance(dfs_corpus, pd.DataFrame): dfs_corpus = [dfs_corpus] for df_corpus in dfs_corpus: # The corpus DataFrame consists of a number of sentences (rows) with a fixed number of tokens. # Each token has a fixed number of layers holding info like: lemma, wordform or part-of-speech. # As a result, the number of columns of each row = [number of tokens] x [number of layers] # To be able to feed the tagger correctly, we need to compute the number of layers, # so we can infer the number of tokens the sentences hold. This is because # the tagger expects us to feed it with arrays with length = [number of tokens], as elements of # one single array holding all sentences arrays (see below). # So, determine how many layers (lemma, pos, wordform) we have column_names = list(df_corpus.columns.values) for n, val in enumerate(column_names): # remove the numbers at the end of the layers names (lemma 1, lemma 2, ..., pos 1, pos 2, ...) # so we end up with clean layers name only column_names[n] = val.split(' ')[0] number_of_layers = len(set(column_names)) # Now we can determine the standard length of our corpus sentences: that can be computed # by dividing the number of columns of the corpus DataFrame by the number of layers # we just computed. nr_of_words_per_sentence = int( df_corpus.shape[1] / number_of_layers ) # Build training data for the tagger in the right format # The input must be like: [ [('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')], [...] ] for index, row in df_corpus.iterrows(): one_sentence = [] wrong = False for i in range(0, nr_of_words_per_sentence, 1): word_idx = word_key+' '+str(i) pos_idx = pos_key+' '+str(i) try: tuple = ( row[word_idx], _cut_off_features(row[pos_idx]) ) one_sentence.append( tuple ) if (row[word_idx] is None or row[pos_idx] is None): wrong = True except: raise ValueError("function get_tagger() expects corpus data with columns '%s' and '%s', but those columns could not be found. Please call the function with these extra paramters to declare which column your corpus data has instead: get_tagger(word_key='...', pos_key='...')." % (word_key, pos_key)) if wrong is False: sentences.append(one_sentence) # Instantiate and train the tagger now tagger = PerceptronTagger(load=False) tagger.train(sentences) return tagger
https://github.com/evanmiltenburg/Dutch-tagger We got a POS tagger and a CHUNK tagger. In combination they can be used to apply NER... """ import os import nltk from nltk.tag.perceptron import PerceptronTagger from nltk.corpus import alpino as alp # Trained on ALP data. tagger = PerceptronTagger(load=False) os.chdir(r'D:\nlp_lib') tagger.load('model.perc.dutch_tagger_small.pickle' ) # I don't know the source of training data # Tag a sentence. tagger.tag('Alle vogels zijn nesten begonnen , behalve ik en jij .'.split()) training_corpus = alp.tagged_sents() unitagger = nltk.tag.UnigramTagger(training_corpus) bitagger = nltk.tag.BigramTagger(training_corpus, backoff=unitagger) perctagger = PerceptronTagger(load=True) # What does load=True mean?? perctagger.train(training_corpus) sent = 'NLTK is een goeda taal voor NLP'.split() bitagger.tag(sent) unitagger.tag(sent) perctagger.tag(sent)
aḍris = [] #Construction de la chaine d'apprentissage à partir du corpus kabyle for izirig in open("c:/tal/corpus-kab.txt", encoding='utf-8'): if (amenzu != 0): #ligne=ligne.lower() amenzu = 1 izirig = izirig.replace("\n", "") a = izirig.split(" ") for i in a: b = i.split("/") try: if (b[1] != 'NMP'): b[0] = b[0].lower() else: b[0] = b[0].lower() except: print(b, 'erreur') exit() kab_tags_words1.append((b[0], b[1])) aḍris.append(kab_tags_words1) else: amenzu = 1 #entrainer l'algorithme sur la base du corpus et enregistrer le modèle trained_model = "c:/tal/trained_model15.pickle" tagger.train(aḍris, save_loc=trained_model, nr_iter=20)
comb_results = np.zeros((5, 4)) ind_results = np.zeros((5, 4)) for ki in range(data_batch): from nltk import TnT from nltk.tag import hmm from nltk.tag.perceptron import PerceptronTagger from nltk.tag import CRFTagger perc_tagger = PerceptronTagger(load=False) tnt_tagger = TnT() crf_tagger = CRFTagger() tnt_tagger.train(training_data[ki]) hmm_tagger = nltk.HiddenMarkovModelTagger.train(training_data[ki]) perc_tagger.train(training_data[ki]) crf_tagger.train(training_data[ki], 'model.crf.tagger') # t.tagdata(test_data[800:]) perc_pred = [] hmm_pred = [] for i in testing_data[ki]: perc_pred.append(perc_tagger.tag(i)) hmm_pred.append(hmm_tagger.tag(i)) crf_pred = crf_tagger.tag_sents(testing_data[ki]) tnt_pred = tnt_tagger.tagdata(testing_data[ki]) pred = {'p': perc_pred, 'h': hmm_pred, 'c': crf_pred, 't': tnt_pred} def most_frequent(List):
import cowparser as cp train_sents = [] test_sents = [] gen = cp.sentences_for_dir(separate=False) for i, (metadata, data) in enumerate(gen): train_sents.append([(a,b) for a,b,c in data]) if i == 2000000: break for i, (metadata, data) in enumerate(gen): test_sents.append([(a,b) for a,b,c in data]) if i == 5000: break from nltk.tag.perceptron import PerceptronTagger pt = PerceptronTagger(load=False) pt.train(train_sents,'model2.perc.dutch_tagger') print(pt.evaluate(test_sents))