Ejemplo n.º 1
0
def ap(train_path, test_path):
    modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle'

    test_sentences = list(gen_corpus(test_path))

    if not isfile(modelref):
        start = perf_counter()
        training_sentences = list(gen_corpus(train_path))

        ap_model = PerceptronTagger(load=False)
        ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref)
        end = perf_counter()
        print('Training took {} ms.'.format(int((end - start) * 1000)))
    else:
        ap_model = PerceptronTagger(load=False)
        ap_model.load(modelref)
        print('Model loaded from file.')

    # Evaluation
    start = perf_counter()
    y_pred, y_true = [], []
    for words, tags in test_sentences:
        y_pred.extend(y for x, y in ap_model.tag(words))
        y_true.extend(tags)

    end = perf_counter()
    print('Testing took {} ms.'.format(int((end - start) * 1000)))

    for l in classification_report(y_true, y_pred).split('\n'):
        print(l)
Ejemplo n.º 2
0
class Tagger(AbstractTagger):
    def __init__(self):
        self._tagger = PerceptronTagger(load=False)
        self._name = 'nltkperceptron'
        self._model_name = "nltkperceptron"
        self._result = None
        super().__init__()

    def _save_model(self, fpath):
        with open(fpath, 'wb') as f:
            dill.dump(self._tagger, f)

    def load(self, path=''):
        if path == '':
            self._load_model(path)
        else:
            mpath = os.path.join(path, self.model_name)
            self._load_model(mpath)

    def _load_model(self, fpath):
        if fpath == '':
            self._tagger = PerceptronTagger(load=True)
        else:
            with open(fpath, 'rb') as f:
                self._tagger = dill.load(f)

    def tag(self, data):
        res = self._tagger.tag(data)
        return [x[1] for x in res]

    def train(self, data):
        # Reset tagger.
        self._tagger = PerceptronTagger(load=False)
        self._tagger.train(data)

    @property
    def produces_temp_data(self):
        return False

    @property
    def requires_additional_params(self):
        return False

    def set_additional_params(self, options):
        pass

    def add_temp_dir(self, options):
        pass

    @property
    def model_name(self):
        return self._model_name

    @property
    def name(self):
        return self._name
Ejemplo n.º 3
0
def nltk_perceptron_pos_tagger(input_dict):
    name= 'PerceptronPosTagger'
    if not input_dict['training_corpus']:
        perceptron_tagger = PerceptronTagger()
        name += '-pretrained'
    else: 
        perceptron_tagger = PerceptronTagger(load=False)
        chunk = input_dict['training_corpus']['chunk']
        corpus = input_dict['training_corpus']['corpus']
        training_corpus=corpus_reader(corpus, chunk)
        perceptron_tagger.train(list(training_corpus))

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': perceptron_tagger,
                'name': name
            }
    }
def _get_pretrain_model():
    pos_tagger = PerceptronTagger(load=False)
    pos_tagger.train(sentences=train_sents, save_loc=PICKLE)
    print('Accuracy : ', pos_tagger.evaluate(test_sents))
Ejemplo n.º 5
0
def get_tagger(dfs_corpus, word_key="word", pos_key="universal_dependency"):
    '''
    This function instantiates a tagger trained with some corpus annotations (out of a DataFrame)

    Args:
        dfs_corpus: one (or a list of) Pandas DataFrame(s) with annotated corpus data
        word_key: (default 'word') column name for wordforms in dfs_corpus
        pos_key: (default 'universal_dependency') column name for parts-of-speech in dfs_corpus
    
    Returns:
        a PerceptronTagger instance 
    
    >>> # get a tagger, trained with df_corpus: a Pandas DataFrame with lots of corpus data
    >>> tagger = get_tagger(df_corpus)  
    >>> # tag a sentence now
    >>> sentence = 'Here is some beautiful sentence'
    >>> tagged_sentence = tagger.tag( sentence.split() )
    >>> print(tagged_sentence) 
    
    '''
    
    sentences = []
    
    # The algorithm expects a list of DataFrames by default, so make sure we have just that
    if isinstance(dfs_corpus, pd.DataFrame):
        dfs_corpus = [dfs_corpus]
        
    for df_corpus in dfs_corpus:
    
        # The corpus DataFrame consists of a number of sentences (rows) with a fixed number of tokens.
        # Each token has a fixed number of layers holding info like: lemma, wordform or part-of-speech. 
        # As a result, the number of columns of each row = [number of tokens] x [number of layers]

        # To be able to feed the tagger correctly, we need to compute the number of layers,
        # so we can infer the number of tokens the sentences hold. This is because
        # the tagger expects us to feed it with arrays with length = [number of tokens], as elements of
        # one single array holding all sentences arrays (see below).

        # So, determine how many layers (lemma, pos, wordform) we have 
        column_names = list(df_corpus.columns.values)
        for n, val in enumerate(column_names):
            # remove the numbers at the end of the layers names (lemma 1, lemma 2, ..., pos 1, pos 2, ...)
            # so we end up with clean layers name only
            column_names[n] = val.split(' ')[0] 
        number_of_layers = len(set(column_names))

        # Now we can determine the standard length of our corpus sentences: that can be computed 
        # by dividing the number of columns of the corpus DataFrame by the number of layers
        # we just computed.

        nr_of_words_per_sentence = int( df_corpus.shape[1] / number_of_layers )  

        # Build training data for the tagger in the right format
        # The input must be like: [ [('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')], [...] ]
        for index, row in df_corpus.iterrows():
            one_sentence =  []
            wrong = False
            for i in range(0, nr_of_words_per_sentence, 1):
                word_idx = word_key+' '+str(i)
                pos_idx = pos_key+' '+str(i)
                try:
                    tuple = ( row[word_idx], _cut_off_features(row[pos_idx]) )
                    one_sentence.append( tuple )
                    if (row[word_idx] is None or row[pos_idx] is None):
                        wrong = True
                except:
                    raise ValueError("function get_tagger() expects corpus data with columns '%s' and '%s', but those columns could not be found. Please call the function with these extra paramters to declare which column your corpus data has instead: get_tagger(word_key='...', pos_key='...')." % (word_key, pos_key))
            if wrong is False:
                sentences.append(one_sentence)
                
    # Instantiate and train the tagger now
    tagger = PerceptronTagger(load=False)
    tagger.train(sentences)
    
    return tagger
Ejemplo n.º 6
0
https://github.com/evanmiltenburg/Dutch-tagger

We got a POS tagger and a CHUNK tagger. In combination they can be used to apply NER...


"""

import os
import nltk

from nltk.tag.perceptron import PerceptronTagger
from nltk.corpus import alpino as alp  # Trained on ALP data.

tagger = PerceptronTagger(load=False)
os.chdir(r'D:\nlp_lib')
tagger.load('model.perc.dutch_tagger_small.pickle'
            )  # I don't know the source of training data

# Tag a sentence.
tagger.tag('Alle vogels zijn nesten begonnen , behalve ik en jij .'.split())

training_corpus = alp.tagged_sents()
unitagger = nltk.tag.UnigramTagger(training_corpus)
bitagger = nltk.tag.BigramTagger(training_corpus, backoff=unitagger)
perctagger = PerceptronTagger(load=True)  # What does load=True mean??
perctagger.train(training_corpus)

sent = 'NLTK is een goeda taal voor NLP'.split()
bitagger.tag(sent)
unitagger.tag(sent)
perctagger.tag(sent)
Ejemplo n.º 7
0
aḍris = []
#Construction de la chaine d'apprentissage à partir du corpus kabyle
for izirig in open("c:/tal/corpus-kab.txt", encoding='utf-8'):
    if (amenzu != 0):

        #ligne=ligne.lower()
        amenzu = 1
        izirig = izirig.replace("\n", "")
        a = izirig.split(" ")

        for i in a:
            b = i.split("/")
            try:
                if (b[1] != 'NMP'):
                    b[0] = b[0].lower()
                else:
                    b[0] = b[0].lower()
            except:
                print(b, 'erreur')
                exit()

            kab_tags_words1.append((b[0], b[1]))
        aḍris.append(kab_tags_words1)
    else:
        amenzu = 1

#entrainer l'algorithme sur la base du corpus et enregistrer le modèle

trained_model = "c:/tal/trained_model15.pickle"
tagger.train(aḍris, save_loc=trained_model, nr_iter=20)
Ejemplo n.º 8
0
comb_results = np.zeros((5, 4))
ind_results = np.zeros((5, 4))
for ki in range(data_batch):
    from nltk import TnT
    from nltk.tag import hmm
    from nltk.tag.perceptron import PerceptronTagger
    from nltk.tag import CRFTagger

    perc_tagger = PerceptronTagger(load=False)
    tnt_tagger = TnT()
    crf_tagger = CRFTagger()

    tnt_tagger.train(training_data[ki])
    hmm_tagger = nltk.HiddenMarkovModelTagger.train(training_data[ki])
    perc_tagger.train(training_data[ki])
    crf_tagger.train(training_data[ki], 'model.crf.tagger')

    # t.tagdata(test_data[800:])

    perc_pred = []
    hmm_pred = []

    for i in testing_data[ki]:
        perc_pred.append(perc_tagger.tag(i))
        hmm_pred.append(hmm_tagger.tag(i))
    crf_pred = crf_tagger.tag_sents(testing_data[ki])
    tnt_pred = tnt_tagger.tagdata(testing_data[ki])
    pred = {'p': perc_pred, 'h': hmm_pred, 'c': crf_pred, 't': tnt_pred}

    def most_frequent(List):
import cowparser as cp

train_sents = []
test_sents = []

gen = cp.sentences_for_dir(separate=False)
for i, (metadata, data) in enumerate(gen):
    train_sents.append([(a,b) for a,b,c in data])
    if i == 2000000:
        break

for i, (metadata, data) in enumerate(gen):
    test_sents.append([(a,b) for a,b,c in data])
    if i == 5000:
        break

from nltk.tag.perceptron import PerceptronTagger
pt = PerceptronTagger(load=False)
pt.train(train_sents,'model2.perc.dutch_tagger')
print(pt.evaluate(test_sents))