Exemple #1
0
class Chunker(nltk.chunk.ChunkParserI):
    '''
    Chunker for SCLE. Only chunks NP for now.
    '''
    def __init__(self):

        train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
        ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
        self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents]
        self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features)

    def chunk(self, sentences):
        '''
        '''
        chunked_sents = []
        for sent in sentences:
            c_sent = self._tagger.tag(sent)
            conlltags =[(w,t,c) for ((w,t),c) in c_sent]
            chunked_sents.append(nltk.chunk.conlltags2tree(conlltags))
        return chunked_sents
   
    def evaluate(self):
        '''
        Evaluate the chunker.
        '''
        print self._tagger.evaluate(self._test_sents)
Exemple #2
0
        read_ud_pos_data(r'C:\UD_English-EWT-master\en_ewt-ud-train.conllu'))
    test_data = list(
        read_ud_pos_data(r'C:\UD_English-EWT-master\en_ewt-ud-dev.conllu'))
    print("train_data", train_data)
    print("Data loaded .")
    start_time = time.time()
    print("Starting training ...")
    tagger = ClassifierBasedTagger(
        feature_detector=pos_features,
        train=train_data[:100],
        classifier_builder=train_scikit_classifier,
    )
    end_time = time.time()
    print("Training complete. Time={0:.2f}s".format(end_time - start_time))
    print("Computing test set accuracy ...")
    print(tagger.evaluate(test_data))  # 0.8949021790997296

import time
import itertools
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import Perceptron


def incremental_train_scikit_classifier(sentences, feature_detector,
                                        batch_size, max_iterations):

    initial_corpus_iterator, sentences = itertools.tee(sentences)

    # compute all labels
    ALL_LABELS = set([])
Exemple #3
0
display_training_metrics(tag1_eval)
"""
# =============================================================================
# finalise a classification-based tagger
# =============================================================================
"""
""" 1. Naive Bayes classifier tagger with features and Brill """
nb_eval = dict()
# train
tic()
nb_tagger = ClassifierBasedTagger(train=train_sents,
                                  feature_detector=add_features)
nb_eval['train_time'] = toc()
# test
tic()
nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents)
nb_eval['test_time'] = toc()
# display results
display_training_metrics(nb_eval)
"""
# =============================================================================
# finalise a deep learning tagger
# =============================================================================
"""
""" 1. prepare the data """
# for train, test and validation
train_X, train_y = create_observation(train_sents)
val_X, val_y = create_observation(val_sents)

# convert features to vectors
dict_vectorizer = DictVectorizer(sparse=True)
Exemple #4
0
    return classifier


import time
from nltk.tag import ClassifierBasedTagger
from utils import read_ud_pos_data
from tag import pos_features


if __name__ == "__main__":
    print("Loading data ...")
    train_data = list(read_ud_pos_data('../../../data/en-ud-train.conllu'))
    test_data = list(read_ud_pos_data('../../../data/en-ud-dev.conllu'))
    print("train_data", train_data)
    print("Data loaded .")

    start_time = time.time()
    print("Starting training ...")
    tagger = ClassifierBasedTagger(
        feature_detector=pos_features,
        train=train_data[:2000],
        classifier_builder=train_scikit_classifier,
    )
    end_time = time.time()
    print("Training complete. Time={0:.2f}s".format(end_time - start_time))


    print("Computing test set accuracy ...")
    print(tagger.evaluate(test_data))  # 0.8949021790997296

    print(tagger.tag("This is a test".split()))