class Chunker(nltk.chunk.ChunkParserI): ''' Chunker for SCLE. Only chunks NP for now. ''' def __init__(self): train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents] self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features) def chunk(self, sentences): ''' ''' chunked_sents = [] for sent in sentences: c_sent = self._tagger.tag(sent) conlltags =[(w,t,c) for ((w,t),c) in c_sent] chunked_sents.append(nltk.chunk.conlltags2tree(conlltags)) return chunked_sents def evaluate(self): ''' Evaluate the chunker. ''' print self._tagger.evaluate(self._test_sents)
read_ud_pos_data(r'C:\UD_English-EWT-master\en_ewt-ud-train.conllu')) test_data = list( read_ud_pos_data(r'C:\UD_English-EWT-master\en_ewt-ud-dev.conllu')) print("train_data", train_data) print("Data loaded .") start_time = time.time() print("Starting training ...") tagger = ClassifierBasedTagger( feature_detector=pos_features, train=train_data[:100], classifier_builder=train_scikit_classifier, ) end_time = time.time() print("Training complete. Time={0:.2f}s".format(end_time - start_time)) print("Computing test set accuracy ...") print(tagger.evaluate(test_data)) # 0.8949021790997296 import time import itertools from sklearn.feature_extraction import FeatureHasher from sklearn.linear_model import Perceptron def incremental_train_scikit_classifier(sentences, feature_detector, batch_size, max_iterations): initial_corpus_iterator, sentences = itertools.tee(sentences) # compute all labels ALL_LABELS = set([])
display_training_metrics(tag1_eval) """ # ============================================================================= # finalise a classification-based tagger # ============================================================================= """ """ 1. Naive Bayes classifier tagger with features and Brill """ nb_eval = dict() # train tic() nb_tagger = ClassifierBasedTagger(train=train_sents, feature_detector=add_features) nb_eval['train_time'] = toc() # test tic() nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents) nb_eval['test_time'] = toc() # display results display_training_metrics(nb_eval) """ # ============================================================================= # finalise a deep learning tagger # ============================================================================= """ """ 1. prepare the data """ # for train, test and validation train_X, train_y = create_observation(train_sents) val_X, val_y = create_observation(val_sents) # convert features to vectors dict_vectorizer = DictVectorizer(sparse=True)
return classifier import time from nltk.tag import ClassifierBasedTagger from utils import read_ud_pos_data from tag import pos_features if __name__ == "__main__": print("Loading data ...") train_data = list(read_ud_pos_data('../../../data/en-ud-train.conllu')) test_data = list(read_ud_pos_data('../../../data/en-ud-dev.conllu')) print("train_data", train_data) print("Data loaded .") start_time = time.time() print("Starting training ...") tagger = ClassifierBasedTagger( feature_detector=pos_features, train=train_data[:2000], classifier_builder=train_scikit_classifier, ) end_time = time.time() print("Training complete. Time={0:.2f}s".format(end_time - start_time)) print("Computing test set accuracy ...") print(tagger.evaluate(test_data)) # 0.8949021790997296 print(tagger.tag("This is a test".split()))