def test_tagger(tagger_name, tagger_input, test_data, **kwargs):
    # initialise results
    tagger_eval = dict()
    # train
    tic()
    tagger_tagger = tagger_name(tagger_input, **kwargs)
    tagger_eval['train_time'] = toc()
    # test
    tic()
    tagger_eval['test_accuracy'] = tagger_tagger.evaluate(test_data)
    tagger_eval['test_time'] = toc()
    # show results
    display_training_metrics(tagger_eval)
Exemple #2
0
train_sents, val_sents, test_sents = read_corpus('INTERA',
                                                 role='train',
                                                 proportion=PROPORTION,
                                                 tag_length=TAG_LENGTH)
"""
# =============================================================================
# investigate NLTK classification tagging options
# =============================================================================
"""
""" 1. TNT tagger """
tnt_eval = dict()
# train
tic()
tnt_tagger = tnt.TnT()
tnt_tagger.train(train_sents)
tnt_eval['train_time'] = toc()
# test
tic()
tnt_eval['test_accuracy'] = tnt_tagger.evaluate(val_sents)
tnt_eval['test_time'] = toc()
# display results
display_training_metrics(tnt_eval)
""" 2. Naive Bayes classifier tagger """
nb_eval = dict()
# train
tic()
nb_tagger = ClassifierBasedPOSTagger(train=train_sents)
nb_eval['train_time'] = toc()
# test
tic()
nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents)
    'build_fn': build_model,
    'input_dim': train_X.shape[1],
    'hidden_neurons': 32,
    'output_dim': train_y.shape[1],
    'epochs': 5,
    'batch_size': 256,
    'verbose': 1,
    'validation_data': (val_X, val_y),
    'shuffle': True
}
pos_model = KerasClassifier(**model_params)

# train the model
pos_model_history = pos_model.fit(train_X, train_y)
deeplearn_eval = dict()
deeplearn_eval['train_time'] = toc()
print(deeplearn_eval['train_time'])

# review training results
plot_model_performance(pos_model_history)
plot_model(pos_model.model, to_file=RESULTS_DIR+'Greek_POS_deep_model.png', 
           show_shapes=True)


""" 4. save and test the model """

# temporarily save the trained model, history and details
pos_model.model.save(RESOURCES_DIR+'Greek_POS_DL.h5')
save_tagger(RESULTS_DIR+'Greek_POS_DL_History.pkl', pos_model_history.history)   
save_tagger(RESOURCES_DIR+'Greek_POS_DL_DictVectorizer.pkl', dict_vectorizer)   
save_tagger(RESOURCES_DIR+'Greek_POS_DL_LabelEncoder.pkl', label_encoder)   
Exemple #4
0
# train with backoff and Brill
tic()
tag1_tagger = DefaultTagger('NO')
tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger)
tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger)
tag1_tagger = BigramTagger(train_sents, backoff=tag1_tagger)
tag1_tagger = TrigramTagger(train_sents, backoff=tag1_tagger)
tag1b_tagger = train_brill_tagger(tag1_tagger,
                                  train_sents,
                                  True,
                                  max_rules=100)
tag1_eval['train_time'] = toc()
# test
tic()
tag1_eval['test_accuracy'] = tag1b_tagger.evaluate(val_sents)
tag1_eval['test_time'] = toc()
# display results
display_training_metrics(tag1_eval)
"""
# =============================================================================
# finalise a classification-based tagger
# =============================================================================
"""
""" 1. Naive Bayes classifier tagger with features and Brill """
nb_eval = dict()
# train
tic()
# =============================================================================
# compound taggers using sequential taggers and backoff
# =============================================================================
"""
""" 1. create a tagger utilising: 
       n-gram, unigram, regexp and default taggers """
tag2_eval = dict()
# train with backoff
tic()
tag2_input = create_regexp_list('Open_Word_Patterns.xlsx', RESOURCES_DIR)
tag2_tagger = DefaultTagger('NO')
tag2_tagger = RegexpTagger(tag2_input, backoff=tag2_tagger)
tag2_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag2_tagger)
tag2_tagger = BigramTagger(train_sents, backoff=tag2_tagger)
tag2_tagger = TrigramTagger(train_sents, backoff=tag2_tagger)
tag2_eval['train_time'] = toc()
# test
tic()
tag2_eval['test_accuracy'] = tag2_tagger.evaluate(val_sents)
tag2_eval['test_time'] = toc()
# display results
display_training_metrics(tag2_eval)
""" 2. create a tagger utilising: 
       n-gram, unigram, affix and default taggers """
tag1_eval = dict()
# train with backoff
tic()
tag1_tagger = DefaultTagger('NO')
tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger)
                           proportion=70, tag_length=TAG_LENGTH)
_, val_100, test_int = read_corpus('INTERA', role='train',
                                   proportion=PROPORTION,
                                   tag_length=TAG_LENGTH)
_, _, test_ud = read_corpus('UDGreek')
_, _, test_tt = read_corpus('tagged_texts')


""" 1. sequential tagger """
seq_eval = dict()
seq_tag = load_tagger(RESOURCES_DIR+'Greek_POS_seq.pkl')
# word level
seq_eval['verification'] = seq_tag.evaluate(val_70)
tic()
seq_eval['evaluate'] = seq_tag.evaluate(test_int)
seq_eval['evaluate_time'] = toc()
seq_eval['ud_greek'] = seq_tag.evaluate(test_ud)
seq_eval['tagged_text'] = seq_tag.evaluate(test_tt)
# sentence level
pred_int = [seq_tag.tag(s) for s in untag(test_int)]
seq_eval['sent_evaluate'] = compute_sent_acc(test_int, pred_int)
pred_ud = [seq_tag.tag(s) for s in untag(test_ud)]
seq_eval['sent_ud_greek'] = compute_sent_acc(test_ud, pred_ud)
pred_tt = [seq_tag.tag(s) for s in untag(test_tt)]
seq_eval['sent_tagged_text'] = compute_sent_acc(test_tt, pred_tt)
print('\n')
print(seq_eval)


""" 2. classification tagger """
class_eval = dict()