def test_update(self): tagger = TaggingPerceptron(vocabulary(self.dataset), tags(self.dataset)) for ss, pred ,tt, expected_w in zip(self.sents, self.predicted_tags, self.tags, self.expected_ws): w = tagger.update(ss, pred, tt) self.assertSequenceEqual(list(w), list(expected_w))
def test_feature_vector(self): tagger = TaggingPerceptron(vocabulary(self.dataset), tags(self.dataset)) for sample_ix, (sent, tag) in enumerate(zip(self.sents, \ self.tags)): ftr_vectr = tagger.feature_vector(sent,tag) self.assertSequenceEqual(list(ftr_vectr), list(self.expected_train_ftr_vectos[sample_ix]))
def test_decode(self): tagger = TaggingPerceptron(vocabulary(self.dataset), tags(self.dataset)) # one iteration of updates tagger.train(2, self.sents, self.tags, self.sents, self.tags) for sent, expected_output in zip(self.sents, self.expected_tags_two_itr): tag_seq = tagger.decode(sent) self.assertSequenceEqual(tag_seq, expected_output)
if self._n % report_every == 1: test_accuracy, confusion, word_errors = \ self.accuracy(test_sents, test_tags) print("\t".join(ss)) print("\t".join(tt)) print("\t".join(pred)) print("After %i sents, accuracy is %f, nonzero feats %i" % (self._n, test_accuracy, sum(1 for x in self._w if x != 0.0))) self.finalize() test_accuracy, confusion, word_errors = \ self.accuracy(test_sents, test_tags) print("---------------") print("Final accuracy: %f" % test_accuracy) if __name__ == "__main__": #CoNLL conll_train = CoNLL2003_Train() conll_valid = CoNLL2003_Valid() train_sents, train_tags = dataset_to_sents_and_tags(conll_train) valid_sents, valid_tags = dataset_to_sents_and_tags(conll_valid) tp = TaggingPerceptron(vocabulary(conll_train), tags(conll_train)) itrs = 5 tp.train(itrs, train_sents, train_tags, valid_sents, valid_tags)