def tag_freq(self, t): """Frequency of tag t.""" return self.tagsAppearances[t] def tag_word_dict(self, t): """Dictionary of words and their counts for tag t.""" return dict(self.tagDict[t]) if __name__ == '__main__': opts = docopt(__doc__) # load the data corpus = SimpleAncoraCorpusReader(opts['<path>']) sents = corpus.tagged_sents() # compute the statistics stats = POSStats(sents) print('Basic Statistics') print('================') print('sents: {}'.format(stats.sent_count())) token_count = stats.token_count() print('tokens: {}'.format(token_count)) word_count = stats.word_count() print('words: {}'.format(word_count)) print('tags: {}'.format(stats.tag_count())) print('') print('Most Frequent POS Tags')
== ground_truth[unknown]).sum() / unknown.sum() * 100 print("Accuracy for unknown words: {:2.2f}%".format(unknown_acc)) if show_confusion_matrix: top = 5 top_tags = np.argsort(-counts)[:top] labels = labels[top_tags] cm = cm.astype('float') / cm.sum() cm = cm[top_tags][:, top_tags] plot_confusion_matrix(cm, labels) if __name__ == '__main__': opts = docopt(__doc__) # load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # load the data files = '3LB-CAST/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader(ANCORA_CORPUS_PATH, files) sents = list(corpus.tagged_sents()) # tag and evaluate print_results(model, sents, opts['-c'])