Example #1
0
    def tag_freq(self, t):
        """Frequency of tag t."""
        return self.tagsAppearances[t]

    def tag_word_dict(self, t):
        """Dictionary of words and their counts for tag t."""
        return dict(self.tagDict[t])


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = SimpleAncoraCorpusReader(opts['<path>'])
    sents = corpus.tagged_sents()

    # compute the statistics
    stats = POSStats(sents)

    print('Basic Statistics')
    print('================')
    print('sents: {}'.format(stats.sent_count()))
    token_count = stats.token_count()
    print('tokens: {}'.format(token_count))
    word_count = stats.word_count()
    print('words: {}'.format(word_count))
    print('tags: {}'.format(stats.tag_count()))
    print('')

    print('Most Frequent POS Tags')
Example #2
0
                   == ground_truth[unknown]).sum() / unknown.sum() * 100
    print("Accuracy for unknown words: {:2.2f}%".format(unknown_acc))

    if show_confusion_matrix:
        top = 5
        top_tags = np.argsort(-counts)[:top]
        labels = labels[top_tags]

        cm = cm.astype('float') / cm.sum()
        cm = cm[top_tags][:, top_tags]

        plot_confusion_matrix(cm, labels)


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader(ANCORA_CORPUS_PATH, files)
    sents = list(corpus.tagged_sents())

    # tag and evaluate
    print_results(model, sents, opts['-c'])