Example #1
0
"""
from docopt import docopt
import pickle

from tagging.ancora import SimpleAncoraCorpusReader
from tagging.baseline import BaselineTagger, BadBaselineTagger
from tagging.hmm import MLHMM

models = {'badbase': BadBaselineTagger, 'base': BaselineTagger, 'mlhmm': MLHMM}

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('ancora/ancora-3.0.1es/', files)
    sents = corpus.tagged_sents()

    # train the model
    model_class = models[opts['-m']]
    print(opts['-m'])
    if opts['-m'] == 'mlhmm':
        model = model_class(int(opts['-n']), sents)
    else:
        model = model_class(sents)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()
Example #2
0
        return len(self.tags())

    def tag_freq(self, t):
        """Frequency of tag t."""
        return self.tagsAppearances[t]

    def tag_word_dict(self, t):
        """Dictionary of words and their counts for tag t."""
        return dict(self.tagDict[t])


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = SimpleAncoraCorpusReader(opts['<path>'])
    sents = corpus.tagged_sents()

    # compute the statistics
    stats = POSStats(sents)

    print('Basic Statistics')
    print('================')
    print('sents: {}'.format(stats.sent_count()))
    token_count = stats.token_count()
    print('tokens: {}'.format(token_count))
    word_count = stats.word_count()
    print('words: {}'.format(word_count))
    print('tags: {}'.format(stats.tag_count()))
    print('')
Example #3
0
    print('\b' * width + msg, end='')
    sys.stdout.flush()


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader(opts['-c'], files)
    sents = list(corpus.tagged_sents())

    # tag and evaluate
    hits, total = 0, 0
    unk_hits, unk_total = 0, 0
    error_count = defaultdict(lambda: defaultdict(int))
    error_sents = defaultdict(lambda: defaultdict(set))
    n = len(sents)
    for i, sent in enumerate(sents):
        word_sent, gold_tag_sent = zip(*sent)
        model_tag_sent = model.tag(word_sent)
        assert len(model_tag_sent) == len(gold_tag_sent), i

        # global score
        hits_sent = [m == g for m, g in zip(model_tag_sent, gold_tag_sent)]
Example #4
0
                   == ground_truth[unknown]).sum() / unknown.sum() * 100
    print("Accuracy for unknown words: {:2.2f}%".format(unknown_acc))

    if show_confusion_matrix:
        top = 5
        top_tags = np.argsort(-counts)[:top]
        labels = labels[top_tags]

        cm = cm.astype('float') / cm.sum()
        cm = cm[top_tags][:, top_tags]

        plot_confusion_matrix(cm, labels)


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader(ANCORA_CORPUS_PATH, files)
    sents = list(corpus.tagged_sents())

    # tag and evaluate
    print_results(model, sents, opts['-c'])
Example #5
0
        return len(self.tags())

    def tag_freq(self, t):
        """Frequency of tag t."""
        return sum([count for tag, count in self._tag_word_dict[t].items()])

    def tag_word_dict(self, t):
        """Dictionary of words and their counts for tag t."""
        return dict(self._tag_word_dict[t])


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = SimpleAncoraCorpusReader('./ancora-dataset/ancora-3.0.1es/')
    sents = corpus.tagged_sents()

    # compute the statistics
    stats = POSStats(sents)

    print('Basic Statistics')
    print('================')
    print('sents: {}'.format(stats.sent_count()))
    token_count = stats.token_count()
    print('tokens: {}'.format(token_count))
    word_count = stats.word_count()
    print('words: {}'.format(word_count))
    print('tags: {}'.format(stats.tag_count()))
    print('')
Example #6
0
        """Frequency of tag t."""
        return(self._countTag[t])

    def tag_word_dict(self, t):
        """Dictionary of words and their counts for tag t."""
        return dict(self._tcount[t])


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data

    #corpus = SimpleAncoraCorpusReader(opts['-c'])  #No se porque no me esta cargando asi
    # corpus = SimpleAncoraCorpusReader("ancora-3.0.1es")
    corpus = SimpleAncoraCorpusReader(opts['<path>']) #por la documentacion que encontre, lo cambie a '<path>' para no hardcodearlo
    sents = corpus.tagged_sents()

    count = defaultdict(int)




    # compute the statistics
    stats = POSStats(sents)

    print('Basic Statistics')
    print('================')
    print('sents: {}'.format(stats.sent_count()))
    token_count = stats.token_count()
    print('tokens: {}'.format(token_count))
Example #7
0
    def tag_freq(self, t):
        """Frequency of tag t."""
        #        return self.tag_dict[t] / self.tokenCount
        return self.freq_tag_dict[t]

    def tag_word_dict(self, t):
        """Dictionary of words and their counts for tag t."""
        return dict(self.tag_dict[t])


#Esto corre el programa
if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = SimpleAncoraCorpusReader(
        opts['<path>'])  #Modified -c with path. Otherwise it doesn't work
    sents = corpus.tagged_sents()

    # compute the statistics
    stats = POSStats(sents)

    print('Basic Statistics')
    print('================')
    print('sents: {}'.format(stats.sent_count()))
    token_count = stats.token_count()
    print('tokens: {}'.format(token_count))
    word_count = stats.word_count()
    print('words: {}'.format(word_count))
    print('tags: {}'.format(stats.tag_count()))
    print('')
    print('Example of word frequency')
Example #8
0
        return len(self._tags)

    def tag_freq(self, t):
        """Frequency of tag t."""
        return self._tag_freq[t]

    def tag_word_dict(self, t):
        """Dictionary of words and their counts for tag t."""
        return dict(self._tcount[t])


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = SimpleAncoraCorpusReader('corpus/ancora-3.0.1es/')
    sents = corpus.tagged_sents()

    # compute the statistics
    stats = POSStats(sents)

    print('Basic Statistics')
    print('================')
    print('sents: {}'.format(stats.sent_count()))
    token_count = stats.token_count()
    print('tokens: {}'.format(token_count))
    word_count = stats.word_count()
    print('words: {}'.format(word_count))
    print('tags: {}'.format(stats.tag_count()))
    print('')
Example #9
0
from tagging.classifier import *
import time

models = {
    'badbase': BadBaselineTagger,
    'base': BaselineTagger,
    'classifier': ClassifierTagger,
}

path = 'ancora-3.0.1es'  #Path to corpus
filename = 'classifierLR'  #Name of pickle
selectedModel = 'classifier'

# load the data
files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
corpus = SimpleAncoraCorpusReader(path, files)
sents = list(corpus.tagged_sents())

# train the model
model_class = models[selectedModel]

start = time.time()
model = model_class(sents, 'lr')
end = time.time()
print(end - start)
print((end - start) / 60)

#sent = 'El gato come pescado .'.split()
#model.tag(sent)

# save it