Esempio n. 1
0
    sys.stdout.flush()


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # Load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    PATH = "./../../ancora-3.0.1es"
    corpus = SimpleAncoraCorpusReader(PATH, files)
    sents = list(corpus.tagged_sents())

    # Tag
    hits = 0
    total = 0

    # Hits Palabras conocidas
    hits_known_word = 0
    total_known_word = 0

    # Hits Palabras desconocidas
    hits_unknown_word = 0
    total_unknown_word = 0

    # Para Matriz de Confusion
Esempio n. 2
0
    print('\b' * width + msg, end='')
    sys.stdout.flush()


if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('ancora/ancora-3.0.1es/', files)
    sents = list(corpus.tagged_sents())

    # tag
    hits, total = 0, 0
    n = len(sents)
    for i, sent in enumerate(sents):
        word_sent, gold_tag_sent = zip(*sent)

        model_tag_sent = model.tag(word_sent)
        assert len(model_tag_sent) == len(gold_tag_sent), i

        # global score
        hits_sent = [m == g for m, g in zip(model_tag_sent, gold_tag_sent)]
        hits += sum(hits_sent)
        total += len(sent)
Esempio n. 3
0
if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    print("\nLoading the model...")
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()
    print("Model type: %s" % type(model))

    # load the data
    print("Loading corpus data...")
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/', files)
    sents = list(corpus.tagged_sents())

    # compute statistics
    print("Computing results...")
    # Compute Accuracy
    # Global accuracy of the model (percentage of right tagging)
    acc, hits, total = 0.0, 0, 0
    # Accuracy over known(k) and unknowns(u) words for the model
    hits_k, total_k, hits_u, total_u = 0, 0, 0, 0
    y_true, y_pred = [], []

    # Data for Confusion Matrix
    tagset = set()
    for t_sent in sents:
        for _, tag in t_sent:
Esempio n. 4
0
    print('\b' * width + msg, end='')
    sys.stdout.flush()


if __name__ == '__main__':
    opts = docopt(__doc__)

    print('Loading model...')
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    print('Loading corpus...')
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('ancora/ancora-2.0/', files)
    parsed_sents = list(corpus.parsed_sents())

    print('Parsing...')
    hits, total_gold, total_model = 0, 0, 0
    n = len(parsed_sents)
    format_str = '{:3.1f}% ({}/{}) (P={:2.2f}%, R={:2.2f}%, F1={:2.2f}%)'
    progress(format_str.format(0.0, 0, n, 0.0, 0.0, 0.0))
    for i, gold_parsed_sent in enumerate(parsed_sents):
        tagged_sent = gold_parsed_sent.pos()

        # parse
        model_parsed_sent = model.parse(tagged_sent)

        # compute labeled scores
        gold_spans = spans(gold_parsed_sent, unary=False)
Esempio n. 5
0
models = {
    'flat': Flat,
    'rbranch': RBranch,
    'lbranch': LBranch,
    'upcfg': UPCFG
}


if __name__ == '__main__':
    opts = docopt(__doc__)

    print('Loading corpus ...')
    PATH = "./../../ancora-3.0.1es"
    files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader(PATH, files)

    print('Training model ...')
    # x = list(corpus.parsed_sents())[:10]
    m = opts['-m']  # Modelo Elegido
    n = opts['-n']  # Orden Markovizacion Horizontal
    if (n is not None) and (m == "upcfg"):
        model = models[opts['-m']](corpus.parsed_sents(), horzMarkov=int(n))
    else:
        model = models[opts['-m']](corpus.parsed_sents())
    # model = models[opts['-m']](corpus.parsed_sents())
    # x = corpus.parsed_sents()
    # model = models[opts['-m']](x)

    print('Saving ...')
    filename = opts['-o']
Esempio n. 6
0
from docopt import docopt
import pickle

from corpus.ancora import SimpleAncoraCorpusReader

from parsing.baselines import Flat, RBranch, LBranch


models = {
    'flat': Flat,
    'rbranch': RBranch,
    'lbranch': LBranch,
}


if __name__ == '__main__':
    opts = docopt(__doc__)

    print('Loading corpus...')
    files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('ancora/ancora-2.0/', files)

    print('Training model...')
    model = models[opts['-m']](corpus.parsed_sents())

    print('Saving...')
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()
Esempio n. 7
0

models = {
    'flat': Flat,
    'rbranch': RBranch,
    'lbranch': LBranch,
    'upcfg': UPCFG
}


if __name__ == '__main__':
    opts = docopt(__doc__)

    print('\nLoading corpus...')
    files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/', files)

    print('Training model...')
    om, on = opts['-m'], opts['-n']
    if om == 'upcfg':
        n = None if on is None else int(on)
        print('UPCFG model selected n={}.'.format(n))
        model = models[om](corpus.parsed_sents(), horzMarkov=n)
    elif om in ['flat', 'rbranch', 'lbranch']:
        print(om + ' model selected.')
        model = models[om](corpus.parsed_sents())
    else:
        print('Bad model type.')
        exit()

    print('Saving...\n')
Esempio n. 8
0
"""Print corpus statistics.

Usage:
  stats.py
  stats.py -h | --help

Options:
  -h --help     Show this screen.
"""
from docopt import docopt

from corpus.ancora import SimpleAncoraCorpusReader

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    corpus = SimpleAncoraCorpusReader('ancora/ancora-2.0/')
    sents = list(corpus.tagged_sents())

    # compute the statistics
    print('sents: {}'.format(len(sents)))
Esempio n. 9
0
Options:
  -h --help     Show this screen.
"""

from operator import itemgetter as elem
from docopt import docopt
from collections import Counter, defaultdict
from corpus.ancora import SimpleAncoraCorpusReader

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    path = '/home/alangb/Escritorio/ancora-3.0.1es/'
    corpus = SimpleAncoraCorpusReader(path)
    sents = list(corpus.tagged_sents())

    # compute the statistics

    # get words and tags
    words_tags = [word_tag for sent in sents for word_tag in sent]
    words, tags = zip(*words_tags)
    word_types = set(words)
    tag_types = set(tags)

    # calculate 10 most common tags
    common_tags = Counter(tags).most_common(10)

    # calculate 5 most common words
    # for each one of the most common tags
Esempio n. 10
0
    'ct': ClassifierTagger,
}

clasifiers = {
    'multinomial': MultinomialNB,
    'linear': LinearSVC,
    'LogisticRegression': LogisticRegression
}

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
    actual_dir = os.path.dirname(os.path.abspath(__file__))
    corpus = SimpleAncoraCorpusReader(actual_dir + '/corpus/ancora/', files)
    sents = list(corpus.tagged_sents())

    # train the model
    if opts['-n'] is not None:
        n = int(opts['-n'])
    m = opts['-m']
    c = opts['-c']
    wv_file = opts['-i']
    b = opts['-b'] == 'y'
    if m == 'ct':
        print("Model", m, "Training")
        model = models[m](wv_file=wv_file, is_bin=b)
        model.fit(sents)

    # save it
Esempio n. 11
0
if __name__ == '__main__':
    opts = docopt(__doc__)

    print('Loading model...')
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    m = opts['-m']
    n = opts['-n']

    print('Loading corpus...')
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('ancora-3.0.1es/', files)
    parsed_sents = list(corpus.parsed_sents())

    if n is not None:
        n = int(n)
        parsed_sents = parsed_sents[:n]
    if m is not None:
        m = int(m)
        parsed_sents = [tree for tree in parsed_sents if len(tree.leaves()) <= m]

    print('Parsing...')
    hits, total_gold, total_model = 0, 0, 0
    un_hits, un_total_gold, un_total_model = 0, 0, 0
    n = len(parsed_sents)
    format_str = '{:3.1f}% ({}/{}) (P={:2.2f}%, R={:2.2f}%, F1={:2.2f}%)'
    progress(format_str.format(0.0, 0, n, 0.0, 0.0, 0.0))
Esempio n. 12
0
    return (2 * precision * recall) / (precision + recall)


if __name__ == '__main__':
    opts = docopt(__doc__)

    print('Loading model ...')
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    print('Loading corpus ...')
    PATH = "./../../ancora-3.0.1es"
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader(PATH, files)
    parsed_sents = list(corpus.parsed_sents())

    # Opcion para seleccionar las primeras n oraciones
    n = opts["-n"]
    if n is not None:
        n = int(n)
        parsed_sents = parsed_sents[:n]

    # Opcion para seleccionar las oraciones de largo <= m
    m = opts["-m"]
    if m is not None:
        m = int(m)
        new_parsed_sents = []
        for parsed_sent in parsed_sents:
            if len(parsed_sent.leaves()) <= m:
Esempio n. 13
0
def evaluate(model=None, matrix='n'):
    '''
    model --   The model trained that has been evaluated
    matrix --  If you want to generate the confusion matrix ('y') or not ('n')

    '''
    start = time()
    if model is None:
        opts = docopt(__doc__)
        matrix = opts['-m'] == 'y'

        # load the model
        filename = opts['-i']
        filename = 'Models/' + filename
        f = open(filename, 'rb')
        model = pickle.load(f)
        f.close()

    # load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    actual_dir = os.path.dirname(os.path.abspath(__file__))

    corpus = SimpleAncoraCorpusReader(actual_dir + '/corpus/ancora/', files)
    sents = list(corpus.tagged_sents())
    n = len(sents)

    # tag
    hits, total = 0, 0
    hits_known, hits_unknown = 0, 0
    total_known, total_unknown = 0, 0
    are_known = []

    # confusion matrix
    test = []
    prediction = []

    for i, sent in enumerate(sents):
        word_sent, gold_tag_sent = zip(*sent)
        model_tag_sent = model.tag(word_sent).tolist()
        assert len(model_tag_sent) == len(gold_tag_sent), i
        # For confusion matrix
        test += list(gold_tag_sent)
        prediction += model_tag_sent

        # global score
        hits_sent = [m == g for m, g in zip(model_tag_sent, gold_tag_sent)]
        hits += sum(hits_sent)
        total += len(sent)
        total_acc = float(hits) / total

        # known words score
        for j in range(len(hits_sent)):
            # using the Counter method, descripted later, we have to asign
            # some values if are known or unknown and if are hit or not.
            if not model.unknown(word_sent[j]):
                are_known += [hits_sent[j] + 1]
            else:
                are_known += [hits_sent[j] - 2]

        progress('{:3.1f}% (Total: {:2.2f}%)'.format(
            float(i) * 100 / n, total_acc * 100))

    # For eficiency we will use the Counter object from collections
    # library.
    # We redefine some things to look for them later
    known = 2
    fail_known = 1
    unknown = -1
    fail_unknown = -2

    # Counter creates a dictionary whose keys are known, fail_known, unknown
    # and fail_unknown.
    counter = Counter(are_known)
    # Now get the values that represent how many times does apears each one
    hits_known += counter[known]
    total_known += counter[known] + counter[fail_known]

    hits_unknown += counter[unknown]
    total_unknown += counter[unknown] + counter[fail_unknown]

    # Compute accuracy
    total_acc = float(hits) / total
    known_acc = float(hits_known) / total_known
    unknown_acc = float(hits_unknown) / total_unknown
    finish = time() - start
    print('')
    print('Total accuracy: {:2.2f}%'.format(total_acc * 100))
    print('Known accuracy: {:2.2f}%'.format(known_acc * 100))
    print('Unknown accuracy: {:2.2f}%'.format(unknown_acc * 100))
    print('Time running: {:2.2f}seconds'.format(finish))

    if matrix:
        matrix = confusion_matrix(test, prediction)
        classes = list(set(test) | set(prediction))
        classes.sort()
        plot_confusion_matrix(matrix, classes, filename.split('.')[0] + '.png')
Esempio n. 14
0
from docopt import docopt
import pickle

from corpus.ancora import SimpleAncoraCorpusReader
from tagging.baseline import BaselineTagger
from tagging.hmm import MLHMM
from tagging.memm import MEMM

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the data
    print("Loading corpus data...")
    files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/', files)
    sents = list(corpus.tagged_sents())

    # order of the model
    m = str(opts['-m'])
    # train the model
    filename = opts['-o']

    if m == "base":
        print("Baseline Model selected")
        model = BaselineTagger(tagged_sents=sents)
    elif m == "mlhmm":
        n = int(opts['-n'])
        print("Maximum Likelihood Hidden Markov Model selected, n=%d" % n)
        model = MLHMM(n=n, tagged_sents=sents, addone=True)
    elif m == 'memm':