Exemple #1
0
if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    print("\nLoading the model...")
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()
    print("Model type: %s" % type(model))

    # load the data
    print("Loading corpus data...")
    files = '3LB-CAST/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/', files)
    sents = list(corpus.tagged_sents())

    # compute statistics
    print("Computing results...")
    # Compute Accuracy
    # Global accuracy of the model (percentage of right tagging)
    acc, hits, total = 0.0, 0, 0
    # Accuracy over known(k) and unknowns(u) words for the model
    hits_k, total_k, hits_u, total_u = 0, 0, 0, 0
    y_true, y_pred = [], []

    # Data for Confusion Matrix
    tagset = set()
    for t_sent in sents:
        for _, tag in t_sent:
            tagset.add(tag)
Exemple #2
0

if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # Load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    PATH = "./../../ancora-3.0.1es"
    corpus = SimpleAncoraCorpusReader(PATH, files)
    sents = list(corpus.tagged_sents())

    # Tag
    hits = 0
    total = 0

    # Hits Palabras conocidas
    hits_known_word = 0
    total_known_word = 0

    # Hits Palabras desconocidas
    hits_unknown_word = 0
    total_unknown_word = 0

    # Para Matriz de Confusion
    tags_gold = []  # Tags correctos
Exemple #3
0
def evaluate(model=None, matrix='n'):
    '''
    model --   The model trained that has been evaluated
    matrix --  If you want to generate the confusion matrix ('y') or not ('n')

    '''
    start = time()
    if model is None:
        opts = docopt(__doc__)
        matrix = opts['-m'] == 'y'

        # load the model
        filename = opts['-i']
        filename = 'Models/' + filename
        f = open(filename, 'rb')
        model = pickle.load(f)
        f.close()

    # load the data
    files = '3LB-CAST/.*\.tbf\.xml'
    actual_dir = os.path.dirname(os.path.abspath(__file__))

    corpus = SimpleAncoraCorpusReader(actual_dir + '/corpus/ancora/', files)
    sents = list(corpus.tagged_sents())
    n = len(sents)

    # tag
    hits, total = 0, 0
    hits_known, hits_unknown = 0, 0
    total_known, total_unknown = 0, 0
    are_known = []

    # confusion matrix
    test = []
    prediction = []

    for i, sent in enumerate(sents):
        word_sent, gold_tag_sent = zip(*sent)
        model_tag_sent = model.tag(word_sent).tolist()
        assert len(model_tag_sent) == len(gold_tag_sent), i
        # For confusion matrix
        test += list(gold_tag_sent)
        prediction += model_tag_sent

        # global score
        hits_sent = [m == g for m, g in zip(model_tag_sent, gold_tag_sent)]
        hits += sum(hits_sent)
        total += len(sent)
        total_acc = float(hits) / total

        # known words score
        for j in range(len(hits_sent)):
            # using the Counter method, descripted later, we have to asign
            # some values if are known or unknown and if are hit or not.
            if not model.unknown(word_sent[j]):
                are_known += [hits_sent[j] + 1]
            else:
                are_known += [hits_sent[j] - 2]

        progress('{:3.1f}% (Total: {:2.2f}%)'.format(
            float(i) * 100 / n, total_acc * 100))

    # For eficiency we will use the Counter object from collections
    # library.
    # We redefine some things to look for them later
    known = 2
    fail_known = 1
    unknown = -1
    fail_unknown = -2

    # Counter creates a dictionary whose keys are known, fail_known, unknown
    # and fail_unknown.
    counter = Counter(are_known)
    # Now get the values that represent how many times does apears each one
    hits_known += counter[known]
    total_known += counter[known] + counter[fail_known]

    hits_unknown += counter[unknown]
    total_unknown += counter[unknown] + counter[fail_unknown]

    # Compute accuracy
    total_acc = float(hits) / total
    known_acc = float(hits_known) / total_known
    unknown_acc = float(hits_unknown) / total_unknown
    finish = time() - start
    print('')
    print('Total accuracy: {:2.2f}%'.format(total_acc * 100))
    print('Known accuracy: {:2.2f}%'.format(known_acc * 100))
    print('Unknown accuracy: {:2.2f}%'.format(unknown_acc * 100))
    print('Time running: {:2.2f}seconds'.format(finish))

    if matrix:
        matrix = confusion_matrix(test, prediction)
        classes = list(set(test) | set(prediction))
        classes.sort()
        plot_confusion_matrix(matrix, classes, filename.split('.')[0] + '.png')