Beispiel #1
0
def evaluate_classification(true_labels, pred_labels):
    mltoolkit = MLToolkit()
    eval_res = dict(entity=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0),
                    matrix=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0),
                    relational=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0),
                    list=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0),
                    nondata=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0),
                    acc=0)
    fscore, precision, recall, f_micro, p_micro, r_micro, tp, fp, fn = mltoolkit.get_score_report(
        pred_labels, true_labels)
    for l in fscore.keys():
        eval_res[l]['p'] = precision[l]
        eval_res[l]['r'] = recall[l]
        eval_res[l]['f'] = fscore[l]
        eval_res[l]['tp'] = tp[l]
        eval_res[l]['fp'] = fp[l]
        eval_res[l]['fn'] = fn[l]
    eval_res['acc'] = f_micro
    return eval_res
def evaluate_classification(true_labels, pred_labels):
    mltoolkit = MLToolkit()
    eval_res = dict(entity=dict(p=0,r=0,f=0,tp=0,fp=0,fn=0),
                    matrix=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0),
                    relational=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0),
                    list=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0),
                    nondata=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0),
                    acc=0)
    fscore, precision, recall, f_micro, p_micro, r_micro, tp, fp, fn = mltoolkit.get_score_report(pred_labels, true_labels)
    for l in fscore.keys():
        eval_res[l]['p'] = precision[l]
        eval_res[l]['r'] = recall[l]
        eval_res[l]['f'] = fscore[l]
        eval_res[l]['tp'] = tp[l]
        eval_res[l]['fp'] = fp[l]
        eval_res[l]['fn'] = fn[l]
    eval_res['acc'] = f_micro
    classes = ['relational', 'entity', 'matrix', 'list', 'nondata']
    true_labels = [classes.index(x) for x in true_labels]
    pred_labels = [classes.index(x) for x in pred_labels]
    eval_res['conf'] = mltoolkit.calc_conf_matrix(pred_labels, true_labels, classes).tolist()
    return eval_res
            # print 'fingerprint not found {}'.format(x['fingerprint'])
            count +=1
    print '{} tables not found out of {}'.format(count, len(gt))
    # exit(0)
    new_gt = []
    res = []
    for x in gt:
        if x['cdr_id'] in tables and x['fingerprint'] in tables[x['cdr_id']]:
            new_gt.append(x)
            res.append(tables[x['cdr_id']][x['fingerprint']])
        else:
            res.append(dict(tableType='NON-DATA'))
    return res, new_gt

if __name__ == '__main__':
    mltoolkit = MLToolkit()
    infile = open(sys.argv[1])
    gt = load_GT(sys.argv[2])
    outfile = open(sys.argv[3], 'w')

    tables_in = [json.loads(x) for x in infile]

    tables_in, gt = get_GT_tables(tables_in, gt)
    pred_labels = [wc_mapping[x['tableType']] for x in tables_in]
    true_labels = [x['label'] for x in gt]
    eval_res = dict(entity=dict(p=0,r=0,f=0,tp=0,fp=0,fn=0),
                        matrix=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0),
                        relational=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0),
                        list=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0),
                        nondata=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0),
                        acc=0)
Beispiel #4
0
        ann_tables.append(t)
    return ann_tables

def get_subsets(GT, train_indices):
    Xtrain = [np.array(GT[i]['vector']) for i in train_indices]
    ytrain = [GT[i]['labels'][2] for i in train_indices]

    Xtest = [np.array(GT[i]['vector']) for i in range(len(GT)) if i not in train_indices and 'THROW' not in GT[i]['labels']]
    ytest = [GT[i]['labels'][2] for i in range(len(GT)) if i not in train_indices and 'THROW' not in GT[i]['labels']]

    Xtest = np.array(Xtest)
    Xtrain = np.array(Xtrain)
    return Xtrain, ytrain, Xtest, ytest

if __name__ == '__main__':
    mlToolkit = MLToolkit()
    vizToolkit = VizToolkit()
    do_hard = False
    GT_path = '../../../result/all_output/'
    res_path = '../../../result/output_easy/'
    for domain in ['HT']:
        easy = []
        for l in easy_samples[domain].values():
            easy += l
        for cv_name in sentences.keys():
            for reg in regularize:
                GT = [json.loads(x) for x in open(GT_path+'{}/{}.{}.out.jl'.format(domain, reg, cv_name))]
                Xtrain, ytrain, Xtest, ytest = get_subsets(GT, easy)


                out_path = res_path + '{}/{}.{}'.format(domain, reg, cv_name)
Beispiel #5
0
__author__ = 'majid'
import json
import os
import sys
import re
import numpy as np
from jsonpath_rw import jsonpath, parse
if __name__ == '__main__' and __package__ is None:
    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
    from baselines.nnet.tablenet import TCM
    from toolkit import MLToolkit
    from eval_classification import wc_mapping, evaluate_classification
mltoolkit = MLToolkit()


def tokenize_cell(text, i, j):
    text = re.sub('[<>]', ' ', text)
    text = re.sub('[\w\-]+="[\w/\.\s\-_]+"', ' ', text)
    text = re.sub('"', ' ', text)
    text = re.sub('\s+', ' ', text)

    text = text.strip()
    text = text.lower()

    return ['row{}'.format(i), 'col{}'.format(j)] + re.split('\s+', text)


def tokenize_table(t, num_token=50):
    tok_tarr = []
    num_col = 0
    for i, r in enumerate(t['rows']):
        ll,
        colors=['blue', 'green', 'crimson', 'purple', 'black'],
        markers=['o', '*', 'v', 'X', 'P'],
        classes=['RELATIONAL', 'ENTITY', 'MATRIX', 'LIST', 'NON-DATA'],
        save_to_file='figs/scatter_HT.pdf')


if __name__ == '__main__':
    gtfile = '/Users/majid/DIG/data/{}_annotated.jl'
    gttables = '/Users/majid/DIG/data/{}_annotated_tables_cl.jl'

    infile = '/Users/majid/DIG/tabvec/output/evaluation/{}.json'
    infile2 = '/Users/majid/DIG/tabvec/output/evaluation/{}/{}_result.json'

    viztk = VizToolkit()
    mltk = MLToolkit()

    # exit(0)

    dd = [20, 50, 100, 200, 400]
    nn = [4, 6, 8, 10, 12, 14]

    sentences = [['text'], ['cell'], ['text', 'cell'],
                 ['cell', 'hrow', 'hcol'], ['cell', 'hrow', 'hcol', 'adjcell'],
                 ['text', 'hrow', 'cell', 'adjcell', 'hcol']]
    domains = ['ATF', 'HT', 'SEC', 'WCC']

    # num_plots = len(dd)* len(nn)
    # for i, n in enumerate(nn):
    #     for j, d in enumerate(dd):
    #         # get the data