def evaluate_classification(true_labels, pred_labels): mltoolkit = MLToolkit() eval_res = dict(entity=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0), matrix=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0), relational=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0), list=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0), nondata=dict(p=0, r=0, f=0, tp=0, fp=0, fn=0), acc=0) fscore, precision, recall, f_micro, p_micro, r_micro, tp, fp, fn = mltoolkit.get_score_report( pred_labels, true_labels) for l in fscore.keys(): eval_res[l]['p'] = precision[l] eval_res[l]['r'] = recall[l] eval_res[l]['f'] = fscore[l] eval_res[l]['tp'] = tp[l] eval_res[l]['fp'] = fp[l] eval_res[l]['fn'] = fn[l] eval_res['acc'] = f_micro return eval_res
def evaluate_classification(true_labels, pred_labels): mltoolkit = MLToolkit() eval_res = dict(entity=dict(p=0,r=0,f=0,tp=0,fp=0,fn=0), matrix=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0), relational=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0), list=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0), nondata=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0), acc=0) fscore, precision, recall, f_micro, p_micro, r_micro, tp, fp, fn = mltoolkit.get_score_report(pred_labels, true_labels) for l in fscore.keys(): eval_res[l]['p'] = precision[l] eval_res[l]['r'] = recall[l] eval_res[l]['f'] = fscore[l] eval_res[l]['tp'] = tp[l] eval_res[l]['fp'] = fp[l] eval_res[l]['fn'] = fn[l] eval_res['acc'] = f_micro classes = ['relational', 'entity', 'matrix', 'list', 'nondata'] true_labels = [classes.index(x) for x in true_labels] pred_labels = [classes.index(x) for x in pred_labels] eval_res['conf'] = mltoolkit.calc_conf_matrix(pred_labels, true_labels, classes).tolist() return eval_res
# print 'fingerprint not found {}'.format(x['fingerprint']) count +=1 print '{} tables not found out of {}'.format(count, len(gt)) # exit(0) new_gt = [] res = [] for x in gt: if x['cdr_id'] in tables and x['fingerprint'] in tables[x['cdr_id']]: new_gt.append(x) res.append(tables[x['cdr_id']][x['fingerprint']]) else: res.append(dict(tableType='NON-DATA')) return res, new_gt if __name__ == '__main__': mltoolkit = MLToolkit() infile = open(sys.argv[1]) gt = load_GT(sys.argv[2]) outfile = open(sys.argv[3], 'w') tables_in = [json.loads(x) for x in infile] tables_in, gt = get_GT_tables(tables_in, gt) pred_labels = [wc_mapping[x['tableType']] for x in tables_in] true_labels = [x['label'] for x in gt] eval_res = dict(entity=dict(p=0,r=0,f=0,tp=0,fp=0,fn=0), matrix=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0), relational=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0), list=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0), nondata=dict(p=0, r=0, f=0,tp=0,fp=0,fn=0), acc=0)
ann_tables.append(t) return ann_tables def get_subsets(GT, train_indices): Xtrain = [np.array(GT[i]['vector']) for i in train_indices] ytrain = [GT[i]['labels'][2] for i in train_indices] Xtest = [np.array(GT[i]['vector']) for i in range(len(GT)) if i not in train_indices and 'THROW' not in GT[i]['labels']] ytest = [GT[i]['labels'][2] for i in range(len(GT)) if i not in train_indices and 'THROW' not in GT[i]['labels']] Xtest = np.array(Xtest) Xtrain = np.array(Xtrain) return Xtrain, ytrain, Xtest, ytest if __name__ == '__main__': mlToolkit = MLToolkit() vizToolkit = VizToolkit() do_hard = False GT_path = '../../../result/all_output/' res_path = '../../../result/output_easy/' for domain in ['HT']: easy = [] for l in easy_samples[domain].values(): easy += l for cv_name in sentences.keys(): for reg in regularize: GT = [json.loads(x) for x in open(GT_path+'{}/{}.{}.out.jl'.format(domain, reg, cv_name))] Xtrain, ytrain, Xtest, ytest = get_subsets(GT, easy) out_path = res_path + '{}/{}.{}'.format(domain, reg, cv_name)
__author__ = 'majid' import json import os import sys import re import numpy as np from jsonpath_rw import jsonpath, parse if __name__ == '__main__' and __package__ is None: sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from baselines.nnet.tablenet import TCM from toolkit import MLToolkit from eval_classification import wc_mapping, evaluate_classification mltoolkit = MLToolkit() def tokenize_cell(text, i, j): text = re.sub('[<>]', ' ', text) text = re.sub('[\w\-]+="[\w/\.\s\-_]+"', ' ', text) text = re.sub('"', ' ', text) text = re.sub('\s+', ' ', text) text = text.strip() text = text.lower() return ['row{}'.format(i), 'col{}'.format(j)] + re.split('\s+', text) def tokenize_table(t, num_token=50): tok_tarr = [] num_col = 0 for i, r in enumerate(t['rows']):
ll, colors=['blue', 'green', 'crimson', 'purple', 'black'], markers=['o', '*', 'v', 'X', 'P'], classes=['RELATIONAL', 'ENTITY', 'MATRIX', 'LIST', 'NON-DATA'], save_to_file='figs/scatter_HT.pdf') if __name__ == '__main__': gtfile = '/Users/majid/DIG/data/{}_annotated.jl' gttables = '/Users/majid/DIG/data/{}_annotated_tables_cl.jl' infile = '/Users/majid/DIG/tabvec/output/evaluation/{}.json' infile2 = '/Users/majid/DIG/tabvec/output/evaluation/{}/{}_result.json' viztk = VizToolkit() mltk = MLToolkit() # exit(0) dd = [20, 50, 100, 200, 400] nn = [4, 6, 8, 10, 12, 14] sentences = [['text'], ['cell'], ['text', 'cell'], ['cell', 'hrow', 'hcol'], ['cell', 'hrow', 'hcol', 'adjcell'], ['text', 'hrow', 'cell', 'adjcell', 'hcol']] domains = ['ATF', 'HT', 'SEC', 'WCC'] # num_plots = len(dd)* len(nn) # for i, n in enumerate(nn): # for j, d in enumerate(dd): # # get the data