def report(self, dset, pred): pred = [self.tdecoder.decode(s, p) for s, p in zip(dset, pred)] y_true = self.feat.yenc.transform( [t for sent in dset for t in sent['y']]) y_pred = list(chain.from_iterable(pred)) yerr = np.sum(y_true != y_pred) / float(len(y_true)) # char_conmat_str = self.get_conmat_str(y_true, y_pred, self.feat.tseqenc) lts = [sent['ts'] for sent in dset] lts_pred = [] for sent, ipred in zip(dset, pred): tseq_pred = self.feat.yenc.inverse_transform(ipred) # tseqgrp_pred = get_tseqgrp(sent['wiseq'],tseq_pred) ts_pred = self.tfunc(sent['wiseq'], tseq_pred) lts_pred.append(ts_pred) # changed # wacc, pre, recall, f1 = bilouEval2(lts, lts_pred) (wacc, pre, recall, f1), conll_print = conlleval(lts, lts_pred) logging.debug('') logging.debug(conll_print) # logging.debug(char_conmat_str) # logging.debug(word_conmat_str) logging.debug('') return yerr, pre, recall, f1
def report(self, dset, pred, epoch, index): pred = [self.tdecoder.decode(s, p) for s, p in zip(dset, pred)] y_true = self.feat.yenc.transform([t for sent in dset for t in sent['y']]) y_pred = list(chain.from_iterable(pred)) print len(y_true) print len(y_pred) yerr = np.sum(y_true!=y_pred)/float(len(y_true)) # char_conmat_str = self.get_conmat_str(y_true, y_pred, self.feat.tseqenc) lts = [sent['ts'] for sent in dset] lts_pred = [] for sent, ipred in zip(dset, pred): tseq_pred = self.feat.yenc.inverse_transform(ipred) # tseqgrp_pred = get_tseqgrp(sent['wiseq'],tseq_pred) ts_pred = self.tfunc(sent['wiseq'],tseq_pred) lts_pred.append(ts_pred) # changed # wacc, pre, recall, f1 = bilouEval2(lts, lts_pred) # Start out File f = open('tmp/' + index + str(epoch), 'w') for sent1, sent2 in zip(lts, lts_pred): for word1, word2 in zip(sent1, sent2): f.write('x x x %s %s\n'%(word1, word2)) f.write('\n') f.close() # End out file (wacc, pre, recall, f1), conll_print = conlleval(lts, lts_pred) logging.debug('') logging.debug(conll_print) # logging.debug(char_conmat_str) # logging.debug(word_conmat_str) logging.debug('') return yerr, pre, recall, f1
def io_ideal(dev, tst): from score import conlleval print 'io tagging ideal scores' for dset, dset_str in zip((dev, tst), ('dev', 'tst')): ts_gold = [sent['ts'] for sent in dset] ts_pred = [encoding.any2io(sent['ts']) for sent in dset] r1, r2 = conlleval(ts_gold, ts_pred) print '\t'.join([dset_str] + map(str, r1))
def main(): langs = ['eng', 'deu', 'spa', 'ned', 'tr', 'cze', 'ger', 'arb0', 'ita'] # langs = ['eng', 'deu'] dsetnames = ['trn', 'dev', 'tst'] data = dict((lang, dict((dname, dset) for dname, dset in zip(dsetnames, get_sents(lang)))) for lang in langs) for l in langs: print l, sorted(set(t for sent in data[l]['trn'] for t in sent['ts'])) print table = [] for l in langs: table.append([ l, sum(1 for sent in data[l]['trn'] if len(' '.join(sent['ws'])) > 500) ]) print tabulate(table) table = [] for dname in dsetnames: table.append([dname] + map(len, [data[l][dname] for l in langs])) print tabulate(table, headers=['#sent'] + langs, tablefmt='latex') print table = [] for dname in dsetnames: table.append( [dname] + [sum(len(sent['ws']) for sent in data[l][dname]) for l in langs]) print tabulate(table, headers=['#token'] + langs) print table = [] for dname in dsetnames: table.append([dname] + [ float( sum( len([c for w in sent['ws'] for c in w]) for sent in data[l][dname])) for l in langs ]) print tabulate(table, headers=['#char'] + langs, floatfmt='.1e') print table = [] for l in langs: # nchar_sents = [sum(1 for w in sent['ws']) for sent in chain(*data[l].values())] for dname in dsetnames: nchar_sents = [ sum(1 for w in sent['ws']) for sent in data[l][dname] ] table.append(['{}-{}'.format(l, dname)] + [ int(f(nchar_sents)) if len(nchar_sents) else 0 for f in (np.min, np.max, np.mean, np.std) ]) table.append(['...'] * 5) print tabulate(table, headers=['#word per sent'] + ['min', 'max', 'mean', 'std']) print table = [] for l in langs: # nchar_sents = [sum(1 for c in ' '.join(sent['ws'])) for sent in chain(*data[l].values())] for dname in dsetnames: nchar_sents = [ sum(1 for c in ' '.join(sent['ws'])) for sent in data[l][dname] ] table.append(['{}-{}'.format(l, dname)] + [ int(f(nchar_sents)) if len(nchar_sents) else 0 for f in (np.min, np.max, np.mean, np.std) ]) table.append(['...'] * 5) print tabulate(table, headers=['#char per sent'] + ['min', 'max', 'mean', 'std']) print table = [] for dname in dsetnames: table.append([dname] + [len(get_vocab(data[l][dname])) for l in langs]) print tabulate(table, headers=['size(vocab)'] + langs) print table = [] for l, dname in product(langs, ('dev', 'tst')): vdst = get_vocab(data[l][dname]) vsrc = get_vocab(data[l]['trn']) vdiff = vdst.difference(vsrc) uperc = len(vdiff) / float(len(vdst)) * 100 cnt = Counter(w for sent in data[l][dname] for w, t in zip(sent['ws'], sent['ts']) if t != 'O') pperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100 cnt = Counter(w for sent in data[l][dname] for w in sent['ws']) cperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100 table.append([l + '-' + dname] + [uperc, pperc, cperc]) print tabulate(table, headers=['unk', 'unique', 'phrase', 'corpus'], floatfmt='.2f') table = [] for l, dname in product(langs, ('dev', 'tst')): dset = data[l][dname] ts_gold = [sent['ts'] for sent in dset] ts_pred = [encoding.any2io(sent['ts']) for sent in dset] r1, r2 = conlleval(ts_gold, ts_pred) table.append([l + '-' + dname] + map(str, r1)) print tabulate(table, headers=['io-ideal', 'wacc', 'pre', 'rec', 'f1']) print