Beispiel #1
0
    def report(self, dset, pred):
        pred = [self.tdecoder.decode(s, p) for s, p in zip(dset, pred)]
        y_true = self.feat.yenc.transform(
            [t for sent in dset for t in sent['y']])
        y_pred = list(chain.from_iterable(pred))
        yerr = np.sum(y_true != y_pred) / float(len(y_true))

        # char_conmat_str = self.get_conmat_str(y_true, y_pred, self.feat.tseqenc)

        lts = [sent['ts'] for sent in dset]
        lts_pred = []
        for sent, ipred in zip(dset, pred):
            tseq_pred = self.feat.yenc.inverse_transform(ipred)
            # tseqgrp_pred = get_tseqgrp(sent['wiseq'],tseq_pred)
            ts_pred = self.tfunc(sent['wiseq'], tseq_pred)
            lts_pred.append(ts_pred)  # changed

        # wacc, pre, recall, f1 = bilouEval2(lts, lts_pred)
        (wacc, pre, recall, f1), conll_print = conlleval(lts, lts_pred)
        logging.debug('')
        logging.debug(conll_print)
        # logging.debug(char_conmat_str)
        # logging.debug(word_conmat_str)
        logging.debug('')
        return yerr, pre, recall, f1
Beispiel #2
0
    def report(self, dset, pred, epoch, index):
        pred = [self.tdecoder.decode(s, p) for s, p in zip(dset, pred)]
        y_true = self.feat.yenc.transform([t for sent in dset for t in sent['y']])
        y_pred = list(chain.from_iterable(pred))
        print len(y_true)
        print len(y_pred)
        yerr = np.sum(y_true!=y_pred)/float(len(y_true))

        # char_conmat_str = self.get_conmat_str(y_true, y_pred, self.feat.tseqenc)

        lts = [sent['ts'] for sent in dset]
        lts_pred = []
        for sent, ipred in zip(dset, pred):
            tseq_pred = self.feat.yenc.inverse_transform(ipred)
            # tseqgrp_pred = get_tseqgrp(sent['wiseq'],tseq_pred)
            ts_pred = self.tfunc(sent['wiseq'],tseq_pred)
            lts_pred.append(ts_pred) # changed

        # wacc, pre, recall, f1 = bilouEval2(lts, lts_pred)
     # Start out File
        f = open('tmp/' + index + str(epoch), 'w')
        for sent1, sent2 in zip(lts, lts_pred):
            for word1, word2 in zip(sent1, sent2):
                f.write('x x x %s %s\n'%(word1, word2))
            f.write('\n')
        f.close()
        # End out file
        (wacc, pre, recall, f1), conll_print = conlleval(lts, lts_pred)
        logging.debug('')
        logging.debug(conll_print)
        # logging.debug(char_conmat_str)
        # logging.debug(word_conmat_str)
        logging.debug('')
        return yerr, pre, recall, f1
Beispiel #3
0
def io_ideal(dev, tst):
    from score import conlleval
    print 'io tagging ideal scores'
    for dset, dset_str in zip((dev, tst), ('dev', 'tst')):
        ts_gold = [sent['ts'] for sent in dset]
        ts_pred = [encoding.any2io(sent['ts']) for sent in dset]
        r1, r2 = conlleval(ts_gold, ts_pred)
        print '\t'.join([dset_str] + map(str, r1))
Beispiel #4
0
def main():
    langs = ['eng', 'deu', 'spa', 'ned', 'tr', 'cze', 'ger', 'arb0', 'ita']
    # langs = ['eng', 'deu']
    dsetnames = ['trn', 'dev', 'tst']

    data = dict((lang,
                 dict((dname, dset)
                      for dname, dset in zip(dsetnames, get_sents(lang))))
                for lang in langs)

    for l in langs:
        print l, sorted(set(t for sent in data[l]['trn'] for t in sent['ts']))
    print

    table = []
    for l in langs:
        table.append([
            l,
            sum(1 for sent in data[l]['trn']
                if len(' '.join(sent['ws'])) > 500)
        ])
    print tabulate(table)

    table = []
    for dname in dsetnames:
        table.append([dname] + map(len, [data[l][dname] for l in langs]))
    print tabulate(table, headers=['#sent'] + langs, tablefmt='latex')
    print

    table = []
    for dname in dsetnames:
        table.append(
            [dname] +
            [sum(len(sent['ws']) for sent in data[l][dname]) for l in langs])
    print tabulate(table, headers=['#token'] + langs)
    print

    table = []
    for dname in dsetnames:
        table.append([dname] + [
            float(
                sum(
                    len([c for w in sent['ws'] for c in w])
                    for sent in data[l][dname])) for l in langs
        ])
    print tabulate(table, headers=['#char'] + langs, floatfmt='.1e')
    print

    table = []
    for l in langs:
        # nchar_sents = [sum(1 for w in sent['ws']) for sent in chain(*data[l].values())]
        for dname in dsetnames:
            nchar_sents = [
                sum(1 for w in sent['ws']) for sent in data[l][dname]
            ]
            table.append(['{}-{}'.format(l, dname)] + [
                int(f(nchar_sents)) if len(nchar_sents) else 0
                for f in (np.min, np.max, np.mean, np.std)
            ])
        table.append(['...'] * 5)
    print tabulate(table,
                   headers=['#word per sent'] + ['min', 'max', 'mean', 'std'])
    print

    table = []
    for l in langs:
        # nchar_sents = [sum(1 for c in ' '.join(sent['ws'])) for sent in chain(*data[l].values())]
        for dname in dsetnames:
            nchar_sents = [
                sum(1 for c in ' '.join(sent['ws'])) for sent in data[l][dname]
            ]
            table.append(['{}-{}'.format(l, dname)] + [
                int(f(nchar_sents)) if len(nchar_sents) else 0
                for f in (np.min, np.max, np.mean, np.std)
            ])
        table.append(['...'] * 5)
    print tabulate(table,
                   headers=['#char per sent'] + ['min', 'max', 'mean', 'std'])
    print

    table = []
    for dname in dsetnames:
        table.append([dname] + [len(get_vocab(data[l][dname])) for l in langs])
    print tabulate(table, headers=['size(vocab)'] + langs)
    print

    table = []
    for l, dname in product(langs, ('dev', 'tst')):
        vdst = get_vocab(data[l][dname])
        vsrc = get_vocab(data[l]['trn'])
        vdiff = vdst.difference(vsrc)
        uperc = len(vdiff) / float(len(vdst)) * 100

        cnt = Counter(w for sent in data[l][dname]
                      for w, t in zip(sent['ws'], sent['ts']) if t != 'O')
        pperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100

        cnt = Counter(w for sent in data[l][dname] for w in sent['ws'])
        cperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100

        table.append([l + '-' + dname] + [uperc, pperc, cperc])
    print tabulate(table,
                   headers=['unk', 'unique', 'phrase', 'corpus'],
                   floatfmt='.2f')

    table = []
    for l, dname in product(langs, ('dev', 'tst')):
        dset = data[l][dname]
        ts_gold = [sent['ts'] for sent in dset]
        ts_pred = [encoding.any2io(sent['ts']) for sent in dset]
        r1, r2 = conlleval(ts_gold, ts_pred)
        table.append([l + '-' + dname] + map(str, r1))
    print tabulate(table, headers=['io-ideal', 'wacc', 'pre', 'rec', 'f1'])
    print