Example #1
0
def main():
    from utils import get_sents
    from dataset import Dset
    import featchar, rep

    trn, dev, tst = get_sents('toy')
    dset = Dset('toy')

    r = rep.Repstd()

    for sent in trn:
        sent.update({
            'cseq': r.get_cseq(sent),
            'wiseq': r.get_wiseq(sent),
            'tseq': r.get_tseq(sent)
        })
    r.pprint(trn[0])
    print
    r.pprint(trn[1])

    print rep.get_ts_bio(trn[0]['wiseq'], trn[0]['tseq'])

    feat = featchar.Feat('basic')
    feat.fit(dset)

    vdecoder = ViterbiDecoder(trn, feat)
    vdecoder.pprint()
    sent = trn[0]
    vdecoder.decode(sent, randlogprob(sent, feat.NC), debug=True)
    """
Example #2
0
    def __init__(self, model_file):
        dat = np.load(model_file)
        args = dat['argsd'].tolist()
        rnn_param_values = dat['rnn_param_values'].tolist()

        trn, dev, tst = get_sents(args['lang'])

        repclass = getattr(rep, 'Rep' + args['rep'])
        repobj = repclass()
        for d in (trn, dev, tst):
            for sent in d:
                sent.update({
                    'cseq': repobj.get_cseq(sent),
                    'wiseq': repobj.get_wiseq(sent),
                    'tseq': repobj.get_tseq(sent)
                })

        trn = sorted(trn, key=lambda sent: len(sent['cseq']))
        dev = sorted(dev, key=lambda sent: len(sent['cseq']))
        tst = sorted(tst, key=lambda sent: len(sent['cseq']))

        self.feat = featchar.Feat(args['feat'])
        self.feat.fit(trn, dev, tst)

        self.vdecoder = decoder.ViterbiDecoder(trn, self.feat)

        batcher = Batcher(args['n_batch'], self.feat)  # batch size 1
        devdat = batcher.get_batches(dev)
        tstdat = batcher.get_batches(tst)

        rdnn = RNN(self.feat.NC, self.feat.NF, args)
        cost, dev_predictions = rdnn.predict(devdat)
        cost, tst_predictions = rdnn.predict(tstdat)

        self.predictions = {}
        self.predictions['dev'] = dev_predictions
        self.predictions['tst'] = tst_predictions

        self.dset = {}
        self.dset['dev'] = dev
        self.dset['tst'] = tst
        self.repobj = repobj

        self.reporter = exper.Reporter(self.feat, rep.get_ts_bio)

        print rdnn.l_soft_out.get_params()
        print rdnn.blayers[0][0].get_params()
        params = lasagne.layers.get_all_param_values(rdnn.layers[-1])
        print map(np.shape, params)
        lasagne.layers.set_all_param_values(rdnn.layers[-1],
                                            rnn_param_values[:len(params)])

        logger = logging.getLogger()
        logger.setLevel(logging.INFO)
        shandler = logging.StreamHandler()
        shandler.setLevel(logging.INFO)
        logger.addHandler(shandler)

        validator = Validator(trn, dev, tst, batcher, self.reporter)
        validator.validate(rdnn, args, self.vdecoder)
Example #3
0
def tez_datasets_pos():
    langs = ['eng-pos', 'fin-pos', 'deu-pos', 'spa-pos', 'pos', 'chu']

    dsetnames = ['trn', 'dev', 'tst']

    data = dict((lang,
                 dict((dname, dset)
                      for dname, dset in zip(dsetnames, get_sents(lang))))
                for lang in langs)

    table = []
    for l in langs:
        table.append([l] + map(len, [data[l][dname] for dname in dsetnames]))
    print tabulate(np.array(table).T,
                   headers=['#sent'] + dsetnames,
                   tablefmt='latex')
    print

    table = []
    for dname in dsetnames:
        table.append(
            [dname] +
            [sum(len(sent['ws']) for sent in data[l][dname]) for l in langs])
    print tabulate(table, headers=['#token'] + langs)
    print

    table = []
    for l in langs:
        char_set = set(c for sent in data[l]['trn'] for w in sent['ws']
                       for c in w)
        tag_set = set(t for dname in dsetnames for sent in data[l][dname]
                      for t in encoding.any2io(sent['ts']))
        table.append(['%s' % l, len(char_set), len(tag_set)])
    print tabulate(table,
                   headers=['i/o'] + ['input', 'output'],
                   tablefmt='latex')
    print

    table = []
    # for l, dname in product(langs,('dev','tst')):
    for l in langs:
        dname = 'tst'
        vdst = get_vocab(data[l][dname])
        vsrc = get_vocab(data[l]['trn'])
        vdiff = vdst.difference(vsrc)
        uperc = len(vdiff) / float(len(vdst)) * 100

        cnt = Counter(w for sent in data[l][dname]
                      for w, t in zip(sent['ws'], sent['ts']) if t != 'O')
        pperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100

        cnt = Counter(w for sent in data[l][dname] for w in sent['ws'])
        cperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100

        table.append([l + '-' + dname] + [uperc, pperc, cperc])
    print tabulate(np.array(table).T,
                   headers=['unk', 'unique', 'phrase', 'corpus'],
                   tablefmt='latex',
                   floatfmt='.2f')
Example #4
0
    def __init__(self,
                 lang='eng',
                 level='char',
                 tagging='bio',
                 breaktrn=False,
                 captrn=500,
                 sample=0,
                 charrep='std',
                 sort=True,
                 **kwargs):
        self.level = level
        self.tagging = tagging
        trn, dev, tst = utils.get_sents(lang)

        repclass = getattr(rep, 'Rep' + charrep)
        repobj = repclass()

        for d in (trn, dev, tst):
            for sent in d:
                sent.update({
                    'cseq': repobj.get_cseq(sent),
                    'wiseq': repobj.get_wiseq(sent),
                    'tseq': repobj.get_tseq(sent)
                })
                sent['x'] = sent['cseq'] if level == 'char' else sent['ws']
                sent['y'] = sent['tseq'] if level == 'char' else sent['ts']

        if captrn:
            trn = filter(lambda sent: len(sent['ws']) < captrn, trn)

        if sample > 0:
            trn_size = sample * 1000
            trn = utils.sample_sents(trn, trn_size)

        if sort:
            trn = sorted(trn, key=lambda sent: len(sent['x']))
            dev = sorted(dev, key=lambda sent: len(sent['x']))
            tst = sorted(tst, key=lambda sent: len(sent['x']))

        ntrnsent, ndevsent, ntstsent = list(map(len, (trn, dev, tst)))
        logging.info('# of sents trn, dev, tst: {} {} {}'.format(
            ntrnsent, ndevsent, ntstsent))

        for dset, dname in zip((trn, dev, tst), ('trn', 'dev', 'tst')):
            slens = [len(sent['x']) for sent in dset]
            MAX_LENGTH, MIN_LENGTH, AVG_LENGTH, STD_LENGTH = max(slens), min(
                slens), np.mean(slens), np.std(slens)
            logging.info(
                'input: {}\tmaxlen: {} minlen: {} avglen: {:.2f} stdlen: {:.2f}'
                .format(dname, MAX_LENGTH, MIN_LENGTH, AVG_LENGTH, STD_LENGTH))
        self.trn, self.dev, self.tst = trn, dev, tst
Example #5
0
def main():
    langs = ['eng', 'deu', 'spa', 'ned', 'tr', 'cze', 'ger', 'arb0', 'ita']
    # langs = ['eng', 'deu']
    dsetnames = ['trn', 'dev', 'tst']

    data = dict((lang,
                 dict((dname, dset)
                      for dname, dset in zip(dsetnames, get_sents(lang))))
                for lang in langs)

    for l in langs:
        print l, sorted(set(t for sent in data[l]['trn'] for t in sent['ts']))
    print

    table = []
    for l in langs:
        table.append([
            l,
            sum(1 for sent in data[l]['trn']
                if len(' '.join(sent['ws'])) > 500)
        ])
    print tabulate(table)

    table = []
    for dname in dsetnames:
        table.append([dname] + map(len, [data[l][dname] for l in langs]))
    print tabulate(table, headers=['#sent'] + langs, tablefmt='latex')
    print

    table = []
    for dname in dsetnames:
        table.append(
            [dname] +
            [sum(len(sent['ws']) for sent in data[l][dname]) for l in langs])
    print tabulate(table, headers=['#token'] + langs)
    print

    table = []
    for dname in dsetnames:
        table.append([dname] + [
            float(
                sum(
                    len([c for w in sent['ws'] for c in w])
                    for sent in data[l][dname])) for l in langs
        ])
    print tabulate(table, headers=['#char'] + langs, floatfmt='.1e')
    print

    table = []
    for l in langs:
        # nchar_sents = [sum(1 for w in sent['ws']) for sent in chain(*data[l].values())]
        for dname in dsetnames:
            nchar_sents = [
                sum(1 for w in sent['ws']) for sent in data[l][dname]
            ]
            table.append(['{}-{}'.format(l, dname)] + [
                int(f(nchar_sents)) if len(nchar_sents) else 0
                for f in (np.min, np.max, np.mean, np.std)
            ])
        table.append(['...'] * 5)
    print tabulate(table,
                   headers=['#word per sent'] + ['min', 'max', 'mean', 'std'])
    print

    table = []
    for l in langs:
        # nchar_sents = [sum(1 for c in ' '.join(sent['ws'])) for sent in chain(*data[l].values())]
        for dname in dsetnames:
            nchar_sents = [
                sum(1 for c in ' '.join(sent['ws'])) for sent in data[l][dname]
            ]
            table.append(['{}-{}'.format(l, dname)] + [
                int(f(nchar_sents)) if len(nchar_sents) else 0
                for f in (np.min, np.max, np.mean, np.std)
            ])
        table.append(['...'] * 5)
    print tabulate(table,
                   headers=['#char per sent'] + ['min', 'max', 'mean', 'std'])
    print

    table = []
    for dname in dsetnames:
        table.append([dname] + [len(get_vocab(data[l][dname])) for l in langs])
    print tabulate(table, headers=['size(vocab)'] + langs)
    print

    table = []
    for l, dname in product(langs, ('dev', 'tst')):
        vdst = get_vocab(data[l][dname])
        vsrc = get_vocab(data[l]['trn'])
        vdiff = vdst.difference(vsrc)
        uperc = len(vdiff) / float(len(vdst)) * 100

        cnt = Counter(w for sent in data[l][dname]
                      for w, t in zip(sent['ws'], sent['ts']) if t != 'O')
        pperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100

        cnt = Counter(w for sent in data[l][dname] for w in sent['ws'])
        cperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100

        table.append([l + '-' + dname] + [uperc, pperc, cperc])
    print tabulate(table,
                   headers=['unk', 'unique', 'phrase', 'corpus'],
                   floatfmt='.2f')

    table = []
    for l, dname in product(langs, ('dev', 'tst')):
        dset = data[l][dname]
        ts_gold = [sent['ts'] for sent in dset]
        ts_pred = [encoding.any2io(sent['ts']) for sent in dset]
        r1, r2 = conlleval(ts_gold, ts_pred)
        table.append([l + '-' + dname] + map(str, r1))
    print tabulate(table, headers=['io-ideal', 'wacc', 'pre', 'rec', 'f1'])
    print
Example #6
0
def paper():
    # langs = ['eng', 'deu', 'spa', 'ned', 'tr', 'cze', 'ger', 'arb', 'ita']
    # langs = ['arb0', 'cze', 'ned', 'eng', 'deu', 'spa', 'tr']
    langs = ['cze-pos', 'eng-pos', 'deu-pos', 'spa-pos', 'pos', 'chu']
    dsetnames = ['trn', 'dev', 'tst']

    data = dict((lang,
                 dict((dname, dset)
                      for dname, dset in zip(dsetnames, get_sents(lang))))
                for lang in langs)

    table = []
    for l in langs:
        table.append([l] + map(len, [data[l][dname] for dname in dsetnames]))
    print tabulate(np.array(table).T,
                   headers=['#sent'] + dsetnames,
                   tablefmt='latex')
    print

    table = []
    for l in langs:
        # nchar_sents = [sum(1 for c in ' '.join(sent['ws'])) for sent in chain(*data[l].values())]
        # for dname in dsetnames:
        nchar_sents = [
            sum(1 for c in ' '.join(sent['ws'])) for dname in dsetnames
            for sent in data[l][dname]
        ]
        # table.append(['%s'%l]+[int(f(nchar_sents)) for f in (np.min,np.max,np.mean,np.std)])
        table.append(['%s' % l] +
                     [int(f(nchar_sents)) for f in (np.mean, np.std)])
    print tabulate(table,
                   headers=['#char per sent'] + ['mean', 'std'],
                   tablefmt='latex')
    print

    table = []
    for l in langs:
        # char_set = set(c for dname in dsetnames for sent in data[l][dname] for c in ''.join(sent['ws']))
        char_set = set(c for dname in ('trn', 'dev') for sent in data[l][dname]
                       for w in sent['ws'] for c in w)
        # char_set = set(c for sent in data[l]['trn'] for w in sent['ws'] for c in w)
        tag_set = set(t for dname in dsetnames for sent in data[l][dname]
                      for t in encoding.any2io(sent['ts']))
        # table.append(['%s'%l]+[int(f(nchar_sents)) for f in (np.min,np.max,np.mean,np.std)])
        table.append(['%s' % l, len(char_set) + 1, len(tag_set)])
    print tabulate(table,
                   headers=['i/o'] + ['input', 'output'],
                   tablefmt='latex')
    print

    table = []
    for l in langs:
        # char_set = set(c for dname in dsetnames for sent in data[l][dname] for c in ''.join(sent['ws']))
        # char_set = set(c for dname in dsetnames for sent in data[l][dname] for w in sent['ws'] for c in w)
        char_set = set(c for sent in data[l]['trn'] for w in sent['ws']
                       for c in w)
        tag_set = set(t for dname in dsetnames for sent in data[l][dname]
                      for t in encoding.any2io(sent['ts']))
        # table.append(['%s'%l]+[int(f(nchar_sents)) for f in (np.min,np.max,np.mean,np.std)])
        table.append(['%s' % l, len(char_set), len(tag_set)])
    print tabulate(table,
                   headers=['i/o'] + ['input', 'output'],
                   tablefmt='latex')
    print

    table = []
    # for l, dname in product(langs,('dev','tst')):
    for l in langs:
        dname = 'tst'
        vdst = get_vocab(data[l][dname])
        vsrc = get_vocab(data[l]['trn'])
        vdiff = vdst.difference(vsrc)
        uperc = len(vdiff) / float(len(vdst)) * 100

        cnt = Counter(w for sent in data[l][dname]
                      for w, t in zip(sent['ws'], sent['ts']) if t != 'O')
        pperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100

        cnt = Counter(w for sent in data[l][dname] for w in sent['ws'])
        cperc = sum(cnt[w] for w in vdiff) / float(sum(cnt.values())) * 100

        table.append([l + '-' + dname] + [uperc, pperc, cperc])
    print tabulate(np.array(table).T,
                   headers=['unk', 'unique', 'phrase', 'corpus'],
                   tablefmt='latex',
                   floatfmt='.2f')
Example #7
0
    #                     )
    # parser.add_argument("--name",
    #                     default=None,
    #                     type=str,
    #                     required=True,
    #                     )
    # parser.add_argument("--n_samples",
    #                     default=None,
    #                     type=int)

    # args = parser.parse_args()

    for data in datasets:
        myprint(data)
        for key in ["positive", "negative"]:
            myprint(key)
            reviews = utils.read_file(data[key]["data_filepath"])
            parent_dir = os.path.dirname(data[key]["data_filepath"])
            sents = utils.get_sents(reviews)
            train_size = int(0.9 * len(sents))
            split_data_dir = os.path.join(
                os.path.dirname(data[key]["data_filepath"]), "split_data")
            Path(split_data_dir).mkdir(parents=True, exist_ok=True)
            sents_filename_train = os.path.join(split_data_dir,
                                                key + "_reviews_train_sents")
            sents_filename_dev = os.path.join(split_data_dir,
                                              key + "_reviews_dev_sents")
            # sents_filename = data[key]["data_filepath"]+"_sents"
            utils.write_file(sents[:train_size], sents_filename_train)
            utils.write_file(sents[train_size:], sents_filename_dev)
Example #8
0
        for sent in dset_part:
            for w, t in zip(sent['ws'], sent['ts']):
                src.write(('%s\t%s\n' % (w, t)).encode('utf-8'))
            src.write('\n')


def get_sample(l, k):
    rand_indices = random.sample(xrange(len(l)), k)
    return [l[i] for i in rand_indices]


if __name__ == '__main__':
    random.seed(7)
    args = get_args()
    print args
    trn, dev, tst = get_sents(args['dset'])
    dset_parts = (trn, dev, tst)
    print map(len, (trn, dev, tst))
    strn, sdev, stst = map(get_sample, dset_parts,
                           map(lambda x: x * 1000, args['nums']))
    print map(len, (strn, sdev, stst))

    # filter out sents in sdev & stst if they contain a tag that is not in strn
    trn_tags = set(t for sent in strn for t in sent['ts'])
    sdev = filter(lambda sent: all(t in trn_tags for t in sent['ts']), sdev)
    stst = filter(lambda sent: all(t in trn_tags for t in sent['ts']), stst)
    print map(len, (strn, sdev, stst))

    dir_name = 'data/%s-sample' % args['dset']
    os.mkdir(dir_name)
Example #9
0
        if j in 'BS':  # 词的开始
            result.append(i)
        else:  # 接着原来的词
            result[-1] += i
    return result


if __name__ == '__main__':
    method = 'TestOne'

    if method == 'TestOne':
        s = '造成交通事故后逃逸被吊销机动车驾驶证的'
        data_name = './data/train.utf8'
        model_name = 'tmp_crflstm.model.h5'
        modelType = 'lstmcrf'
        sentences, words = get_sents(datasets=data_name)
        vocab_size = len(words)
        max_len = 75
        id2char = {i + 1: j for i, j in enumerate(words)}  # id到字的映射
        char2id = {j: i for i, j in id2char.items()}  # 字到id的映射
        res = cutTest(s,
                      filename=model_name,
                      batch_size=1,
                      modelType=modelType)
        print("------------------分词结果为:------------------")
        print(res)

    if method == 'TestBatch':
        # 把要测试的句子放到文件data.test.txt中,执行下面的代码
        with open('data.test.txt', 'r') as fr:
            lines = fr.readlines()
Example #10
0
                                                 for i, wi in enumerate(wiseq)
                                                 if wi > -1), lambda x: x[0])
    ]
    ts = []
    for i in windxs:
        if tseq[i] == 'o':
            ts.append('O')
        else:
            ttype = tseq[i].split('-')[1]
            if i == 0:
                ts.append('B-{}'.format(ttype.upper()))
            else:
                if tseq[i - 1] == tseq[i]:
                    ts.append('I-{}'.format(ttype.upper()))
                else:
                    ts.append('B-{}'.format(ttype.upper()))
    return ts


def is_consec(sent):
    return any(
        t1.startswith('I-') and t2.startswith('B-')
        and t1.split('-')[1] == t2.split('-')[1]
        for t1, t2 in zip(sent['ts'], sent['ts'][1:]))


if __name__ == '__main__':
    trn, dev, tst = utils.get_sents('eng')
    rep = Repstd()
    print Counter(c for sent in trn for c in rep.get_cseq(sent))