Ejemplo n.º 1
0
def reader(ps, kind):
    p = pth.Path(ps.dir_data) / ps.dset
    for n in registry[kind]:
        with lzma.open(p / (n + '.json.xz'), mode='rt') as f:
            for data in json.load(f)['data']:
                cs = []
                for p in data['paragraphs']:
                    ct = utils.normalize(p['context'])
                    qs = []
                    for q in p['qas']:
                        qu = q['id']
                        rs = []
                        for i, r in enumerate(q.get('answers', ())):
                            rt = utils.normalize(r['text'])
                            s = r['answer_start']
                            if ct.find(rt, s) == s:
                                s = F.Span(s, s + len(rt))
                                rs.append(F.Reply(rt, s, qu + f'-r{i}'))
                            else:
                                print('Mismatched', ct[:20], rt[:20])
                        ps = []
                        for i, p in enumerate(q.get('plausible_answers', ())):
                            pt = utils.normalize(p['text'])
                            s = p['answer_start']
                            if ct.find(pt, s) == s:
                                s = F.Span(s, s + len(pt))
                                ps.append(F.Reply(pt, s, qu + f'-p{i}'))
                            else:
                                print('Mismatched', ct[:20], pt[:20])
                        qt = utils.normalize(q['question'])
                        qv = q.get('is_impossible', False)
                        qs.append(F.Query(qt, qv, qu, rs, ps))
                    cs.append(F.Context(ct, qs))
                tt = utils.normalize(data['title'])
                yield F.Topic(tt, cs)
Ejemplo n.º 2
0
def test_encoders():
    txt = "sf!fg dfg'sdf?dfg xcxb'sdfg!sdg 324sdf.sdfa"
    ce = encoder.CharE(ps)
    ts, os, _ = zip(*ce(txt))
    d = ce.decode(ts, os)
    assert d == txt
    we = encoder.WordE(ps)
    ts, os, _ = zip(*we(txt))
    d = we.decode(ts, os)
    assert d == txt
    be = encoder.BertE(ps)
    ge = encoder.Gpt2E(ps)
    with zipfile.ZipFile('.data/text8/text8.zip') as z:
        with z.open('text8') as f:
            ws = utils.normalize(f.read().decode().strip())
            ws = utils.normalize(ws).split()
            for i in range(200):
                txt = ' '.join(ws[i * 100:i * 100 + 100])
                ts, os, _ = zip(*ce(txt))
                d = ce.decode(ts, os)
                assert d == txt
                ts, os, _ = zip(*we(txt))
                d = we.decode(ts, os)
                assert d == txt
                ts, os, _ = zip(*be(txt))
                d = be.decode(ts, os)
                assert d == txt
                ts, os, _ = zip(*ge(txt))
                d = ge.decode(ts, os)
                assert d == txt
    print(len(ce.vocab), len(we.vocab), len(be.vocab), len(ge.vocab))
Ejemplo n.º 3
0
def reader(ps, kind):
    assert not ps.dset or ps.dset == 'enwik8'
    p = pth.Path(ps.dir_data) / ps.dset
    with zipfile.ZipFile(p / 'enwik8.zip') as z:
        with z.open('enwik8') as f:
            ws = utils.normalize(f.read().decode().strip()).split()
            split = ps.test_train_split or 10
            n = len(ws) * split // 100
            if kind == 'train':
                ws = ws[:-2 * n]
            elif kind == 'valid':
                ws = ws[-2 * n:-n]
            elif kind == 'test':
                ws = ws[-n:]
            wl = ps.len_words
            for i in range(len(ws) // wl):
                cu = '{:0>9d}0'.format(i)
                yield F.Context(ws[i * wl:(i + 1) * wl], uid=cu)
Ejemplo n.º 4
0
def reader(ps, kind):
    p = pth.Path(ps.dir_data) / ps.dset
    for n in registry[kind]:
        with lzma.open(p / (n + '.csv.xz'), mode='rt') as f:
            for i, ln in enumerate(csv.reader(f)):
                if i:
                    ln = utils.normalize(ln)
                    if kind == 'train':
                        tt = ln[1].strip()
                        ct = ' '.join(t.strip() for t in ln[2:6])
                        qs = [F.Query(ln[6].strip(), True, ln[0].strip())]
                    else:
                        tt = ''
                        ct = ' '.join(t.strip() for t in ln[1:5])
                        qu = ln[0].strip()
                        v = int(ln[-1])
                        qs = [
                            F.Query(ln[5].strip(), v == 1, qu + f'-r0'),
                            F.Query(ln[6].strip(), v == 2, qu + f'-r1'),
                        ]
                    yield F.Topic(tt, [F.Context(ct, qs)])