Exemple #1
0
 def load(self, v):
     if isinstance(v, pth.Path):
         v = R.load(v, {
             'toks': tf.VarLenFeature(tf.string),
         })
     for i, w in enumerate(v):
         w = w.strip()
         assert w not in self.by_word
         self.by_word[w] = i
         self.by_idx.append(w)
     self.fixed = True
Exemple #2
0
from qnarre.feeds.prep import records as R


def dset(ps, kind):
    assert ps.dset.startswith('mnist')
    p = pth.Path(ps.dir_data) / ps.dset / kind
    if not p.exists():
        vs = tuple(reader(ps, kind))
        R.dump(p / ps.dset, lambda: recorder(vs))
    ds = R.dataset(p / ps.dset)
    return ds, feats


feats = {
    'int_img': tf.FixedLenFeature([28 * 28], tf.int64),
    'flt_img': tf.VarLenFeature(tf.float32),
    'int_lbl': tf.FixedLenFeature([], tf.int64),
    'str_lbl': tf.FixedLenFeature([], tf.string),
}


def recorder(vals):
    for iis, fis, il, sl in vals:
        yield R.example({
            'int_img': R.ints_feat(iis),
            'flt_img': R.floats_feat(fis),
            'int_lbl': R.one_int_feat(il),
            'str_lbl': R.bytes_feat(sl),
        })

Exemple #3
0
    pv = p / ps.vocab_path
    p = p / kind
    if not p.exists():
        tokenizer = encoder.tokenizer_for(ps)
        ts = F.Topics(tokenizer(reader(ps, kind)))
        for n in registry['all']:
            R.dump(p / n, lambda: registry[n](ts))
        if kind == 'train' and not pv.exists():
            R.dump(pv, lambda: [tokenizer.vocab.record()])
    ds = R.dataset(p / ps.dset_subset)
    return ds, feats[ps.dset_subset]


feats = {
    'query_valid': {
        'title': tf.VarLenFeature(tf.int64),
        'context': tf.VarLenFeature(tf.int64),
        'query': tf.VarLenFeature(tf.int64),
        'valid': tf.FixedLenFeature([], tf.int64),
        'uid': tf.FixedLenFeature([], tf.string),
    },
    'reply_spans': {
        'title': tf.VarLenFeature(tf.int64),
        'context': tf.VarLenFeature(tf.int64),
        'query': tf.VarLenFeature(tf.int64),
        'reply': tf.VarLenFeature(tf.int64),
        'begin': tf.FixedLenFeature([], tf.int64),
        'end': tf.FixedLenFeature([], tf.int64),
        'uid': tf.FixedLenFeature([], tf.string),
    },
    'possibles': {
Exemple #4
0
    assert ps.dset == 'enwik8'
    p = pth.Path(ps.dir_data) / ps.dset
    pv = p / ps.vocab_path
    p = p / kind
    if not p.exists():
        tokenizer = encoder.tokenizer_for(ps)
        tp = F.Topic(ps.dset, tokenizer(reader(ps, kind)))
        R.dump(p / ps.dset, lambda: recorder(tp))
        if kind == 'train' and not pv.exists():
            R.dump(pv, lambda: [tokenizer.vocab.record()])
    ds = R.dataset(p / ps.dset)
    return ds, feats


feats = {
    'context': tf.VarLenFeature(tf.int64),
    'uid': tf.FixedLenFeature([], tf.string),
}


def recorder(topic):
    for _, c in topic.contexts():
        yield R.example({
            'context': R.ints_feat([*c.toks]),
            'uid': R.bytes_feat(c.uid),
        })


def reader(ps, kind):
    assert not ps.dset or ps.dset == 'enwik8'
    p = pth.Path(ps.dir_data) / ps.dset
Exemple #5
0
    assert ps.dset == 'roc'
    p = pth.Path(ps.dir_data) / ps.dset
    pv = p / ps.vocab_path
    p = p / kind
    if not p.exists():
        tokenizer = encoder.tokenizer_for(ps)
        ts = F.Topics(tokenizer(reader(ps, kind)))
        R.dump(p / ps.dset, lambda: recorder(ts))
        if kind == 'train' and not pv.exists():
            R.dump(pv, lambda: [tokenizer.vocab.record()])
    ds = R.dataset(p / ps.dset)
    return ds, feats


feats = {
    'title': tf.VarLenFeature(tf.int64),
    'context': tf.VarLenFeature(tf.int64),
    'query': tf.VarLenFeature(tf.int64),
    'valid': tf.FixedLenFeature([], tf.int64),
    'uid': tf.FixedLenFeature([], tf.string),
},


def recorder(topics):
    for t, c, q in topics.queries():
        yield R.example({
            'title': R.ints_feat([*t.title.toks]),
            'context': R.ints_feat([*c.toks]),
            'query': R.ints_feat([*q.toks]),
            'valid': R.one_int_feat(1 if q.valid else 0),
            'uid': R.bytes_feat(q.uid),