def load(self, v): if isinstance(v, pth.Path): v = R.load(v, { 'toks': tf.VarLenFeature(tf.string), }) for i, w in enumerate(v): w = w.strip() assert w not in self.by_word self.by_word[w] = i self.by_idx.append(w) self.fixed = True
from qnarre.feeds.prep import records as R def dset(ps, kind): assert ps.dset.startswith('mnist') p = pth.Path(ps.dir_data) / ps.dset / kind if not p.exists(): vs = tuple(reader(ps, kind)) R.dump(p / ps.dset, lambda: recorder(vs)) ds = R.dataset(p / ps.dset) return ds, feats feats = { 'int_img': tf.FixedLenFeature([28 * 28], tf.int64), 'flt_img': tf.VarLenFeature(tf.float32), 'int_lbl': tf.FixedLenFeature([], tf.int64), 'str_lbl': tf.FixedLenFeature([], tf.string), } def recorder(vals): for iis, fis, il, sl in vals: yield R.example({ 'int_img': R.ints_feat(iis), 'flt_img': R.floats_feat(fis), 'int_lbl': R.one_int_feat(il), 'str_lbl': R.bytes_feat(sl), })
pv = p / ps.vocab_path p = p / kind if not p.exists(): tokenizer = encoder.tokenizer_for(ps) ts = F.Topics(tokenizer(reader(ps, kind))) for n in registry['all']: R.dump(p / n, lambda: registry[n](ts)) if kind == 'train' and not pv.exists(): R.dump(pv, lambda: [tokenizer.vocab.record()]) ds = R.dataset(p / ps.dset_subset) return ds, feats[ps.dset_subset] feats = { 'query_valid': { 'title': tf.VarLenFeature(tf.int64), 'context': tf.VarLenFeature(tf.int64), 'query': tf.VarLenFeature(tf.int64), 'valid': tf.FixedLenFeature([], tf.int64), 'uid': tf.FixedLenFeature([], tf.string), }, 'reply_spans': { 'title': tf.VarLenFeature(tf.int64), 'context': tf.VarLenFeature(tf.int64), 'query': tf.VarLenFeature(tf.int64), 'reply': tf.VarLenFeature(tf.int64), 'begin': tf.FixedLenFeature([], tf.int64), 'end': tf.FixedLenFeature([], tf.int64), 'uid': tf.FixedLenFeature([], tf.string), }, 'possibles': {
assert ps.dset == 'enwik8' p = pth.Path(ps.dir_data) / ps.dset pv = p / ps.vocab_path p = p / kind if not p.exists(): tokenizer = encoder.tokenizer_for(ps) tp = F.Topic(ps.dset, tokenizer(reader(ps, kind))) R.dump(p / ps.dset, lambda: recorder(tp)) if kind == 'train' and not pv.exists(): R.dump(pv, lambda: [tokenizer.vocab.record()]) ds = R.dataset(p / ps.dset) return ds, feats feats = { 'context': tf.VarLenFeature(tf.int64), 'uid': tf.FixedLenFeature([], tf.string), } def recorder(topic): for _, c in topic.contexts(): yield R.example({ 'context': R.ints_feat([*c.toks]), 'uid': R.bytes_feat(c.uid), }) def reader(ps, kind): assert not ps.dset or ps.dset == 'enwik8' p = pth.Path(ps.dir_data) / ps.dset
assert ps.dset == 'roc' p = pth.Path(ps.dir_data) / ps.dset pv = p / ps.vocab_path p = p / kind if not p.exists(): tokenizer = encoder.tokenizer_for(ps) ts = F.Topics(tokenizer(reader(ps, kind))) R.dump(p / ps.dset, lambda: recorder(ts)) if kind == 'train' and not pv.exists(): R.dump(pv, lambda: [tokenizer.vocab.record()]) ds = R.dataset(p / ps.dset) return ds, feats feats = { 'title': tf.VarLenFeature(tf.int64), 'context': tf.VarLenFeature(tf.int64), 'query': tf.VarLenFeature(tf.int64), 'valid': tf.FixedLenFeature([], tf.int64), 'uid': tf.FixedLenFeature([], tf.string), }, def recorder(topics): for t, c, q in topics.queries(): yield R.example({ 'title': R.ints_feat([*t.title.toks]), 'context': R.ints_feat([*c.toks]), 'query': R.ints_feat([*q.toks]), 'valid': R.one_int_feat(1 if q.valid else 0), 'uid': R.bytes_feat(q.uid),