def get(ddir: str, ft_path: str, split: str):
    random.seed(1111)
    ddir = Path(ddir)

    ft_model = fastText.load_model(ft_path)
    swem = SWEM(ft_model)

    quality = lf.TextDataset(str(ddir / (f'quality.{split}.txt'))).map(int)
    sent1 = lf.TextDataset(str(ddir / (f'sent1.{split}.txt'))).map(sent_preprocess(swem))
    sent2 = lf.TextDataset(str(ddir / (f'sent2.{split}.txt'))).map(sent_preprocess(swem))

    ds = lf.zip(quality, sent1, sent2)
    return ds
Exemple #2
0
def build(datapath='./data/example.txt', savedir='./'):
    datapath = Path(datapath)
    savedir = Path(savedir)

    docs = lf.TextDataset(str(datapath))
    ids = lf.Dataset(range(len(docs)))
    docs = docs.map(preprocess)
    ds = lf.zip(ids, docs)

    tokens = lf.flat_map(lambda x: x[1], ds, lazy=True)
    t2i, words = build_vocab(tokens, str(savedir / 'vocab.pkl'))

    unk_index = t2i[UNK_TOKEN]

    ds.map(postprocess(t2i, unk_index))\
        .save(str(savedir / 'dataset.token.pkl'))
def test_get(ddir: str, savedir: str, bsize: int, ft_path: str):
    ddir = Path(ddir)
    savedir = Path(savedir)

    ft_model = fastText.load_model(ft_path)
    swem = SWEM(ft_model)

    quality = lf.TextDataset(str(ddir / ('quality.test.txt'))).map(int)
    sent1 = lf.TextDataset(str(ddir / ('sent1.test.txt'))).map(sent_preprocess(swem))
    sent2 = lf.TextDataset(str(ddir / ('sent2.test.txt'))).map(sent_preprocess(swem))

    ds = lf.zip(quality, sent1, sent2)

    test_dataloader = DataLoader(
            ds.save(savedir / 'swem.test.cache'),
            batch_size=bsize,
            shuffle=False,
            num_workers=4,
            collate_fn=get_collate_fn()
            )

    return test_dataloader
Exemple #4
0
 def setUp(self):
     self.base = range(100)
     self.n = 5
     self.data = lineflow.zip(*[Dataset(self.base)] * self.n)