def get(savepath: str, bsize: int = 32, vocab_size: int = 5000) -> (DataLoader, DataLoader, Dict, Dict): savepath = Path(savepath) print('Reading...') train = lfds.SmallParallelEnJa('train') validation = lfds.SmallParallelEnJa('dev') train = train.map(preprocess) validation = validation.map(preprocess) src_tokens: List = lf.flat_map(lambda x: x[0], train + validation, lazy=True) # en tgt_tokens: List = lf.flat_map(lambda x: x[1], train + validation, lazy=True) # ja print('Building vocabulary...') src_t2i, _ = build_vocab(src_tokens, savepath / 'src.voacb', vocab_size) tgt_t2i, _ = build_vocab(tgt_tokens, savepath / 'tgt.voacb', vocab_size) print(f'Source vocab size: {len(src_t2i)}') print(f'Target Vocab size: {len(tgt_t2i)}') src_pad_idx = src_t2i[PAD_TOKEN] tgt_pad_idx = tgt_t2i[PAD_TOKEN] src_unk_idx = src_t2i[UNK_TOKEN] tgt_unk_idx = tgt_t2i[UNK_TOKEN] print('Postprocessing...') train_loader = DataLoader(train.map( postprocess(src_t2i, src_unk_idx, tgt_t2i, tgt_unk_idx)).save(savepath / 'enja.train.cache'), batch_size=bsize, shuffle=True, num_workers=4, collate_fn=get_collate_fn( src_pad_idx, tgt_pad_idx)) validation_loader = DataLoader(validation.map( postprocess(src_t2i, src_unk_idx, tgt_t2i, tgt_unk_idx)).save(savepath / 'enja.validation.cache'), batch_size=bsize, shuffle=False, num_workers=4, collate_fn=get_collate_fn( src_pad_idx, tgt_pad_idx)) return train_loader, validation_loader, src_t2i, tgt_t2i
def test_returns_flat_mapped_data_lazily(self): result = lineflow.flat_map(lambda x: [x] * 3, self.data, lazy=True) self.assertIsInstance(result, itertools.chain) expected = list(itertools.chain.from_iterable( [[x] * 3 for x in self.data])) for x, y in zip(result, expected): self.assertEqual(x, y)
def build(dpath, savedir): ''' 1. Read dpath csv file. 2. Preprocess. (tokenizing, stripping) 3. Build vocab. 4. Replace tokens with ids. 5. Save. ''' # Load csv data. dpath = Path(dpath) savedir = Path(savedir) # Preprocess tokenizer = Tokenizer() train = lf.CsvDataset(str(dpath / 'train.csv'), header=True).map(get_preprocess(tokenizer)) test = lf.CsvDataset(str(dpath / 'test.csv'), header=True).map(get_preprocess(tokenizer)) # Collect all tokens. tokens = lf.flat_map(lambda x: x['tokens'], train, lazy=True) # Build vocab. words, t2i = build_vocab(tokens) # Save vocab. with open(savedir / 'vocab.pkl', 'wb') as f: pickle.dump((t2i, words), f) # Save dataset. train.map(get_postprocess(t2i, t2i[UNK_TOKEN])).save( str(savedir / 'dataset.train.token.pkl')) test.map(get_postprocess(t2i, t2i[UNK_TOKEN])).save( str(savedir / 'dataset.test.token.pkl'))
def build(datapath='./data/example.txt', savedir='./'): datapath = Path(datapath) savedir = Path(savedir) docs = lf.TextDataset(str(datapath)) ids = lf.Dataset(range(len(docs))) docs = docs.map(preprocess) ds = lf.zip(ids, docs) tokens = lf.flat_map(lambda x: x[1], ds, lazy=True) t2i, words = build_vocab(tokens, str(savedir / 'vocab.pkl')) unk_index = t2i[UNK_TOKEN] ds.map(postprocess(t2i, unk_index))\ .save(str(savedir / 'dataset.token.pkl'))
def test_returns_flat_mapped_data_eagerly(self): result = lineflow.flat_map(lambda x: [x] * 3, self.data) expected = [[x] * 3 for x in self.data] expected = [x for xs in expected for x in xs] self.assertListEqual(result, expected)
padded_src = [x + [pad_index] * (src_max_length - len(x)) for x in src] padded_tgt = [y + [IGNORE_INDEX] * (tgt_max_length - len(y)) for y in tgt] return torch.LongTensor(padded_src), torch.LongTensor(padded_tgt) return f if __name__ == '__main__': print('Reading...') train = lfds.SmallParallelEnJa('train') validation = lfds.SmallParallelEnJa('dev') train = train.map(preprocess) validation = validation.map(preprocess) en_tokens = lf.flat_map(lambda x: x[0], train + validation, lazy=True) ja_tokens = lf.flat_map(lambda x: x[1], train + validation, lazy=True) print('Building vocabulary...') en_token_to_index, _ = build_vocab(en_tokens, 'en.vocab') ja_token_to_index, _ = build_vocab(ja_tokens, 'ja.vocab') print(f'Vocab Size: {len(en_token_to_index)}') print(f'Vocab Size: {len(ja_token_to_index)}') pad_index = en_token_to_index[PAD_TOKEN] en_unk_index = en_token_to_index[UNK_TOKEN] ja_unk_index = ja_token_to_index[UNK_TOKEN] loader = DataLoader(
def __iter__(self) -> Iterator[Any]: yield from lf.flat_map(self._map_func, self._dataset, lazy=True)
def get_collate_fn(pad_index): def f(batch): indices, labels = zip(*batch) max_length = max(len(x) for x in indices) padded = [x + [pad_index] * (max_length - len(x)) for x in indices] return torch.LongTensor(padded), torch.LongTensor(labels) return f if __name__ == '__main__': print('Reading...') train = lfds.Imdb('train').map(preprocess) tokens = lf.flat_map(lambda x: x[0], train, lazy=True) print('Building vocabulary...') token_to_index, _ = build_vocab(tokens, 'vocab.pkl') print(f'Vocab Size: {len(token_to_index)}') pad_index = token_to_index[PAD_TOKEN] unk_index = token_to_index[UNK_TOKEN] loader = DataLoader( train .map(postprocess(token_to_index, unk_index)) .save('imdb.train.cache'), batch_size=32, num_workers=4, collate_fn=get_collate_fn(pad_index))
print('Reading...') train = Seq2SeqDataset( source_file_path='./cnndm/train.txt.src', target_file_path='./cnndm/train.txt.tgt.tagged') \ .to_dict(source_field_name=SOURCE_FIELD, target_field_name=TARGET_FIELD) validation = Seq2SeqDataset( source_file_path='./cnndm/val.txt.src', target_file_path='./cnndm/val.txt.tgt.tagged') \ .to_dict(source_field_name=SOURCE_FIELD, target_field_name=TARGET_FIELD) train = train.map(preprocess) validation = validation.map(preprocess) tokens = lf.flat_map(lambda x: x[SOURCE_FIELD] + x[TARGET_FIELD], train + validation, lazy=True) print('Building vocabulary...') token_to_index, words = build_vocab(tokens) print(f'Vocab Size: {len(token_to_index)}') pad_index = token_to_index[PAD_TOKEN] unk_index = token_to_index[UNK_TOKEN] loader = DataLoader(train.map(postprocess( token_to_index, unk_index)).save('cnndm.preprossed'), batch_size=32, num_workers=4, collate_fn=collate(pad_index)) for batch in tqdm(loader):