def setUp(self): d = mock_dict() tokens_1 = torch.LongTensor([i for i in range(1, 5000, 2)]).view(1, -1) tokens_ds1 = TokenBlockDataset( tokens_1, sizes=[tokens_1.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_1 = LanguagePairDataset(tokens_ds1, tokens_ds1.sizes, d, shuffle=False) tokens_2 = torch.LongTensor([i for i in range(0, 5000, 2)]).view(1, -1) tokens_ds2 = TokenBlockDataset( tokens_2, sizes=[tokens_2.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_2 = LanguagePairDataset(tokens_ds2, tokens_ds2.sizes, d, shuffle=False)
def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): assert not self.cfg.include_src or len(src_tokens[0]) == 2 input_src = None if self.cfg.include_src: input_src = TokenBlockDataset( [t[0] for t in src_tokens], [l[0] for l in src_lengths], block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ) input_src = PrependTokenDataset(input_src, self.dictionary.bos()) input_src = TruncateDataset(input_src, self.cfg.max_positions) input_tgt = TokenBlockDataset( [t[-1] for t in src_tokens], [l[-1] for l in src_lengths], block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ) input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions) if self.cfg.include_src: src_tokens = ConcatSentencesDataset(input_src, input_tgt) src_lengths = NumelDataset(input_src, reduce=False) else: input_tgt = PrependTokenDataset(input_tgt, self.dictionary.bos()) src_tokens = input_tgt src_lengths = NumelDataset(src_tokens, reduce=False) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": src_lengths, }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } return NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], )
def get_prepended_token_block_dataset(args, dataset_path, vocab, combine=False): dataset = data_utils.load_indexed_dataset( dataset_path, vocab, args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: ({})'.format(dataset_path)) if not args.apply_ptb: print("| [I] ptb not applied.", flush=True) return dataset dataset = TruncateDataset(dataset, args.tokens_per_sample - 1) dataset = TokenBlockDataset( dataset, dataset.sizes, args.tokens_per_sample - 1, # one less for <s> pad=vocab.pad(), eos=vocab.eos(), break_mode=args.sample_break_mode, ) print('| loaded {} blocks from: {}'.format(len(dataset), dataset_path), flush=True) dataset = PrependTokenDataset(dataset, vocab.bos()) return dataset
def main(args): tokenizer = build_tokenizer(args) indices = [] with open(args.input) as fp: for line in tqdm(fp): line = line.strip() indices.append(tokenizer.encode(line)) print("tokenize finished.") for i in range(5): print("example[%d]:" % i) input_ids = indices[i] print(input_ids) tokens = tokenizer.convert_ids_to_tokens(input_ids) print(tokens) dataset = IndexDataset(indices) dataset = TruncateDataset(dataset, args.tokens_per_sample - 1) dataset = TokenBlockDataset( dataset, dataset.sizes, args.tokens_per_sample - 1, # one less for <s> pad=tokenizer.pad_token_id, eos=tokenizer.sep_token_id, break_mode=args.sample_break_mode, ) print('| loaded {} blocks from: {}'.format(len(dataset), args.input), flush=True) dataset = PrependTokenDataset(dataset, tokenizer.cls_token_id) print("| get all items ...") items = [i for i in tqdm(dataset)] print("| writing binary file ...") prefix = os.path.join(args.output, "train.0") save_items(items, prefix, len(tokenizer))
def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True): src_dataset = PadDataset( TokenBlockDataset( src_tokens, src_lengths, self.args.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode='eos', ), pad_idx=self.source_dictionary.pad(), left_pad=False, ) src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos()) src_dataset = NestedDictionaryDataset( { 'id': IdDataset(), 'net_input': { 'src_tokens': src_dataset, 'src_lengths': NumelDataset(src_dataset, reduce=False), }, }, sizes=src_lengths, ) if sort: src_dataset = SortDataset(src_dataset, sort_order=[src_lengths]) return src_dataset
def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True): src_dataset = RightPadDataset( TokenBlockDataset( src_tokens, src_lengths, self.args.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ), pad_idx=self.source_dictionary.pad(), ) src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos()) src_dataset = NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": src_dataset, "src_lengths": NumelDataset(src_dataset, reduce=False), }, }, sizes=src_lengths, ) if sort: src_dataset = SortDataset(src_dataset, sort_order=[src_lengths]) return src_dataset
def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): """ Generate batches for inference. We assume that the input begins with a bos symbol (`<s>`) and ends with an eos symbol (`</s>`). """ pad = self.source_dictionary.pad() eos = self.source_dictionary.eos() src_dataset = TokenBlockDataset( src_tokens, src_lengths, block_size=self.args.tokens_per_sample - 2, # for <s> and </s> pad=pad, eos=eos, break_mode=self.args.sample_break_mode, document_sep_len=0, ) prev_output_tokens = PrependTokenDataset( StripTokenDataset(src_dataset, eos), eos) src_dataset = PadDataset(src_dataset, pad_idx=pad, left_pad=False) return NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": src_dataset, "src_lengths": NumelDataset(src_dataset, reduce=False), "prev_output_tokens": PadDataset(prev_output_tokens, pad_idx=pad, left_pad=False), }, "target": src_dataset, }, sizes=[np.array(src_lengths)], )
def build_dataset_for_inference(self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs) -> torch.utils.data.Dataset: eos = self.source_dictionary.eos() dataset = TokenBlockDataset( src_tokens, src_lengths, block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=eos, break_mode="eos", ) class Dataset(torch.utils.data.Dataset): def __getitem__(self, i): item = dataset[i] if item[-1] == eos: # remove eos to support generating with a prefix item = item[:-1] return (i, [item]) def __len__(self): return len(dataset) return Dataset()
def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True): src_dataset = RightPadDataset( TokenBlockDataset( src_tokens, src_lengths, self.args.tokens_per_sample, pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode='eos', ), pad_idx=self.source_dictionary.pad(), ) # remove tail src_dataset = RemoveTailDataset(src_dataset) src_dataset = NestedDictionaryDataset( { 'id': IdDataset(), 'net_input': { 'src_tokens': src_dataset, 'src_lengths': NumelDataset(src_dataset, reduce=False), }, }, sizes=src_lengths, ) if sort: src_dataset = SortDataset(src_dataset, sort_order=[src_lengths]) return src_dataset
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test)""" # support sharded datasets paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) # each element of *data* will be a tensorized line from the original # text dataset, similar to ``open(split_path).readlines()`` data = data_utils.load_indexed_dataset(split_path, self.dictionary, combine=combine) if data is None: raise FileNotFoundError("Dataset not found: {} ({})".format( split, split_path)) # this is similar to ``data.view(-1).split(tokens_per_sample)`` data = TokenBlockDataset( data, data.sizes, block_size=self.cfg.tokens_per_sample, pad=None, # unused eos=None, # unused break_mode="none", ) self.datasets[split] = TruncatedBPTTDataset( data=data, bsz_per_shard=self.cfg.batch_size, shard_id=self.cfg.data_parallel_rank, num_shards=self.cfg.data_parallel_size, )
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = self.args.data.split(':') assert len(paths) > 0 data_path = paths[epoch % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path)) dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, ) add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' self.datasets[split] = MonolingualDataset( dataset, dataset.sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, add_bos_token=self.args.add_bos_token, )
def load_mask_data( path, mydict): #一个大列表,每个item是一个文档矩阵,矩阵里面每个item是一个node的数值 ,for token_id 和 #print('???',path) #from fairseq.data.indexed_dataset import MMapIndexedDataset #print('???', MMapIndexedDataset(path) ) dataset = data_utils.load_indexed_dataset( path, mydict, 'mmap', combine=False, ) #print(dataset.__getitem__(0),dataset.__getitem__(0).shape,len(dataset)) dataset = TokenBlockDataset( dataset, dataset.sizes, 512 - 1, pad=mydict.pad(), eos=mydict.eos(), break_mode='complete_doc', ) #print(dataset.__getitem__(0),dataset.__getitem__(0).shape,len(dataset)) dataset = PrependTokenDataset(dataset, mydict.bos()) #print(dataset.__getitem__(0),dataset.__getitem__(0).shape,len(dataset)) return dataset
def load_sentence(self, split, sentence): loaded_datasets = [] words = sentence.split(' ') ds = IndexedRawTextDataset(words, self.dictionary) loaded_datasets.append( TokenBlockDataset( ds, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, )) if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' self.datasets[split] = MonolingualDataset( dataset, sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, )
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] loaded_labels = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedDataset(path, fix_lua_indexing=False) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) loaded_datasets.append( TokenBlockDataset( ds, 0, pad=self.dictionary.pad(), break_mode='eos', include_targets=False, )) with open(path + '.lbl', 'r') as lbl_f: lines = lbl_f.readlines() loaded_labels.extend(int(l) for l in lines) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = SentenceClassificationDataset( dataset, loaded_labels, sizes, self.dictionary, )
def load_dataset(self, split: str, epoch=1, combine=False, **kwargs) -> MonolingualDataset: """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) # each process has its own copy of the raw data (likely to be an np.memmap) dataset = data_utils.load_indexed_dataset(split_path, self.dictionary, self.args.dataset_impl, combine=combine) if dataset is None: raise FileNotFoundError( f"Dataset not found: {split} ({split_path})") dataset = maybe_shorten_dataset( dataset, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.tokens_per_sample, self.args.seed, ) dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, use_plasma_view=self.args.use_plasma_view, split_path=split_path, plasma_path=self.args.plasma_path, ) add_eos_for_other_targets = (self.args.sample_break_mode is not None and self.args.sample_break_mode != "none") self.datasets[split] = MonolingualDataset( dataset=dataset, sizes=dataset.sizes, src_vocab=self.dictionary, tgt_vocab=self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, add_bos_token=self.args.add_bos_token, )
def build_dataset_for_inference(self, src_tokens, src_lengths, language="en_XX", **kwargs): """ Generate batches for inference. We prepend an eos token to src_tokens (or bos if `--add-bos-token` is set) and we append a <pad> to target. This is convenient both for generation with a prefix and LM scoring. """ dataset = StripTokenDataset( TokenBlockDataset( src_tokens, src_lengths, block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ), # remove eos from (end of) target sequence self.source_dictionary.eos(), ) src_lang_idx = self.dictionary.index(lang_token(language)) src_dataset = PrependTokenDataset( dataset, token=((src_lang_idx or self.source_dictionary.bos()) if getattr( self.args, "add_bos_token", False) else self.source_dictionary.eos()), ) max_seq_len = max(src_lengths) + 1 tgt_dataset = AppendTokenDataset(dataset, token=self.source_dictionary.pad()) return NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": PadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, pad_length=max_seq_len, ), "src_lengths": NumelDataset(src_dataset, reduce=False), }, "target": PadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, pad_length=max_seq_len, ), }, sizes=[np.array(src_lengths)], )
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ use_ctx_dataset = getattr(self.vqvae_args, 'use_context_dataset', 0) paths = self.vqvae_args.data.split(":") assert len(paths) > 0 data_path = paths[epoch % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.vqvae_args.dataset_impl, combine=combine ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) if use_ctx_dataset: dataset = DocBlockDataset( dataset, dataset.sizes, self.vqvae_args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.vqvae_args.sample_break_mode, include_targets=True, context_mode=self.vqvae_args.context_mode, window_size=self.vqvae_args.window_size, ) else: dataset = TokenBlockDataset( dataset, dataset.sizes, self.vqvae_args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.vqvae_args.sample_break_mode, include_targets=True, ) add_eos_for_other_targets = ( self.vqvae_args.sample_break_mode is not None and self.vqvae_args.sample_break_mode != "none" ) self.datasets[split] = MonolingualDataset( dataset, dataset.sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, add_bos_token=self.vqvae_args.add_bos_token, )
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedDataset.exists(path): ds = IndexedDataset(path, fix_lua_indexing=True) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) loaded_datasets.append( TokenBlockDataset( ds, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' self.datasets[split] = MonolingualDataset( dataset, sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, )
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError("Dataset not found: {} ({})".format( split, split_path)) dataset = StripTokenDataset(dataset, self.dictionary.eos()) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample - 2, # one less for <s> and one for </s> pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, document_sep_len=0, ) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = AppendTokenDataset(dataset, self.source_dictionary.eos()) mask_whole_words = (get_whole_word_mask(self.args, self.source_dictionary) if self.args.mask_length != "subword" else None) self.datasets[split] = DenoisingDataset( dataset, dataset.sizes, self.dictionary, self.mask_idx, mask_whole_words, shuffle=self.args.shuffle_instance, seed=self.seed, args=self.args, ) logger.info( "Split: {0}, Loaded {1} samples of denoising_dataset".format( split, len(self.datasets[split]), ))
def load_dataset(self, split, combine=False): """Load a dataset split.""" loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=True) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) cbt_booktitle_idx = None if self.args.sample_break_mode == 'cbt_booktitle': if self.dictionary.index( '_BOOK_TITLE_') != self.dictionary.unk(): cbt_booktitle_idx = self.dictionary.index('_BOOK_TITLE_') loaded_datasets.append( TokenBlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode, include_targets=True, cbt_booktitle_idx=cbt_booktitle_idx, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = MonolingualDataset(dataset, sizes, self.dictionary, shuffle=False)
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.args.dataset_impl, combine=combine ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) dataset = maybe_shorten_dataset( dataset, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.tokens_per_sample, self.args.seed, ) dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, ) add_eos_for_other_targets = ( self.args.sample_break_mode is not None and self.args.sample_break_mode != "none" ) self.datasets[split] = self._initialize_dataset( dataset=dataset, sizes=dataset.sizes, src_vocab=self.dictionary, tgt_vocab=self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, add_bos_token=self.args.add_bos_token, )
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ dataset_map = OrderedDict() for lang in self.langs2id.keys(): if self.default_key is None: self.default_key = lang # Datasets are expected to be in "split.lang" format (Eg: train.en) language_split = '{}.{}'.format(split, lang) path = os.path.join(self.args.data, language_split) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedDataset.exists(path): if self.args.lazy_load: ds = IndexedDataset(path, fix_lua_indexing=True) else: ds = IndexedCachedDataset(path, fix_lua_indexing=True) else: raise FileNotFoundError('Dataset not found: {} ({})'.format( language_split, self.args.data)) # Since we append each block with the classification_token, # we need to effectively create blocks of length # tokens_per_sample-1 block_dataset = TokenBlockDataset( dataset=ds, sizes=ds.sizes, block_size=self.args.tokens_per_sample - 1, pad=self.dictionary.pad(), eos=self.dictionary.eos()) dataset_map[lang] = MaskedLMDataset( dataset=block_dataset, sizes=block_dataset.sizes, vocab=self.dictionary, pad_idx=self.dictionary.pad(), mask_idx=self.dictionary.mask(), classif_token_idx=self.dictionary.eos(), sep_token_idx=self.dictionary.eos(), shuffle=getattr(self.args, 'shuffle', False), has_pairs=False, segment_id=self.langs2id[lang], seed=self.seed, ) self.datasets[split] = MultiCorpusSampledDataset( dataset_map, default_key=self.default_key) print('| {} {} {} examples'.format(self.args.data, split, len(self.datasets[split])))
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[epoch % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, split_path)) dataset = maybe_shorten_dataset( dataset, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.tokens_per_sample, self.args.seed, ) # create continuous blocks of tokens. block_size=511或者512 dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode=self.args.sample_break_mode, ) logger.info('loaded {} blocks from: {}'.format(len(dataset), split_path)) s2s_dataset = MaskedLanguagePairDataset.apply_mask( dataset, dataset.sizes, self.source_dictionary, shuffle=True, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, ) self.datasets[split] = s2s_dataset
def _load_single_lang_dataset(self, split, epoch): loaded_datasets = [] paths = self.args.data.split(":") assert len(paths) > 0 data_path = paths[epoch % len(paths)] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") path = os.path.join(data_path, split_k) ds = data_utils.load_indexed_dataset( path, self.dictionary, self.args.dataset_impl ) if ds is None: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) # Since we append each block with the classification_token, # we need to effectively create blocks of length # tokens_per_sample-1 loaded_datasets.append( TokenBlockDataset( ds, ds.sizes, self.args.tokens_per_sample - 1, pad=self.dictionary.pad(), eos=self.dictionary.eos(), ) ) print( "| {} {} {} examples".format( data_path, split_k, len(loaded_datasets[-1]) ) ) if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) return dataset, sizes
def load_dataset_ordering(self, input_ordered_file, input_shuffled_file): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] assert self.args.raw_text and IndexedRawTextDataset.exists( input_shuffled_file) ds = IndexedRawTextDataset(input_shuffled_file, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] loaded_datasets.append( TokenBlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, )) print('| {} {} examples'.format(input_shuffled_file, len(loaded_datasets[-1]))) # if not combine: # break assert len(loaded_datasets) == 1 dataset = loaded_datasets[0] sizes = dataset.sizes add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' self.datasets['test'] = MonolingualDataset( dataset, sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False, targets=self.targets, )
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ print("This is the split", split) from fairseq.data.cvit.utils import monoling_select dataset = monoling_select(self.data['corpora'], split) from ilmulti.sentencepiece import SentencePieceTokenizer hard_code_dict = self.data['hard_coded_dict'] tokenizer = SentencePieceTokenizer(hard_code_dict) dataset = CVITIndexedRawTextDataset(dataset, tokenizer, self.dictionary) if dataset is None: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, split_path)) dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, ) add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' self.datasets[split] = MonolingualDataset( dataset, dataset.sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, add_bos_token=self.args.add_bos_token, )
def load_dataset(self, split): """Load a dataset split.""" path = os.path.join(self.args.data, split) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = ds.tokens_list elif not self.args.raw_text and IndexedInMemoryDataset.exists(path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=True) tokens = ds.buffer else: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data)) dataset = TokenBlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode, include_targets=True, # return next tokens as targets ) self.datasets[split] = MonolingualDataset(dataset, dataset.sizes, self.dictionary, shuffle=False)
def _load_dataset_split(self, split, epoch, combine): paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.cfg.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) dataset = StripTokenDataset(dataset, self.dictionary.eos()) dataset = maybe_shorten_dataset( dataset, split, self.cfg.shorten_data_split_list, self.cfg.shorten_method, self.cfg.tokens_per_sample, self.cfg.seed, ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.cfg.tokens_per_sample - 2, # one less for <s> and one for </s> pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.cfg.sample_break_mode, document_sep_len=0, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = AppendTokenDataset(dataset, self.source_dictionary.eos()) return dataset
def _load_single_lang_dataset(self, split): loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedDataset.exists(path): if self.args.lazy_load: ds = IndexedDataset(path, fix_lua_indexing=True) else: ds = IndexedCachedDataset(path, fix_lua_indexing=True) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) # Since we append each block with the classification_token, # we need to effectively create blocks of length # tokens_per_sample-1 loaded_datasets.append( TokenBlockDataset( ds, ds.sizes, self.args.tokens_per_sample - 1, pad=self.dictionary.pad(), eos=self.dictionary.eos(), )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) return dataset, sizes
def build_s2s_dataset(self, dataset): dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode=self.args.sample_break_mode, ) pred_probs = torch.FloatTensor([float(x) for x in self.args.mask_s2s_mask_keep_rand.split(',')]) s2s_dataset = MaskedLanguagePairDataset( dataset, dataset.sizes, self.source_dictionary, shuffle=True, mask_prob=self.args.mask_s2s_prob, pred_probs=pred_probs, ) return s2s_dataset