def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): assert isinstance(dataset, FairseqDataset) max_positions = (10240, 1024) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, )
def make_dynamic_sampler(self, max_tokens_per_batch=1024, **kwargs): assert FAIRSEQ_AVAILABLE, "Dynamic batch size requires `pip install fairseq`" assert not self.used_char_len, "You must call python make_len_file.py before calling make_dynamic_sampler" sorted_indices = list(self.make_sortish_sampler(1024, shuffle=False)) def num_tokens_in_example(i): return min(self.src_lens[i], self.max_target_length) # call fairseq cython function batch_sampler: List[List[int]] = batch_by_size( sorted_indices, num_tokens_fn=num_tokens_in_example, max_tokens=max_tokens_per_batch, required_batch_size_multiple=64, ) shuffled_batches = [ batch_sampler[i] for i in np.random.permutation(range(len(batch_sampler))) ] # move the largest batch to the front to OOM quickly (uses an approximation for padding) approximate_toks_per_batch = [ max(self.src_lens[i] for i in batch) * len(batch) for batch in shuffled_batches ] largest_batch_idx = np.argmax(approximate_toks_per_batch) shuffled_batches[0], shuffled_batches[largest_batch_idx] = ( shuffled_batches[largest_batch_idx], shuffled_batches[0], ) return shuffled_batches
def construct_batch_sampler( dataset, epoch ): splits = [s for s, _ in self.datasets.items() if self.datasets[s] == dataset] split = splits[0] if len(splits) > 0 else None if epoch is not None: dataset.set_epoch(epoch) start_time = time.time() # get indices ordered by example size indices = dataset.ordered_indices() logger.debug(f'[{split}] @batch_sampler order indices time: {get_time_gap(start_time, time.time())}') # filter examples that are too large if max_positions is not None: my_time = time.time() indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), ) logger.debug(f'[{split}] @batch_sampler filter_by_size time: {get_time_gap(my_time, time.time())}') # create mini-batches with given size constraints my_time = time.time() batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, ) logger.debug(f'[{split}] @batch_sampler batch_by_size time: {get_time_gap(my_time, time.time())}') logger.debug(f'[{split}] per epoch batch_sampler set-up time: {get_time_gap(start_time, time.time())}') return batch_sampler
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch. Default: ``None`` max_sentences (int, optional): max number of sentences in each batch. Default: ``None`` max_positions (optional): max sentence length supported by the model. Default: ``None`` ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long. Default: ``False`` required_batch_size_multiple (int, optional): require batch size to be a multiple of N. Default: ``1`` seed (int, optional): seed for random number generator for reproducibility. Default: ``1`` num_shards (int, optional): shard the data iterator into N shards. Default: ``1`` shard_id (int, optional): which shard of the data iterator to return. Default: ``0`` Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, )
def get_epoch_iterator(task, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, num_workers=0, seed=215, num_shards=1, shard_id=0, epoch=0): """ Get an iterator that yields batches of data from the given dataset. """ if dataset in task.dataset_to_epoch_iter: return task.dataset_to_epoch_iter[dataset] # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs)) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple) epoch_iter = iterators.EpochBatchIterator(dataset=dataset, collate_fn=collate, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch) task.dataset_to_epoch_iter[dataset] = epoch_iter return epoch_iter
def batch_by_size( self, indices, max_tokens=None, max_sentences=None, required_batch_size_multiple=1, ): """ Given an ordered set of indices, return batches according to *max_tokens*, *max_sentences* and *required_batch_size_multiple*. """ from fairseq.data import data_utils fixed_shapes = self.get_batch_shapes() if fixed_shapes is not None: def adjust_bsz(bsz, num_tokens): if bsz is None: assert max_tokens is not None, "Must specify --max-tokens" bsz = max_tokens // num_tokens if max_sentences is not None: bsz = min(bsz, max_sentences) elif ( bsz >= required_batch_size_multiple and bsz % required_batch_size_multiple != 0 ): bsz -= bsz % required_batch_size_multiple return bsz fixed_shapes = np.array( [ [adjust_bsz(bsz, num_tokens), num_tokens] for (bsz, num_tokens) in fixed_shapes ] ) try: num_tokens_vec = self.num_tokens_vec(indices).astype('int64') except NotImplementedError: num_tokens_vec = None return data_utils.batch_by_size( indices, num_tokens_fn=self.num_tokens, num_tokens_vec=num_tokens_vec, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, fixed_shapes=fixed_shapes, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ # For default fairseq task, return same iterator across epochs # as datasets are not dynamic, can be overridden in task specific # setting. print("| At task.get_batch_iterator ...", flush=True) if dataset in self.dataset_to_epoch_iter: return self.dataset_to_epoch_iter[dataset] assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() print("| At task.get_batch_iterator, indices ordered ... ", flush=True) # filter examples that are too large if max_positions is not None: indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), ) print("| At task.get_batch_iterator, examples filtered ... ", flush=True) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) print("| At task.get_batch_iterator, batch_sampler created ... ", flush=True) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) self.dataset_to_epoch_iter[dataset] = epoch_iter print("| At task.get_batch_iterator, iterator created ... ", flush=True) return epoch_iter
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, data_buffer_size=0, disable_iterator_cache=False, ): assert isinstance(dataset, OrderedDict) assert len(dataset) assert isinstance(dataset[next(iter(dataset))], FairseqDataset) # initialize the dataset with the correct starting epoch for _, dt in dataset.items(): dt.set_epoch(epoch) indices = OrderedDict() batch_sampler = OrderedDict() with data_utils.numpy_seed(seed + epoch): for key, dt in dataset.items(): logger.info(f"\t ordered_indices {key}") indices[key] = dt.ordered_indices() # filter examples that are too large if max_positions is not None: for key, dt in dataset.items(): logger.info(f"\t filter_by_size {key}") indices[key], ignored = dt.filter_indices_by_size( indices[key], max_positions) for key, dt in dataset.items(): logger.info(f"\t batch_by_size {key}") batch_sampler[key] = data_utils.batch_by_size( indices[key], dt.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) epoch_iter = MultidatasetEpochBatchIterator( dataset=dataset, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) return epoch_iter
def get_batch_iterator(self, dataset, assistant=None, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, batch_method='sentences'): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch. Default: ``None`` max_sentences (int, optional): max number of sentences in each batch. Default: ``None`` max_positions (optional): max sentence length supported by the model. Default: ``None`` ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long. Default: ``False`` required_batch_size_multiple (int, optional): require batch size to be a multiple of N. Default: ``1`` seed (int, optional): seed for random number generator for reproducibility. Default: ``1`` num_shards (int, optional): shard the data iterator into N shards. Default: ``1`` shard_id (int, optional): which shard of the data iterator to return. Default: ``0`` Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints if assistant is not None: assistant.associate_data(dataset, indices) else: batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) if assistant is not None: # return a reusable, sharded iterator return iterators.AssistantEpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, assistant=assistant, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, shard_num=num_shards, shard_id=shard_id, batch_method=batch_method, seed=seed, ) else: # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch global_start_time = time.time() dataset.set_epoch(epoch) set_epoch_time = time.time() - global_start_time # get indices ordered by example size start_time = time.time() with data_utils.numpy_seed(seed, epoch): indices = dataset.ordered_indices() sort_time = time.time() - start_time # create mini-batches with given size constraints start_time = time.time() batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) batch_by_size_time = time.time() - start_time logger.info( 'get batch iterator [seed=%d, epoch=%d, num_shards=%d] is done in %.3f seconds ' '(set epoch=%.3f, sorting=%.3f, batch by size=%.3f)' % ( seed, epoch, num_shards, time.time() - global_start_time, set_epoch_time, sort_time, batch_by_size_time, )) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) return epoch_iter
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ # For default fairseq task, return same iterator across epochs # as datasets are not dynamic, can be overridden in task specific # setting. # 如果不是每个epoch都换数据,那么就可以直接用上次构建的epoch_itr的结果 # 这样的问题就是, 如果不是每个epoch换数据,那么同一个数据在epoch与epoch之间就不会随机,减低了全局随机性 if dataset in self.dataset_to_epoch_iter: return self.dataset_to_epoch_iter[dataset] assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size ## 依照这个顺序读取数据生成batch with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: ## 把长度不符合条件的indice从indices中去掉, 根据dataset.size函数 indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), ) print("indices length and type", len(indices), type(indices)) # create mini-batches with given size constraints # batch_sampler应该是一个大list,每个值也是一个list,存放各个batch包含的sen IDs. batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) print("batch_sampler length and type", len(batch_sampler), type(batch_sampler)) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) self.dataset_to_epoch_iter[dataset] = epoch_iter print(self.dataset_to_epoch_iter) return epoch_iter
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, noskip=False, source_lang=None, target_lang=None, data_actor=None, trainer=None, data_filter_percentage=-1, filtered_maxpos_indices=None, dev_grad_dotprod=None, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). data_actor: if not None, it will be used to filter out data Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: if filtered_maxpos_indices is None: indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), noskip=noskip, ) filtered_maxpos_indices = indices else: indices = filtered_maxpos_indices # data selection: filter a subset of data if data_filter_percentage > 0: indices = data_utils.filter_by_data_actor(indices, dataset, data_actor, data_filter_percentage, trainer=trainer) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ), filtered_maxpos_indices