def __init__( self, dataset, batch_sampler, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, ): assert isinstance(dataset, OrderedDict) assert len(dataset) assert isinstance(dataset[next(iter(dataset))], FairseqDataset) self.iterators = [] self.epoch = epoch for key, dt in dataset.items(): epoch_iter = iterators.EpochBatchIterator( dataset=dt, collate_fn=dt.collater, batch_sampler=batch_sampler[key], seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=0, epoch=epoch, ) self.iterators.append(epoch_iter)
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): assert isinstance(dataset, FairseqDataset) max_positions = (10240, 1024) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, )
def get_epoch_iterator(task, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, num_workers=0, seed=215, num_shards=1, shard_id=0, epoch=0): """ Get an iterator that yields batches of data from the given dataset. """ if dataset in task.dataset_to_epoch_iter: return task.dataset_to_epoch_iter[dataset] # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs)) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple) epoch_iter = iterators.EpochBatchIterator(dataset=dataset, collate_fn=collate, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch) task.dataset_to_epoch_iter[dataset] = epoch_iter return epoch_iter
def _get_epoch_batch_itr(ref, bsz, skip_remainder_batch): dsz = len(ref) indices = range(dsz) starts = indices[::bsz] batch_sampler = [indices[s:s + bsz] for s in starts] dataset = ListDataset(ref) itr = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, skip_remainder_batch=skip_remainder_batch, ) return itr.next_epoch_itr()
def get_batch_iterator( self, dataset, num_workers=0, epoch=1, data_buffer_size=0, **kwargs ): return iterators.EpochBatchIterator( dataset=dataset, collate_fn=self._collate_fn, num_workers=num_workers, epoch=epoch, buffer_size=data_buffer_size, # we don't use the batching functionality from EpochBatchIterator; # instead every item in *dataset* is a whole batch batch_sampler=[[i] for i in range(len(dataset))], disable_shuffling=True, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, data_buffer_size=0, disable_iterator_cache=False, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 1). data_buffer_size (int, optional): number of batches to preload (default: 0). disable_iterator_cache (bool, optional): don't cache the EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`) (default: False). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ can_reuse_epoch_itr = not disable_iterator_cache and self.can_reuse_epoch_itr( dataset ) if can_reuse_epoch_itr and dataset in self.dataset_to_epoch_iter: logger.debug("reusing EpochBatchIterator for epoch {}".format(epoch)) return self.dataset_to_epoch_iter[dataset] assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: indices = self.filter_indices_by_size( indices, dataset, max_positions, ignore_invalid_inputs ) # create mini-batches with given size constraints batch_sampler = dataset.batch_by_size( indices, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, buffer_size=data_buffer_size, ) if can_reuse_epoch_itr: self.dataset_to_epoch_iter[dataset] = epoch_iter return epoch_iter
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, data_buffer_size=0, disable_iterator_cache=False, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). data_buffer_size (int, optional): number of batches to preload (default: 0). disable_iterator_cache (bool, optional): don't cache the EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`) (default: False). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ # initialize the dataset with the correct starting epoch assert isinstance(dataset, FairseqDataset) if dataset in self.dataset_to_epoch_iter: return self.dataset_to_epoch_iter[dataset] if self.args.sampling_method == "RoundRobin": batch_iter = super().get_batch_iterator( dataset, max_tokens=max_tokens, max_sentences=max_sentences, max_positions=max_positions, ignore_invalid_inputs=ignore_invalid_inputs, required_batch_size_multiple=required_batch_size_multiple, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, data_buffer_size=data_buffer_size, disable_iterator_cache=disable_iterator_cache, ) self.dataset_to_epoch_iter[dataset] = batch_iter return batch_iter construct_batch_sampler = self.create_batch_sampler_func( max_positions, ignore_invalid_inputs, max_tokens, max_sentences, required_batch_size_multiple=required_batch_size_multiple, seed=seed, ) epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=construct_batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) return epoch_iter
def filter_by_data_actor(indices, dataset, data_actor, data_filter_percentage=-1, trainer=None): """ Filter indices based on their size. Args: indices (List[int]): ordered list of dataset indices dataset (FairseqDataset): fairseq dataset instance max_positions (tuple): filter elements larger than this size. Comparisons are done component-wise. raise_exception (bool, optional): if ``True``, raise an exception if any elements are filtered (default: False). """ bins = 50 random_filter = False if trainer.args.random_data_filter: orig_data_size = len(indices) indices = np.array(indices) np.random.shuffle(indices) #interval = int(len(indices)/bins) #start_idx, end_idx, numfiltered = 0, 0, 0 #while end_idx < len(indices): # end_idx = min(len(indices), start_idx + interval) # current_indices = indices[start_idx:end_idx] # numfiltered += int(len(current_indices)*data_filter_percentage) # start_idx = end_idx #indices = indices[numfiltered:] indices = indices[int(len(indices) * data_filter_percentage):] print("Orignial data size={}; filtered data size={}".format( orig_data_size, len(indices))) indices.sort() return indices # elif trainer.args.random_data_filter_by_len: elif random_filter: orig_data_size = len(indices) indices = np.array(indices) selected = [] interval = int(len(indices) / bins) if interval == 0: # too little data for the number of bins we defined interval += 1 start_idx, end_idx = 0, 0 while end_idx < len(indices): end_idx = min(len(indices), start_idx + interval) current_indices = indices[start_idx:end_idx] np.random.shuffle(current_indices) selected.extend(current_indices[ int(len(current_indices) * data_filter_percentage):].tolist()) start_idx = end_idx indices = np.array(selected) indices.sort() print("Orignial data size={}; filtered data size={}".format( orig_data_size, len(indices))) return indices else: # calculate data actor score # create mini-batches with given size constraints print("Use RL agent to filter data") max_tokens = 600 max_sentences = 100 batch_sampler = batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, ) # return a reusable, sharded iterator itr = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler).next_epoch_itr(shuffle=False) idx_start, idx_end = 0, 0 scores = np.zeros(len(indices)) ids = np.zeros(len(indices), dtype=np.int64) for i, sample in enumerate(itr): sample = trainer._prepare_sample(sample) sample = list(sample.values())[0] # sample is a batch, use src & trg in this batch to predict how likely we choose this data # score should be size [batch_size], use the score to order the datapoints in this batch # the data actor cannot take 2 arguments and predict on token with Base data actor # score = data_actor(sample['net_input']['src_tokens'], sample['target']).data.cpu().numpy() score = data_actor(sample).data.cpu().numpy() idx_start = idx_end idx_end = idx_start + score.shape[0] scores[idx_start:idx_end] = score.ravel() ids[idx_start:idx_end] = sample['id'].data.cpu().numpy().ravel() # print(ids[idx_start:idx_end]) # print(scores[idx_start:idx_end]) # print("") # argsort is ascending order print(scores) preserved_indices = np.argsort( scores)[int(len(indices) * data_filter_percentage):] worst_id = np.array(ids)[np.argsort(scores)[:10]] best_id = np.array(ids)[np.argsort(scores)[-10:]] print("worst sentence id: ", worst_id) print("best sentence id: ", best_id) filter_indices = np.array(ids)[preserved_indices] filter_indices.sort() filter_indices = np.array( [item for item in filter_indices if item in indices]) print("Orignial data size={}; filtered data size={}".format( len(ids), len(filter_indices))) return batch_by_size(filter_indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences)
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). (we do not use it anymore and must ensure that everywhere) max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). (we do not use it anymore and must ensure that everywhere) ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size #this should be already fixed in the dataset with data_utils.numpy_seed(seed): #will get our ordered_indices #will be filtering by our size indices = dataset.ordered_indices() # filter_by_size was removed as we believe we do not need it (Christine)(18-12-2019) # create mini-batches which has batches with given size constraints # it just adjusts the batch by size (works by batch_by_size implemented above) # should be tested later if it suits the framework(Christine)(18-12-2019) batch_sampler = batch_by_size(indices, max_sentences=max_sentences) # batches should be here returned correctly, mini batches should be ??? # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, data_buffer_size=0, disable_iterator_cache=False, ): can_reuse_epoch_itr = not disable_iterator_cache and self.can_reuse_epoch_itr( dataset) if can_reuse_epoch_itr and dataset in self.dataset_to_epoch_iter: logger.debug( "reusing EpochBatchIterator for epoch {}".format(epoch)) return self.dataset_to_epoch_iter[dataset] assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: indices = self.filter_indices_by_size(indices, dataset, max_positions, ignore_invalid_inputs) logger.info("the max number of tokens are {}".format(max_tokens)) print("the indices are: {}".format(indices)) print("the max number of tokens are {}".format(max_tokens)) print("the number of indices are: {}".format(len(dataset.src_sizes))) print(dataset.sizes[indices]) # create mini-batches with given size constraints batch_sampler = dataset.batch_by_size( indices, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, buffer_size=data_buffer_size, ) if can_reuse_epoch_itr: self.dataset_to_epoch_iter[dataset] = epoch_iter return epoch_iter
def filter_by_data_actor(indices, dataset, data_actor, data_filter_percentage=-1, trainer=None): """ Filter indices based on their size. Args: indices (List[int]): ordered list of dataset indices dataset (FairseqDataset): fairseq dataset instance max_positions (tuple): filter elements larger than this size. Comparisons are done component-wise. raise_exception (bool, optional): if ``True``, raise an exception if any elements are filtered (default: False). """ bins = 50 if trainer.args.random_data_filter: orig_data_size = len(indices) indices = np.array(indices) np.random.shuffle(indices) #interval = int(len(indices)/bins) #start_idx, end_idx, numfiltered = 0, 0, 0 #while end_idx < len(indices): # end_idx = min(len(indices), start_idx + interval) # current_indices = indices[start_idx:end_idx] # numfiltered += int(len(current_indices)*data_filter_percentage) # start_idx = end_idx #indices = indices[numfiltered:] indices = indices[int(len(indices)*data_filter_percentage):] print("Orignial data size={}; filtered data size={}".format(orig_data_size, len(indices))) indices.sort() return indices elif trainer.args.random_data_filter_by_len: orig_data_size = len(indices) indices = np.array(indices) selected = [] interval = int(len(indices)/bins) start_idx, end_idx = 0, 0 while end_idx < len(indices): end_idx = min(len(indices), start_idx + interval) current_indices = indices[start_idx:end_idx] np.random.shuffle(current_indices) selected.extend(current_indices[int(len(current_indices)*data_filter_percentage):].tolist()) start_idx = end_idx indices = np.array(selected) indices.sort() print("Orignial data size={}; filtered data size={}".format(orig_data_size, len(indices))) return indices else: # calculate data actor score # create mini-batches with given size constraints max_tokens = 4800 max_sentences = 100 batch_sampler = batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, ) # return a reusable, sharded iterator itr = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler ).next_epoch_itr(shuffle=False) idx_start, idx_end = 0, 0 scores = np.zeros(len(indices)) ids = np.zeros(len(indices), dtype=np.int64) for i, sample in enumerate(itr): sample = trainer._prepare_sample(sample) sample = list(sample.values())[0] #print(sample) score = data_actor(sample['net_input']['src_tokens'], sample['target']).data.cpu().numpy() idx_start = idx_end idx_end = idx_start + score.shape[0] scores[idx_start:idx_end] = score.ravel() ids[idx_start:idx_end] = sample['id'].data.cpu().numpy().ravel() # argsort is ascending order preserved_indices = np.argsort(scores)[int(len(indices)*data_filter_percentage):] indices = np.array(ids)[preserved_indices] #score_indices = np.argsort(scores) #selected = [] #interval = int(len(scores)/bins) #start_idx, end_idx = 0, 0 #while end_idx < len(score_indices): # end_idx = min(len(scores), start_idx + interval) # current_indices = score_indices[start_idx:end_idx] # np.random.shuffle(current_indices) # selected.extend(current_indices[int(len(current_indices)*data_filter_percentage):].tolist()) # start_idx = end_idx #indices = np.array(selected) indices.sort() print("Orignial data size={}; filtered data size={}".format(len(ids), len(indices))) return indices
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch global_start_time = time.time() dataset.set_epoch(epoch) set_epoch_time = time.time() - global_start_time # get indices ordered by example size start_time = time.time() with data_utils.numpy_seed(seed, epoch): indices = dataset.ordered_indices() sort_time = time.time() - start_time # create mini-batches with given size constraints start_time = time.time() batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) batch_by_size_time = time.time() - start_time logger.info( 'get batch iterator [seed=%d, epoch=%d, num_shards=%d] is done in %.3f seconds ' '(set epoch=%.3f, sorting=%.3f, batch by size=%.3f)' % ( seed, epoch, num_shards, time.time() - global_start_time, set_epoch_time, sort_time, batch_by_size_time, )) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) return epoch_iter
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ # For default fairseq task, return same iterator across epochs # as datasets are not dynamic, can be overridden in task specific # setting. # 如果不是每个epoch都换数据,那么就可以直接用上次构建的epoch_itr的结果 # 这样的问题就是, 如果不是每个epoch换数据,那么同一个数据在epoch与epoch之间就不会随机,减低了全局随机性 if dataset in self.dataset_to_epoch_iter: return self.dataset_to_epoch_iter[dataset] assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size ## 依照这个顺序读取数据生成batch with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: ## 把长度不符合条件的indice从indices中去掉, 根据dataset.size函数 indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), ) print("indices length and type", len(indices), type(indices)) # create mini-batches with given size constraints # batch_sampler应该是一个大list,每个值也是一个list,存放各个batch包含的sen IDs. batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) print("batch_sampler length and type", len(batch_sampler), type(batch_sampler)) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) self.dataset_to_epoch_iter[dataset] = epoch_iter print(self.dataset_to_epoch_iter) return epoch_iter
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ # For default fairseq task, return same iterator across epochs # as datasets are not dynamic, can be overridden in task specific # setting. print("| At task.get_batch_iterator ...", flush=True) if dataset in self.dataset_to_epoch_iter: return self.dataset_to_epoch_iter[dataset] assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() print("| At task.get_batch_iterator, indices ordered ... ", flush=True) # filter examples that are too large if max_positions is not None: indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), ) print("| At task.get_batch_iterator, examples filtered ... ", flush=True) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) print("| At task.get_batch_iterator, batch_sampler created ... ", flush=True) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) self.dataset_to_epoch_iter[dataset] = epoch_iter print("| At task.get_batch_iterator, iterator created ... ", flush=True) return epoch_iter
def get_batch_iterator(self, dataset, assistant=None, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, batch_method='sentences'): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch. Default: ``None`` max_sentences (int, optional): max number of sentences in each batch. Default: ``None`` max_positions (optional): max sentence length supported by the model. Default: ``None`` ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long. Default: ``False`` required_batch_size_multiple (int, optional): require batch size to be a multiple of N. Default: ``1`` seed (int, optional): seed for random number generator for reproducibility. Default: ``1`` num_shards (int, optional): shard the data iterator into N shards. Default: ``1`` shard_id (int, optional): which shard of the data iterator to return. Default: ``0`` Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints if assistant is not None: assistant.associate_data(dataset, indices) else: batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) if assistant is not None: # return a reusable, sharded iterator return iterators.AssistantEpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, assistant=assistant, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, shard_num=num_shards, shard_id=shard_id, batch_method=batch_method, seed=seed, ) else: # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large indices = data_utils.filter_by_size( indices, dataset.size, max_positions, raise_exception=(not ignore_invalid_inputs), ) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, )
def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, noskip=False, source_lang=None, target_lang=None, data_actor=None, trainer=None, data_filter_percentage=-1, filtered_maxpos_indices=None, dev_grad_dotprod=None, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). data_actor: if not None, it will be used to filter out data Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ assert isinstance(dataset, FairseqDataset) # get indices ordered by example size with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: if filtered_maxpos_indices is None: indices = data_utils.filter_by_size( indices, dataset, max_positions, raise_exception=(not ignore_invalid_inputs), noskip=noskip, ) filtered_maxpos_indices = indices else: indices = filtered_maxpos_indices # data selection: filter a subset of data if data_filter_percentage > 0: indices = data_utils.filter_by_data_actor(indices, dataset, data_actor, data_filter_percentage, trainer=trainer) # create mini-batches with given size constraints batch_sampler = data_utils.batch_by_size( indices, dataset.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) # return a reusable, sharded iterator return iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ), filtered_maxpos_indices