def __init__( self, dataset, sizes, block_size, pad, eos, break_mode=None, include_targets=False, document_sep_len=1, ): super().__init__() self.dataset = dataset self.pad = pad self.eos = eos self.include_targets = include_targets assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else 'none' # For "eos" break-mode, block_size is not required parameters. if break_mode == "eos" and block_size is None: block_size = 0 slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size, document_sep_len) self._sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros(len(sizes), dtype=np.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) self._slice_indices = plasma_utils.PlasmaArray(slice_indices) self._sizes = plasma_utils.PlasmaArray(self._sizes) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index)
def __init__( self, dataset, sizes, block_size, pad=0, eos=2, include_targets=False, break_mode=None, document_sep_len=1, two_inputs=False, ): try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( 'Please build Cython components with: `pip install --editable .` ' 'or `python setup.py build_ext --inplace`' ) super().__init__() self.dataset = dataset self.two_inputs = two_inputs assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: if torch.is_tensor(sizes): sizes = sizes.numpy() sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else 'none' slice_indices = _get_slice_indices_fast( sizes, break_mode, block_size, document_sep_len) self._sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) self._slice_indices = plasma_utils.PlasmaArray(slice_indices) self._sizes = plasma_utils.PlasmaArray(self._sizes) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index) self.pad = pad self.eos = eos self.include_targets = include_targets
def set_epoch(self, epoch): logger.debug('ResamplingDataset.set_epoch: {}'.format(epoch)) super().set_epoch(epoch) if epoch == self._cur_epoch: return self._cur_epoch = epoch # Generate a weighted sample of indices as a function of the # random seed and the current epoch. rng = np.random.RandomState( [ 42, # magic number self.seed % (2 ** 32), # global seed self._cur_epoch, # epoch index ] ) self._cur_indices = plasma_utils.PlasmaArray( rng.choice( len(self.dataset), self.actual_size, replace=self.replace, p=(None if self.weights is None else self.weights.array), ) )
def set_epoch(self, epoch): logger.info("SubsampleLanguagePairDataset.set_epoch: {}".format(epoch)) super().set_epoch(epoch) if epoch == self._cur_epoch: return self._cur_epoch = epoch # Generate a weighted sample of indices as a function of the # random seed and the current epoch. rng = np.random.RandomState([ 42, # magic number self.seed % (2**32), # global seed self._cur_epoch, # epoch index ]) self._cur_indices = plasma_utils.PlasmaArray( rng.choice( len(self.dataset), self.actual_size, replace=self.replace, p=(None if self.weights is None else self.weights.array), )) logger.info( "Dataset is sub-sampled: {} -> {}, first 3 ids are: {}".format( len(self.dataset), self.actual_size, ",".join([str(_i) for _i in self._cur_indices.array[:3]])))
def __init__(self, dataset, size_ratio, weights=None, replace=False, seed=0, epoch=1): super().__init__(dataset) assert size_ratio <= 1 self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int) logger.info("subsampled dataset from {} to {} (ratio={})".format( len(self.dataset), self.actual_size, size_ratio)) self.src_dict = self.dataset.src_dict self.tgt_dict = self.dataset.tgt_dict self.left_pad_source = self.dataset.left_pad_source self.left_pad_target = self.dataset.left_pad_target self.seed = seed self._cur_epoch = None self._cur_indices = None self.replace = replace if weights is None: self.weights = None else: assert len(weights) == len(dataset) weights_arr = np.array(weights, dtype=np.float64) weights_arr /= weights_arr.sum() self.weights = plasma_utils.PlasmaArray(weights_arr) self.set_epoch(epoch)
def _next_global_indices(self, epoch): rng = np.random.RandomState([ int( hashlib.sha1(str( self.__class__.__name__).encode('utf-8')).hexdigest(), 16) % (2**32), self.seed % (2**32), # global seed epoch, # epoch index, ]) del self._random_globa_indices self._random_globa_indices = plasma_utils.PlasmaArray( rng.choice(self.virtual_size, self.virtual_size, replace=False)) if self.load_next_shard is None: self.load_next_shard = False else: # increase shard epoch for next loading self.shard_epoch += 1 self.load_next_shard = True # a hack to avoid possible out of sync of shard epoch number # TODO: to confirm whether this is needed; without it, CUDA event error is occassionally observed synced_shard_epoch = self._sync_shard_epoch(self.shard_epoch) logger.info( 'to load next epoch/shard in next load_dataset: ' f'epoch={epoch}/shard_epoch={self.shard_epoch}[synced={synced_shard_epoch}]' )
def __init__( self, dataset, sizes, block_size, pad, eos, break_mode=None, include_targets=False, document_sep_len=1, use_plasma_view=False, split_path=None, plasma_path=None, ): super().__init__() self.dataset = dataset self.pad = pad self.eos = eos self.include_targets = include_targets assert len(dataset) > 0 assert len(dataset) == len(sizes) _sizes, block_to_dataset_index, slice_indices = self._build_slice_indices( sizes, break_mode, document_sep_len, block_size ) if use_plasma_view: plasma_id = (block_size, document_sep_len, str(break_mode), len(dataset)) self._slice_indices = plasma_utils.PlasmaView( slice_indices, split_path, (plasma_id, 0), plasma_path=plasma_path ) self._sizes = plasma_utils.PlasmaView( _sizes, split_path, (plasma_id, 1), plasma_path=plasma_path ) self._block_to_dataset_index = plasma_utils.PlasmaView( block_to_dataset_index, split_path, (plasma_id, 2), plasma_path=plasma_path, ) else: self._slice_indices = plasma_utils.PlasmaArray(slice_indices) self._sizes = plasma_utils.PlasmaArray(_sizes) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index )
def _establish_virtual_datasets(self): if self.sample_ratios is None and self._cur_indices is not None: # not a samping dataset, no need to resample if indices are already established return self._reset_cached_properties() start_time = time.time() # Generate a weighted sample of indices as a function of the # random seed and the current epoch. rng = np.random.RandomState([ int( hashlib.sha1(str( self.__class__.__name__).encode('utf-8')).hexdigest(), 16) % (2**32), self.seed % (2**32), # global seed self._cur_epoch, # epoch index, ]) indices, cumulated_sizes, virtual_size_per_dataset = self.get_virtual_indices( rng, self.datasets, self.sample_ratios, self.virtual_size) self._clean_if_not_none( [self.cumulated_sizes, self.virtual_size_per_dataset]) self._cur_indices = plasma_utils.PlasmaArray(indices) self.cumulated_sizes = plasma_utils.PlasmaArray(cumulated_sizes) self.virtual_size_per_dataset = plasma_utils.PlasmaArray( virtual_size_per_dataset) raw_sizes = [len(d) for d in self.datasets] sampled_sizes = self.virtual_size_per_dataset.array logger.info( f'[{self.split}] Raw sizes: {str(dict(zip(self.keys, raw_sizes)))}; ' f'raw total size: {sum(raw_sizes)}') logger.info( f'[{self.split}] Resampled sizes: {str(dict(zip(self.keys, sampled_sizes)))}; ' f'resampled total size: {sum(sampled_sizes)}') if self.sample_ratios is not None: logger.info( f'[{self.split}] Upsampling ratios: {str(dict(zip(self.keys, self.sample_ratios.array)))}' ) else: logger.info(f'[{self.split}] A concat dataset') logger.debug( f'[{self.split}] virtual dataset established time: {get_time_gap(start_time, time.time())}' )
def setup_sampling(self, sample_ratios, virtual_size): sizes = [len(d) for d in self.datasets] if sample_ratios is None: # default back to concating datasets self.sample_ratios = None self.virtual_size = sum(sizes) else: if not isinstance(sample_ratios, np.ndarray): sample_ratios = np.array(sample_ratios) self.sample_ratios = plasma_utils.PlasmaArray(sample_ratios) virtual_size = default_virtual_size_func if virtual_size is None else virtual_size self.virtual_size = (virtual_size(self.datasets, self.sample_ratios.array) if callable(virtual_size) else virtual_size)
def ordered_indices(self): if self._epoch_ordered_indices is not None: return self._epoch_ordered_indices.array if self.batch_by_size: # No need to do shuffle as the data items are already randomized indices = np.arange(len(self)) sizes = self.sizes tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None src_sizes = sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes # sort by target length, then source length if tgt_sizes is not None: indices = indices[ np.argsort(tgt_sizes[indices], kind='mergesort') ] sort_indices = indices[np.argsort(src_sizes[indices], kind='mergesort')] else: sort_indices = np.arange(len(self)) self._epoch_ordered_indices = plasma_utils.PlasmaArray(sort_indices) return self._epoch_ordered_indices.array
def sizes(self): if self._epoch_sizes is not None: return self._epoch_sizes.array start_time = time.time() size_cache = self._size_cache ret = [] for i in range(len(self)): index = self._map_epoch_index_to_global(i) ds_idx, ds_sample_idx = self._get_dataset_and_index(index) if (ds_idx, ds_sample_idx) in size_cache: ret.append(size_cache[(ds_idx, ds_sample_idx)]) else: s = self.datasets[ds_idx].size(ds_sample_idx) s = (s, s) if not isinstance(s, tuple) else s size_cache[(ds_idx, ds_sample_idx)] = s ret.append(s) self._epoch_sizes = plasma_utils.PlasmaArray(np.array(ret, np.int64)) logger.info(f'sizes() calling time: {get_time_gap(start_time, time.time())}') return self._epoch_sizes.array
def __init__( self, dataset, weights=None, replace=True, size_ratio=1.0, batch_by_size=True, seed=0, epoch=1, ): super().__init__(dataset) if weights is None: self.weights = None else: assert len(weights) == len(dataset) weights_arr = np.array(weights, dtype=np.float64) weights_arr /= weights_arr.sum() self.weights = plasma_utils.PlasmaArray(weights_arr) self.replace = replace assert size_ratio > 0.0 if not self.replace: assert size_ratio < 1.0 self.size_ratio = float(size_ratio) self.actual_size = np.ceil(len(dataset) * self.size_ratio).astype(int) self.batch_by_size = batch_by_size self.seed = seed self._cur_epoch = None self._cur_indices = None self.set_epoch(epoch)
def __init__( self, dataset, sizes, block_size, pad, eos, break_mode=None, include_targets=False, document_sep_len=1, ): try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( "Please build Cython components with: `pip install --editable .` " "or `python setup.py build_ext --inplace`") super().__init__() self.dataset = dataset self.pad = pad self.eos = eos self.include_targets = include_targets assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: if torch.is_tensor(sizes): sizes = sizes.numpy() sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else "none" # For "eos" break-mode, block_size is not required parameters. if break_mode == "eos" and block_size is None: block_size = 0 slice_indices = _get_slice_indices_fast(sizes, str(break_mode), block_size, document_sep_len) self._sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros(len(sizes), dtype=np.compat.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) size_dtype = np.uint16 if block_size < 65535 else np.uint32 slice_indices_dtype = best_fitting_int_dtype(slice_indices[-1].max()) self._slice_indices = plasma_utils.PlasmaArray( slice_indices.astype(slice_indices_dtype)) self._sizes = plasma_utils.PlasmaArray(self._sizes.astype(size_dtype)) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index.astype(slice_indices_dtype))
def __init__( self, dataset, sizes, block_size, pad, eos, break_mode=None, include_targets=False, document_sep_len=1, ): super().__init__() self.dataset = dataset self.pad = pad self.eos = eos self.include_targets = include_targets slice_indices = [] assert len(dataset) == len(sizes) assert len(dataset) > 0 sizes = np.array(sizes, dtype=int) if break_mode is None or break_mode == 'none': total_size = sum(sizes) length = math.ceil(total_size / block_size) def block_at(i): start = i * block_size end = min(start + block_size, total_size) return (start, end) slice_indices = [block_at(i) for i in range(length)] elif break_mode == 'complete': tok_idx = 0 sz_idx = 0 curr_size = 0 while sz_idx < len(sizes): if curr_size + sizes[sz_idx] <= block_size or curr_size == 0: curr_size += sizes[sz_idx] sz_idx += 1 else: slice_indices.append((tok_idx, tok_idx + curr_size)) tok_idx += curr_size curr_size = 0 if curr_size > 0: slice_indices.append((tok_idx, tok_idx + curr_size)) elif break_mode == 'complete_doc': tok_idx = 0 sz_idx = 0 curr_size = 0 while sz_idx < len(sizes): if ((curr_size + sizes[sz_idx] <= block_size or curr_size == 0) # an empty sentence indicates end-of-document: and sizes[sz_idx] != document_sep_len): curr_size += sizes[sz_idx] sz_idx += 1 else: slice_indices.append((tok_idx, tok_idx + curr_size)) tok_idx += curr_size curr_size = 0 if sizes[sz_idx] == document_sep_len: tok_idx += sizes[sz_idx] sz_idx += 1 if curr_size > 0: slice_indices.append((tok_idx, tok_idx + curr_size)) elif break_mode == 'eos': slice_indices = np.empty((len(sizes), 2), dtype=int) if not torch.is_tensor(sizes): sizes = torch.tensor(sizes) cumsum = torch.cumsum(sizes, dim=0) slice_indices[0] = [0, sizes[0]] if len(cumsum) > 1: slice_indices[1:] = cumsum.unfold(0, 2, 1) else: raise ValueError('Invalid break_mode: ' + break_mode) slice_indices = np.array(slice_indices, dtype=int) self._sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices if break_mode == 'eos': # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros(len(sizes), dtype=np.long ), # starting offset within starting index np.arange(len(sizes)) # ending index in dataset ], 1, ) else: ds = DatasetSearcher(sizes) block_to_dataset_index = np.empty((len(slice_indices), 3), dtype=int) for i, (s, e) in enumerate(slice_indices): ds.seek(s) start_ds_idx = ds.current_index start_offset = ds.current_offset if e <= s: continue ds.seek(e - 1) end_ds_idx = ds.current_index block_to_dataset_index[i] = ( start_ds_idx, # starting index in dataset start_offset, # starting offset within starting index end_ds_idx, # ending index in dataset ) self._slice_indices = plasma_utils.PlasmaArray(slice_indices) self._sizes = plasma_utils.PlasmaArray(self._sizes) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index)
def __init__( self, dataset, sizes, block_size, pad, eos, break_mode='complete_doc', include_targets=False, document_sep_len=1, context_mode='doc', window_size=3, ): try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( 'Please build Cython components with: `pip install --editable .` ' 'or `python setup.py build_ext --inplace`') super().__init__() self.dataset = dataset self.pad = pad self.eos = eos self.include_targets = include_targets assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else 'none' # For "eos" break-mode, block_size is not required parameters. if break_mode == "eos" and block_size is None: block_size = 0 slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size, document_sep_len) # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros(len(sizes), dtype=np.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) self._slice_indices = plasma_utils.PlasmaArray(slice_indices) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index) self.context_mode = context_mode self.window_size = window_size context_index, _sizes = self.rebuild_index() self._context_index = plasma_utils.PlasmaArray(np.array(context_index)) self._sizes = plasma_utils.PlasmaArray(np.array(_sizes))
def __init__( self, dataset, sizes, block_sizes, pad, eos, document_sep_len=1, ): try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( 'Please build Cython components with: `pip install --editable .` ' 'or `python setup.py build_ext --inplace`') super().__init__() self.dataset = dataset self.pad = pad self.eos = eos assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: if torch.is_tensor(sizes): sizes = sizes.numpy() sizes = sizes.astype(np.int64) assert min(block_sizes) > 0 block_sizes = [0] + block_sizes slice_indices_list = [] sizes_list = [] block_to_dataset_index_list = [] number_of_inst_in_block = [] for block_size in block_sizes: break_mode = "eos" if block_size == 0 else "complete" slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size, document_sep_len) slice_indices_list.append(slice_indices) sizes_list.append(slice_indices[:, 1] - slice_indices[:, 0]) number_of_inst_in_block.append(len(slice_indices)) # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros(len(sizes), dtype=np.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices) block_to_dataset_index_list.append(block_to_dataset_index) self._sizes = np.concatenate(sizes_list) self._slice_indices = np.concatenate(slice_indices_list, axis=0) self._block_to_dataset_index = np.concatenate( block_to_dataset_index_list, axis=0) self._number_of_inst_in_block = np.array(number_of_inst_in_block, dtype=np.int64) self._slice_indices = plasma_utils.PlasmaArray(self._slice_indices) self._sizes = plasma_utils.PlasmaArray(self._sizes) self._block_to_dataset_index = plasma_utils.PlasmaArray( self._block_to_dataset_index) self._number_of_inst_in_block = plasma_utils.PlasmaArray( self._number_of_inst_in_block)