def __init__( self, annotated_text, max_positions, pad, eos, document_sep_len, seed, ): super().__init__() self.dataset = annotated_text self.pad = pad self.eos = eos self.max_positions = max_positions assert len(self.dataset) > 0 sizes = annotated_text.sizes.astype(np.int64) slice_indices = _get_slice_indices_fast( sizes=sizes, break_mode='complete_doc', block_size=self.max_positions, document_sep_len=document_sep_len, ) _sizes = slice_indices[:, 1] - slice_indices[:, 0] self._slice_indices = maybe_move_to_plasma(slice_indices) self._sizes = maybe_move_to_plasma(_sizes) self.seed = seed self.epoch = 0
def _build_slice_indices( sizes, break_mode, document_sep_len, block_size ) -> Tuple[np.ndarray]: """Use token_block_utils_fast to build arrays for indexing into self.dataset""" try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( "Please build Cython components with: `pip install --editable .` " "or `python setup.py build_ext --inplace`" ) if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: if torch.is_tensor(sizes): sizes = sizes.numpy() sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else "none" # For "eos" break-mode, block_size is not required parameters. if break_mode == "eos" and block_size is None: block_size = 0 slice_indices = _get_slice_indices_fast( sizes, str(break_mode), block_size, document_sep_len ) _sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros( len(sizes), dtype=np.compat.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) size_dtype = np.uint16 if block_size < 65535 else np.uint32 num_tokens = slice_indices[-1].max() slice_indices_dtype = best_fitting_int_dtype(num_tokens) slice_indices = slice_indices.astype(slice_indices_dtype) _sizes = _sizes.astype(size_dtype) block_to_dataset_index = block_to_dataset_index.astype(slice_indices_dtype) return _sizes, block_to_dataset_index, slice_indices
def __init__( self, dataset, sizes, block_size, pad, eos, break_mode=None, include_targets=False, document_sep_len=1, ): super().__init__() self.dataset = dataset self.pad = pad self.eos = eos self.include_targets = include_targets assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else 'none' # For "eos" break-mode, block_size is not required parameters. if break_mode == "eos" and block_size is None: block_size = 0 slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size, document_sep_len) self._sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros(len(sizes), dtype=np.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) self._slice_indices = plasma_utils.PlasmaArray(slice_indices) self._sizes = plasma_utils.PlasmaArray(self._sizes) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index)
def __init__( self, dataset, sizes, block_size, pad=0, eos=2, include_targets=False, break_mode=None, document_sep_len=1, two_inputs=False, ): try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( 'Please build Cython components with: `pip install --editable .` ' 'or `python setup.py build_ext --inplace`' ) super().__init__() self.dataset = dataset self.two_inputs = two_inputs assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: if torch.is_tensor(sizes): sizes = sizes.numpy() sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else 'none' slice_indices = _get_slice_indices_fast( sizes, break_mode, block_size, document_sep_len) self._sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) self._slice_indices = plasma_utils.PlasmaArray(slice_indices) self._sizes = plasma_utils.PlasmaArray(self._sizes) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index) self.pad = pad self.eos = eos self.include_targets = include_targets
def __init__( self, dataset, sizes, block_size, pad, eos, break_mode=None, include_targets=False, document_sep_len=1, ): try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( "Please build Cython components with: `pip install --editable .` " "or `python setup.py build_ext --inplace`") super().__init__() self.dataset = dataset self.pad = pad self.eos = eos self.include_targets = include_targets assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: if torch.is_tensor(sizes): sizes = sizes.numpy() sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else "none" # For "eos" break-mode, block_size is not required parameters. if break_mode == "eos" and block_size is None: block_size = 0 slice_indices = _get_slice_indices_fast(sizes, str(break_mode), block_size, document_sep_len) self._sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros(len(sizes), dtype=np.compat.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) size_dtype = np.uint16 if block_size < 65535 else np.uint32 slice_indices_dtype = best_fitting_int_dtype(slice_indices[-1].max()) self._slice_indices = plasma_utils.PlasmaArray( slice_indices.astype(slice_indices_dtype)) self._sizes = plasma_utils.PlasmaArray(self._sizes.astype(size_dtype)) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index.astype(slice_indices_dtype))
def __init__( self, dataset, sizes, block_size, pad, eos, break_mode='complete_doc', include_targets=False, document_sep_len=1, context_mode='doc', window_size=3, ): try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( 'Please build Cython components with: `pip install --editable .` ' 'or `python setup.py build_ext --inplace`') super().__init__() self.dataset = dataset self.pad = pad self.eos = eos self.include_targets = include_targets assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else 'none' # For "eos" break-mode, block_size is not required parameters. if break_mode == "eos" and block_size is None: block_size = 0 slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size, document_sep_len) # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros(len(sizes), dtype=np.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) self._slice_indices = plasma_utils.PlasmaArray(slice_indices) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index) self.context_mode = context_mode self.window_size = window_size context_index, _sizes = self.rebuild_index() self._context_index = plasma_utils.PlasmaArray(np.array(context_index)) self._sizes = plasma_utils.PlasmaArray(np.array(_sizes))
def __init__( self, dataset, sizes, block_sizes, pad, eos, document_sep_len=1, ): try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( 'Please build Cython components with: `pip install --editable .` ' 'or `python setup.py build_ext --inplace`') super().__init__() self.dataset = dataset self.pad = pad self.eos = eos assert len(dataset) == len(sizes) assert len(dataset) > 0 if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: if torch.is_tensor(sizes): sizes = sizes.numpy() sizes = sizes.astype(np.int64) assert min(block_sizes) > 0 block_sizes = [0] + block_sizes slice_indices_list = [] sizes_list = [] block_to_dataset_index_list = [] number_of_inst_in_block = [] for block_size in block_sizes: break_mode = "eos" if block_size == 0 else "complete" slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size, document_sep_len) slice_indices_list.append(slice_indices) sizes_list.append(slice_indices[:, 1] - slice_indices[:, 0]) number_of_inst_in_block.append(len(slice_indices)) # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros(len(sizes), dtype=np.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices) block_to_dataset_index_list.append(block_to_dataset_index) self._sizes = np.concatenate(sizes_list) self._slice_indices = np.concatenate(slice_indices_list, axis=0) self._block_to_dataset_index = np.concatenate( block_to_dataset_index_list, axis=0) self._number_of_inst_in_block = np.array(number_of_inst_in_block, dtype=np.int64) self._slice_indices = plasma_utils.PlasmaArray(self._slice_indices) self._sizes = plasma_utils.PlasmaArray(self._sizes) self._block_to_dataset_index = plasma_utils.PlasmaArray( self._block_to_dataset_index) self._number_of_inst_in_block = plasma_utils.PlasmaArray( self._number_of_inst_in_block)