コード例 #1
0
    def __init__(
        self,
        annotated_text,
        max_positions,
        pad,
        eos,
        document_sep_len,
        seed,
    ):
        super().__init__()
        self.dataset = annotated_text
        self.pad = pad
        self.eos = eos
        self.max_positions = max_positions

        assert len(self.dataset) > 0
        sizes = annotated_text.sizes.astype(np.int64)

        slice_indices = _get_slice_indices_fast(
            sizes=sizes,
            break_mode='complete_doc',
            block_size=self.max_positions,
            document_sep_len=document_sep_len,
        )
        _sizes = slice_indices[:, 1] - slice_indices[:, 0]

        self._slice_indices = maybe_move_to_plasma(slice_indices)
        self._sizes = maybe_move_to_plasma(_sizes)

        self.seed = seed
        self.epoch = 0
コード例 #2
0
    def _build_slice_indices(
        sizes, break_mode, document_sep_len, block_size
    ) -> Tuple[np.ndarray]:
        """Use token_block_utils_fast to build arrays for indexing into self.dataset"""
        try:
            from fairseq.data.token_block_utils_fast import (
                _get_slice_indices_fast,
                _get_block_to_dataset_index_fast,
            )
        except ImportError:
            raise ImportError(
                "Please build Cython components with: `pip install --editable .` "
                "or `python setup.py build_ext --inplace`"
            )

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            if torch.is_tensor(sizes):
                sizes = sizes.numpy()
            sizes = sizes.astype(np.int64)

        break_mode = break_mode if break_mode is not None else "none"

        # For "eos" break-mode, block_size is not required parameters.
        if break_mode == "eos" and block_size is None:
            block_size = 0

        slice_indices = _get_slice_indices_fast(
            sizes, str(break_mode), block_size, document_sep_len
        )
        _sizes = slice_indices[:, 1] - slice_indices[:, 0]

        # build index mapping block indices to the underlying dataset indices
        if break_mode == "eos":
            # much faster version for eos break mode
            block_to_dataset_index = np.stack(
                [
                    np.arange(len(sizes)),  # starting index in dataset
                    np.zeros(
                        len(sizes), dtype=np.compat.long
                    ),  # starting offset within starting index
                    np.arange(len(sizes)),  # ending index in dataset
                ],
                1,
            )
        else:
            block_to_dataset_index = _get_block_to_dataset_index_fast(
                sizes,
                slice_indices,
            )
        size_dtype = np.uint16 if block_size < 65535 else np.uint32
        num_tokens = slice_indices[-1].max()
        slice_indices_dtype = best_fitting_int_dtype(num_tokens)
        slice_indices = slice_indices.astype(slice_indices_dtype)
        _sizes = _sizes.astype(size_dtype)
        block_to_dataset_index = block_to_dataset_index.astype(slice_indices_dtype)
        return _sizes, block_to_dataset_index, slice_indices
コード例 #3
0
ファイル: token_block_dataset.py プロジェクト: yyht/fairseq
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad,
        eos,
        break_mode=None,
        include_targets=False,
        document_sep_len=1,
    ):
        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            sizes = sizes.astype(np.int64)

        break_mode = break_mode if break_mode is not None else 'none'

        # For "eos" break-mode, block_size is not required parameters.
        if break_mode == "eos" and block_size is None:
            block_size = 0

        slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size,
                                                document_sep_len)
        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]

        # build index mapping block indices to the underlying dataset indices
        if break_mode == "eos":
            # much faster version for eos break mode
            block_to_dataset_index = np.stack(
                [
                    np.arange(len(sizes)),  # starting index in dataset
                    np.zeros(len(sizes), dtype=np.long
                             ),  # starting offset within starting index
                    np.arange(len(sizes)),  # ending index in dataset
                ],
                1,
            )
        else:
            block_to_dataset_index = _get_block_to_dataset_index_fast(
                sizes,
                slice_indices,
            )
        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
        self._sizes = plasma_utils.PlasmaArray(self._sizes)
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            block_to_dataset_index)
コード例 #4
0
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad=0,
        eos=2,
        include_targets=False,
        break_mode=None,
        document_sep_len=1,
        two_inputs=False,
    ):
        try:
            from fairseq.data.token_block_utils_fast import (
                _get_slice_indices_fast,
                _get_block_to_dataset_index_fast,
            )
        except ImportError:
            raise ImportError(
                'Please build Cython components with: `pip install --editable .` '
                'or `python setup.py build_ext --inplace`'
            )

        super().__init__()
        self.dataset = dataset
        self.two_inputs = two_inputs

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            if torch.is_tensor(sizes):
                sizes = sizes.numpy()
            sizes = sizes.astype(np.int64)

        break_mode = break_mode if break_mode is not None else 'none'

        slice_indices = _get_slice_indices_fast(
            sizes, break_mode, block_size, document_sep_len)
        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]

        # build index mapping block indices to the underlying dataset indices

        block_to_dataset_index = _get_block_to_dataset_index_fast(
            sizes,
            slice_indices,
        )
        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
        self._sizes = plasma_utils.PlasmaArray(self._sizes)
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            block_to_dataset_index)
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets
コード例 #5
0
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad,
        eos,
        break_mode=None,
        include_targets=False,
        document_sep_len=1,
    ):
        try:
            from fairseq.data.token_block_utils_fast import (
                _get_slice_indices_fast,
                _get_block_to_dataset_index_fast,
            )
        except ImportError:
            raise ImportError(
                "Please build Cython components with: `pip install --editable .` "
                "or `python setup.py build_ext --inplace`")

        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            if torch.is_tensor(sizes):
                sizes = sizes.numpy()
            sizes = sizes.astype(np.int64)

        break_mode = break_mode if break_mode is not None else "none"

        # For "eos" break-mode, block_size is not required parameters.
        if break_mode == "eos" and block_size is None:
            block_size = 0

        slice_indices = _get_slice_indices_fast(sizes, str(break_mode),
                                                block_size, document_sep_len)
        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]

        # build index mapping block indices to the underlying dataset indices
        if break_mode == "eos":
            # much faster version for eos break mode
            block_to_dataset_index = np.stack(
                [
                    np.arange(len(sizes)),  # starting index in dataset
                    np.zeros(len(sizes), dtype=np.compat.long
                             ),  # starting offset within starting index
                    np.arange(len(sizes)),  # ending index in dataset
                ],
                1,
            )
        else:
            block_to_dataset_index = _get_block_to_dataset_index_fast(
                sizes,
                slice_indices,
            )
        size_dtype = np.uint16 if block_size < 65535 else np.uint32
        slice_indices_dtype = best_fitting_int_dtype(slice_indices[-1].max())

        self._slice_indices = plasma_utils.PlasmaArray(
            slice_indices.astype(slice_indices_dtype))
        self._sizes = plasma_utils.PlasmaArray(self._sizes.astype(size_dtype))
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            block_to_dataset_index.astype(slice_indices_dtype))
コード例 #6
0
    def __init__(
        self,
        dataset,
        sizes,
        block_size,
        pad,
        eos,
        break_mode='complete_doc',
        include_targets=False,
        document_sep_len=1,
        context_mode='doc',
        window_size=3,
    ):
        try:
            from fairseq.data.token_block_utils_fast import (
                _get_slice_indices_fast,
                _get_block_to_dataset_index_fast,
            )
        except ImportError:
            raise ImportError(
                'Please build Cython components with: `pip install --editable .` '
                'or `python setup.py build_ext --inplace`')

        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos
        self.include_targets = include_targets

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            sizes = sizes.astype(np.int64)

        break_mode = break_mode if break_mode is not None else 'none'

        # For "eos" break-mode, block_size is not required parameters.
        if break_mode == "eos" and block_size is None:
            block_size = 0

        slice_indices = _get_slice_indices_fast(sizes, break_mode, block_size,
                                                document_sep_len)

        # build index mapping block indices to the underlying dataset indices
        if break_mode == "eos":
            # much faster version for eos break mode
            block_to_dataset_index = np.stack(
                [
                    np.arange(len(sizes)),  # starting index in dataset
                    np.zeros(len(sizes), dtype=np.long
                             ),  # starting offset within starting index
                    np.arange(len(sizes)),  # ending index in dataset
                ],
                1,
            )
        else:
            block_to_dataset_index = _get_block_to_dataset_index_fast(
                sizes,
                slice_indices,
            )
        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            block_to_dataset_index)

        self.context_mode = context_mode
        self.window_size = window_size

        context_index, _sizes = self.rebuild_index()
        self._context_index = plasma_utils.PlasmaArray(np.array(context_index))
        self._sizes = plasma_utils.PlasmaArray(np.array(_sizes))
コード例 #7
0
    def __init__(
        self,
        dataset,
        sizes,
        block_sizes,
        pad,
        eos,
        document_sep_len=1,
    ):
        try:
            from fairseq.data.token_block_utils_fast import (
                _get_slice_indices_fast,
                _get_block_to_dataset_index_fast,
            )
        except ImportError:
            raise ImportError(
                'Please build Cython components with: `pip install --editable .` '
                'or `python setup.py build_ext --inplace`')

        super().__init__()
        self.dataset = dataset
        self.pad = pad
        self.eos = eos

        assert len(dataset) == len(sizes)
        assert len(dataset) > 0

        if isinstance(sizes, list):
            sizes = np.array(sizes, dtype=np.int64)
        else:
            if torch.is_tensor(sizes):
                sizes = sizes.numpy()
            sizes = sizes.astype(np.int64)

        assert min(block_sizes) > 0
        block_sizes = [0] + block_sizes
        slice_indices_list = []
        sizes_list = []
        block_to_dataset_index_list = []
        number_of_inst_in_block = []
        for block_size in block_sizes:
            break_mode = "eos" if block_size == 0 else "complete"
            slice_indices = _get_slice_indices_fast(sizes, break_mode,
                                                    block_size,
                                                    document_sep_len)
            slice_indices_list.append(slice_indices)
            sizes_list.append(slice_indices[:, 1] - slice_indices[:, 0])
            number_of_inst_in_block.append(len(slice_indices))

            # build index mapping block indices to the underlying dataset indices
            if break_mode == "eos":
                # much faster version for eos break mode
                block_to_dataset_index = np.stack(
                    [
                        np.arange(len(sizes)),  # starting index in dataset
                        np.zeros(len(sizes), dtype=np.long
                                 ),  # starting offset within starting index
                        np.arange(len(sizes)),  # ending index in dataset
                    ],
                    1,
                )
            else:
                block_to_dataset_index = _get_block_to_dataset_index_fast(
                    sizes, slice_indices)
            block_to_dataset_index_list.append(block_to_dataset_index)

        self._sizes = np.concatenate(sizes_list)
        self._slice_indices = np.concatenate(slice_indices_list, axis=0)
        self._block_to_dataset_index = np.concatenate(
            block_to_dataset_index_list, axis=0)
        self._number_of_inst_in_block = np.array(number_of_inst_in_block,
                                                 dtype=np.int64)

        self._slice_indices = plasma_utils.PlasmaArray(self._slice_indices)
        self._sizes = plasma_utils.PlasmaArray(self._sizes)
        self._block_to_dataset_index = plasma_utils.PlasmaArray(
            self._block_to_dataset_index)
        self._number_of_inst_in_block = plasma_utils.PlasmaArray(
            self._number_of_inst_in_block)