Esempio n. 1
0
    def __init__(
        self,
        cfg: dict,
        name: str,
        indexed_dataset: MMapIndexedDataset,
        data_prefix: str,
        num_epochs: Optional[int],
        max_num_samples: int,
        masked_lm_prob: float,
        max_seq_length: int,
        short_seq_prob: float,
        seed: int,
        binary_head: bool,
        tokenizer: Any,
    ):

        # Params to store.
        self.name = name
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
        self.binary_head = binary_head

        # Dataset.
        self.indexed_dataset = indexed_dataset

        # save index mappings to a configurable dir
        self.index_mapping_dir = cfg.data.get('index_mapping_dir', None)

        # create index_mapping_dir on rank 0
        if torch.distributed.is_available(
        ) and torch.distributed.is_initialized():
            if torch.distributed.get_rank() == 0:
                if self.index_mapping_dir is not None and not os.path.isdir(
                        self.index_mapping_dir):
                    os.makedirs(self.index_mapping_dir)
            torch.distributed.barrier()

        # Build the samples mapping.
        self.samples_mapping = get_samples_mapping(
            self.indexed_dataset,
            data_prefix,
            num_epochs,
            max_num_samples,
            self.max_seq_length - 3,  # account for added tokens
            short_seq_prob,
            self.seed,
            self.name,
            self.binary_head,
            index_mapping_dir=self.index_mapping_dir,
        )

        # Vocab stuff.
        self.vocab_id_list = list(tokenizer.ids_to_tokens.keys())
        self.vocab_id_to_token_dict = tokenizer.ids_to_tokens
        self.cls_id = tokenizer.cls_token_id
        self.sep_id = tokenizer.sep_token_id
        self.mask_id = tokenizer.mask_token_id
        self.pad_id = tokenizer.pad_token_id
Esempio n. 2
0
    def __init__(
        self,
        name,
        indexed_dataset,
        data_prefix,
        num_epochs,
        max_num_samples,
        masked_lm_prob,
        max_seq_length,
        max_seq_length_dec,
        short_seq_prob,
        seed,
    ):

        # Params to store.
        self.name = name
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
        self.max_seq_length_dec = max_seq_length_dec

        # Dataset.
        self.indexed_dataset = indexed_dataset

        # Build the samples mapping.
        self.samples_mapping = get_samples_mapping(
            self.indexed_dataset,
            data_prefix,
            num_epochs,
            max_num_samples,
            self.max_seq_length - 2,  # account for added tokens
            short_seq_prob,
            self.seed,
            self.name,
            False,
        )

        # Vocab stuff.
        tokenizer = get_tokenizer()
        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
        self.vocab_id_to_token_dict = tokenizer.inv_vocab
        self.cls_id = tokenizer.cls
        self.sep_id = tokenizer.sep
        self.mask_id = tokenizer.mask
        self.pad_id = tokenizer.pad
        self.bos_id = tokenizer.bos_token_id
        self.eos_id = tokenizer.eos_token_id
        self.sentinel_tokens = tokenizer.additional_special_tokens_ids
        assert len(
            self.sentinel_tokens
        ) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
Esempio n. 3
0
    def __init__(
        self,
        name,
        indexed_dataset,
        data_prefix,
        num_epochs,
        max_num_samples,
        masked_lm_prob,
        max_seq_length,
        short_seq_prob,
        seed,
        binary_head,
        tokenizer,
    ):

        # Params to store.
        self.name = name
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
        self.binary_head = binary_head

        # Dataset.
        self.indexed_dataset = indexed_dataset

        # Build the samples mapping.
        self.samples_mapping = get_samples_mapping(
            self.indexed_dataset,
            data_prefix,
            num_epochs,
            max_num_samples,
            self.max_seq_length - 3,  # account for added tokens
            short_seq_prob,
            self.seed,
            self.name,
            self.binary_head,
        )

        # Vocab stuff.
        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
        self.vocab_id_to_token_dict = tokenizer.inv_vocab
        self.cls_id = tokenizer.cls_id
        self.sep_id = tokenizer.sep_id
        self.mask_id = tokenizer.mask_id
        self.pad_id = tokenizer.pad_id
Esempio n. 4
0
    def __init__(
        self,
        name: str,
        indexed_dataset: MMapIndexedDataset,
        data_prefix: str,
        num_epochs: Optional[int],
        max_num_samples: int,
        masked_lm_prob: float,
        max_seq_length: int,
        short_seq_prob: float,
        seed: int,
        binary_head: bool,
        tokenizer: Any,
    ):

        # Params to store.
        self.name = name
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
        self.binary_head = binary_head

        # Dataset.
        self.indexed_dataset = indexed_dataset

        # Build the samples mapping.
        self.samples_mapping = get_samples_mapping(
            self.indexed_dataset,
            data_prefix,
            num_epochs,
            max_num_samples,
            self.max_seq_length - 3,  # account for added tokens
            short_seq_prob,
            self.seed,
            self.name,
            self.binary_head,
        )

        # Vocab stuff.
        self.vocab_id_list = list(tokenizer.ids_to_tokens.keys())
        self.vocab_id_to_token_dict = tokenizer.ids_to_tokens
        self.cls_id = tokenizer.cls_token_id
        self.sep_id = tokenizer.sep_token_id
        self.mask_id = tokenizer.mask_token_id
        self.pad_id = tokenizer.pad_token_id
Esempio n. 5
0
    def __init__(
        self,
        cfg,
        trainer,
        tokenizer,
        name,
        indexed_dataset,
        data_prefix,
        num_epochs,
        max_num_samples,
        max_seq_length,
        max_seq_length_dec,
        seed,
        masked_lm_prob=0.15,
        short_seq_prob=0.1,
        max_ngram_size=10,
        mean_ngram_size=None,
        geometric_dist=True,
        permutation=False,
        whole_word_masking=True,
        favor_long_ngrams=False,
    ):
        super().__init__(cfg, trainer=trainer)

        # Params to store.
        self.name = name
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
        self.max_seq_length_dec = max_seq_length_dec
        self.short_seq_prob = short_seq_prob
        self.max_ngram_size = max_ngram_size
        self.mean_ngram_size = mean_ngram_size
        self.geometric_dist = geometric_dist
        self.permutation = permutation
        self.whole_word_masking = whole_word_masking
        self.favor_long_ngrams = favor_long_ngrams

        # Dataset.
        self.indexed_dataset = indexed_dataset

        # save index mappings to a configurable dir
        self.index_mapping_dir = cfg.data.get('index_mapping_dir', None)

        # create index_mapping_dir on rank 0
        if torch.distributed.is_available(
        ) and torch.distributed.is_initialized():
            if torch.distributed.get_rank() == 0:
                if self.index_mapping_dir is not None and not os.path.isdir(
                        self.index_mapping_dir):
                    os.makedirs(self.index_mapping_dir)
            torch.distributed.barrier()

        # Build the samples mapping.
        self.samples_mapping = get_samples_mapping(
            indexed_dataset=self.indexed_dataset,
            data_prefix=data_prefix,
            num_epochs=num_epochs,
            max_num_samples=max_num_samples,
            max_seq_length=self.max_seq_length - 2,  # account for added tokens
            short_seq_prob=self.short_seq_prob,
            seed=self.seed,
            name=self.name,
            binary_head=False,
            index_mapping_dir=self.index_mapping_dir,
        )

        self.tokenizer = tokenizer
        self.tokenizer_type = 'wordpiece'  # TODO: better checks for tokenizer types. How do we do this for HF tokenizers that are not BERT?
        if isinstance(self.tokenizer, YouTokenToMeTokenizer):
            raise ValueError(
                f"YTTM does not support special tokens and cannot be used with T5 datasets."
            )

        if isinstance(self.tokenizer, SentencePieceTokenizer):
            if not self.tokenizer.legacy:
                raise ValueError(
                    "Sentencepiece Tokenizer must have legacy = False to add special tokens."
                )
            self.tokenizer_type = 'sentencepiece'
            if whole_word_masking:
                raise ValueError(
                    "Whole word masking is not supported with sentencepiece tokenizers and only with wordpiece tokenizers. Please set it to False."
                )

        self.cls_id = tokenizer.cls_id
        self.sep_id = tokenizer.sep_id
        self.mask_id = tokenizer.mask_id
        self.pad_id = tokenizer.pad_id
        self.bos_id = tokenizer.bos_id
        self.eos_id = tokenizer.eos_id

        self.vocab_id_list = self.tokenizer.vocab
        self.vocab_id_to_token_dict = {
            idx: token
            for idx, token in enumerate(self.vocab_id_list)
        }

        self._build()
Esempio n. 6
0
    def __init__(
        self,
        cfg,
        trainer,
        tokenizer,
        name,
        indexed_dataset,
        data_prefix,
        num_epochs,
        max_num_samples,
        masked_lm_prob,
        max_seq_length,
        max_seq_length_dec,
        short_seq_prob,
        seed,
    ):
        super().__init__(cfg, trainer=trainer)

        # Params to store.
        self.name = name
        self.seed = seed
        self.masked_lm_prob = masked_lm_prob
        self.max_seq_length = max_seq_length
        self.max_seq_length_dec = max_seq_length_dec

        # Dataset.
        self.indexed_dataset = indexed_dataset

        # Build the samples mapping.
        self.samples_mapping = get_samples_mapping(
            self.indexed_dataset,
            data_prefix,
            num_epochs,
            max_num_samples,
            self.max_seq_length - 2,  # account for added tokens
            short_seq_prob,
            self.seed,
            self.name,
            False,
        )

        self.tokenizer = tokenizer

        if isinstance(self.tokenizer, YouTokenToMeTokenizer):
            raise ValueError(
                f"YTTM does not support special tokens and cannot be used with T5 datasets."
            )

        if isinstance(self.tokenizer, SentencePieceTokenizer):
            if not self.tokenizer.legacy:
                raise ValueError(
                    "Sentencepiece Tokenizer must have legacy = False to add special tokens."
                )

        self.cls_id = tokenizer.cls_id
        self.sep_id = tokenizer.sep_id
        self.mask_id = tokenizer.mask_id
        self.pad_id = tokenizer.pad_id
        self.bos_id = tokenizer.bos_id
        self.eos_id = tokenizer.eos_id

        self.vocab_id_list = self.tokenizer.vocab
        self.vocab_id_to_token_dict = {
            idx: token
            for idx, token in enumerate(self.vocab_id_list)
        }

        self.sentinel_tokens = tokenizer.additional_special_tokens_ids
        assert len(self.sentinel_tokens) > 0