def __init__( self, cfg: dict, name: str, indexed_dataset: MMapIndexedDataset, data_prefix: str, num_epochs: Optional[int], max_num_samples: int, masked_lm_prob: float, max_seq_length: int, short_seq_prob: float, seed: int, binary_head: bool, tokenizer: Any, ): # Params to store. self.name = name self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length self.binary_head = binary_head # Dataset. self.indexed_dataset = indexed_dataset # save index mappings to a configurable dir self.index_mapping_dir = cfg.data.get('index_mapping_dir', None) # create index_mapping_dir on rank 0 if torch.distributed.is_available( ) and torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: if self.index_mapping_dir is not None and not os.path.isdir( self.index_mapping_dir): os.makedirs(self.index_mapping_dir) torch.distributed.barrier() # Build the samples mapping. self.samples_mapping = get_samples_mapping( self.indexed_dataset, data_prefix, num_epochs, max_num_samples, self.max_seq_length - 3, # account for added tokens short_seq_prob, self.seed, self.name, self.binary_head, index_mapping_dir=self.index_mapping_dir, ) # Vocab stuff. self.vocab_id_list = list(tokenizer.ids_to_tokens.keys()) self.vocab_id_to_token_dict = tokenizer.ids_to_tokens self.cls_id = tokenizer.cls_token_id self.sep_id = tokenizer.sep_token_id self.mask_id = tokenizer.mask_token_id self.pad_id = tokenizer.pad_token_id
def __init__( self, name, indexed_dataset, data_prefix, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, max_seq_length_dec, short_seq_prob, seed, ): # Params to store. self.name = name self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length self.max_seq_length_dec = max_seq_length_dec # Dataset. self.indexed_dataset = indexed_dataset # Build the samples mapping. self.samples_mapping = get_samples_mapping( self.indexed_dataset, data_prefix, num_epochs, max_num_samples, self.max_seq_length - 2, # account for added tokens short_seq_prob, self.seed, self.name, False, ) # Vocab stuff. tokenizer = get_tokenizer() self.vocab_id_list = list(tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = tokenizer.inv_vocab self.cls_id = tokenizer.cls self.sep_id = tokenizer.sep self.mask_id = tokenizer.mask self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids assert len( self.sentinel_tokens ) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
def __init__( self, name, indexed_dataset, data_prefix, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, short_seq_prob, seed, binary_head, tokenizer, ): # Params to store. self.name = name self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length self.binary_head = binary_head # Dataset. self.indexed_dataset = indexed_dataset # Build the samples mapping. self.samples_mapping = get_samples_mapping( self.indexed_dataset, data_prefix, num_epochs, max_num_samples, self.max_seq_length - 3, # account for added tokens short_seq_prob, self.seed, self.name, self.binary_head, ) # Vocab stuff. self.vocab_id_list = list(tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = tokenizer.inv_vocab self.cls_id = tokenizer.cls_id self.sep_id = tokenizer.sep_id self.mask_id = tokenizer.mask_id self.pad_id = tokenizer.pad_id
def __init__( self, name: str, indexed_dataset: MMapIndexedDataset, data_prefix: str, num_epochs: Optional[int], max_num_samples: int, masked_lm_prob: float, max_seq_length: int, short_seq_prob: float, seed: int, binary_head: bool, tokenizer: Any, ): # Params to store. self.name = name self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length self.binary_head = binary_head # Dataset. self.indexed_dataset = indexed_dataset # Build the samples mapping. self.samples_mapping = get_samples_mapping( self.indexed_dataset, data_prefix, num_epochs, max_num_samples, self.max_seq_length - 3, # account for added tokens short_seq_prob, self.seed, self.name, self.binary_head, ) # Vocab stuff. self.vocab_id_list = list(tokenizer.ids_to_tokens.keys()) self.vocab_id_to_token_dict = tokenizer.ids_to_tokens self.cls_id = tokenizer.cls_token_id self.sep_id = tokenizer.sep_token_id self.mask_id = tokenizer.mask_token_id self.pad_id = tokenizer.pad_token_id
def __init__( self, cfg, trainer, tokenizer, name, indexed_dataset, data_prefix, num_epochs, max_num_samples, max_seq_length, max_seq_length_dec, seed, masked_lm_prob=0.15, short_seq_prob=0.1, max_ngram_size=10, mean_ngram_size=None, geometric_dist=True, permutation=False, whole_word_masking=True, favor_long_ngrams=False, ): super().__init__(cfg, trainer=trainer) # Params to store. self.name = name self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length self.max_seq_length_dec = max_seq_length_dec self.short_seq_prob = short_seq_prob self.max_ngram_size = max_ngram_size self.mean_ngram_size = mean_ngram_size self.geometric_dist = geometric_dist self.permutation = permutation self.whole_word_masking = whole_word_masking self.favor_long_ngrams = favor_long_ngrams # Dataset. self.indexed_dataset = indexed_dataset # save index mappings to a configurable dir self.index_mapping_dir = cfg.data.get('index_mapping_dir', None) # create index_mapping_dir on rank 0 if torch.distributed.is_available( ) and torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: if self.index_mapping_dir is not None and not os.path.isdir( self.index_mapping_dir): os.makedirs(self.index_mapping_dir) torch.distributed.barrier() # Build the samples mapping. self.samples_mapping = get_samples_mapping( indexed_dataset=self.indexed_dataset, data_prefix=data_prefix, num_epochs=num_epochs, max_num_samples=max_num_samples, max_seq_length=self.max_seq_length - 2, # account for added tokens short_seq_prob=self.short_seq_prob, seed=self.seed, name=self.name, binary_head=False, index_mapping_dir=self.index_mapping_dir, ) self.tokenizer = tokenizer self.tokenizer_type = 'wordpiece' # TODO: better checks for tokenizer types. How do we do this for HF tokenizers that are not BERT? if isinstance(self.tokenizer, YouTokenToMeTokenizer): raise ValueError( f"YTTM does not support special tokens and cannot be used with T5 datasets." ) if isinstance(self.tokenizer, SentencePieceTokenizer): if not self.tokenizer.legacy: raise ValueError( "Sentencepiece Tokenizer must have legacy = False to add special tokens." ) self.tokenizer_type = 'sentencepiece' if whole_word_masking: raise ValueError( "Whole word masking is not supported with sentencepiece tokenizers and only with wordpiece tokenizers. Please set it to False." ) self.cls_id = tokenizer.cls_id self.sep_id = tokenizer.sep_id self.mask_id = tokenizer.mask_id self.pad_id = tokenizer.pad_id self.bos_id = tokenizer.bos_id self.eos_id = tokenizer.eos_id self.vocab_id_list = self.tokenizer.vocab self.vocab_id_to_token_dict = { idx: token for idx, token in enumerate(self.vocab_id_list) } self._build()
def __init__( self, cfg, trainer, tokenizer, name, indexed_dataset, data_prefix, num_epochs, max_num_samples, masked_lm_prob, max_seq_length, max_seq_length_dec, short_seq_prob, seed, ): super().__init__(cfg, trainer=trainer) # Params to store. self.name = name self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length self.max_seq_length_dec = max_seq_length_dec # Dataset. self.indexed_dataset = indexed_dataset # Build the samples mapping. self.samples_mapping = get_samples_mapping( self.indexed_dataset, data_prefix, num_epochs, max_num_samples, self.max_seq_length - 2, # account for added tokens short_seq_prob, self.seed, self.name, False, ) self.tokenizer = tokenizer if isinstance(self.tokenizer, YouTokenToMeTokenizer): raise ValueError( f"YTTM does not support special tokens and cannot be used with T5 datasets." ) if isinstance(self.tokenizer, SentencePieceTokenizer): if not self.tokenizer.legacy: raise ValueError( "Sentencepiece Tokenizer must have legacy = False to add special tokens." ) self.cls_id = tokenizer.cls_id self.sep_id = tokenizer.sep_id self.mask_id = tokenizer.mask_id self.pad_id = tokenizer.pad_id self.bos_id = tokenizer.bos_id self.eos_id = tokenizer.eos_id self.vocab_id_list = self.tokenizer.vocab self.vocab_id_to_token_dict = { idx: token for idx, token in enumerate(self.vocab_id_list) } self.sentinel_tokens = tokenizer.additional_special_tokens_ids assert len(self.sentinel_tokens) > 0