def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model): # Initialise PyTorch model config = FunnelConfig.from_json_file(config_file) print(f"Building PyTorch model from configuration: {config}") model = FunnelBaseModel(config) if base_model else FunnelModel(config) # Load weights from tf checkpoint load_tf_weights_in_funnel(model, config, tf_checkpoint_path) # Save pytorch-model print(f"Save PyTorch model to {pytorch_dump_path}") torch.save(model.state_dict(), pytorch_dump_path)
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1) config = FunnelConfig( vocab_size=self.vocab_size, block_sizes=self.block_sizes, num_decoder_layers=self.num_decoder_layers, d_model=self.d_model, n_head=self.n_head, d_head=self.d_head, d_inner=self.d_inner, hidden_act=self.hidden_act, hidden_dropout=self.hidden_dropout, attention_dropout=self.attention_dropout, activation_dropout=self.activation_dropout, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, return_dict=True, ) return ( config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, fake_token_labels, )
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model config = FunnelConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = FunnelForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_funnel(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def get_config(self): return FunnelConfig( vocab_size=self.vocab_size, block_sizes=self.block_sizes, num_decoder_layers=self.num_decoder_layers, d_model=self.d_model, n_head=self.n_head, d_head=self.d_head, d_inner=self.d_inner, hidden_act=self.hidden_act, hidden_dropout=self.hidden_dropout, attention_dropout=self.attention_dropout, activation_dropout=self.activation_dropout, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, )
def define_config(name): if name in [ "bert-base-multilingual-cased", "sangrimlee/bert-base-multilingual-cased-korquad", "kykim/bert-kor-base", "monologg/kobert" ]: return BertConfig.from_pretrained(name) elif name in [ "monologg/koelectra-base-v3-discriminator", "kykim/electra-kor-base" ]: return ElectraConfig.from_pretrained(name) elif name in ["xlm-roberta-large"]: return XLMRobertaConfig.from_pretrained(name) elif name in ["kykim/funnel-kor-base"]: return FunnelConfig.from_pretrained(name)
class Funnel_T5_VAE_Config(PretrainedConfig): r""" This is the configuration class to store the configuration of :class:`~transformer_vae.T5_VAE_Model`. It is used to instantiate a Funnel-T5-VAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the T5 `funnel-t5-vae-base architecture. To be able to use `transformer.trainer.Trainer` we need some specific training logic & config in the model. Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. Arguments: latent_size (:obj:`int`, `optional`, defaults to 1,000): Number of dimensions to use for the sequences latent code. funnel_name (:obj:`str`, `optional`, defaults to t5-base): Name of the transformer model to use as encoder & decoder. vae_encoder_model (:obj:`str`, `optional`, defaults to None): Name of the model to encode T5 hidden states into latent codes. vae_decoder_model (:obj:`str`, `optional`, defaults to None): Name of the model to decode latent codes into T5 hidden states. set_seq_size (:obj:`int`, `optional`, defaults to 60): NOTE: Every input sequence must be padded to be equal to this length. t5_name (:obj:`str`, `optional`, defaults to t5-base): Name of the Transformer model to use as a decoder. transformer_critic_name (:obj:`str`, `optional`, defaults to None): Name of the Transformer model to use as an advisery on interpolations. *** Training Args *** reg_schedule_k (:obj:`float`, `optional`, defaults to 0.0025): Multiplied by global_step in a sigmoid, more gradually increase regulariser loss weight. reg_schedule_b (:obj:`float`, `optional`, defaults to 6.25): Added to global step in sigmoid, further delays increase in regulariser loss weight. use_extra_logs (:obj:`bool`, `optional`, defaults to False): Store extra logs during each training inference. gradient_checkpoint (:obj:`bool`, `optional`, defaults to False): Checkpoint gradients in the model. Currently just checkpoints after the encoder + VAE funnel_block_sizes (:obj:`str`, `optional`, defaults to ''): Size of each Funnel Encoder block, sequence is halved between each block. Example specification: 1_1_1 *** End *** TODO: Add extra models to condition on the latent """ model_type = "transformer_vae" is_composition = True def __init__( self, latent_size=1_000, funnel_name="funnel-transformer/intermediate", t5_name="t5-base", vae_encoder_model='', vae_decoder_model='', critic_type='', critic_name='', set_seq_size=60, decoder_start_token_id=0, dont_use_reg_loss=False, reg_schedule_k=0.0025, reg_schedule_b=6.25, use_extra_logs=False, cache_dir=None, n_latent_tokens=5, # set to -1 for full sequence funnel_block_sizes='', num_decoder_layers=0, num_decoder_heads=0, attention_window_size=0, attention_window_overlap=0, gradient_checkpoint_encoder=False, decoder_grad_chk_pnt_rate=0, skip_upsample=False, **kwargs, ): assertIn(vae_encoder_model, VAE_ENCODER_MODELS.keys(), "Unexpected VAE encoder.") assertIn(vae_decoder_model, VAE_DECODER_MODELS.keys(), "Unexpected VAE decoder.") super().__init__(**kwargs) self.set_seq_size = set_seq_size # VAE self.vae_encoder_model = vae_encoder_model self.vae_decoder_model = vae_decoder_model if set_seq_size < n_latent_tokens: logger.warning( f'set_seq_size size is smaller than n_latent_tokens, now using n_latent_tokens={set_seq_size} from {n_latent_tokens}' ) n_latent_tokens = set_seq_size self.latent_size = latent_size self.n_latent_tokens = n_latent_tokens self.skip_upsample = skip_upsample # funnel encoder model if 'funnel' not in kwargs: self.funnel = AutoConfig.from_pretrained(funnel_name, cache_dir=cache_dir) if funnel_block_sizes: self.funnel.block_sizes = [ int(i) for i in funnel_block_sizes.split('_') ] self.funnel.decoder_start_token_id = decoder_start_token_id self.funnel.n_positions = set_seq_size else: self.funnel = FunnelConfig(**kwargs.pop('funnel')) pooling_division = 2**(len(self.funnel.block_sizes) - 1) self.encoded_seq_size = math.ceil(self.funnel.n_positions / pooling_division) self.gradient_checkpoint_encoder = gradient_checkpoint_encoder # T5 decoder model if 't5' not in kwargs: self.t5 = AutoConfig.from_pretrained(t5_name, cache_dir=cache_dir) if num_decoder_layers: self.t5.num_layers = num_decoder_layers if num_decoder_heads: self.t5.num_heads = num_decoder_heads self.t5.decoder_start_token_id = decoder_start_token_id self.t5.n_positions = self.funnel.n_positions assertEqual(self.t5.model_type, "t5", "Need t5 model type for transformer_decoder.") else: self.t5 = T5Config(**kwargs.pop('t5')) assertEqual(self.funnel.d_model, self.t5.d_model, "Funnel & T5 transformers have different dimensions.") self.decoder_grad_chk_pnt_rate = decoder_grad_chk_pnt_rate assert (attention_window_size < set_seq_size ), 'Attention window must be smallar than set sequence size.' self.attention_window_size = attention_window_size self.attention_window_overlap = attention_window_overlap if attention_window_size: assert ( set_seq_size % attention_window_size != 0 ), 'When doing an alternating attention pattern the sequence size cannot be divisable by the window size as no alternations will be possible.' self.attention_window_overlap = set_seq_size % attention_window_size # extra training losses self.use_reg_loss = not dont_use_reg_loss if dont_use_reg_loss: logger.warning( "Regularisation loss is turned off, you are training an Autoencoder (not a VAE)." ) self.reg_schedule_k = reg_schedule_k self.reg_schedule_b = reg_schedule_b self.use_extra_logs = use_extra_logs # critic model self.critic = None if critic_name: self.critic_type = critic_type if 'critic' not in kwargs: self.critic = AutoConfig.from_pretrained(critic_name, cache_dir=cache_dir) else: self.critic = FunnelConfig(**kwargs.pop('critic')) assertEqual(self.t5.d_model, self.critic.d_model, "Funnel & T5 transformers have different dimensions.") # misc self.use_cache = getattr(self.funnel, "use_cache", False)
# create data objects dataset_gen = LineByLineTextDataset(tokenizer=bpe_tokenizer, file_path=input_path, block_size=block_size) dataset_gen_val = LineByLineTextDataset(tokenizer=bpe_tokenizer, file_path=input_path_val, block_size=block_size) data_collator = DataCollatorForLanguageModeling( tokenizer=bpe_tokenizer, mlm=True, mlm_probability=mlm_probability) # create model config = FunnelConfig( vocab_size=bpe_tokenizer.vocab_size, max_position_embeddings=max_len + 10, n_head=num_attention_heads, block_sizes=block_sizes, type_vocab_size=1, ) model = FunnelForMaskedLM(config=config) _pretty_print(f"Number of model parameters : {model.num_parameters()}") model_path = os.path.join(output_path, "lm") training_args = TrainingArguments( output_dir=model_path, overwrite_output_dir=True, num_train_epochs=epochs, per_device_train_batch_size=batch_size, per_device_eval_batch_size=val_batch_size, evaluation_strategy="steps",