def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): # Initialise PyTorch model config = ElectraConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) if discriminator_or_generator == "discriminator": model = ElectraForPreTraining(config) elif discriminator_or_generator == "generator": model = ElectraForMaskedLM(config) else: raise ValueError( "The discriminator_or_generator argument should be either 'discriminator' or 'generator'" ) # Load weights from tf checkpoint load_tf_weights_in_electra( model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
class ElectraForLanguageModelingModel(PreTrainedModel): def __init__(self, config, **kwargs): super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs) if "generator_config" in kwargs: generator_config = kwargs["generator_config"] else: generator_config = config self.generator_model = ElectraForMaskedLM(generator_config) if "discriminator_config" in kwargs: discriminator_config = kwargs["discriminator_config"] else: discriminator_config = config self.discriminator_model = ElectraForPreTraining(discriminator_config) self.vocab_size = generator_config.vocab_size if kwargs.get("tie_generator_and_discriminator_embeddings", True): self.tie_generator_and_discriminator_embeddings() def tie_generator_and_discriminator_embeddings(self): self.discriminator_model.set_input_embeddings(self.generator_model.get_input_embeddings()) def forward(self, inputs, masked_lm_labels, attention_mask=None, token_type_ids=None): d_inputs = inputs.clone() # run masked LM. g_out = self.generator_model( inputs, masked_lm_labels=masked_lm_labels, attention_mask=attention_mask, token_type_ids=token_type_ids ) # get samples from masked LM. sample_probs = torch.softmax(g_out[1], dim=-1, dtype=torch.float32) sample_probs = sample_probs.view(-1, self.vocab_size) sampled_tokens = torch.multinomial(sample_probs, 1).view(-1) sampled_tokens = sampled_tokens.view(d_inputs.shape[0], -1) # labels have a -100 value to mask out loss from unchanged tokens. mask = masked_lm_labels.ne(-100) # replace the masked out tokens of the input with the generator predictions. d_inputs[mask] = sampled_tokens[mask] # turn mask into new target labels. 1 (True) for corrupted, 0 otherwise. # if the prediction was correct, mark it as uncorrupted. correct_preds = sampled_tokens == masked_lm_labels d_labels = mask.long() d_labels[correct_preds] = 0 # run token classification, predict whether each token was corrupted. d_out = self.discriminator_model( d_inputs, labels=d_labels, attention_mask=attention_mask, token_type_ids=token_type_ids ) g_loss = g_out[0] d_loss = d_out[0] g_scores = g_out[1] d_scores = d_out[1] return g_loss, d_loss, g_scores, d_scores, d_labels
def __init__(self, config, **kwargs): super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs) if "generator_config" in kwargs: generator_config = kwargs["generator_config"] else: generator_config = config self.generator_model = ElectraForMaskedLM(generator_config) if "discriminator_config" in kwargs: discriminator_config = kwargs["discriminator_config"] else: discriminator_config = config self.discriminator_model = ElectraForPreTraining(discriminator_config) self.vocab_size = generator_config.vocab_size if kwargs.get("tie_generator_and_discriminator_embeddings", True): self.tie_generator_and_discriminator_embeddings()
def __init__(self, config: ElectraConfig, embeddings): super().__init__() self.embed_layer = nn.Embedding(num_embeddings=config.vocab_size, embedding_dim=config.embedding_size, padding_idx=config.vocab_size - 1) self.embed_layer.weight = nn.Parameter(embeddings) self.discriminator = ElectraForPreTraining(config) self.sigmoid = nn.Sigmoid()
def main(train_cfg='config/electra_pretrain.json', model_cfg='config/electra_small.json', data_file='../tbc/books_large_all.txt', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', log_dir='../exp/electra/pretrain/runs', save_dir='../exp/electra/pretrain', max_len=128, max_pred=20, mask_prob=0.15, quantize=False): check_dirs_exist([log_dir, save_dir]) train_cfg = ElectraConfig().from_json_file(train_cfg) model_cfg = ElectraConfig().from_json_file(model_cfg) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [ Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, max_len) ] data_iter = SentPairDataLoader(data_file, train_cfg.batch_size, tokenize, max_len, pipeline=pipeline) # Get distilled-electra and quantized-distilled-electra generator = ElectraForMaskedLM.from_pretrained( 'google/electra-small-generator') t_discriminator = ElectraForPreTraining.from_pretrained( 'google/electra-base-discriminator') s_discriminator = QuantizedElectraForPreTraining( model_cfg) if quantize else ElectraForPreTraining s_discriminator = s_discriminator.from_pretrained( 'google/electra-small-discriminator', config=model_cfg) # model # config is used for model "QuantizedElectraForPreTraining" model = DistillElectraForPreTraining(generator, t_discriminator, s_discriminator, model_cfg) optimizer = optim.optim4GPU(train_cfg, model) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX base_trainer_args = (train_cfg, model_cfg, model, data_iter, None, optimizer, save_dir, get_device()) trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args) trainer.train(model_file, None, data_parallel) trainer._eval()
def create_and_check_electra_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, fake_token_labels, ): config.num_labels = self.num_labels model = ElectraForPreTraining(config=config) model.to(torch_device) model.eval() loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result)
def __init__(self, config, **kwargs): super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs) if "generator_config" in kwargs: generator_config = kwargs["generator_config"] else: generator_config = config self.generator_model = ElectraForMaskedLM(generator_config) if "discriminator_config" in kwargs: discriminator_config = kwargs["discriminator_config"] else: discriminator_config = config self.discriminator_model = ElectraForPreTraining(discriminator_config) self.vocab_size = config.vocab_size
def test_inference_no_head_absolute_embedding(self): model = ElectraForPreTraining.from_pretrained( "google/electra-small-discriminator") input_ids = torch.tensor( [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 11)) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor([[ -8.9253, -4.0305, -3.9306, -3.8774, -4.1873, -4.1280, 0.9429, -4.1672, 0.9281, 0.0410, -3.4823 ]]) self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
def load_huggingface_weights_in_electra(model, config, pytorch_model_path): try: from transformers import ElectraForPreTraining except ImportError: raise ImportError( "cannot import transformers, please install transformers first") hf_model = ElectraForPreTraining.from_pretrained(pytorch_model_path) model.electra.embeddings.token_embeddings.weight = ( hf_model.electra.embeddings.word_embeddings.weight) model.electra.embeddings.position_embeddings.weight = ( hf_model.electra.embeddings.position_embeddings.weight) model.electra.embeddings.token_type_embeddings.weight = ( hf_model.electra.embeddings.token_type_embeddings.weight) model.electra.embeddings.layer_norm.weight = hf_model.electra.embeddings.LayerNorm.weight model.electra.embeddings.layer_norm.bias = hf_model.electra.embeddings.LayerNorm.bias if config.embedding_size != config.hidden_size: model.electra.embeddings_project.weight = hf_model.electra.embeddings_project.weight for layer_idx in range(config.num_hidden_layers): layer = model.electra.encoder.layers[layer_idx] hf_layer = hf_model.electra.encoder.layer[layer_idx] layer.self.query.weight = hf_layer.attention.self.query.weight layer.self.query.bias = hf_layer.attention.self.query.bias layer.self.key.weight = hf_layer.attention.self.key.weight layer.self.key.bias = hf_layer.attention.self.key.bias layer.self.value.weight = hf_layer.attention.self.value.weight layer.self.value.bias = hf_layer.attention.self.value.bias layer.self.dense.weight = hf_layer.attention.output.dense.weight layer.self.dense.bias = hf_layer.attention.output.dense.bias layer.feed_forward.intermediate.weight = hf_layer.intermediate.dense.weight layer.feed_forward.intermediate.bias = hf_layer.intermediate.dense.bias layer.feed_forward.output.weight = hf_layer.output.dense.weight layer.feed_forward.output.bias = hf_layer.output.dense.bias layer.add_norm[ 0].layer_norm.weight = hf_layer.attention.output.LayerNorm.weight layer.add_norm[ 0].layer_norm.bias = hf_layer.attention.output.LayerNorm.bias layer.add_norm[1].layer_norm.weight = hf_layer.output.LayerNorm.weight layer.add_norm[1].layer_norm.bias = hf_layer.output.LayerNorm.bias
def detect_error_demo(): tokenizer = ElectraTokenizer.from_pretrained(D_model_dir) discriminator = ElectraForPreTraining.from_pretrained(D_model_dir) sentence = '今天新情很好' fake_tokens = tokenizer.tokenize(sentence) print(fake_tokens) fake_inputs = tokenizer.encode(sentence, return_tensors="pt") discriminator_outputs = discriminator(fake_inputs) predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2) print(list(zip(fake_tokens, predictions.tolist()))) print("fixed " + '*' * 42) print(list(zip(fake_tokens, predictions.tolist()[1:-1])))
def __init__(self, d_mdel_dir=os.path.join(pwd_path, "../data/electra_models/chinese_electra_base_discriminator_pytorch/"), g_model_dir=os.path.join(pwd_path, "../data/electra_models/chinese_electra_base_generator_pytorch/"), ): super(ElectraCorrector, self).__init__() self.name = 'electra_corrector' t1 = time.time() self.g_model = pipeline("fill-mask", model=g_model_dir, tokenizer=g_model_dir ) self.d_model = ElectraForPreTraining.from_pretrained(d_mdel_dir) if self.g_model: self.mask = self.g_model.tokenizer.mask_token logger.debug('Loaded electra model: %s, spend: %.3f s.' % (g_model_dir, time.time() - t1))
def __init__(self, d_model_dir=config.electra_D_model_dir, g_model_dir=config.electra_G_model_dir, device=device_id): super(ElectraCorrector, self).__init__() self.name = 'electra_corrector' t1 = time.time() self.g_model = pipeline( "fill-mask", model=g_model_dir, tokenizer=g_model_dir, device=device, # gpu device id ) self.d_model = ElectraForPreTraining.from_pretrained(d_model_dir) if self.g_model: self.mask = self.g_model.tokenizer.mask_token logger.debug('Loaded electra model: %s, spend: %.3f s.' % (g_model_dir, time.time() - t1))
def create_and_check_electra_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, fake_token_labels, ): config.num_labels = self.num_labels model = ElectraForPreTraining(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
disc_pred = disc_logits > 0 return gen_pred, generated, disc_pred, is_replaced # %% [markdown] # # 5. Train # %% # Generator and Discriminator if c.my_model: gen_hparam['tie_in_out_embedding'] = c.tie_gen_in_out_embedding generator = ModelForGenerator(gen_hparam) discriminator = ModelForDiscriminator(disc_hparam) discriminator.electra.embedding = generator.electra.embedding else: generator = ElectraForMaskedLM(gen_config) discriminator = ElectraForPreTraining(disc_config) discriminator.electra.embeddings = generator.electra.embeddings if c.tie_gen_in_out_embedding: generator.generator_predictions.dense.weight = generator.electra.embeddings.word_embeddings.weight # ELECTRA training loop electra_model = ELECTRAModel(generator, discriminator, hf_tokenizer) electra_loss_func = ELECTRALoss(gen_label_smooth=c.gen_smooth_label, disc_label_smooth=c.disc_smooth_label) # jit (Haven't fiqured out how to make it work) # input_ids, sentA_lenths = dls.one_batch() # masked_inputs, labels, is_mlm_applied = mlm_cb.mask_tokens(input_ids) # electra_jit_model = torch.jit.trace(electra_model, (masked_inputs, sentA_lenths, is_mlm_applied, labels)) # Optimizer if c.adam_bias_correction: opt_func = partial(Adam, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=0.01)
def __init__( self, model_type, model_name, generator_name=None, discriminator_name=None, train_files=None, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a LanguageModelingModel. Args: model_type: The type of model (gpt2, openai-gpt, bert, roberta, distilbert, camembert) model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin). generator_name (optional): A pretrained model name or path to a directory containing an ELECTRA generator model. discriminator_name (optional): A pretrained model name or path to a directory containing an ELECTRA discriminator model. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. train_files (optional): List of files to be used when training the tokenizer. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" if args and "manual_seed" in args: random.seed(args["manual_seed"]) np.random.seed(args["manual_seed"]) torch.manual_seed(args["manual_seed"]) if "n_gpu" in args and args["n_gpu"] > 0: torch.cuda.manual_seed_all(args["manual_seed"]) if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." " Make sure CUDA is available or set use_cuda=False." ) else: self.device = "cpu" self.results = {} self.args = { "dataset_type": "None", "dataset_class": None, "custom_tokenizer": None, "block_size": -1, "mlm": True, "mlm_probability": 0.15, "max_steps": -1, "config_name": None, "tokenizer_name": None, "min_frequency": 2, "special_tokens": ["<s>", "<pad>", "</s>", "<unk>", "<mask>"], "sliding_window": False, "stride": 0.8, "generator_config": {}, "discriminator_config": {}, "vocab_size": None, } self.args.update(global_args) if not use_cuda: self.args["fp16"] = False if args: self.args.update(args) self.args["model_name"] = model_name self.args["model_type"] = model_type config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] self.tokenizer_class = tokenizer_class new_tokenizer = False if self.args["tokenizer_name"]: self.tokenizer = tokenizer_class.from_pretrained( self.args["tokenizer_name"], cache_dir=self.args["cache_dir"] ) elif self.args["model_name"]: self.tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=self.args["cache_dir"], **kwargs) self.args["tokenizer_name"] = self.args["model_name"] else: if not train_files: raise ValueError( "model_name and tokenizer_name are not specified." "You must specify train_files to train a Tokenizer." ) else: self.train_tokenizer(train_files) new_tokenizer = True if self.args["config_name"]: self.config = config_class.from_pretrained(self.args["config_name"], cache_dir=self.args["cache_dir"]) elif self.args["model_name"]: self.config = config_class.from_pretrained(model_name, cache_dir=self.args["cache_dir"], **kwargs) else: self.config = config_class(**self.args["config"], **kwargs) if self.args["vocab_size"]: self.config.vocab_size = self.args["vocab_size"] if new_tokenizer: self.config.vocab_size = len(self.tokenizer) if self.args["model_type"] == "electra": if generator_name: self.generator_config = ElectraConfig.from_pretrained(generator_name) elif self.args["model_name"]: self.generator_config = ElectraConfig.from_pretrained( os.path.join(self.args["model_name"], "generator_config"), **kwargs, ) else: self.generator_config = ElectraConfig(**self.args["generator_config"], **kwargs) if new_tokenizer: self.generator_config.vocab_size = len(self.tokenizer) if discriminator_name: self.discriminator_config = ElectraConfig.from_pretrained(discriminator_name) elif self.args["model_name"]: self.discriminator_config = ElectraConfig.from_pretrained( os.path.join(self.args["model_name"], "discriminator_config"), **kwargs, ) else: self.discriminator_config = ElectraConfig(**self.args["discriminator_config"], **kwargs) if new_tokenizer: self.discriminator_config.vocab_size = len(self.tokenizer) if self.args["block_size"] <= 0: self.args["block_size"] = min(self.args["max_seq_length"], self.tokenizer.max_len) else: self.args["block_size"] = min(self.args["block_size"], self.tokenizer.max_len, self.args["max_seq_length"]) if self.args["model_name"]: if self.args["model_type"] == "electra": self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args["cache_dir"], generator_config=self.generator_config, discriminator_config=self.discriminator_config, **kwargs, ) self.model.load_state_dict(torch.load(os.path.join(self.args["model_name"], "pytorch_model.bin"))) else: self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args["cache_dir"], **kwargs, ) else: logger.info(" Training language model from scratch") if self.args["model_type"] == "electra": generator_model = ElectraForMaskedLM(config=self.generator_config) discriminator_model = ElectraForPreTraining(config=self.discriminator_config) self.model = ElectraForLanguageModelingModel( config=self.config, generator_model=generator_model, discriminator_model=discriminator_model, generator_config=self.generator_config, discriminator_config=self.discriminator_config, ) model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) else: self.model = model_class(config=self.config) model_to_resize = self.model.module if hasattr(self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) if model_type in ["camembert", "xlmroberta"]: warnings.warn( f"use_multiprocessing automatically disabled as {model_type}" " fails when using multiprocessing for feature conversion." ) self.args["use_multiprocessing"] = False if self.args["wandb_project"] and not wandb_available: warnings.warn("wandb_project specified but wandb is not available. Wandb disabled.") self.args["wandb_project"] = None
import torch from fio import read from transformers import ElectraTokenizer, ElectraForPreTraining """ python ppl.py [hypothesis file] """ hyp_file = sys.argv[1] assert os.path.exists(hyp_file) device = torch.device("cuda:0") tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-small-discriminator') model = ElectraForPreTraining.from_pretrained( 'google/electra-small-discriminator') model.to(device) model.eval() sigmoid = torch.nn.Sigmoid() pad_id = tokenizer.pad_token_id src_tsf_paris = [line.split("\t")[:2] for line in read(hyp_file)] src_tsf_paris = [(tokenizer.encode(src), tokenizer.encode(tsf)) for src, tsf in src_tsf_paris] def format_samples(pairs): src_bat, tsf_bat = zip(*pairs) max_l_src, max_l_tsf = max([len(src) for src in src_bat ]), max([len(tsf) for tsf in tsf_bat])
def __init__( self, model_type, model_name, generator_name=None, discriminator_name=None, train_files=None, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a LanguageModelingModel. Args: model_type: The type of model (gpt2, openai-gpt, bert, roberta, distilbert, camembert) model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin). generator_name (optional): A pretrained model name or path to a directory containing an ELECTRA generator model. discriminator_name (optional): A pretrained model name or path to a directory containing an ELECTRA discriminator model. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. train_files (optional): List of files to be used when training the tokenizer. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" self.args = self._load_model_args(model_name) if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, LanguageModelingArgs): self.args = args if "sweep_config" in kwargs: sweep_config = kwargs.pop("sweep_config") sweep_values = {key: value["value"] for key, value in sweep_config.as_dict().items() if key != "_wandb"} self.args.update_from_dict(sweep_values) if self.args.manual_seed: random.seed(self.args.manual_seed) np.random.seed(self.args.manual_seed) torch.manual_seed(self.args.manual_seed) if self.args.n_gpu > 0: torch.cuda.manual_seed_all(self.args.manual_seed) if self.args.local_rank != -1: logger.info(f"local_rank: {self.args.local_rank}") torch.distributed.init_process_group(backend="nccl") cuda_device = self.args.local_rank if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." " Make sure CUDA is available or set use_cuda=False." ) else: self.device = "cpu" self.results = {} if not use_cuda: self.args.fp16 = False self.args.model_name = model_name self.args.model_type = model_type config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] self.tokenizer_class = tokenizer_class new_tokenizer = False if self.args.tokenizer_name: self.tokenizer = tokenizer_class.from_pretrained(self.args.tokenizer_name, cache_dir=self.args.cache_dir) elif self.args.model_name: if self.args.model_name == "electra": self.tokenizer = tokenizer_class.from_pretrained( generator_name, cache_dir=self.args.cache_dir, **kwargs ) self.args.tokenizer_name = self.args.model_name else: self.tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs) self.args.tokenizer_name = self.args.model_name else: if not train_files: raise ValueError( "model_name and tokenizer_name are not specified." "You must specify train_files to train a Tokenizer." ) else: self.train_tokenizer(train_files) new_tokenizer = True if self.args.config_name: self.config = config_class.from_pretrained(self.args.config_name, cache_dir=self.args.cache_dir) elif self.args.model_name and self.args.model_name != "electra": self.config = config_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs) else: self.config = config_class(**self.args.config, **kwargs) if self.args.vocab_size: self.config.vocab_size = self.args.vocab_size if new_tokenizer: self.config.vocab_size = len(self.tokenizer) if self.args.model_type == "electra": if generator_name: self.generator_config = ElectraConfig.from_pretrained(generator_name) elif self.args.model_name: self.generator_config = ElectraConfig.from_pretrained( os.path.join(self.args.model_name, "generator_config"), **kwargs, ) else: self.generator_config = ElectraConfig(**self.args.generator_config, **kwargs) if new_tokenizer: self.generator_config.vocab_size = len(self.tokenizer) if discriminator_name: self.discriminator_config = ElectraConfig.from_pretrained(discriminator_name) elif self.args.model_name: self.discriminator_config = ElectraConfig.from_pretrained( os.path.join(self.args.model_name, "discriminator_config"), **kwargs, ) else: self.discriminator_config = ElectraConfig(**self.args.discriminator_config, **kwargs) if new_tokenizer: self.discriminator_config.vocab_size = len(self.tokenizer) if self.args.block_size <= 0: self.args.block_size = min(self.args.max_seq_length, self.tokenizer.max_len) else: self.args.block_size = min(self.args.block_size, self.tokenizer.max_len, self.args.max_seq_length) if self.args.model_name: if self.args.model_type == "electra": if self.args.model_name == "electra": generator_model = ElectraForMaskedLM.from_pretrained(generator_name) discriminator_model = ElectraForPreTraining.from_pretrained(discriminator_name) self.model = ElectraForLanguageModelingModel( config=self.config, generator_model=generator_model, discriminator_model=discriminator_model, generator_config=self.generator_config, discriminator_config=self.discriminator_config, tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings, ) model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) self.model.generator_model = generator_model self.model.discriminator_model = discriminator_model else: self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args.cache_dir, generator_config=self.generator_config, discriminator_config=self.discriminator_config, **kwargs, ) self.model.load_state_dict(torch.load(os.path.join(self.args.model_name, "pytorch_model.bin"))) else: self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args.cache_dir, **kwargs, ) else: logger.info(" Training language model from scratch") if self.args.model_type == "electra": generator_model = ElectraForMaskedLM(config=self.generator_config) discriminator_model = ElectraForPreTraining(config=self.discriminator_config) self.model = ElectraForLanguageModelingModel( config=self.config, generator_model=generator_model, discriminator_model=discriminator_model, generator_config=self.generator_config, discriminator_config=self.discriminator_config, tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings, ) model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) else: self.model = model_class(config=self.config) model_to_resize = self.model.module if hasattr(self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) if model_type in ["camembert", "xlmroberta"]: warnings.warn( f"use_multiprocessing automatically disabled as {model_type}" " fails when using multiprocessing for feature conversion." ) self.args.use_multiprocessing = False if self.args.wandb_project and not wandb_available: warnings.warn("wandb_project specified but wandb is not available. Wandb disabled.") self.args.wandb_project = None
def get_glue_learner(task, run_name=None, inference=False): # Num_epochs if task in ['rte', 'stsb']: num_epochs = 10 else: num_epochs = 3 # Dataloaders dls = glue_dls[task] if isinstance(c.device, str): dls.to(torch.device(c.device)) elif isinstance(c.device, list): dls.to(torch.device('cuda', c.device[0])) else: dls.to(torch.device('cuda:0')) # Load pretrained model if not c.pretrained_checkpoint: discriminator = ElectraForPreTraining.from_pretrained(f"google/electra-{c.size}-discriminator") else: discriminator = ModelForDiscriminator(hparam) if c.my_model else ElectraForPreTraining(electra_config) load_part_model(c.pretrained_ckp_path, discriminator, 'discriminator') # Create finetuning model if task=='wnli' and c.wsc_trick: model = ELECTRAWSCTrickModel(discriminator, hf_tokenizer.pad_token_id) else: model = SentencePredictor(discriminator.electra, electra_config.hidden_size, num_class=NUM_CLASS[task]) # Discriminative learning rates splitter = partial( hf_electra_param_splitter, wsc_trick=(task=='wnli' and c.wsc_trick) ) layer_lrs = get_layer_lrs(lr=c.lr, decay_rate=c.layer_lr_decay, num_hidden_layers=electra_config.num_hidden_layers,) # Optimizer if c.adam_bias_correction: opt_func = partial(Adam, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=0.01) else: opt_func = partial(Adam_no_bias_correction, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=0.01) # Learner learn = Learner(dls, model, loss_func=LOSS_FUNC[task], opt_func=opt_func, metrics=[eval(f'{metric}()') for metric in METRICS[task]], splitter=splitter if not inference else trainable_params, lr=layer_lrs if not inference else defaults.lr, path='./checkpoints', model_dir='glue',) # Multi gpu if isinstance(c.device, list) or c.device is None: learn.model = nn.DataParallel(learn.model, device_ids=c.device) # Gradient clip learn.add_cb(GradientClipping(1.0)) # Logging if run_name and not inference: neptune.create_experiment(name=run_name, params={'task':task, **c, **hparam_update}) learn.add_cb(SimplerNeptuneCallback(False)) # Learning rate schedule if c.schedule == 'one_cycle': return learn, partial(learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs) elif c.schedule == 'adjusted_one_cycle': return learn, partial(learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs, div=1e5, pct_start=0.1) else: lr_shed_func = linear_warmup_and_then_decay if c.schedule=='separate_linear' else linear_warmup_and_decay lr_shedule = ParamScheduler({'lr': partial(lr_shed_func, lr_max=np.array(layer_lrs), warmup_pct=0.1, total_steps=num_epochs*(len(dls.train)))}) return learn, partial(learn.fit, n_epoch=num_epochs, cbs=[lr_shedule])
def get_glue_learner(task, run_name=None, inference=False): is_wsc_trick = task == "wnli" and c.wsc_trick # Num_epochs if task in ["rte", "stsb"]: num_epochs = 10 else: num_epochs = 3 # Dataloaders dls = glue_dls[task] if isinstance(c.device, str): dls.to(torch.device(c.device)) elif isinstance(c.device, list): dls.to(torch.device("cuda", c.device[0])) else: dls.to(torch.device("cuda:0")) # Load pretrained model if not c.pretrained_checkpoint: discriminator = ElectraForPreTraining.from_pretrained( f"google/electra-{c.size}-discriminator") else: discriminator = (ModelForDiscriminator(hparam) if c.my_model else ElectraForPreTraining(electra_config)) load_part_model(c.pretrained_ckp_path, discriminator, "discriminator") # Seeds & PyTorch benchmark torch.backends.cudnn.benchmark = True if c.seeds: dls[0].rng = random.Random(c.seeds[i]) # for fastai dataloader random.seed(c.seeds[i]) np.random.seed(c.seeds[i]) torch.manual_seed(c.seeds[i]) # Create finetuning model if is_wsc_trick: model = ELECTRAWSCTrickModel(discriminator, hf_tokenizer.pad_token_id) else: model = SentencePredictor(discriminator.electra, electra_config.hidden_size, num_class=NUM_CLASS[task]) # Discriminative learning rates splitter = partial(hf_electra_param_splitter, wsc_trick=is_wsc_trick) layer_lrs = get_layer_lrs( lr=c.lr, decay_rate=c.layer_lr_decay, num_hidden_layers=electra_config.num_hidden_layers, ) # Optimizer if c.adam_bias_correction: opt_func = partial(Adam, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=c.weight_decay) else: opt_func = partial(Adam_no_bias_correction, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=c.weight_decay) # Learner learn = Learner( dls, model, loss_func=LOSS_FUNC[task], opt_func=opt_func, metrics=METRICS[task], splitter=splitter if not inference else trainable_params, lr=layer_lrs if not inference else defaults.lr, path="./checkpoints/glue", model_dir=c.group_name, ) # Multi gpu if isinstance(c.device, list) or c.device is None: learn.create_opt() learn.model = nn.DataParallel(learn.model, device_ids=c.device) # Mixed precision learn.to_native_fp16(init_scale=2.0**14) # Gradient clip learn.add_cb(GradientClipping(1.0)) # Logging # Logging if run_name and not inference: if c.logger == "neptune": neptune.create_experiment(name=run_name, params={ "task": task, **c, **hparam_update }) learn.add_cb(LightNeptuneCallback(False)) elif c.logger == "wandb": wandb_run = wandb.init( name=run_name, project="electra_glue", config={ "task": task, **c, **hparam_update }, reinit=True, ) learn.add_cb(LightWandbCallback(wandb_run)) # Learning rate schedule if c.schedule == "one_cycle": return learn, partial(learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs) elif c.schedule == "adjusted_one_cycle": return learn, partial( learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs, div=1e5, pct_start=0.1, ) else: lr_shed_func = (linear_warmup_and_then_decay if c.schedule == "separate_linear" else linear_warmup_and_decay) lr_shedule = ParamScheduler({ "lr": partial( lr_shed_func, lr_max=np.array(layer_lrs), warmup_pct=0.1, total_steps=num_epochs * (len(dls.train)), ) }) return learn, partial(learn.fit, n_epoch=num_epochs, cbs=[lr_shedule])
def train(rank, args): ####################### ## distributed if args.distributed_enabled: torch.distributed.init_process_group( backend='nccl', init_method='env://', world_size=args.distributed_world_size, rank=rank) if args.gpu_enabled: device = torch.device('cuda:{}'.format(rank)) else: device = torch.device('cpu') is_master = True if not args.distributed_enabled else args.distributed_enabled and rank == 0 ####################### ## preamble set_gpus(rank) set_seed(rank) set_cuda(deterministic=args.gpu_deterministic) output_dir = f'{args.output_dir}/{rank}' os.makedirs(output_dir, exist_ok=False) setup_logging(filename=f'{output_dir}/output.log', console=is_master) ####################### ## dataset tokenizer = new_tokenizer(vocab_file=args.data_vocab_file) vocab_size = len(tokenizer.vocab) ds_train = wrap_example_builder( dataset=load_owt(owt_dir=args.data_dir, n_tensors_per_file=args.data_n_tensors_per_file), vocab=tokenizer.vocab, max_length=args.data_max_seq_length) pad_token_id = tokenizer.vocab['[PAD]'] mask_token_id = tokenizer.vocab['[MASK]'] cls_token_id = tokenizer.vocab['[CLS]'] sep_token_id = tokenizer.vocab['[SEP]'] assert pad_token_id == 0 assert cls_token_id == 101 assert sep_token_id == 102 assert mask_token_id == 103 def collate_batch(examples): input_ids = torch.nn.utils.rnn.pad_sequence( [example['input_ids'] for example in examples], batch_first=True, padding_value=pad_token_id) input_mask = torch.nn.utils.rnn.pad_sequence( [example['input_mask'] for example in examples], batch_first=True, padding_value=pad_token_id) segment_ids = torch.nn.utils.rnn.pad_sequence( [example['segment_ids'] for example in examples], batch_first=True, padding_value=pad_token_id) return input_ids, input_mask, segment_ids def cycle(iterable): while True: for x in iterable: yield x ds_train_loader = iter( cycle( DataLoader(ds_train, batch_size=args.opt_batch_size, collate_fn=collate_batch))) ####################### ## model def to_distributed_model(model): return model if not args.distributed_enabled else torch.nn.parallel.DistributedDataParallel( model, device_ids=[rank], find_unused_parameters=True) def tie_weights(generator, discriminator): generator.electra.embeddings.word_embeddings = discriminator.electra.embeddings.word_embeddings generator.electra.embeddings.position_embeddings = discriminator.electra.embeddings.position_embeddings generator.electra.embeddings.token_type_embeddings = discriminator.electra.embeddings.token_type_embeddings class LogitsAdapter(torch.nn.Module): def __init__(self, adaptee): super().__init__() self.adaptee = adaptee def forward(self, *args, **kwargs): return self.adaptee(*args, **kwargs)[0] from transformers import AutoConfig, ElectraForMaskedLM, ElectraForPreTraining generator = ElectraForMaskedLM( AutoConfig.from_pretrained(args.model_generator)) discriminator = ElectraForPreTraining( AutoConfig.from_pretrained(args.model_discriminator)) tie_weights(generator, discriminator) model = to_distributed_model( Electra(LogitsAdapter(generator), LogitsAdapter(discriminator), num_tokens=vocab_size, mask_token_id=mask_token_id, pad_token_id=pad_token_id, mask_prob=args.model_mask_prob, mask_ignore_token_ids=[ tokenizer.vocab['[CLS]'], tokenizer.vocab['[SEP]'] ], random_token_prob=0.0).to(device)) ####################### ## optimizer def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): def lr_lambda(current_step): learning_rate = max( 0.0, 1. - (float(current_step) / float(num_training_steps))) learning_rate *= min(1.0, float(current_step) / float(num_warmup_steps)) return learning_rate return LambdaLR(optimizer, lr_lambda, last_epoch) def get_params_without_weight_decay_ln(named_params, weight_decay): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in named_params if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay, }, { 'params': [ p for n, p in named_params if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, }, ] return optimizer_grouped_parameters optimizer = torch.optim.AdamW(get_params_without_weight_decay_ln( model.named_parameters(), weight_decay=0.1), lr=args.opt_lr, betas=(0.9, 0.999), eps=1e-08) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.opt_warmup_steps, num_training_steps=args.opt_num_training_steps) scaler = torch.cuda.amp.GradScaler(enabled=args.gpu_mixed_precision) ####################### ## train t, steps_s, eta_m = time(), 0., 0 for step in range(args.opt_num_training_steps + 1): input_ids, input_mask, segment_ids = next(ds_train_loader) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) assert input_ids.shape[1] <= args.data_max_seq_length optimizer.zero_grad() with torch.cuda.amp.autocast(enabled=args.gpu_mixed_precision): loss, loss_mlm, loss_disc, acc_gen, acc_disc, disc_labels, disc_pred = model( input_ids, attention_mask=input_mask, token_type_ids=segment_ids) scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() scheduler.step() metrics = { 'step': (step, '{:8d}'), 'loss': (loss.item(), '{:8.5f}'), 'loss_mlm': (loss_mlm.item(), '{:8.5f}'), 'loss_disc': (loss_disc.item(), '{:8.5f}'), 'acc_gen': (acc_gen.item(), '{:5.3f}'), 'acc_disc': (acc_disc.item(), '{:5.3f}'), 'lr': (scheduler.get_last_lr()[0], '{:8.7f}'), 'steps': (steps_s, '{:4.1f}/s'), 'eta': (eta_m, '{:4d}m'), } if step % args.step_log == 0: sep = ' ' * 2 logger.info( sep.join([ f'{k}: {v[1].format(v[0])}' for (k, v) in metrics.items() ])) if step > 0 and step % 100 == 0: t2 = time() steps_s = 100. / (t2 - t) eta_m = int(((args.opt_num_training_steps - step) / steps_s) // 60) t = t2 if step % 200 == 0: logger.info( np.array2string(disc_labels[0].cpu().numpy(), threshold=sys.maxsize, max_line_width=sys.maxsize)) logger.info( np.array2string(disc_pred[0].cpu().numpy(), threshold=sys.maxsize, max_line_width=sys.maxsize)) if step > 0 and step % args.step_ckpt == 0 and is_master: discriminator.electra.save_pretrained( f'{args.output_dir}/ckpt/{step}')