def main(train_cfg='config/electra_pretrain.json', model_cfg='config/electra_small.json', data_file='../tbc/books_large_all.txt', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', log_dir='../exp/electra/pretrain/runs', save_dir='../exp/electra/pretrain', max_len=128, max_pred=20, mask_prob=0.15, quantize=False): check_dirs_exist([log_dir, save_dir]) train_cfg = ElectraConfig().from_json_file(train_cfg) model_cfg = ElectraConfig().from_json_file(model_cfg) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [ Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, max_len) ] data_iter = SentPairDataLoader(data_file, train_cfg.batch_size, tokenize, max_len, pipeline=pipeline) # Get distilled-electra and quantized-distilled-electra generator = ElectraForMaskedLM.from_pretrained( 'google/electra-small-generator') t_discriminator = ElectraForPreTraining.from_pretrained( 'google/electra-base-discriminator') s_discriminator = QuantizedElectraForPreTraining( model_cfg) if quantize else ElectraForPreTraining s_discriminator = s_discriminator.from_pretrained( 'google/electra-small-discriminator', config=model_cfg) # model # config is used for model "QuantizedElectraForPreTraining" model = DistillElectraForPreTraining(generator, t_discriminator, s_discriminator, model_cfg) optimizer = optim.optim4GPU(train_cfg, model) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX base_trainer_args = (train_cfg, model_cfg, model, data_iter, None, optimizer, save_dir, get_device()) trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args) trainer.train(model_file, None, data_parallel) trainer._eval()
def test_inference_no_head_absolute_embedding(self): model = ElectraForPreTraining.from_pretrained( "google/electra-small-discriminator") input_ids = torch.tensor( [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 11)) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor([[ -8.9253, -4.0305, -3.9306, -3.8774, -4.1873, -4.1280, 0.9429, -4.1672, 0.9281, 0.0410, -3.4823 ]]) self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
def load_huggingface_weights_in_electra(model, config, pytorch_model_path): try: from transformers import ElectraForPreTraining except ImportError: raise ImportError( "cannot import transformers, please install transformers first") hf_model = ElectraForPreTraining.from_pretrained(pytorch_model_path) model.electra.embeddings.token_embeddings.weight = ( hf_model.electra.embeddings.word_embeddings.weight) model.electra.embeddings.position_embeddings.weight = ( hf_model.electra.embeddings.position_embeddings.weight) model.electra.embeddings.token_type_embeddings.weight = ( hf_model.electra.embeddings.token_type_embeddings.weight) model.electra.embeddings.layer_norm.weight = hf_model.electra.embeddings.LayerNorm.weight model.electra.embeddings.layer_norm.bias = hf_model.electra.embeddings.LayerNorm.bias if config.embedding_size != config.hidden_size: model.electra.embeddings_project.weight = hf_model.electra.embeddings_project.weight for layer_idx in range(config.num_hidden_layers): layer = model.electra.encoder.layers[layer_idx] hf_layer = hf_model.electra.encoder.layer[layer_idx] layer.self.query.weight = hf_layer.attention.self.query.weight layer.self.query.bias = hf_layer.attention.self.query.bias layer.self.key.weight = hf_layer.attention.self.key.weight layer.self.key.bias = hf_layer.attention.self.key.bias layer.self.value.weight = hf_layer.attention.self.value.weight layer.self.value.bias = hf_layer.attention.self.value.bias layer.self.dense.weight = hf_layer.attention.output.dense.weight layer.self.dense.bias = hf_layer.attention.output.dense.bias layer.feed_forward.intermediate.weight = hf_layer.intermediate.dense.weight layer.feed_forward.intermediate.bias = hf_layer.intermediate.dense.bias layer.feed_forward.output.weight = hf_layer.output.dense.weight layer.feed_forward.output.bias = hf_layer.output.dense.bias layer.add_norm[ 0].layer_norm.weight = hf_layer.attention.output.LayerNorm.weight layer.add_norm[ 0].layer_norm.bias = hf_layer.attention.output.LayerNorm.bias layer.add_norm[1].layer_norm.weight = hf_layer.output.LayerNorm.weight layer.add_norm[1].layer_norm.bias = hf_layer.output.LayerNorm.bias
def detect_error_demo(): tokenizer = ElectraTokenizer.from_pretrained(D_model_dir) discriminator = ElectraForPreTraining.from_pretrained(D_model_dir) sentence = '今天新情很好' fake_tokens = tokenizer.tokenize(sentence) print(fake_tokens) fake_inputs = tokenizer.encode(sentence, return_tensors="pt") discriminator_outputs = discriminator(fake_inputs) predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2) print(list(zip(fake_tokens, predictions.tolist()))) print("fixed " + '*' * 42) print(list(zip(fake_tokens, predictions.tolist()[1:-1])))
def __init__(self, d_mdel_dir=os.path.join(pwd_path, "../data/electra_models/chinese_electra_base_discriminator_pytorch/"), g_model_dir=os.path.join(pwd_path, "../data/electra_models/chinese_electra_base_generator_pytorch/"), ): super(ElectraCorrector, self).__init__() self.name = 'electra_corrector' t1 = time.time() self.g_model = pipeline("fill-mask", model=g_model_dir, tokenizer=g_model_dir ) self.d_model = ElectraForPreTraining.from_pretrained(d_mdel_dir) if self.g_model: self.mask = self.g_model.tokenizer.mask_token logger.debug('Loaded electra model: %s, spend: %.3f s.' % (g_model_dir, time.time() - t1))
def __init__(self, d_model_dir=config.electra_D_model_dir, g_model_dir=config.electra_G_model_dir, device=device_id): super(ElectraCorrector, self).__init__() self.name = 'electra_corrector' t1 = time.time() self.g_model = pipeline( "fill-mask", model=g_model_dir, tokenizer=g_model_dir, device=device, # gpu device id ) self.d_model = ElectraForPreTraining.from_pretrained(d_model_dir) if self.g_model: self.mask = self.g_model.tokenizer.mask_token logger.debug('Loaded electra model: %s, spend: %.3f s.' % (g_model_dir, time.time() - t1))
def get_glue_learner(task, run_name=None, inference=False): is_wsc_trick = task == "wnli" and c.wsc_trick # Num_epochs if task in ["rte", "stsb"]: num_epochs = 10 else: num_epochs = 3 # Dataloaders dls = glue_dls[task] if isinstance(c.device, str): dls.to(torch.device(c.device)) elif isinstance(c.device, list): dls.to(torch.device("cuda", c.device[0])) else: dls.to(torch.device("cuda:0")) # Load pretrained model if not c.pretrained_checkpoint: discriminator = ElectraForPreTraining.from_pretrained( f"google/electra-{c.size}-discriminator") else: discriminator = (ModelForDiscriminator(hparam) if c.my_model else ElectraForPreTraining(electra_config)) load_part_model(c.pretrained_ckp_path, discriminator, "discriminator") # Seeds & PyTorch benchmark torch.backends.cudnn.benchmark = True if c.seeds: dls[0].rng = random.Random(c.seeds[i]) # for fastai dataloader random.seed(c.seeds[i]) np.random.seed(c.seeds[i]) torch.manual_seed(c.seeds[i]) # Create finetuning model if is_wsc_trick: model = ELECTRAWSCTrickModel(discriminator, hf_tokenizer.pad_token_id) else: model = SentencePredictor(discriminator.electra, electra_config.hidden_size, num_class=NUM_CLASS[task]) # Discriminative learning rates splitter = partial(hf_electra_param_splitter, wsc_trick=is_wsc_trick) layer_lrs = get_layer_lrs( lr=c.lr, decay_rate=c.layer_lr_decay, num_hidden_layers=electra_config.num_hidden_layers, ) # Optimizer if c.adam_bias_correction: opt_func = partial(Adam, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=c.weight_decay) else: opt_func = partial(Adam_no_bias_correction, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=c.weight_decay) # Learner learn = Learner( dls, model, loss_func=LOSS_FUNC[task], opt_func=opt_func, metrics=METRICS[task], splitter=splitter if not inference else trainable_params, lr=layer_lrs if not inference else defaults.lr, path="./checkpoints/glue", model_dir=c.group_name, ) # Multi gpu if isinstance(c.device, list) or c.device is None: learn.create_opt() learn.model = nn.DataParallel(learn.model, device_ids=c.device) # Mixed precision learn.to_native_fp16(init_scale=2.0**14) # Gradient clip learn.add_cb(GradientClipping(1.0)) # Logging # Logging if run_name and not inference: if c.logger == "neptune": neptune.create_experiment(name=run_name, params={ "task": task, **c, **hparam_update }) learn.add_cb(LightNeptuneCallback(False)) elif c.logger == "wandb": wandb_run = wandb.init( name=run_name, project="electra_glue", config={ "task": task, **c, **hparam_update }, reinit=True, ) learn.add_cb(LightWandbCallback(wandb_run)) # Learning rate schedule if c.schedule == "one_cycle": return learn, partial(learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs) elif c.schedule == "adjusted_one_cycle": return learn, partial( learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs, div=1e5, pct_start=0.1, ) else: lr_shed_func = (linear_warmup_and_then_decay if c.schedule == "separate_linear" else linear_warmup_and_decay) lr_shedule = ParamScheduler({ "lr": partial( lr_shed_func, lr_max=np.array(layer_lrs), warmup_pct=0.1, total_steps=num_epochs * (len(dls.train)), ) }) return learn, partial(learn.fit, n_epoch=num_epochs, cbs=[lr_shedule])
def get_glue_learner(task, run_name=None, inference=False): # Num_epochs if task in ['rte', 'stsb']: num_epochs = 10 else: num_epochs = 3 # Dataloaders dls = glue_dls[task] if isinstance(c.device, str): dls.to(torch.device(c.device)) elif isinstance(c.device, list): dls.to(torch.device('cuda', c.device[0])) else: dls.to(torch.device('cuda:0')) # Load pretrained model if not c.pretrained_checkpoint: discriminator = ElectraForPreTraining.from_pretrained(f"google/electra-{c.size}-discriminator") else: discriminator = ModelForDiscriminator(hparam) if c.my_model else ElectraForPreTraining(electra_config) load_part_model(c.pretrained_ckp_path, discriminator, 'discriminator') # Create finetuning model if task=='wnli' and c.wsc_trick: model = ELECTRAWSCTrickModel(discriminator, hf_tokenizer.pad_token_id) else: model = SentencePredictor(discriminator.electra, electra_config.hidden_size, num_class=NUM_CLASS[task]) # Discriminative learning rates splitter = partial( hf_electra_param_splitter, wsc_trick=(task=='wnli' and c.wsc_trick) ) layer_lrs = get_layer_lrs(lr=c.lr, decay_rate=c.layer_lr_decay, num_hidden_layers=electra_config.num_hidden_layers,) # Optimizer if c.adam_bias_correction: opt_func = partial(Adam, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=0.01) else: opt_func = partial(Adam_no_bias_correction, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=0.01) # Learner learn = Learner(dls, model, loss_func=LOSS_FUNC[task], opt_func=opt_func, metrics=[eval(f'{metric}()') for metric in METRICS[task]], splitter=splitter if not inference else trainable_params, lr=layer_lrs if not inference else defaults.lr, path='./checkpoints', model_dir='glue',) # Multi gpu if isinstance(c.device, list) or c.device is None: learn.model = nn.DataParallel(learn.model, device_ids=c.device) # Gradient clip learn.add_cb(GradientClipping(1.0)) # Logging if run_name and not inference: neptune.create_experiment(name=run_name, params={'task':task, **c, **hparam_update}) learn.add_cb(SimplerNeptuneCallback(False)) # Learning rate schedule if c.schedule == 'one_cycle': return learn, partial(learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs) elif c.schedule == 'adjusted_one_cycle': return learn, partial(learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs, div=1e5, pct_start=0.1) else: lr_shed_func = linear_warmup_and_then_decay if c.schedule=='separate_linear' else linear_warmup_and_decay lr_shedule = ParamScheduler({'lr': partial(lr_shed_func, lr_max=np.array(layer_lrs), warmup_pct=0.1, total_steps=num_epochs*(len(dls.train)))}) return learn, partial(learn.fit, n_epoch=num_epochs, cbs=[lr_shedule])
def __init__( self, model_type, model_name, generator_name=None, discriminator_name=None, train_files=None, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a LanguageModelingModel. Args: model_type: The type of model (gpt2, openai-gpt, bert, roberta, distilbert, camembert) model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin). generator_name (optional): A pretrained model name or path to a directory containing an ELECTRA generator model. discriminator_name (optional): A pretrained model name or path to a directory containing an ELECTRA discriminator model. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. train_files (optional): List of files to be used when training the tokenizer. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" self.args = self._load_model_args(model_name) if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, LanguageModelingArgs): self.args = args if "sweep_config" in kwargs: sweep_config = kwargs.pop("sweep_config") sweep_values = {key: value["value"] for key, value in sweep_config.as_dict().items() if key != "_wandb"} self.args.update_from_dict(sweep_values) if self.args.manual_seed: random.seed(self.args.manual_seed) np.random.seed(self.args.manual_seed) torch.manual_seed(self.args.manual_seed) if self.args.n_gpu > 0: torch.cuda.manual_seed_all(self.args.manual_seed) if self.args.local_rank != -1: logger.info(f"local_rank: {self.args.local_rank}") torch.distributed.init_process_group(backend="nccl") cuda_device = self.args.local_rank if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." " Make sure CUDA is available or set use_cuda=False." ) else: self.device = "cpu" self.results = {} if not use_cuda: self.args.fp16 = False self.args.model_name = model_name self.args.model_type = model_type config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] self.tokenizer_class = tokenizer_class new_tokenizer = False if self.args.tokenizer_name: self.tokenizer = tokenizer_class.from_pretrained(self.args.tokenizer_name, cache_dir=self.args.cache_dir) elif self.args.model_name: if self.args.model_name == "electra": self.tokenizer = tokenizer_class.from_pretrained( generator_name, cache_dir=self.args.cache_dir, **kwargs ) self.args.tokenizer_name = self.args.model_name else: self.tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs) self.args.tokenizer_name = self.args.model_name else: if not train_files: raise ValueError( "model_name and tokenizer_name are not specified." "You must specify train_files to train a Tokenizer." ) else: self.train_tokenizer(train_files) new_tokenizer = True if self.args.config_name: self.config = config_class.from_pretrained(self.args.config_name, cache_dir=self.args.cache_dir) elif self.args.model_name and self.args.model_name != "electra": self.config = config_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs) else: self.config = config_class(**self.args.config, **kwargs) if self.args.vocab_size: self.config.vocab_size = self.args.vocab_size if new_tokenizer: self.config.vocab_size = len(self.tokenizer) if self.args.model_type == "electra": if generator_name: self.generator_config = ElectraConfig.from_pretrained(generator_name) elif self.args.model_name: self.generator_config = ElectraConfig.from_pretrained( os.path.join(self.args.model_name, "generator_config"), **kwargs, ) else: self.generator_config = ElectraConfig(**self.args.generator_config, **kwargs) if new_tokenizer: self.generator_config.vocab_size = len(self.tokenizer) if discriminator_name: self.discriminator_config = ElectraConfig.from_pretrained(discriminator_name) elif self.args.model_name: self.discriminator_config = ElectraConfig.from_pretrained( os.path.join(self.args.model_name, "discriminator_config"), **kwargs, ) else: self.discriminator_config = ElectraConfig(**self.args.discriminator_config, **kwargs) if new_tokenizer: self.discriminator_config.vocab_size = len(self.tokenizer) if self.args.block_size <= 0: self.args.block_size = min(self.args.max_seq_length, self.tokenizer.max_len) else: self.args.block_size = min(self.args.block_size, self.tokenizer.max_len, self.args.max_seq_length) if self.args.model_name: if self.args.model_type == "electra": if self.args.model_name == "electra": generator_model = ElectraForMaskedLM.from_pretrained(generator_name) discriminator_model = ElectraForPreTraining.from_pretrained(discriminator_name) self.model = ElectraForLanguageModelingModel( config=self.config, generator_model=generator_model, discriminator_model=discriminator_model, generator_config=self.generator_config, discriminator_config=self.discriminator_config, tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings, ) model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) self.model.generator_model = generator_model self.model.discriminator_model = discriminator_model else: self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args.cache_dir, generator_config=self.generator_config, discriminator_config=self.discriminator_config, **kwargs, ) self.model.load_state_dict(torch.load(os.path.join(self.args.model_name, "pytorch_model.bin"))) else: self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args.cache_dir, **kwargs, ) else: logger.info(" Training language model from scratch") if self.args.model_type == "electra": generator_model = ElectraForMaskedLM(config=self.generator_config) discriminator_model = ElectraForPreTraining(config=self.discriminator_config) self.model = ElectraForLanguageModelingModel( config=self.config, generator_model=generator_model, discriminator_model=discriminator_model, generator_config=self.generator_config, discriminator_config=self.discriminator_config, tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings, ) model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) else: self.model = model_class(config=self.config) model_to_resize = self.model.module if hasattr(self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) if model_type in ["camembert", "xlmroberta"]: warnings.warn( f"use_multiprocessing automatically disabled as {model_type}" " fails when using multiprocessing for feature conversion." ) self.args.use_multiprocessing = False if self.args.wandb_project and not wandb_available: warnings.warn("wandb_project specified but wandb is not available. Wandb disabled.") self.args.wandb_project = None
import torch from fio import read from transformers import ElectraTokenizer, ElectraForPreTraining """ python ppl.py [hypothesis file] """ hyp_file = sys.argv[1] assert os.path.exists(hyp_file) device = torch.device("cuda:0") tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-small-discriminator') model = ElectraForPreTraining.from_pretrained( 'google/electra-small-discriminator') model.to(device) model.eval() sigmoid = torch.nn.Sigmoid() pad_id = tokenizer.pad_token_id src_tsf_paris = [line.split("\t")[:2] for line in read(hyp_file)] src_tsf_paris = [(tokenizer.encode(src), tokenizer.encode(tsf)) for src, tsf in src_tsf_paris] def format_samples(pairs): src_bat, tsf_bat = zip(*pairs) max_l_src, max_l_tsf = max([len(src) for src in src_bat ]), max([len(tsf) for tsf in tsf_bat])