def convert_tf_checkpoint_to_pytorch_albert(tf_checkpoint_path, albert_config_file, pytorch_dump_path): # Initialise PyTorch model config = AlbertConfig.from_json_file(albert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = AlbertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_albert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def __init__(self, coordinator_args: CoordinatorArguments, collab_optimizer_args: CollaborativeOptimizerArguments, averager_args: AveragerArguments, dht: hivemind.DHT): self.save_checkpoint_step_interval = coordinator_args.save_checkpoint_step_interval self.repo_path = coordinator_args.repo_path self.upload_interval = coordinator_args.upload_interval self.previous_step = -1 config = AlbertConfig.from_pretrained( coordinator_args.model_config_path) self.model = AlbertForPreTraining(config) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] opt = Lamb( optimizer_grouped_parameters, lr=0.00176, weight_decay=0.01, clamp_value=10000.0, debias=True, ) adjusted_target_batch_size = collab_optimizer_args.target_batch_size - collab_optimizer_args.batch_size_lead self.collaborative_optimizer = hivemind.CollaborativeOptimizer( opt=opt, dht=dht, prefix=experiment_prefix, compression_type=hivemind.utils.CompressionType.Value( collab_optimizer_args.compression), throughput=collab_optimizer_args.bandwidth, target_batch_size=adjusted_target_batch_size, client_mode=collab_optimizer_args.client_mode, verbose=True, start=True, **asdict(averager_args)) self.previous_timestamp = time.time()
def get_model(training_args, config, tokenizer): # Find latest checkpoint in output_dir output_dir = Path(training_args.output_dir) logger.info( f'Checkpoint dir {output_dir}, contents {list(output_dir.glob("checkpoint*"))}' ) latest_checkpoint_dir = max(output_dir.glob('checkpoint*'), default=None, key=os.path.getctime) if latest_checkpoint_dir is not None: logger.info(f'Loading model from {latest_checkpoint_dir}') model = AlbertForPreTraining.from_pretrained(latest_checkpoint_dir) else: logger.info(f'Training from scratch') model = AlbertForPreTraining(config) model.resize_token_embeddings(len(tokenizer)) return model
def create_and_check_albert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = AlbertForPreTraining(config=config) model.to(torch_device) model.eval() loss, prediction_scores, sop_scores = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels, sentence_order_label=sequence_labels, ) result = { "loss": loss, "prediction_scores": prediction_scores, "sop_scores": sop_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.parent.assertListEqual(list(result["sop_scores"].size()), [self.batch_size, config.num_labels]) self.check_loss_output(result)
def test_inference_no_head_absolute_embedding(self): model = AlbertForPreTraining.from_pretrained("albert-base-v2") input_ids = torch.tensor( [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 11, 30000)) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor([[[4.6061, 0.7321, -1.7725], [4.6061, 0.7323, -1.7727], [4.6061, 0.7323, -1.7727]]]) self.assertTrue( torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
def create_and_check_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = AlbertForPreTraining(config=config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels, sentence_order_label=sequence_labels, ) self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, config.num_labels))
class CheckpointHandler: def __init__(self, coordinator_args: CoordinatorArguments, collab_optimizer_args: CollaborativeOptimizerArguments, averager_args: AveragerArguments, dht: hivemind.DHT): self.save_checkpoint_step_interval = coordinator_args.save_checkpoint_step_interval self.repo_path = coordinator_args.repo_path self.upload_interval = coordinator_args.upload_interval self.previous_step = -1 config = AlbertConfig.from_pretrained( coordinator_args.model_config_path) self.model = AlbertForPreTraining(config) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] opt = Lamb( optimizer_grouped_parameters, lr=0.00176, weight_decay=0.01, clamp_value=10000.0, debias=True, ) adjusted_target_batch_size = collab_optimizer_args.target_batch_size - collab_optimizer_args.batch_size_lead self.collaborative_optimizer = hivemind.CollaborativeOptimizer( opt=opt, dht=dht, prefix=experiment_prefix, compression_type=hivemind.utils.CompressionType.Value( collab_optimizer_args.compression), throughput=collab_optimizer_args.bandwidth, target_batch_size=adjusted_target_batch_size, client_mode=collab_optimizer_args.client_mode, verbose=True, start=True, **asdict(averager_args)) self.previous_timestamp = time.time() def is_time_to_save_state(self, cur_step): if self.save_checkpoint_step_interval is None: return False elif cur_step - self.previous_step >= self.save_checkpoint_step_interval: return True else: return False def save_state(self, cur_step): self.collaborative_optimizer.load_state_from_peers() self.previous_step = cur_step def is_time_to_upload(self): if self.repo_path is None: return False elif time.time() - self.previous_timestamp >= self.upload_interval: return True else: return False def upload_checkpoint(self, current_loss): self.model.save_pretrained(self.repo_path) torch.save(self.collaborative_optimizer.opt.state_dict(), f"{self.repo_path}/optimizer_state.pt") self.previous_timestamp = time.time() try: subprocess.run("git add --all", shell=True, check=True, cwd=self.repo_path) current_step = self.collaborative_optimizer.collaboration_state.optimizer_step subprocess.run( f"git commit -m 'Step {current_step}, loss {current_loss:.3f}'", shell=True, check=True, cwd=self.repo_path) subprocess.run("git push", shell=True, check=True, cwd=self.repo_path) except subprocess.CalledProcessError as e: logger.warning("Error while uploading model:", e.output)
def main(): parser = HfArgumentParser((AlbertTrainingArguments, DatasetArguments, CollaborationArguments)) training_args, dataset_args, collaboration_args = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) config = AlbertConfig.from_pretrained(dataset_args.config_path, cache_dir=dataset_args.cache_dir) tokenizer = AlbertTokenizerFast.from_pretrained(dataset_args.tokenizer_path, cache_dir=dataset_args.cache_dir) # find latest checkpoint in output_dir output_dir = Path(training_args.output_dir) logger.info(f'Checkpoint dir {output_dir}, contents {list(output_dir.glob("checkpoint*"))}') latest_checkpoint_dir = max(output_dir.glob('checkpoint*'), default=None, key=os.path.getctime) if latest_checkpoint_dir is not None: logger.info(f'Loading model from {latest_checkpoint_dir}') model = AlbertForPreTraining.from_pretrained(latest_checkpoint_dir) else: logger.info(f'Training from scratch') model = AlbertForPreTraining(config) model.resize_token_embeddings(len(tokenizer)) tokenized_dataset_path = Path(dataset_args.dataset_path) tokenized_datasets = load_from_disk(tokenized_dataset_path) # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": training_args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = FusedLAMB( optimizer_grouped_parameters, lr=training_args.learning_rate, betas=(training_args.adam_beta1, training_args.adam_beta2), eps=training_args.adam_epsilon, ) lr_scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=training_args.max_steps ) trainer = CollaborativeTrainer( model=model, args=training_args, collaboration_args=collaboration_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, optimizers=(optimizer, lr_scheduler) ) # Training if training_args.do_train: trainer.train(model_path=latest_checkpoint_dir)
def main(): # my dice shows 777 only. period. random.seed(EXPCONF.seed) np.random.seed(EXPCONF.seed) torch.manual_seed(EXPCONF.seed) torch.cuda.manual_seed_all(EXPCONF.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') trainloader, vocab, _trainds = get_loader(EXPCONF, getdev=False) devloader, _, _devds = get_loader(EXPCONF, getdev=True) assert len(trainloader) > 0, f"trainloader is empty!" assert len(devloader) > 0, f"devloader is empty!" # this is disgraceful.... but just specify things below albertconf = AlbertConfig.from_pretrained( f'albert-{EXPCONF.albert_scale}-v2') if EXPCONF.smaller: #originally used 4H for FFN but for memory issue, use 1H for FFN albertconf.hidden_size = EXPCONF.hidden_size albertconf.num_hidden_layers = EXPCONF.num_hidden_layers albertconf.num_attention_heads = EXPCONF.num_attention_heads albertconf.intermediate_size = albertconf.hidden_size albertconf.vocab_size = len(vocab.itos) albertconf.bos_token_id = vocab.stoi['BOS'] albertconf.eos_token_id = vocab.stoi['EOS'] albertconf.pad_token_id = vocab.stoi['PAD'] albertconf.max_position_embeddings = 40 model = AlbertForPreTraining(albertconf).to(device) # huggingface example is doing this for language modeling... # https://github.com/huggingface/transformers/blob/v2.6.0/examples/run_language_modeling.py no_decay = ['bias', "LayerNorm.weight"] grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": EXPCONF.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(grouped_parameters, lr=EXPCONF.lr) # otherwise, use default getsch = get_cosine_schedule_with_warmup if EXPCONF.scheduler == 'cosine' else get_linear_schedule_with_warmup scheduler = getsch(optimizer, EXPCONF.warmups, EXPCONF.numep * len(trainloader)) global_step = 0 L = len(trainloader) bsz = len(trainloader[0]) for ep in tqdm(range(1, EXPCONF.numep + 1), desc="epoch progress"): lossep_mlm = 0 lossep_pp = 0 accep_pp = 0 model.train() for i, (b, l, datasetids) in enumerate( tqdm(trainloader, desc="iterations progress"), 1): ''' b.input_ids/token_type_ids/attention_mask .shape == (bsz, seqmaxlen,) b.l.shape == (bsz,) ## bert families, when they do MLM with NSP (or other similar sentence based tasks,) ## they just uses masked input for their sentence representation encoding, not the unmasked ones ## it could be considered as some kind of dropout but at first it looked quite irregular to me. ## --> referred to transformers/examples/run_language_modeling.py (v2.1.0) ## --> modeling_albert.py ( class AlbertModel.forward() ) ''' outputs = model(**b, sentence_order_label=l, return_dict=True) global_step += 1 vsz = outputs.prediction_logits.shape[-1] lossmlm = F.cross_entropy( outputs.prediction_logits.view(-1, vsz).contiguous(), b['labels'].view(-1)) losspp = F.cross_entropy(outputs.sop_logits, l) lossppval = losspp.item() acc = accuracy(outputs.sop_logits.clone().detach(), l) if EXPCONF.alpha_pp == 1 and not EXPCONF.alpha_warmup: outputs.loss.backward() else: del outputs.loss torch.cuda.empty_cache() losspp *= EXPCONF.alpha_pp if EXPCONF.alpha_warmup: grow = min(global_step / EXPCONF.warmups, 1.0) losspp *= grow loss = lossmlm + losspp loss.backward() wandb.log({ 'step': (i + ep * L) * bsz if EXPCONF.see_bsz_effect else global_step, 'train_step/learning_rate': get_lr_from_optim(optimizer), 'train_step/alpha_pp': EXPCONF.alpha_pp * (grow if EXPCONF.alpha_warmup else 1), 'train_step/mlm_loss': lossmlm.item(), 'train_step/pp_loss': lossppval, 'train_step/pp_acc': acc, }) optimizer.step() scheduler.step() model.zero_grad() lossep_mlm += lossmlm.item() lossep_pp += lossppval accep_pp += acc lossep_mlm /= L lossep_pp /= L accep_pp /= L wandb.log({ 'step': ep, 'train_ep/mlm_loss': lossep_mlm, 'train_ep/pp_loss': lossep_pp, 'train_ep/pp_acc': accep_pp, }) print(f"ep:{ep}: losspp = {lossep_pp}, lossmlm={lossep_mlm}") devmlm_loss, devpp_loss, devpp_acc = evaldev(EXPCONF, model, devloader, ep) if devpp_acc > EXPCONF.savethld: savemodel(EXPCONF, model, vocab, ep, mlm=devmlm_loss, pp=devpp_loss, acc=devpp_acc) return None
def main(): # my dice shows 777 only. period. random.seed(EXPCONF.seed) np.random.seed(EXPCONF.seed) torch.manual_seed(EXPCONF.seed) torch.cuda.manual_seed_all(EXPCONF.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tempconf = EXPCONF.copy() tempconf.datamode = 'test' testloader, ___, _____ = get_loader(tempconf) trainloader, __, _trainds = get_loader(EXPCONF, getdev=False) devloader, _, _devds = get_loader(EXPCONF, getdev=True) assert len(trainloader) > 0, f"trainloader is empty!" assert len(devloader) > 0, f"devloader is empty!" # this is disgraceful.... but just specify things below model_weight, vocab, trained_condition = loadmodel_info(EXPCONF) albertconf = retrieve_conf(trained_condition, vocab) albert = AlbertForPreTraining(albertconf) albert.load_state_dict(model_weight) albert = albert.to(device) global_step = 0 L = len(trainloader) bsz = len(trainloader[0]) if not EXPCONF.infer_now: albert = albert.albert albert.eval() # freeze cls = MLP(EXPCONF, albertconf.hidden_size, 2).to(device) cls.train() for p in cls.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # huggingface example is doing this for language modeling... # https://github.com/huggingface/transformers/blob/v2.6.0/examples/run_language_modeling.py optimizer = AdamW(cls.parameters(), lr=EXPCONF.cls_lr) # otherwise, use default getsch = get_cosine_schedule_with_warmup if EXPCONF.cls_sch == 'cosine' else get_linear_schedule_with_warmup scheduler = getsch(optimizer, EXPCONF.cls_warmups, EXPCONF.cls_numsteps) ## train cls only! while global_step < EXPCONF.cls_numsteps: lossep_pp = 0 accep_pp = 0 cls.train() for i, (b, l, datasetids) in enumerate( tqdm(trainloader, desc="iterations progress"), 1): outputs = albert(**b, return_dict=True) global_step += 1 logits = cls(outputs.pooler_output) losspp = F.cross_entropy(logits, l) lossppval = losspp.item() acc = accuracy(logits.clone().detach(), l) wandb.log({ 'step': global_step, 'cls.train_step/learning_rate': get_lr_from_optim(optimizer), 'cls.train_step/pp_loss': lossppval, 'cls.train_step/pp_acc': acc, }) optimizer.step() scheduler.step() cls.zero_grad() lossep_pp += lossppval accep_pp += acc if global_step % EXPCONF.logevery == 0: lossep_pp /= L accep_pp /= L wandb.log({ 'cls.train_ep/pp_loss': lossep_pp, 'cls.train_ep/pp_acc': accep_pp, }) devpp_loss, devpp_acc = evaldev(EXPCONF, albert, cls, devloader, global_step) if devpp_acc > EXPCONF.savethld: savemodel(EXPCONF, albert, cls, vocab, global_step, acc=devpp_acc) write_sub(EXPCONF, albert, cls, global_step, acc=devpp_acc, testloader=testloader) else: # infer now cls = None devpp_loss, devpp_acc = evaldev(EXPCONF, albert, cls, devloader, global_step, infernow=EXPCONF.infer_now) write_sub(EXPCONF, albert, cls, global_step, acc=devpp_acc, testloader=testloader, infernow=EXPCONF.infer_now) return None