Esempio n. 1
0
def convert_tf_checkpoint_to_pytorch_albert(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = AlbertConfig.from_json_file(albert_config_file)  
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = AlbertForPreTraining(config)
    # Load weights from tf checkpoint
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)  
    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)  
Esempio n. 2
0
    def __init__(self, coordinator_args: CoordinatorArguments,
                 collab_optimizer_args: CollaborativeOptimizerArguments,
                 averager_args: AveragerArguments, dht: hivemind.DHT):
        self.save_checkpoint_step_interval = coordinator_args.save_checkpoint_step_interval
        self.repo_path = coordinator_args.repo_path
        self.upload_interval = coordinator_args.upload_interval
        self.previous_step = -1

        config = AlbertConfig.from_pretrained(
            coordinator_args.model_config_path)
        self.model = AlbertForPreTraining(config)

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        opt = Lamb(
            optimizer_grouped_parameters,
            lr=0.00176,
            weight_decay=0.01,
            clamp_value=10000.0,
            debias=True,
        )

        adjusted_target_batch_size = collab_optimizer_args.target_batch_size - collab_optimizer_args.batch_size_lead

        self.collaborative_optimizer = hivemind.CollaborativeOptimizer(
            opt=opt,
            dht=dht,
            prefix=experiment_prefix,
            compression_type=hivemind.utils.CompressionType.Value(
                collab_optimizer_args.compression),
            throughput=collab_optimizer_args.bandwidth,
            target_batch_size=adjusted_target_batch_size,
            client_mode=collab_optimizer_args.client_mode,
            verbose=True,
            start=True,
            **asdict(averager_args))
        self.previous_timestamp = time.time()
Esempio n. 3
0
def get_model(training_args, config, tokenizer):
    # Find latest checkpoint in output_dir
    output_dir = Path(training_args.output_dir)
    logger.info(
        f'Checkpoint dir {output_dir}, contents {list(output_dir.glob("checkpoint*"))}'
    )
    latest_checkpoint_dir = max(output_dir.glob('checkpoint*'),
                                default=None,
                                key=os.path.getctime)

    if latest_checkpoint_dir is not None:
        logger.info(f'Loading model from {latest_checkpoint_dir}')
        model = AlbertForPreTraining.from_pretrained(latest_checkpoint_dir)
    else:
        logger.info(f'Training from scratch')
        model = AlbertForPreTraining(config)
        model.resize_token_embeddings(len(tokenizer))

    return model
 def create_and_check_albert_for_pretraining(self, config, input_ids,
                                             token_type_ids, input_mask,
                                             sequence_labels,
                                             token_labels,
                                             choice_labels):
     model = AlbertForPreTraining(config=config)
     model.to(torch_device)
     model.eval()
     loss, prediction_scores, sop_scores = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         labels=token_labels,
         sentence_order_label=sequence_labels,
     )
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
         "sop_scores": sop_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.parent.assertListEqual(list(result["sop_scores"].size()),
                                 [self.batch_size, config.num_labels])
     self.check_loss_output(result)
    def test_inference_no_head_absolute_embedding(self):
        model = AlbertForPreTraining.from_pretrained("albert-base-v2")
        input_ids = torch.tensor(
            [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11, 30000))
        self.assertEqual(output.shape, expected_shape)
        expected_slice = torch.tensor([[[4.6061, 0.7321, -1.7725],
                                        [4.6061, 0.7323, -1.7727],
                                        [4.6061, 0.7323, -1.7727]]])

        self.assertTrue(
            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
 def create_and_check_for_pretraining(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = AlbertForPreTraining(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         labels=token_labels,
         sentence_order_label=sequence_labels,
     )
     self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
     self.parent.assertEqual(result.sop_logits.shape, (self.batch_size, config.num_labels))
Esempio n. 7
0
class CheckpointHandler:
    def __init__(self, coordinator_args: CoordinatorArguments,
                 collab_optimizer_args: CollaborativeOptimizerArguments,
                 averager_args: AveragerArguments, dht: hivemind.DHT):
        self.save_checkpoint_step_interval = coordinator_args.save_checkpoint_step_interval
        self.repo_path = coordinator_args.repo_path
        self.upload_interval = coordinator_args.upload_interval
        self.previous_step = -1

        config = AlbertConfig.from_pretrained(
            coordinator_args.model_config_path)
        self.model = AlbertForPreTraining(config)

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        opt = Lamb(
            optimizer_grouped_parameters,
            lr=0.00176,
            weight_decay=0.01,
            clamp_value=10000.0,
            debias=True,
        )

        adjusted_target_batch_size = collab_optimizer_args.target_batch_size - collab_optimizer_args.batch_size_lead

        self.collaborative_optimizer = hivemind.CollaborativeOptimizer(
            opt=opt,
            dht=dht,
            prefix=experiment_prefix,
            compression_type=hivemind.utils.CompressionType.Value(
                collab_optimizer_args.compression),
            throughput=collab_optimizer_args.bandwidth,
            target_batch_size=adjusted_target_batch_size,
            client_mode=collab_optimizer_args.client_mode,
            verbose=True,
            start=True,
            **asdict(averager_args))
        self.previous_timestamp = time.time()

    def is_time_to_save_state(self, cur_step):
        if self.save_checkpoint_step_interval is None:
            return False
        elif cur_step - self.previous_step >= self.save_checkpoint_step_interval:
            return True
        else:
            return False

    def save_state(self, cur_step):
        self.collaborative_optimizer.load_state_from_peers()
        self.previous_step = cur_step

    def is_time_to_upload(self):
        if self.repo_path is None:
            return False
        elif time.time() - self.previous_timestamp >= self.upload_interval:
            return True
        else:
            return False

    def upload_checkpoint(self, current_loss):
        self.model.save_pretrained(self.repo_path)
        torch.save(self.collaborative_optimizer.opt.state_dict(),
                   f"{self.repo_path}/optimizer_state.pt")
        self.previous_timestamp = time.time()
        try:
            subprocess.run("git add --all",
                           shell=True,
                           check=True,
                           cwd=self.repo_path)
            current_step = self.collaborative_optimizer.collaboration_state.optimizer_step
            subprocess.run(
                f"git commit -m 'Step {current_step}, loss {current_loss:.3f}'",
                shell=True,
                check=True,
                cwd=self.repo_path)
            subprocess.run("git push",
                           shell=True,
                           check=True,
                           cwd=self.repo_path)
        except subprocess.CalledProcessError as e:
            logger.warning("Error while uploading model:", e.output)
Esempio n. 8
0
def main():
    parser = HfArgumentParser((AlbertTrainingArguments, DatasetArguments, CollaborationArguments))
    training_args, dataset_args, collaboration_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    config = AlbertConfig.from_pretrained(dataset_args.config_path, cache_dir=dataset_args.cache_dir)

    tokenizer = AlbertTokenizerFast.from_pretrained(dataset_args.tokenizer_path, cache_dir=dataset_args.cache_dir)

    # find latest checkpoint in output_dir
    output_dir = Path(training_args.output_dir)
    logger.info(f'Checkpoint dir {output_dir}, contents {list(output_dir.glob("checkpoint*"))}')
    latest_checkpoint_dir = max(output_dir.glob('checkpoint*'), default=None, key=os.path.getctime)

    if latest_checkpoint_dir is not None:
        logger.info(f'Loading model from {latest_checkpoint_dir}')
        model = AlbertForPreTraining.from_pretrained(latest_checkpoint_dir)
    else:
        logger.info(f'Training from scratch')
        model = AlbertForPreTraining(config)
        model.resize_token_embeddings(len(tokenizer))

    tokenized_dataset_path = Path(dataset_args.dataset_path)

    tokenized_datasets = load_from_disk(tokenized_dataset_path)

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": training_args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    optimizer = FusedLAMB(
        optimizer_grouped_parameters,
        lr=training_args.learning_rate,
        betas=(training_args.adam_beta1, training_args.adam_beta2),
        eps=training_args.adam_epsilon,
    )

    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=training_args.max_steps
    )

    trainer = CollaborativeTrainer(
        model=model, args=training_args, collaboration_args=collaboration_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        optimizers=(optimizer, lr_scheduler)
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=latest_checkpoint_dir)
Esempio n. 9
0
def main():
    # my dice shows 777 only. period.
    random.seed(EXPCONF.seed)
    np.random.seed(EXPCONF.seed)
    torch.manual_seed(EXPCONF.seed)
    torch.cuda.manual_seed_all(EXPCONF.seed)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    trainloader, vocab, _trainds = get_loader(EXPCONF, getdev=False)
    devloader, _, _devds = get_loader(EXPCONF, getdev=True)

    assert len(trainloader) > 0, f"trainloader is empty!"
    assert len(devloader) > 0, f"devloader is empty!"

    # this is disgraceful.... but just specify things below
    albertconf = AlbertConfig.from_pretrained(
        f'albert-{EXPCONF.albert_scale}-v2')
    if EXPCONF.smaller:  #originally used 4H for FFN but for memory issue, use 1H for FFN
        albertconf.hidden_size = EXPCONF.hidden_size
        albertconf.num_hidden_layers = EXPCONF.num_hidden_layers
        albertconf.num_attention_heads = EXPCONF.num_attention_heads

        albertconf.intermediate_size = albertconf.hidden_size

    albertconf.vocab_size = len(vocab.itos)
    albertconf.bos_token_id = vocab.stoi['BOS']
    albertconf.eos_token_id = vocab.stoi['EOS']
    albertconf.pad_token_id = vocab.stoi['PAD']
    albertconf.max_position_embeddings = 40

    model = AlbertForPreTraining(albertconf).to(device)

    # huggingface example is doing this for language modeling...
    # https://github.com/huggingface/transformers/blob/v2.6.0/examples/run_language_modeling.py
    no_decay = ['bias', "LayerNorm.weight"]
    grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            EXPCONF.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(grouped_parameters,
                      lr=EXPCONF.lr)  # otherwise, use default
    getsch = get_cosine_schedule_with_warmup if EXPCONF.scheduler == 'cosine' else get_linear_schedule_with_warmup
    scheduler = getsch(optimizer, EXPCONF.warmups,
                       EXPCONF.numep * len(trainloader))

    global_step = 0
    L = len(trainloader)
    bsz = len(trainloader[0])

    for ep in tqdm(range(1, EXPCONF.numep + 1), desc="epoch progress"):
        lossep_mlm = 0
        lossep_pp = 0
        accep_pp = 0
        model.train()
        for i, (b, l, datasetids) in enumerate(
                tqdm(trainloader, desc="iterations progress"), 1):
            '''
            b.input_ids/token_type_ids/attention_mask .shape ==  (bsz, seqmaxlen,)
            b.l.shape == (bsz,)

            ## bert families, when they do MLM with NSP (or other similar sentence based tasks,)
            ## they just uses masked input for their sentence representation encoding, not the unmasked ones
            ## it could be considered as some kind of dropout but at first it looked quite irregular to me.

            ## --> referred to transformers/examples/run_language_modeling.py (v2.1.0)
            ## --> modeling_albert.py ( class AlbertModel.forward() )
            '''

            outputs = model(**b, sentence_order_label=l, return_dict=True)
            global_step += 1

            vsz = outputs.prediction_logits.shape[-1]

            lossmlm = F.cross_entropy(
                outputs.prediction_logits.view(-1, vsz).contiguous(),
                b['labels'].view(-1))
            losspp = F.cross_entropy(outputs.sop_logits, l)
            lossppval = losspp.item()
            acc = accuracy(outputs.sop_logits.clone().detach(), l)

            if EXPCONF.alpha_pp == 1 and not EXPCONF.alpha_warmup:
                outputs.loss.backward()
            else:
                del outputs.loss
                torch.cuda.empty_cache()

                losspp *= EXPCONF.alpha_pp

                if EXPCONF.alpha_warmup:
                    grow = min(global_step / EXPCONF.warmups, 1.0)
                    losspp *= grow

                loss = lossmlm + losspp
                loss.backward()

            wandb.log({
                'step':
                (i + ep * L) * bsz if EXPCONF.see_bsz_effect else global_step,
                'train_step/learning_rate':
                get_lr_from_optim(optimizer),
                'train_step/alpha_pp':
                EXPCONF.alpha_pp * (grow if EXPCONF.alpha_warmup else 1),
                'train_step/mlm_loss':
                lossmlm.item(),
                'train_step/pp_loss':
                lossppval,
                'train_step/pp_acc':
                acc,
            })

            optimizer.step()
            scheduler.step()
            model.zero_grad()

            lossep_mlm += lossmlm.item()
            lossep_pp += lossppval
            accep_pp += acc

        lossep_mlm /= L
        lossep_pp /= L
        accep_pp /= L

        wandb.log({
            'step': ep,
            'train_ep/mlm_loss': lossep_mlm,
            'train_ep/pp_loss': lossep_pp,
            'train_ep/pp_acc': accep_pp,
        })
        print(f"ep:{ep}: losspp = {lossep_pp}, lossmlm={lossep_mlm}")
        devmlm_loss, devpp_loss, devpp_acc = evaldev(EXPCONF, model, devloader,
                                                     ep)
        if devpp_acc > EXPCONF.savethld:
            savemodel(EXPCONF,
                      model,
                      vocab,
                      ep,
                      mlm=devmlm_loss,
                      pp=devpp_loss,
                      acc=devpp_acc)
    return None
Esempio n. 10
0
def main():
    # my dice shows 777 only. period.
    random.seed(EXPCONF.seed)
    np.random.seed(EXPCONF.seed)
    torch.manual_seed(EXPCONF.seed)
    torch.cuda.manual_seed_all(EXPCONF.seed)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tempconf = EXPCONF.copy()
    tempconf.datamode = 'test'

    testloader, ___, _____ = get_loader(tempconf)
    trainloader, __, _trainds = get_loader(EXPCONF, getdev=False)
    devloader, _, _devds = get_loader(EXPCONF, getdev=True)

    assert len(trainloader) > 0, f"trainloader is empty!"
    assert len(devloader) > 0, f"devloader is empty!"

    # this is disgraceful.... but just specify things below
    model_weight, vocab, trained_condition = loadmodel_info(EXPCONF)

    albertconf = retrieve_conf(trained_condition, vocab)
    albert = AlbertForPreTraining(albertconf)
    albert.load_state_dict(model_weight)
    albert = albert.to(device)

    global_step = 0
    L = len(trainloader)
    bsz = len(trainloader[0])

    if not EXPCONF.infer_now:
        albert = albert.albert
        albert.eval()  # freeze

        cls = MLP(EXPCONF, albertconf.hidden_size, 2).to(device)
        cls.train()
        for p in cls.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

        # huggingface example is doing this for language modeling...
        # https://github.com/huggingface/transformers/blob/v2.6.0/examples/run_language_modeling.py
        optimizer = AdamW(cls.parameters(),
                          lr=EXPCONF.cls_lr)  # otherwise, use default
        getsch = get_cosine_schedule_with_warmup if EXPCONF.cls_sch == 'cosine' else get_linear_schedule_with_warmup
        scheduler = getsch(optimizer, EXPCONF.cls_warmups,
                           EXPCONF.cls_numsteps)

        ## train cls only!
        while global_step < EXPCONF.cls_numsteps:
            lossep_pp = 0
            accep_pp = 0
            cls.train()
            for i, (b, l, datasetids) in enumerate(
                    tqdm(trainloader, desc="iterations progress"), 1):
                outputs = albert(**b, return_dict=True)
                global_step += 1

                logits = cls(outputs.pooler_output)
                losspp = F.cross_entropy(logits, l)

                lossppval = losspp.item()
                acc = accuracy(logits.clone().detach(), l)

                wandb.log({
                    'step':
                    global_step,
                    'cls.train_step/learning_rate':
                    get_lr_from_optim(optimizer),
                    'cls.train_step/pp_loss':
                    lossppval,
                    'cls.train_step/pp_acc':
                    acc,
                })

                optimizer.step()
                scheduler.step()
                cls.zero_grad()

                lossep_pp += lossppval
                accep_pp += acc
                if global_step % EXPCONF.logevery == 0:
                    lossep_pp /= L
                    accep_pp /= L

                    wandb.log({
                        'cls.train_ep/pp_loss': lossep_pp,
                        'cls.train_ep/pp_acc': accep_pp,
                    })
                    devpp_loss, devpp_acc = evaldev(EXPCONF, albert, cls,
                                                    devloader, global_step)
                    if devpp_acc > EXPCONF.savethld:
                        savemodel(EXPCONF,
                                  albert,
                                  cls,
                                  vocab,
                                  global_step,
                                  acc=devpp_acc)
                        write_sub(EXPCONF,
                                  albert,
                                  cls,
                                  global_step,
                                  acc=devpp_acc,
                                  testloader=testloader)

    else:  # infer now
        cls = None
        devpp_loss, devpp_acc = evaldev(EXPCONF,
                                        albert,
                                        cls,
                                        devloader,
                                        global_step,
                                        infernow=EXPCONF.infer_now)
        write_sub(EXPCONF,
                  albert,
                  cls,
                  global_step,
                  acc=devpp_acc,
                  testloader=testloader,
                  infernow=EXPCONF.infer_now)

    return None