Example #1
0
def train(index):
    model.train()
    dataloaders = {}
    i = 0
    global global_step
    datalengths = []
    batchs_per_dataset = []

    # Pretraining datasets
    wiki_pretrain_dataset = PreTrainingDataset(
        tokenizer=tokenizer,
        folder=args.train_path,
        logger=logger,
        max_seq_length=max_seq_length,
        index=index,
        data_type=PretrainDataType.WIKIPEDIA,
        max_predictions_per_seq=max_predictions_per_seq,
        masked_lm_prob=masked_lm_prob)

    datalengths.append(len(wiki_pretrain_dataset))
    dataloaders[i] = get_dataloader(wiki_pretrain_dataset)

    num_batches_in_dataset = get_effective_batch(len(wiki_pretrain_dataset))
    logger.info(
        'Wikpedia data file: Number of samples {}, number of batches required to process these samples: {}'
        .format(len(wiki_pretrain_dataset), num_batches_in_dataset))

    batchs_per_dataset.append(num_batches_in_dataset)
    i += 1

    logger.info("Training on Wikipedia dataset")

    total_length = sum(datalengths)

    dataset_batches = []
    for i, batch_count in enumerate(batchs_per_dataset):
        dataset_batches.extend([i] * batch_count)
    logger.info(
        'Number of batches to process *all* data samples in this epoch: {}'.
        format(len(dataset_batches)))
    # shuffle
    random.shuffle(dataset_batches)

    # We don't want the dataset to be n the form of alternate chunks if we have more than
    # one dataset type, instead we want to organize them into contiguous chunks of each
    # data type, hence the multiplication with grad_accumulation_steps with dataset_batch_type
    dataset_picker = []
    for dataset_batch_type in dataset_batches:
        dataset_picker.extend([dataset_batch_type] *
                              gradient_accumulation_steps)

    logger.info(
        'Number of steps to process all batches in this epoch: {}'.format(
            len(dataset_picker)))
    model.train()

    # Counter of sequences in an "epoch"
    sequences_counter = 0
    global_step_loss = 0

    for step, dataset_type in enumerate(dataset_picker):
        try:
            #logger.info('Step#: %d'%(step))
            batch = next(dataloaders[dataset_type])

            sequences_counter += len(batch)
            #import pdb;pdb.set_trace()

            if n_gpu == 1:
                batch = tuple(t.to(device) for t in batch)  # Move to GPU

            if step > 1 and step % args.log_steps == 0:
                logger.info(
                    "{} Number of sequences processed so far: {} (cumulative in {} steps)"
                    .format(datetime.utcnow(), sequences_counter, step))
            # Calculate forward pass
            loss = model.network(batch)

            if n_gpu > 1:
                # this is to average loss for multi-gpu. In DistributedDataParallel
                # setting, we get tuple of losses form all proccesses
                loss = loss.mean()

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            #import pdb;pdb.set_trace()

            # Enabling  optimized Reduction
            # reduction only happens in backward if this method is called before
            # when using the distributed module
            if accumulate_gradients:
                if use_multigpu_with_single_device_per_process and (
                        step + 1) % gradient_accumulation_steps == 0:
                    model.network.enable_need_reduction()
                else:
                    model.network.disable_need_reduction()
            if fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            global_step_loss += loss
            if (step + 1) % gradient_accumulation_steps == 0:
                if fp16:
                    # modify learning rate with special warm up BERT uses
                    # if fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = \
                        job_config.get_learning_rate() * warmup_linear_decay_exp(global_step,
                                                                                 job_config.get_decay_rate(),
                                                                                 job_config.get_decay_step(),
                                                                                 job_config.get_total_training_steps(),
                                                                                 job_config.get_warmup_proportion())
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step

                    # Record the LR against global_step on tensorboard
                    if check_write_log():
                        summary_writer.add_scalar(f'Train/lr', lr_this_step,
                                                  global_step)
                else:
                    lr_this_step = job_config.get_learning_rate()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                if check_write_log() and (global_step % args.log_steps == 0):
                    logger.info("training_loss %f" %
                                np.float(global_step_loss))
                    logger.info("lr_this_step %f" % np.float(lr_this_step))
                    logger.info("loss over steps=%f, loss=%f" %
                                (global_step, np.float(global_step_loss)))
                    logger.info("lr over steps=%f, lr=%f" %
                                (global_step, np.float(lr_this_step)))
                global_step_loss = 0
        except StopIteration:
            continue

    logger.info("Completed {} steps".format(step))
    logger.info("Completed processing {} sequences".format(sequences_counter))

    # Run Validation Loss
    if max_seq_length == 512:
        logger.info(f"TRAIN BATCH SIZE: {train_batch_size}")
        return pretrain_validation(index)
    else:
        return None
Example #2
0
def train():
    model.train()
    global global_step
    # Pretraining datasets
    batchs_per_dataset = []
    shuffle_numbers = 10

    midea_dataset = MideaDataset(
        tokenizer=tokenizer,
        folder=args.train_path,
        max_seq_length=max_seq_length,
        shuffle_numbers=shuffle_numbers,
        max_predictions_per_seq=max_predictions_per_seq,
        masked_lm_prob=masked_lm_prob)
    num_batches = get_effective_batch(len(midea_dataset))
    logger.info('Wikpedia data file: Number of samples {}'.format(
        len(midea_dataset)))
    batchs_per_dataset.append(num_batches)

    logger.info("Training on Midea dataset")
    dataset_batches = []
    for i, batch_count in enumerate(batchs_per_dataset):
        dataset_batches.extend([i] * batch_count)
    random.shuffle(dataset_batches)

    dataset_picker = []
    for dataset_batch_type in dataset_batches:
        dataset_picker.extend([dataset_batch_type] *
                              gradient_accumulation_steps)
    print("dataset_picker", len(dataset_picker))
    # We don't want the dataset to be n the form of alternate chunks if we have more than
    # one dataset type, instead we want to organize them into contiguous chunks of each
    # data type, hence the multiplication with grad_accumulation_steps with dataset_batch_type
    model.train()

    # Counter of sequences in an "epoch"
    sequences_counter = 0
    global_step_loss = 0
    dataloaders = get_dataloader(midea_dataset)
    step = 0
    best_loss = None
    for index in range(start_epoch, args.epochs):
        logger.info(f"Training epoch: {index + 1}")
        for batch in tqdm(dataloaders):
            # batch = [t.reshape(batch_size*2*shuffle_numbers, -1) for t in batch]
            sequences_counter += batch[1].shape[0]

            # if n_gpu == 1:
            # batch = tuple(t.to(device) for t in batch)  # Move to GPU
            batch = tuple(t.cuda(device, non_blocking=True) for t in batch)

            # logger.info("{} Number of sequences processed so far: {} (cumulative in {} steps)".format(datetime.utcnow(), sequences_counter, step))
            loss = model.network(batch)

            if n_gpu > 1:
                # this is to average loss for multi-gpu. In DistributedDataParallel
                # setting, we get tuple of losses form all proccesses
                loss = loss.mean()

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            # Enabling  optimized Reduction
            # reduction only happens in backward if this method is called before
            # when using the distributed module
            if accumulate_gradients:
                if use_multigpu_with_single_device_per_process and (
                        step + 1) % gradient_accumulation_steps == 0:
                    model.network.enable_need_reduction()
                else:
                    model.network.disable_need_reduction()
            if fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            global_step_loss += loss
            if (step + 1) % gradient_accumulation_steps == 0:
                if fp16:
                    # modify learning rate with special warm up BERT uses
                    # if fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = job_config.get_learning_rate(
                    ) * warmup_linear_decay_exp(
                        global_step, job_config.get_decay_rate(),
                        job_config.get_decay_step(),
                        job_config.get_total_training_steps(),
                        job_config.get_warmup_proportion())
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step

                    # Record the LR against global_step on tensorboard
                    if check_write_log():
                        summary_writer.add_scalar(f'Train/lr', lr_this_step,
                                                  global_step)

                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
                global_step_loss = 0
                step += 1

        logger.info("Completed {} steps".format(step))
        logger.info(
            "Completed processing {} sequences".format(sequences_counter))
        eval_loss = pretrain_validation(index)
        if check_write_log():
            if best_loss is None or eval_loss is None or eval_loss < best_loss * 0.99:
                best_loss = eval_loss
                epoch_ckp_path = os.path.join(
                    saved_model_path,
                    "bert_encoder_epoch_{0:04d}.pt".format(index + 1))
                checkpoint_model(
                    os.path.join(
                        saved_model_path,
                        "training_state_checkpoint_{0:04d}.tar".format(index +
                                                                       1)),
                    model, optimizer, index, global_step)
                logger.info(
                    f"Saving checkpoint of the model from epoch {index + 1} at {epoch_ckp_path}"
                )
                model.save_bert(epoch_ckp_path)

                # save best checkpoint in separate directory
                if args.best_cp_dir:
                    best_ckp_path = os.path.join(
                        args.best_cp_dir,
                        "bert_encoder_epoch_{0:04d}.pt".format(index + 1))
                    shutil.rmtree(args.best_cp_dir)
                    os.makedirs(args.best_cp_dir, exist_ok=True)
                    model.save_bert(best_ckp_path)

            if args.latest_cp_dir:
                shutil.rmtree(args.latest_cp_dir)
                os.makedirs(args.latest_cp_dir, exist_ok=True)
                checkpoint_model(
                    os.path.join(
                        args.latest_cp_dir,
                        "training_state_checkpoint_{0:04d}.tar".format(index +
                                                                       1)),
                    model, optimizer, index, global_step)
                latest_ckp_path = os.path.join(
                    args.latest_cp_dir,
                    "bert_encoder_epoch_{0:04d}.pt".format(index + 1))
                model.save_bert(latest_ckp_path)