Esempio n. 1
0
def pretrain_validation(index):
    model.eval()
    dataset = PreTrainingDataset(
        tokenizer=tokenizer,
        folder=args.validation_path,
        logger=logger,
        max_seq_length=max_seq_length,
        index=index,
        data_type=PretrainDataType.VALIDATION,
        max_predictions_per_seq=max_predictions_per_seq,
        masked_lm_prob=masked_lm_prob)
    data_batches = get_dataloader(dataset, eval_set=True)
    eval_loss = 0
    nb_eval_steps = 0

    for batch in data_batches:
        batch = tuple(t.to(device) for t in batch)
        tmp_eval_loss = model.network(batch, log=False)
        dist.reduce(tmp_eval_loss, 0)
        # Reduce to get the loss from all the GPU's
        tmp_eval_loss = tmp_eval_loss / dist.get_world_size()
        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    logger.info(f"Validation Loss for epoch {index + 1} is: {eval_loss}")
    if check_write_log():
        summary_writer.add_scalar(f'Validation/Loss', eval_loss, index + 1)
        logger.info("validation_loss %f" % (np.float(eval_loss)))
        logger.info("validation_loss over epochs epoch=%d, val_loss=%f" %
                    (index, np.float(eval_loss)))
    return eval_loss
Esempio n. 2
0
def train(index):
    model.train()
    dataloaders = {}
    i = 0
    global global_step
    datalengths = []
    batchs_per_dataset = []

    # Pretraining datasets
    wiki_pretrain_dataset = PreTrainingDataset(
        tokenizer=tokenizer,
        folder=args.train_path,
        logger=logger,
        max_seq_length=max_seq_length,
        index=index,
        data_type=PretrainDataType.WIKIPEDIA,
        max_predictions_per_seq=max_predictions_per_seq,
        masked_lm_prob=masked_lm_prob)

    datalengths.append(len(wiki_pretrain_dataset))
    dataloaders[i] = get_dataloader(wiki_pretrain_dataset)

    num_batches_in_dataset = get_effective_batch(len(wiki_pretrain_dataset))
    logger.info(
        'Wikpedia data file: Number of samples {}, number of batches required to process these samples: {}'
        .format(len(wiki_pretrain_dataset), num_batches_in_dataset))

    batchs_per_dataset.append(num_batches_in_dataset)
    i += 1

    logger.info("Training on Wikipedia dataset")

    total_length = sum(datalengths)

    dataset_batches = []
    for i, batch_count in enumerate(batchs_per_dataset):
        dataset_batches.extend([i] * batch_count)
    logger.info(
        'Number of batches to process *all* data samples in this epoch: {}'.
        format(len(dataset_batches)))
    # shuffle
    random.shuffle(dataset_batches)

    # We don't want the dataset to be n the form of alternate chunks if we have more than
    # one dataset type, instead we want to organize them into contiguous chunks of each
    # data type, hence the multiplication with grad_accumulation_steps with dataset_batch_type
    dataset_picker = []
    for dataset_batch_type in dataset_batches:
        dataset_picker.extend([dataset_batch_type] *
                              gradient_accumulation_steps)

    logger.info(
        'Number of steps to process all batches in this epoch: {}'.format(
            len(dataset_picker)))
    model.train()

    # Counter of sequences in an "epoch"
    sequences_counter = 0
    global_step_loss = 0

    for step, dataset_type in enumerate(dataset_picker):
        try:
            #logger.info('Step#: %d'%(step))
            batch = next(dataloaders[dataset_type])

            sequences_counter += len(batch)
            #import pdb;pdb.set_trace()

            if n_gpu == 1:
                batch = tuple(t.to(device) for t in batch)  # Move to GPU

            if step > 1 and step % args.log_steps == 0:
                logger.info(
                    "{} Number of sequences processed so far: {} (cumulative in {} steps)"
                    .format(datetime.utcnow(), sequences_counter, step))
            # Calculate forward pass
            loss = model.network(batch)

            if n_gpu > 1:
                # this is to average loss for multi-gpu. In DistributedDataParallel
                # setting, we get tuple of losses form all proccesses
                loss = loss.mean()

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            #import pdb;pdb.set_trace()

            # Enabling  optimized Reduction
            # reduction only happens in backward if this method is called before
            # when using the distributed module
            if accumulate_gradients:
                if use_multigpu_with_single_device_per_process and (
                        step + 1) % gradient_accumulation_steps == 0:
                    model.network.enable_need_reduction()
                else:
                    model.network.disable_need_reduction()
            if fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            global_step_loss += loss
            if (step + 1) % gradient_accumulation_steps == 0:
                if fp16:
                    # modify learning rate with special warm up BERT uses
                    # if fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = \
                        job_config.get_learning_rate() * warmup_linear_decay_exp(global_step,
                                                                                 job_config.get_decay_rate(),
                                                                                 job_config.get_decay_step(),
                                                                                 job_config.get_total_training_steps(),
                                                                                 job_config.get_warmup_proportion())
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step

                    # Record the LR against global_step on tensorboard
                    if check_write_log():
                        summary_writer.add_scalar(f'Train/lr', lr_this_step,
                                                  global_step)
                else:
                    lr_this_step = job_config.get_learning_rate()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                if check_write_log() and (global_step % args.log_steps == 0):
                    logger.info("training_loss %f" %
                                np.float(global_step_loss))
                    logger.info("lr_this_step %f" % np.float(lr_this_step))
                    logger.info("loss over steps=%f, loss=%f" %
                                (global_step, np.float(global_step_loss)))
                    logger.info("lr over steps=%f, lr=%f" %
                                (global_step, np.float(lr_this_step)))
                global_step_loss = 0
        except StopIteration:
            continue

    logger.info("Completed {} steps".format(step))
    logger.info("Completed processing {} sequences".format(sequences_counter))

    # Run Validation Loss
    if max_seq_length == 512:
        logger.info(f"TRAIN BATCH SIZE: {train_batch_size}")
        return pretrain_validation(index)
    else:
        return None