예제 #1
0
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=job_config.get_learning_rate(),
                             warmup=job_config.get_warmup_proportion(),
                             t_total=job_config.get_total_training_steps())

    global_step = 0
    start_epoch = 0

    # if args.load_training_checkpoint is not None:
    if load_training_checkpoint != 'False':
        logger.info(f"Looking for previous training checkpoint.")
        latest_checkpoint_path = latest_checkpoint_file(
            args.load_training_checkpoint, no_cuda)

        logger.info(
            f"Restoring previous training checkpoint from {latest_checkpoint_path}"
        )
        start_epoch, global_step = load_checkpoint(model, optimizer,
                                                   latest_checkpoint_path)
        logger.info(
            f"The model is loaded from last checkpoint at epoch {start_epoch} when the global steps were at {global_step}"
        )

    logger.info("Training the model")

    best_loss = None
    for index in range(start_epoch, args.epochs):
        logger.info(f"Training epoch: {index + 1}")
예제 #2
0
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=job_config.get_learning_rate(),
                             warmup=job_config.get_warmup_proportion(),
                             t_total=job_config.get_total_training_steps())

    global_step = 0
    start_epoch = 0

    # if args.load_training_checkpoint is not None:
    if load_training_checkpoint != 'False':
        logger.info(f"Looking for previous training checkpoint.")
        latest_checkpoint_path = latest_checkpoint_file(parent_dir, no_cuda)

        logger.info(
            f"Restoring previous training checkpoint from {latest_checkpoint_path}"
        )
        start_epoch, global_step = load_checkpoint(model, optimizer,
                                                   latest_checkpoint_path)
        logger.info(
            f"The model is loaded from last checkpoint at epoch {start_epoch} when the global steps were at {global_step}"
        )

    logger.info("Training the model")

    for index in range(start_epoch, job_config.get_total_epoch_count()):
        logger.info(f"Training epoch: {index + 1}")