Beispiel #1
0
def get_world_size():
    if not dist.is_available():
        return 1
    if not dist.is_initialized():
        return 1
    if smp.is_initialized():
        return smp.size()
    return dist.get_world_size()
Beispiel #2
0
def get_rank():
    if not dist.is_available():
        return 0
    if not dist.is_initialized():
        return 0
    if smp.is_initialized():
        return smp.rank()
    return dist.get_rank()
Beispiel #3
0
def dist_init(fn, args):
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True

        if cudnn.deterministic:
            warnings.warn('You have chosen to seed training. '
                          'This will turn on the CUDNN deterministic setting, '
                          'which can slow down your training considerably! '
                          'You may see unexpected behavior when restarting '
                          'from checkpoints.')

    args.is_distributed = len(args.hosts) > 1 and args.backend is not None
    args.is_multigpus = args.num_gpus > 1
    args.multigpus_distributed = (args.is_distributed or args.is_multigpus)

    logger.debug("multigpus_distributed - {}".format(
        args.multigpus_distributed))
    logger.debug("Number of gpus available - {}".format(args.num_gpus))

    #     print("######### Start Training #########")

    if args.multigpus_distributed:
        if args.apex:
            # Initialize the distributed environment.
            mp.spawn(fn, nprocs=args.num_gpus, args=(args, ))
        else:
            if args.data_parallel and not sdp.is_initialized():
                sdp.init_process_group()
            elif args.model_parallel and not smp.is_initialized():
                smp.init()

            fn(None, args)

            if args.model_parallel:
                smp.barrier()
    else:
        fn(0, args)
def main():
    global timeout_sent

    args = parse_arguments()

    random.seed(args.seed + args.local_rank)
    np.random.seed(args.seed + args.local_rank)
    torch.manual_seed(args.seed + args.local_rank)
    torch.cuda.manual_seed(args.seed + args.local_rank)
    worker_init = WorkerInitObj(args.seed + args.local_rank)

    device, args = setup_training(args)

    # Prepare optimizer
    (
        model,
        optimizer,
        lr_scheduler,
        checkpoint,
        global_step,
        criterion,
    ) = prepare_model_and_optimizer(args, device)

    raw_train_start = None
    most_recent_ckpts_paths = []
    average_loss = 0.0  # averaged loss every args.log_freq steps
    epoch = 0
    training_steps = 0
    test_losses = []

    pool = ProcessPoolExecutor(1)

    # Note: We loop infinitely over epochs, termination is handled via iteration count
    while True:
        thread = None
        restored_data_loader = None
        if (not args.resume_from_checkpoint or epoch > 0
                or (args.phase2 and global_step < 1) or args.init_checkpoint):
            files = [
                os.path.join(args.input_dir, f)
                for f in os.listdir(args.input_dir)
                if os.path.isfile(os.path.join(args.input_dir, f))
                and "training" in f
            ]
            files.sort()
            num_files = len(files)
            random.Random(args.seed + epoch).shuffle(files)
            f_start_id = 0
        else:
            f_start_id = checkpoint["files"][0]
            files = checkpoint["files"][1:]
            args.resume_from_checkpoint = False
            num_files = len(files)
            # may not exist in all checkpoints
            epoch = checkpoint.get("epoch", 0)
            restored_dataloader = checkpoint.get("data_loader", None)

        shared_file_list = {}

        if smp.is_initialized():
            dpsize = smp.dp_size()
            dprank = smp.dp_rank()
        elif torch.distributed.is_initialized():
            dpsize = get_world_size()
            dprank = get_rank()
        else:
            dpsize = 1
            dprank = 0
        dparallel = dpsize > 1
        if dparallel and dpsize > num_files:
            remainder = dpsize % num_files
            data_file = files[(f_start_id * dpsize + dprank +
                               remainder * f_start_id) % num_files]
        else:
            data_file = files[(f_start_id * dpsize + dprank) % num_files]

        previous_file = data_file

        if restored_data_loader is None:
            train_data = pretraining_dataset(data_file,
                                             args.max_predictions_per_seq)
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(
                train_data,
                sampler=train_sampler,
                batch_size=args.train_batch_size * args.n_gpu,
                num_workers=4,
                worker_init_fn=worker_init,
                pin_memory=True,
                drop_last=True,
            )
            # shared_file_list["0"] = (train_dataloader, data_file)
        else:
            train_dataloader = restored_data_loader
            restored_data_loader = None

        overflow_buf = None
        if args.allreduce_post_accumulation:
            overflow_buf = torch.cuda.IntTensor([0])

        for f_id in range(f_start_id + 1, len(files)):
            if get_world_size() > num_files:
                data_file = files[(f_id * get_world_size() + get_rank() +
                                   remainder * f_id) % num_files]
            else:
                data_file = files[(f_id * get_world_size() + get_rank()) %
                                  num_files]

            previous_file = data_file

            dataset_future = pool.submit(
                create_pretraining_dataset,
                data_file,
                args.max_predictions_per_seq,
                shared_file_list,
                args,
                worker_init,
            )

            train_iter = (tqdm(train_dataloader,
                               desc="Iteration",
                               disable=args.disable_progress_bar)
                          if is_main_process() else train_dataloader)

            if raw_train_start is None:
                raw_train_start = time.time()

            for step, batch in enumerate(train_iter):
                training_steps += 1
                batch = [t.to(device) for t in batch]
                input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
                if args.do_train:
                    from smdistributed.modelparallel.test.torch.utils import dump_model, verify

                    model.train()
                    if args.smp > 0:
                        loss_mbs = smp_step(
                            args,
                            device,
                            input_ids,
                            segment_ids,
                            input_mask,
                            masked_lm_labels,
                            next_sentence_labels,
                            model,
                            optimizer,
                            criterion,
                            step,
                        )
                        loss = loss_mbs.reduce_mean()
                        if smp.rank() == 0:
                            print("Loss:", loss.item())
                    else:
                        loss = train_step(
                            args,
                            device,
                            input_ids,
                            segment_ids,
                            input_mask,
                            masked_lm_labels,
                            next_sentence_labels,
                            model,
                            optimizer,
                            criterion,
                            step,
                        )
                    divisor = 1
                    average_loss += loss.item()

                    if training_steps % args.gradient_accumulation_steps == 0:
                        lr_scheduler.step()  # learning rate warmup
                        global_step = take_optimizer_step(
                            args, optimizer, model, overflow_buf, global_step)

                    if global_step >= args.steps_this_run or timeout_sent:
                        train_time_raw = time.time() - raw_train_start
                        last_num_steps = (int(
                            training_steps / args.gradient_accumulation_steps)
                                          % args.log_freq)
                        last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
                        average_loss = torch.tensor(
                            average_loss, dtype=torch.float32).cuda()
                        average_loss = average_loss / (last_num_steps *
                                                       divisor)
                        if torch.distributed.is_initialized():
                            average_loss /= get_world_size()
                            torch.distributed.all_reduce(average_loss)
                        final_loss = loss.item()
                    elif training_steps % (
                            args.log_freq *
                            args.gradient_accumulation_steps) == 0:
                        average_loss = 0

                    if (global_step >= args.steps_this_run or training_steps %
                        (args.num_steps_per_checkpoint *
                         args.gradient_accumulation_steps) == 0
                            or timeout_sent):
                        if smp.dp_rank() == 0 and not args.skip_checkpoint:
                            if args.resume_step < 0 or not args.phase2:
                                output_save_file = os.path.join(
                                    args.output_dir,
                                    "ckpt_{}.pt".format(global_step))
                            else:
                                output_save_file = os.path.join(
                                    args.output_dir,
                                    "ckpt_{}.pt".format(global_step +
                                                        args.phase1_end_step),
                                )
                            if args.do_train:
                                save_dict = {
                                    "model":
                                    model.local_state_dict(),
                                    "optimizer":
                                    optimizer.local_state_dict(),
                                    "files": [f_id] + files,
                                    "epoch":
                                    epoch,
                                    "data_loader":
                                    None if global_step >= args.steps_this_run
                                    else train_dataloader,
                                }
                                if args.fp16:
                                    save_dict["master params"] = list(
                                        amp.master_params(optimizer))
                                # SMP: Checkpoint mp_rank specific state
                                smp.save(save_dict,
                                         output_save_file,
                                         partial=True)

                                most_recent_ckpts_paths.append(
                                    output_save_file)
                                if len(most_recent_ckpts_paths) > 3 and (
                                        args.smp == 0 or smp.dp_rank() == 0):
                                    ckpt_to_be_removed = most_recent_ckpts_paths.pop(
                                        0)
                                    os.remove(ckpt_to_be_removed +
                                              f"_{smp.mp_rank()}")

                        # Exiting the training due to hitting max steps, or being sent a
                        # timeout from the cluster scheduler
                        if global_step >= args.steps_this_run or timeout_sent:
                            del train_dataloader
                            # thread.join()
                            if smp.dp_rank() == 0 and args.save_full:
                                output_save_file = os.path.join(
                                    args.output_dir,
                                    "ckpt_{}.pt".format(global_step))
                                save_dict = {
                                    "model":
                                    model.local_state_dict(),
                                    "optimizer":
                                    optimizer.local_state_dict(),
                                    "files": [f_id] + files,
                                    "epoch":
                                    epoch,
                                    "data_loader":
                                    None if global_step >= args.steps_this_run
                                    else train_dataloader,
                                }
                                if args.fp16:
                                    save_dict["master params"] = list(
                                        amp.master_params(optimizer))
                                # SMP: Save a single checkpoint containing entire model parameters
                                smp.save(save_dict,
                                         output_save_file,
                                         partial=False)
                            smp.barrier()
                            if smp.local_rank() == 0:
                                print(f"Start syncing model checkpoints to s3")
                                base_s3_path = os.path.dirname(
                                    os.path.dirname(
                                        os.getenv("SM_MODULE_DIR", "")))
                                curr_host = os.getenv("SM_CURRENT_HOST")
                                full_s3_path = f"{base_s3_path}/checkpoints/{curr_host}/"
                                sync_local_checkpoints_to_s3(
                                    local_path=args.output_dir,
                                    s3_path=full_s3_path)
                                print(
                                    f"Finished syncing model checkpoints to s3"
                                )
                            return args, final_loss, train_time_raw, global_step
                else:
                    model.eval()
                    with torch.no_grad():
                        loss = test_step(
                            args,
                            device,
                            input_ids,
                            segment_ids,
                            input_mask,
                            masked_lm_labels,
                            next_sentence_labels,
                            model,
                            criterion,
                            step,
                        )
                        print(f"global_step {global_step} Test Loss:", loss)
                        test_losses.append(loss)
                    global_step += 1
                    if global_step >= args.steps_this_run:
                        return sum(test_losses) / len(test_losses)

            del train_dataloader
            # thread.join()
            # Make sure pool has finished and switch train_dataloader
            # NOTE: Will block until complete
            train_dataloader, data_file = dataset_future.result(timeout=None)
        epoch += 1