コード例 #1
0
def train(experiment_name, distributed=False, continue_epoch=-1):
    model_str = experiment_name

    cfg = load_config_data(experiment_name)
    pprint.pprint(cfg)

    model_type = cfg["model_params"]["model_type"]
    train_params = DotDict(cfg["train_params"])

    checkpoints_dir = f"./checkpoints/{model_str}"
    tensorboard_dir = f"./tensorboard/{model_type}/{model_str}"
    oof_dir = f"./oof/{model_str}"
    os.makedirs(checkpoints_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(oof_dir, exist_ok=True)
    print("\n", experiment_name, "\n")

    logger = SummaryWriter(log_dir=tensorboard_dir)

    scaler = torch.cuda.amp.GradScaler()

    with utils.timeit_context("load train"):
        dataset_train = dataset.LyftDatasetPrerendered(
            dset_name=dataset.LyftDataset.DSET_TRAIN_XXL, cfg_data=cfg)

    with utils.timeit_context("load validation"):
        dataset_valid = dataset.LyftDatasetPrerendered(
            dset_name=dataset.LyftDataset.DSET_VALIDATION, cfg_data=cfg)

    batch_size = dataset_train.dset_cfg["batch_size"]

    data_loaders = {
        "train":
        DataLoader(dataset_train,
                   num_workers=16,
                   shuffle=True,
                   batch_size=batch_size),
        "val":
        DataLoader(
            dataset_valid,
            shuffle=False,
            num_workers=16,
            batch_size=dataset_valid.dset_cfg["batch_size"],
        ),
    }
    model_info = DotDict(cfg["model_params"])
    model = build_model(model_info, cfg)
    model = model.cuda()

    model.train()

    initial_lr = float(train_params.initial_lr)
    if train_params.optimizer == "adamp":
        optimizer = AdamP(model.parameters(), lr=initial_lr)
    elif train_params.optimizer == "adam":
        optimizer = optim.Adam(model.parameters(), lr=initial_lr)
    elif train_params.optimizer == "sgd":
        if model_type == MODEL_TYPE_REGRESSION_MULTI_MODE_EMB:
            optimizer = optim.SGD(
                [
                    {
                        "params": [
                            v for n, v in model.named_parameters()
                            if not n.startswith("emb.")
                            and not n.startswith("backbone.")
                        ],
                        "lr":
                        initial_lr * 2,
                    },
                    {
                        "params": model.backbone.parameters(),
                        "lr": initial_lr
                    },
                    {
                        "params": model.emb.parameters(),
                        "lr": initial_lr * 20
                    },
                ],
                lr=initial_lr,
                momentum=0.9,
                nesterov=True,
            )
        else:
            optimizer = optim.SGD(model.parameters(),
                                  lr=initial_lr,
                                  momentum=0.9,
                                  nesterov=True)
    else:
        raise RuntimeError("Invalid optimiser" + train_params.optimizer)

    if continue_epoch > 0:
        checkpoint = torch.load(f"{checkpoints_dir}/{continue_epoch:03}.pt")
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

    nb_epochs = train_params.nb_epochs
    if train_params.scheduler == "steps":
        scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=train_params.optimiser_milestones,
            gamma=0.2,
            last_epoch=continue_epoch,
        )
    elif train_params.scheduler == "CosineAnnealingLR":
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=nb_epochs,
            eta_min=initial_lr / 1000,
            last_epoch=continue_epoch,
        )
    elif train_params.scheduler == "CosineAnnealingWarmRestarts":
        scheduler = utils.CosineAnnealingWarmRestarts(
            optimizer,
            T_0=train_params.scheduler_period,
            T_mult=train_params.get('scheduler_t_mult', 1),
            eta_min=initial_lr / 1000.0,
            last_epoch=-1)
        for i in range(continue_epoch + 1):
            scheduler.step()
    else:
        raise RuntimeError("Invalid scheduler name")

    grad_clip_value = train_params.get("grad_clip", 2.0)
    print("grad clip:", grad_clip_value)

    print(
        f"Num training agents: {len(dataset_train)} validation agents: {len(dataset_valid)}"
    )

    for epoch_num in range(continue_epoch + 1, nb_epochs + 1):
        for phase in ["train", "val"]:
            model.train(phase == "train")
            epoch_loss_segmentation = []
            epoch_loss_regression = []
            epoch_loss_regression_aux = []
            data_loader = data_loaders[phase]

            optimizer.zero_grad()

            if phase == "train":
                nb_steps_per_epoch = train_params.epoch_size // batch_size
                data_iter = tqdm(
                    utils.LoopIterable(data_loader,
                                       max_iters=nb_steps_per_epoch),
                    total=nb_steps_per_epoch,
                    ncols=250,
                )
            else:
                if epoch_num % 2 > 0:  # skip each 4th validation for speed
                    continue

                data_iter = tqdm(data_loader, ncols=250)

            for data in data_iter:
                with torch.set_grad_enabled(phase == "train"):
                    # torch.set_anomaly_enabled(True)
                    inputs = data["image"].float().cuda()
                    # agent_state = data["agent_state"].float().cuda()
                    agent_state = None
                    target_availabilities = data["target_availabilities"].cuda(
                    )

                    targets = data["target_positions"].cuda()

                    pos_scale = 1.0

                    optimizer.zero_grad()

                    loss_segmentation = 0
                    loss_regression = 0
                    loss_regression_aux = 0

                    if model_type == MODEL_TYPE_ATTENTION:
                        all_agents_state = data["all_agents_state"].float(
                        ).cuda()
                        image_blocks_positions_agent = data[
                            "image_blocks_positions_agent"].cuda()

                        with torch.cuda.amp.autocast():
                            pred, confidences = model(
                                inputs, image_blocks_positions_agent,
                                all_agents_state)

                            loss_regression = utils.pytorch_neg_multi_log_likelihood_batch(
                                gt=targets.float() * pos_scale,
                                pred=pred.float() * pos_scale,
                                confidences=confidences.float(),
                                avails=target_availabilities.float(),
                            )

                    if model_type == MODEL_TYPE_REGRESSION_MULTI_MODE_WITH_OTHER_AGENTS_INPUTS:
                        all_agents_state = data["all_agents_state"].float(
                        ).cuda()

                        with torch.cuda.amp.autocast():
                            pred, confidences = model(inputs, all_agents_state)

                            loss_regression = utils.pytorch_neg_multi_log_likelihood_batch(
                                gt=targets.float(),
                                pred=pred.float(),
                                confidences=confidences.float(),
                                avails=target_availabilities.float(),
                            )

                    if model_type == MODEL_TYPE_REGRESSION_MULTI_MODE:
                        with torch.cuda.amp.autocast():
                            pred, confidences = model(inputs, agent_state)

                            loss_regression = utils.pytorch_neg_multi_log_likelihood_batch_from_log_sm(
                                gt=targets.float() * pos_scale,
                                pred=pred.float() * pos_scale,
                                confidences=confidences.float(),
                                avails=target_availabilities.float(),
                            )

                    if model_type == MODEL_TYPE_REGRESSION_MULTI_MODE_AUX_OUT:
                        with torch.cuda.amp.autocast():
                            pred, confidences, pred_aux, confidences_aux = model(
                                inputs, agent_state,
                                data["image_4x"].float().cuda())

                            loss_regression = utils.pytorch_neg_multi_log_likelihood_batch_from_log_sm(
                                gt=targets.float(),
                                pred=pred.float(),
                                confidences=confidences.float(),
                                avails=target_availabilities.float(),
                            )

                            loss_regression_aux = utils.pytorch_neg_multi_log_likelihood_batch_from_log_sm(
                                gt=targets.float(),
                                pred=pred_aux.float(),
                                confidences=confidences_aux.float(),
                                avails=target_availabilities.float(),
                            )

                    if model_type == MODEL_TYPE_REGRESSION_MULTI_MODE_I4X:
                        with torch.cuda.amp.autocast():
                            pred, confidences = model(
                                inputs, agent_state,
                                data["image_4x"].float().cuda())

                            loss_regression = utils.pytorch_neg_multi_log_likelihood_batch(
                                gt=targets.float() * pos_scale,
                                pred=pred.float() * pos_scale,
                                confidences=confidences.float(),
                                avails=target_availabilities.float(),
                            )

                    if model_type == MODEL_TYPE_REGRESSION_MULTI_MODE_WITH_MASKS:
                        with torch.cuda.amp.autocast():
                            pred, confidences = model(
                                inputs, agent_state,
                                data["other_agents_masks"].float().cuda())

                            loss_regression = utils.pytorch_neg_multi_log_likelihood_batch(
                                gt=targets.float() * pos_scale,
                                pred=pred.float() * pos_scale,
                                confidences=confidences.float(),
                                avails=target_availabilities.float(),
                            )

                    if model_type == MODEL_TYPE_REGRESSION_MULTI_MODE_EMB:
                        with torch.cuda.amp.autocast():
                            pred, confidences = model(
                                inputs, agent_state,
                                data["corners"].float().cuda())

                            loss_regression = utils.pytorch_neg_multi_log_likelihood_batch(
                                gt=targets.float() * pos_scale,
                                pred=pred.float() * pos_scale,
                                confidences=confidences.float(),
                                avails=target_availabilities.float(),
                            )
                    elif model_type == MODEL_TYPE_SEGMENTATION:
                        target_mask = data["output_mask"].cuda()
                        l2_cls, l1_cls = model(inputs, agent_state)
                        loss_segmentation = (torch.nn.functional.
                                             binary_cross_entropy_with_logits(
                                                 l2_cls, target_mask) * 1000 +
                                             torch.nn.functional.
                                             binary_cross_entropy_with_logits(
                                                 l1_cls, target_mask) * 100)
                    elif model_type == MODEL_TYPE_SEGMENTATION_AND_REGRESSION:
                        target_mask = data["output_mask"].cuda()
                        segmentation, pred, confidences = model(
                            inputs, agent_state)
                        loss_segmentation = (torch.nn.functional.
                                             binary_cross_entropy_with_logits(
                                                 segmentation, target_mask) *
                                             1000)

                        loss_regression = utils.pytorch_neg_multi_log_likelihood_batch(
                            gt=targets.float() * pos_scale,
                            pred=pred.float() * pos_scale,
                            confidences=confidences.float(),
                            avails=target_availabilities.float(),
                        )

                    loss = loss_segmentation + loss_regression + loss_regression_aux

                    if phase == "train":
                        scaler.scale(loss).backward()

                        # Unscales the gradients of optimizer's assigned params in-place
                        scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       grad_clip_value)
                        # optimizer's gradients are already unscaled, so scaler.step does not unscale them,
                        # although it still skips optimizer.step() if the gradients contain infs or NaNs.
                        scaler.step(optimizer)
                        scaler.update()

                    if phase == "val":
                        # save predictions visualisation
                        pass

                    epoch_loss_segmentation.append(float(loss_segmentation))
                    epoch_loss_regression.append(float(loss_regression))
                    epoch_loss_regression_aux.append(
                        float(loss_regression_aux))
                    loss_segmentation = None
                    loss_regression = None
                    loss_regression_aux = None
                    del loss

                    data_iter.set_description(
                        f"{epoch_num} {phase[0]}"
                        f" Loss r {np.mean(epoch_loss_regression):1.4f} "
                        f" r aux {np.mean(epoch_loss_regression_aux):1.4f} "
                        f"s {np.mean(epoch_loss_segmentation):1.4f}")

            logger.add_scalar(f"loss_{phase}", np.mean(epoch_loss_regression),
                              epoch_num)
            if epoch_loss_segmentation[-1] > 0:
                logger.add_scalar(f"loss_segmentation_{phase}",
                                  np.mean(epoch_loss_segmentation), epoch_num)

            if epoch_loss_regression_aux[-1] > 0:
                logger.add_scalar(f"loss_regression_aux_{phase}",
                                  np.mean(epoch_loss_regression_aux),
                                  epoch_num)

            if phase == "train":
                logger.add_scalar("lr", optimizer.param_groups[0]["lr"],
                                  epoch_num)
            logger.flush()

            if phase == "train":
                scheduler.step()
                if (epoch_num % train_params.save_period
                        == 0) or (epoch_num == nb_epochs):
                    torch.save(
                        {
                            "epoch":
                            epoch_num,
                            "model_state_dict":
                            model.module.state_dict()
                            if distributed else model.state_dict(),
                            "optimizer_state_dict":
                            optimizer.state_dict(),
                        },
                        f"{checkpoints_dir}/{epoch_num:03}.pt",
                    )
コード例 #2
0
def init_optimizer(optimizer_name,
                   model,
                   optimizer_state,
                   lr,
                   wd,
                   lr_restart_step=1,
                   lr_decay_gamma=0.9,
                   scheduler="step",
                   nesterov=False,
                   num_epochs=None,
                   steps_per_epoch=None):
    if optimizer_name == "sgd":
        optimizer_ft = optim.SGD(model.parameters(),
                                 lr=lr,
                                 momentum=0.9,
                                 weight_decay=wd,
                                 nesterov=nesterov)
    elif optimizer_name == "adam":
        optimizer_ft = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    elif optimizer_name == "adamp":
        from adamp import AdamP
        optimizer_ft = AdamP(model.parameters(),
                             lr=0.001,
                             betas=(0.9, 0.999),
                             weight_decay=1e-2)
    elif optimizer_name == "sgdp":
        from adamp import SGDP
        optimizer_ft = SGDP(model.parameters(),
                            lr=0.1,
                            weight_decay=1e-5,
                            momentum=0.9,
                            nesterov=nesterov)
    else:
        opt_attr = getattr(toptim, optimizer_name)
        if opt_attr:
            optimizer_ft = opt_attr(model.parameters())
        else:
            raise Exception("unknown optimizer name", optimizer_name)

    if optimizer_state is not None:
        optimizer_ft.load_state_dict(optimizer_state)

    if scheduler == "cosine":
        exp_lr_scheduler = lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer_ft, lr_restart_step)
        use_lr_schedule_steps = True
    elif scheduler == "cycle":
        exp_lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer_ft,
            max_lr=lr,
            steps_per_epoch=steps_per_epoch,
            epochs=num_epochs,
            pct_start=0.1)
        use_lr_schedule_steps = False
    elif scheduler == "step":
        exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft,
                                               step_size=lr_restart_step,
                                               gamma=lr_decay_gamma)
        use_lr_schedule_steps = False

    return optimizer_ft, exp_lr_scheduler, use_lr_schedule_steps