N_BLOCKS = 9
LR_LAMBDA = lambda epoch: min(1, 1 - (epoch - 100) / 100)
IMG_SIZE = 286
INPUT_SIZE = 256

# device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Networks
G_A2B = Generator(input_dim=3, n_blocks=N_BLOCKS).to(device)
G_B2A = Generator(input_dim=3, n_blocks=N_BLOCKS).to(device)
D_A = Discriminator(input_dim=3).to(device)
D_B = Discriminator(input_dim=3).to(device)
G_A2B.apply(init_weights)
G_B2A.apply(init_weights)
D_A.apply(init_weights)
D_B.apply(init_weights)

# ImagePool
fake_A_pool = ImagePool(size=50)
fake_B_pool = ImagePool(size=50)

# loss
Loss_GAN = nn.MSELoss()
Loss_cyc = nn.L1Loss()

# optimizer , betas=(0.5, 0.999)
optimizer_G = optim.Adam(itertools.chain(G_A2B.parameters(),
                                         G_B2A.parameters()),
                         lr=LR,
                         betas=(0.5, 0.999))
Example #2
0
def trainer(cfg: DictConfig) -> None:
    
    os.environ["L5KIT_DATA_FOLDER"] = cfg.l5kit_data_folder
    dm = LocalDataManager(None)

    logger = logging.getLogger(__name__)

    logger.info("Working directory : {}".format(os.getcwd()))

    logger.info("Load dataset...")

    train_cfg = cfg["train_data_loader"]
    valid_cfg = cfg["valid_data_loader"]

    # rasterizer
    rasterizer = build_rasterizer(cfg, dm)

    train_path = train_cfg["key"]
    train_zarr = ChunkedDataset(dm.require(train_path)).open(cached=False)

    logger.info(f"train_zarr {type(train_zarr)}")

    # loading custom mask (we mask static agents)
    logger.info(f"Loading mask in path {train_cfg['mask_path']}")
    custom_mask = np.load(train_cfg['mask_path'])
    logger.info(f"Length of training mask is: {custom_mask.sum()}")

    train_agent_dataset = AgentDataset(cfg, train_zarr, rasterizer, agents_mask=custom_mask)

    # transform dataset to the proper frame of reference
    train_dataset = TransformDataset(train_agent_dataset, cfg)

    if not train_cfg['subset'] == -1:
        train_dataset = Subset(train_dataset, np.arange(train_cfg['subset']))

    train_loader = DataLoader(train_dataset,
                              shuffle=train_cfg["shuffle"],
                              batch_size=train_cfg["batch_size"],
                              num_workers=train_cfg["num_workers"])

    logger.info(train_agent_dataset)

    # loading custom mask for validation dataset
    logger.info(f"Loading val mask in path {valid_cfg['mask_path']}")
    val_custom_mask = np.load(valid_cfg['mask_path'])
    logger.info(f"Length of validation mask is: {val_custom_mask.sum()}")

    valid_path = valid_cfg["key"]
    valid_zarr = ChunkedDataset(dm.require(valid_path)).open(cached=False)

    logger.info(f"valid_zarr {type(train_zarr)}")

    valid_agent_dataset = AgentDataset(cfg, valid_zarr, rasterizer, agents_mask=val_custom_mask)

    # transform validation dataset to the proper frame of reference
    valid_dataset = TransformDataset(valid_agent_dataset, cfg)

    if not valid_cfg['subset'] == -1:
        valid_dataset = Subset(valid_dataset, valid_cfg['subset'])

    valid_loader = DataLoader(
        valid_dataset,
        shuffle=valid_cfg["shuffle"],
        batch_size=valid_cfg["batch_size"],
        num_workers=valid_cfg["num_workers"]
    )

    logger.info(valid_agent_dataset)
    logger.info(f"# Full AgentDataset train: {len(train_agent_dataset)} #valid: {len(valid_agent_dataset)}")
    logger.info(f"# Actual AgentDataset train: {len(train_dataset)} #valid: {len(valid_dataset)}")

    n_epochs = cfg['train_params']['num_epochs']

    d_steps = cfg['train_params']['num_d_steps']
    g_steps = cfg['train_params']['num_g_steps']

    noise_dim = cfg['gan_params']['noise_dim']
    g_learning_rate = cfg['train_params']['g_learning_rate']
    d_learning_rate = cfg['train_params']['d_learning_rate']

    if cfg['gan_params']['gan_type'] == 'vanilla':
        cross_entropy = nn.BCELoss()

    generator = Generator(input_dim=cfg['gan_params']['input_dim'],
                          embedding_dim=cfg['gan_params']['embedding_dim'],
                          decoder_dim=cfg['gan_params']['decoder_dim'],
                          trajectory_dim=cfg['model_params']['future_num_frames'],
                          noise_dim=noise_dim,
                          backbone_type=cfg['gan_params']['backbone_type'],
                          embedding_type=cfg['gan_params']['embedding_type']
                          )

    generator.to(cfg['device'])
    generator.train()  # train mode
    
    W = cfg['raster_params']['raster_size'][0]
    discriminator = Discriminator(width=W,
                                  h_0=cfg['raster_params']['ego_center'][0]*W,
                                  w_0=cfg['raster_params']['ego_center'][1]*W,
                                  r=cfg['raster_params']['pixel_size'][0],
                                  sigma=cfg['gan_params']['sigma'],
                                  channels_num=cfg['model_params']['future_num_frames']+3,
                                  num_disc_feats=cfg['gan_params']['num_disc_feats'],
                                  input_dim=cfg['gan_params']['input_dim'],
                                  device=cfg['device'],
                                  gan_type=cfg['gan_params']['gan_type'],
                                  embedding_type=cfg['gan_params']['embedding_type'],
                                  lstm_embedding_dim=cfg['gan_params']['embedding_dim']
                                  )

    discriminator.to(cfg['device'])
    discriminator.apply(weights_init)
    discriminator.train()  # train mode

    if cfg['gan_params']['gan_type'] == 'wasserstein':
        optimizer_g = optim.RMSprop(generator.parameters(), lr=g_learning_rate)
        optimizer_d = optim.RMSprop(discriminator.parameters(), lr=d_learning_rate)
    elif cfg['gan_params']['gan_type'] == 'wasserstein_gp':
        betas = (0.0, 0.9)
        optimizer_g = optim.Adam(generator.parameters(), lr=g_learning_rate, betas=betas)
        optimizer_d = optim.Adam(discriminator.parameters(), lr=d_learning_rate, betas=betas)
    else:
        optimizer_g = optim.Adam(generator.parameters(), lr=g_learning_rate)
        optimizer_d = optim.Adam(discriminator.parameters(), lr=d_learning_rate)

    d_steps_left = d_steps
    g_steps_left = g_steps

    # variables for statistics
    d_full_loss = []
    g_full_loss = []
    gp_values = []
    l2_variety_values = []
    metric_vals = []

    # checkpoint dictionary
    checkpoint = {
        'G_losses': defaultdict(list),
        'D_losses': defaultdict(list),
        'counters': {
            't': None,
            'epoch': None,
        },
        'g_state': None,
        'g_optim_state': None,
        'd_state': None,
        'd_optim_state': None
    }

    id_batch = 0

    # total number of batches
    len_of_epoch = len(train_loader)

    for epoch in range(n_epochs):
        for batch in train_loader:
            batch = [tensor.to(cfg['device']) for tensor in batch]

            # Creates single raster image from sequence of images from l5kit's AgentDataset
            batch[0] = f_get_raster_image(cfg=cfg,
                                          images=batch[0],
                                          history_weight=cfg['model_params']['history_fading_weight'])

            (image, target_positions, target_availabilities,
             history_positions, history_yaws, centroid, world_to_image) = batch

            actor_state = (history_positions, history_yaws)

            batch_size = image.shape[0]

            # noise for generator
            noise = torch.normal(size=(batch_size, noise_dim),
                                 mean=0.0,
                                 std=1.0,
                                 dtype=torch.float32,
                                 device=cfg['device'])

            #######################################
            #       TRAIN DISCRIMINATOR
            #######################################

            # train discriminator (d_steps_left) times (using different batches)
            # train generator (g_steps_left) times (using different batches)

            if d_steps_left > 0:
                d_steps_left -= 1

                for pd in discriminator.parameters():  # reset requires_grad
                    pd.requires_grad = True  # they are set to False below in generator update

                # freeze generator while training discriminator
                for pg in generator.parameters():
                    pg.requires_grad = False

                discriminator.zero_grad()

                # generate fake trajectories (batch_size, target_size, 2) for current batch
                fake_trajectory = generator(image, actor_state, noise)

                # discriminator predictions (batch_size, 1) on real and fake trajectories
                d_real_pred = discriminator(target_positions, image, actor_state)
                d_g_pred = discriminator(fake_trajectory, image, actor_state)

                # loss
                if cfg['gan_params']['gan_type'] == 'vanilla':
                    # tensor with true/fake labels of size (batch_size, 1)
                    real_labels = torch.full((batch_size,), 1, dtype=torch.float, device=cfg['device'])
                    fake_labels = torch.full((batch_size,), 0, dtype=torch.float, device=cfg['device'])

                    real_loss = cross_entropy(d_real_pred, real_labels)
                    fake_loss = cross_entropy(d_g_pred, fake_labels)

                    total_loss = real_loss + fake_loss
                elif cfg['gan_params']['gan_type'] == 'wasserstein':  # D(fake) - D(real)
                    total_loss = torch.mean(d_g_pred) - torch.mean(d_real_pred)
                elif cfg['gan_params']['gan_type'] == 'wasserstein_gp':
                    gp_loss = gradient_penalty(discrim=discriminator,
                                               real_trajectory=target_positions,
                                               fake_trajectory=fake_trajectory,
                                               in_image=image,
                                               in_actor_state=actor_state,
                                               lambda_gp=cfg['losses']['lambda_gp'],
                                               device=cfg['device'])

                    total_loss = torch.mean(d_g_pred) - torch.mean(d_real_pred) + gp_loss
                else:
                    raise NotImplementedError

                # calculate gradients for this batch
                total_loss.backward()
                optimizer_d.step()

                # weight clipping for discriminator in pure Wasserstein GAN
                if cfg['gan_params']['gan_type'] == 'wasserstein':
                    c = cfg['losses']['weight_clip']
                    for p in discriminator.parameters():
                        p.data.clamp_(-c, c)

                d_full_loss.append(total_loss.item())

                if cfg['gan_params']['gan_type'] == 'wasserstein_gp':
                    gp_values.append(gp_loss.item())

            #######################################
            #         TRAIN GENERATOR
            #######################################

            elif g_steps_left > 0:  # we either train generator or discriminator on current batch
                g_steps_left -= 1

                for pd in discriminator.parameters():
                    pd.requires_grad = False  # avoid discriminator training

                # unfreeze generator
                for pg in generator.parameters():
                    pg.requires_grad = True

                generator.zero_grad()

                if cfg['losses']['use_variety_l2']:
                    l2_variety_loss, fake_trajectory = l2_loss_kmin(traj_real=target_positions,
                                                                    generator_=generator,
                                                                    image=image,
                                                                    actor_state=actor_state,
                                                                    cfg=cfg,
                                                                    kmin=cfg['losses']['k_min'],
                                                                    return_best_traj=True)
                else:
                    fake_trajectory = generator(image, actor_state, noise)

                d_g_pred = discriminator(fake_trajectory, image, actor_state)

                if cfg['gan_params']['gan_type'] == 'vanilla':
                    # while training generator we associate generated fake examples
                    # with real labels in order to measure generator quality
                    real_labels = torch.full((batch_size,), 1, dtype=torch.float, device=cfg['device'])
                    fake_loss = cross_entropy(d_g_pred, real_labels)
                elif cfg['gan_params']['gan_type'] in ['wasserstein', 'wasserstein_gp']:  # -D(fake)
                    fake_loss = -torch.mean(d_g_pred)
                else:
                    raise NotImplementedError

                if cfg['losses']['use_variety_l2']:
                    fake_loss += cfg['losses']['weight_variety_l2'] * l2_variety_loss

                    l2_variety_values.append(l2_variety_loss.item())

                fake_loss.backward()
                optimizer_g.step()

                g_full_loss.append(fake_loss.item())

            # renew d_steps_left, g_steps_left at the end of full discriminator-generator training cycle
            if d_steps_left == 0 and g_steps_left == 0:
                d_steps_left = d_steps
                g_steps_left = g_steps

            # print current model state on train dataset
            if (id_batch > 0) and (id_batch % cfg['train_params']['print_every_n_steps'] == 0):

                print_statistics(logger=logger,
                                 cfg=cfg,
                                 epoch=epoch,
                                 len_of_epoch=len_of_epoch,
                                 id_batch=id_batch,
                                 d_full_loss=d_full_loss,
                                 g_full_loss=g_full_loss,
                                 gp_values=gp_values,
                                 l2_variety_values=l2_variety_values,
                                 print_over_n_last=1000)

                # save rasterized image of 0th element of current batch
                plot_traj_on_map(cfg, 0, batch, generator, save_name=str(id_batch),
                                 save_directory=cfg['train_params']['image_sample_dir'])

            # Save checkpoint and evaluate the model
            if (id_batch > 0) and (id_batch % cfg['train_params']['checkpoint_every_n_steps'] == 0):
                checkpoint['counters']['t'] = id_batch
                checkpoint['counters']['epoch'] = epoch

                # Check stats on the validation set
                logger.info('Checking stats on val ...')
                metrics_val = evaluate(cfg, generator, valid_loader)
                metric_vals.append(metrics_val)

                with open('metric_vals_list.pkl', 'wb') as handle:
                    pickle.dump(metric_vals, handle, protocol=pickle.HIGHEST_PROTOCOL)

                for k, v in sorted(metrics_val.items()):
                    logger.info('  [val] {}: {:.3f}'.format(k, v))

                checkpoint['g_state'] = generator.state_dict()
                checkpoint['g_optim_state'] = optimizer_g.state_dict()
                checkpoint['d_state'] = discriminator.state_dict()
                checkpoint['d_optim_state'] = optimizer_d.state_dict()
                checkpoint_path = os.path.join(os.getcwd(), f"{cfg['model_name']}_{id_batch}.pt")
                logger.info('Saving checkpoint to {}'.format(checkpoint_path))
                torch.save(checkpoint, checkpoint_path)
                logger.info('Done.')

                results_df, metric_df = get_results_plot(d_full_loss,
                                                         g_full_loss,
                                                         metric_vals,
                                                         train_window_size=100,
                                                         val_window_size=10,
                                                         is_save=True)

                results_df.to_excel('results.xlsx', index=False)
                metric_df.to_excel('val_metrics.xlsx', index=False)

            id_batch = id_batch + 1