def experiment(logdir, device) -> None: """Experiment function Args: logdir (Path): directory where should be placed logs device (str): device name to use """ tb_dir = logdir / "tensorboard" main_metric = "loss" minimize_metric = True seed_all() history_n_frames = cfg["model_params"]["history_num_frames"] future_n_frames = cfg["model_params"]["future_num_frames"] n_trajectories = 3 model = ModelWithConfidence( backbone=resnet18( pretrained=True, in_channels=3 + 2 * (history_n_frames + 1), num_classes=2 * future_n_frames * n_trajectories + n_trajectories, ), future_num_frames=future_n_frames, num_trajectories=n_trajectories, ) # model = nn.DataParallel(model) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = neg_multi_log_likelihood_batch scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100) with TensorboardLogger(tb_dir) as tb: stage = "stage_0" n_epochs = 1 print(f"Stage - {stage}") checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=5, ) train_loader, valid_loader = get_loaders(train_batch_size=32, valid_batch_size=32) for epoch in range(1, n_epochs + 1): epoch_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(f"[{epoch_start_time}]\n[Epoch {epoch}/{n_epochs}]") train_metrics = train_fn(model, train_loader, device, criterion, optimizer) log_metrics(stage, train_metrics, tb, "train", epoch) valid_metrics = valid_fn(model, valid_loader, device, criterion) log_metrics(stage, valid_metrics, tb, "valid", epoch) checkpointer.process( metric_value=valid_metrics[main_metric], epoch=epoch, checkpoint=make_checkpoint( stage, epoch, model, optimizer, scheduler, metrics={ "train": train_metrics, "valid": valid_metrics }, ), ) scheduler.step()
backbone=resnet34_accel( pretrained=True, in_channels=6, num_classes=2 * future_n_frames * n_trajectories + n_trajectories, in_accel_features=(history_n_frames - 1) * 2, num_accel_features=32, ), future_num_frames=future_n_frames, num_trajectories=n_trajectories, ) load_checkpoint(checkpoint_path, model) model = model.eval() device = torch.device("cuda:0") model = model.to(device) valid_mask = np.load( f"{DATA_DIR}/scenes/validate_chopped_100/mask.npz")["arr_0"] dm = LocalDataManager(DATA_DIR) # ====== INIT TEST DATASET============================================================= rasterizer = build_rasterizer(cfg, dm) test_zarr = ChunkedDataset(dm.require("scenes/test.zarr")).open() test_mask = np.load(f"{DATA_DIR}/scenes/mask.npz")["arr_0"] test_dataset = AccelAgentDataset(cfg, test_zarr, rasterizer, agents_mask=test_mask) test_dataloader = DataLoader(test_dataset,
def experiment(logdir, device) -> None: """Experiment function Args: logdir (Path): directory where should be placed logs device (str): device name to use """ tb_dir = logdir / "tensorboard" main_metric = "loss" minimize_metric = True seed_all() history_n_frames = cfg["model_params"]["history_num_frames"] future_n_frames = cfg["model_params"]["future_num_frames"] n_trajectories = 3 model = ModelWithConfidence( backbone=resnet34_accel( pretrained=True, in_channels=3 + 3, num_classes=2 * future_n_frames * n_trajectories + n_trajectories, in_accel_features=(history_n_frames - 1) * 2, num_accel_features=32, ), future_num_frames=future_n_frames, num_trajectories=n_trajectories, ) load_checkpoint( "./logs/resnet34_frast_fulldata_confidence_25hist_accel/epoch_1/train_689999.pth", model, ) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = neg_multi_log_likelihood_batch scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100) with TensorboardLogger(tb_dir) as tb: stage = "stage_0" n_epochs = 1 print(f"Stage - {stage}") checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=5, ) train_loader, (valid_loader, valid_gt_path) = get_loaders( train_batch_size=32, valid_batch_size=32 ) valid_func = partial( valid_fn, loader=valid_loader, ground_truth_file=valid_gt_path, logdir=logdir, verbose=True, ) for epoch in range(1, n_epochs + 1): epoch_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(f"[{epoch_start_time}]\n[Epoch {epoch}/{n_epochs}]") # try: train_metrics = train_fn( model, train_loader, device, criterion, optimizer, tensorboard_logger=tb, logdir=logdir / f"epoch_{epoch}", validation_fn=valid_func, ) log_metrics(stage, train_metrics, tb, "train", epoch) # except BaseException: # train_metrics = {"message": "An exception occured!"} valid_metrics = valid_fn(model, valid_loader, device, valid_gt_path, logdir) log_metrics(stage, valid_metrics, tb, "valid", epoch) checkpointer.process( metric_value=valid_metrics["score"], epoch=epoch, checkpoint=make_checkpoint( stage, epoch, model, optimizer, scheduler, metrics={"train": train_metrics, "valid": valid_metrics}, ), )
def experiment(logdir, device) -> None: """Experiment function Args: logdir (Path): directory where should be placed logs device (str): device name to use """ tb_dir = logdir / "tensorboard" main_metric = "score" minimize_metric = True seed_all() history_n_frames = cfg["model_params"]["history_num_frames"] future_n_frames = cfg["model_params"]["future_num_frames"] n_trajectories = 3 model = ModelWithConfidence( backbone=resnet18( pretrained=True, in_channels=3 + 2 * (history_n_frames + 1), num_classes=2 * future_n_frames * n_trajectories + n_trajectories, ), future_num_frames=future_n_frames, num_trajectories=n_trajectories, ) # model = nn.DataParallel(model) model = model.to(device) # optimizer = optim.Adam(model.parameters(), lr=1e-3) optimizer = optim.SGD(model.parameters(), lr=1e-4) scheduler = optim.lr_scheduler.CyclicLR( optimizer, base_lr=1e-4, max_lr=1e-3, step_size_up=120_000, cycle_momentum=True, mode="triangular2", ) load_checkpoint( "./logs/resnet18_bigerimages_continue4_chopped/epoch_1/train_25868.pth", model, # optimizer, ) criterion = neg_multi_log_likelihood_batch with TensorboardLogger(tb_dir) as tb: stage = "stage_0" n_epochs = 1 print(f"Stage - {stage}") checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=5, ) train_loader, (valid_loader, valid_gt_path) = get_loaders( train_batch_size=32, valid_batch_size=32 ) valid_func = partial( valid_fn, loader=valid_loader, ground_truth_file=valid_gt_path, logdir=logdir, verbose=True, ) for epoch in range(1, n_epochs + 1): epoch_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(f"[{epoch_start_time}]\n[Epoch {epoch}/{n_epochs}]") try: train_metrics = train_fn( model, train_loader, device, criterion, optimizer, scheduler=scheduler, tensorboard_logger=tb, logdir=logdir / f"epoch_{epoch}", validation_fn=valid_func, ) log_metrics(stage, train_metrics, tb, "train", epoch) except BaseException: train_metrics = {"message": "An exception occured!"} # valid_metrics = train_metrics valid_metrics = valid_fn(model, valid_loader, device, valid_gt_path, logdir) log_metrics(stage, valid_metrics, tb, "valid", epoch) checkpointer.process( metric_value=valid_metrics[main_metric], epoch=epoch, checkpoint=make_checkpoint( stage, epoch, model, optimizer, scheduler, metrics={"train": train_metrics, "valid": valid_metrics}, ), )