Esempio n. 1
0
def main() -> None:
    args = get_arguments()

    # configuration
    config = get_config(args.config)

    # save log files in the directory which contains config file.
    result_path = os.path.dirname(args.config)
    experiment_name = os.path.basename(result_path)

    # cpu or cuda
    device = get_device(allow_only_gpu=True)

    # Dataloader
    train_transform = Compose([
        RandomResizedCrop(size=(config.height, config.width)),
        RandomHorizontalFlip(),
        ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
        ToTensor(),
        Normalize(mean=get_mean(), std=get_std()),
    ])

    val_transform = Compose(
        [ToTensor(), Normalize(mean=get_mean(), std=get_std())])

    train_loader = get_dataloader(
        config.train_csv,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=True,
        drop_last=True,
        transform=train_transform,
    )

    val_loader = get_dataloader(
        config.val_csv,
        batch_size=1,
        shuffle=False,
        num_workers=config.num_workers,
        pin_memory=True,
        transform=val_transform,
    )

    # the number of classes
    n_classes = len(get_cls2id_map())

    # define a model
    model = get_model(config.model, n_classes, pretrained=config.pretrained)

    # send the model to cuda/cpu
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    # keep training and validation log
    begin_epoch = 0
    best_loss = float("inf")
    log = pd.DataFrame(columns=[
        "epoch",
        "lr",
        "train_time[sec]",
        "train_loss",
        "train_acc@1",
        "train_f1s",
        "val_time[sec]",
        "val_loss",
        "val_acc@1",
        "val_f1s",
    ])

    # resume if you want
    if args.resume:
        resume_path = os.path.join(result_path, "checkpoint.pth")
        begin_epoch, model, optimizer, best_loss = resume(
            resume_path, model, optimizer)

        log_path = os.path.join(result_path, "log.csv")
        assert os.path.exists(
            log_path), "there is no checkpoint at the result folder"
        log = pd.read_csv(log_path)

    # criterion for loss
    criterion = get_criterion(config.use_class_weight, config.train_csv,
                              device)

    # Weights and biases
    if not args.no_wandb:
        wandb.init(
            name=experiment_name,
            config=config,
            project="image_classification_template",
            job_type="training",
            dirs="./wandb_result/",
        )
        # Magic
        wandb.watch(model, log="all")

    # train and validate model
    print("---------- Start training ----------")

    for epoch in range(begin_epoch, config.max_epoch):
        # training
        start = time.time()
        train_loss, train_acc1, train_f1s = train(train_loader, model,
                                                  criterion, optimizer, epoch,
                                                  device)
        train_time = int(time.time() - start)

        # validation
        start = time.time()
        val_loss, val_acc1, val_f1s, c_matrix = evaluate(
            val_loader, model, criterion, device)
        val_time = int(time.time() - start)

        # save a model if top1 acc is higher than ever
        if best_loss > val_loss:
            best_loss = val_loss
            torch.save(
                model.state_dict(),
                os.path.join(result_path, "best_model.prm"),
            )

        # save checkpoint every epoch
        save_checkpoint(result_path, epoch, model, optimizer, best_loss)

        # write logs to dataframe and csv file
        tmp = pd.Series(
            [
                epoch,
                optimizer.param_groups[0]["lr"],
                train_time,
                train_loss,
                train_acc1,
                train_f1s,
                val_time,
                val_loss,
                val_acc1,
                val_f1s,
            ],
            index=log.columns,
        )

        log = log.append(tmp, ignore_index=True)
        log.to_csv(os.path.join(result_path, "log.csv"), index=False)

        # save logs to wandb
        if not args.no_wandb:
            wandb.log(
                {
                    "lr": optimizer.param_groups[0]["lr"],
                    "train_time[sec]": train_time,
                    "train_loss": train_loss,
                    "train_acc@1": train_acc1,
                    "train_f1s": train_f1s,
                    "val_time[sec]": val_time,
                    "val_loss": val_loss,
                    "val_acc@1": val_acc1,
                    "val_f1s": val_f1s,
                },
                step=epoch,
            )

        print("""epoch: {}\tepoch time[sec]: {}\tlr: {}\ttrain loss: {:.4f}\t\
            val loss: {:.4f} val_acc1: {:.5f}\tval_f1s: {:.5f}
            """.format(
            epoch,
            train_time + val_time,
            optimizer.param_groups[0]["lr"],
            train_loss,
            val_loss,
            val_acc1,
            val_f1s,
        ))

    # save models
    torch.save(model.state_dict(), os.path.join(result_path,
                                                "final_model.prm"))

    # delete checkpoint
    os.remove(os.path.join(result_path, "checkpoint.pth"))

    print("Done")
Esempio n. 2
0
def main() -> None:
    # argparser
    args = get_arguments()

    # configuration
    config = get_config(args.config)

    result_path = os.path.dirname(args.config)

    seed = args.seed
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

    # cpu or cuda
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cuda":
        torch.backends.cudnn.benchmark = True

    # Dataloader
    # Temporal downsampling is applied to only videos in 50Salads
    downsamp_rate = 2 if config.dataset == "50salads" else 1

    train_data = ActionSegmentationDataset(
        config.dataset,
        transform=Compose([ToTensor(), TempDownSamp(downsamp_rate)]),
        mode="trainval" if not config.param_search else "training",
        split=config.split,
        dataset_dir=config.dataset_dir,
        csv_dir=config.csv_dir,
    )

    train_loader = DataLoader(
        train_data,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        drop_last=True if config.batch_size > 1 else False,
        collate_fn=collate_fn,
    )

    # if you do validation to determine hyperparams
    if config.param_search:
        val_data = ActionSegmentationDataset(
            config.dataset,
            transform=Compose([ToTensor(),
                               TempDownSamp(downsamp_rate)]),
            mode="validation",
            split=config.split,
            dataset_dir=config.dataset_dir,
            csv_dir=config.csv_dir,
        )

        val_loader = DataLoader(
            val_data,
            batch_size=1,
            shuffle=False,
            num_workers=config.num_workers,
            collate_fn=collate_fn,
        )

    # load model
    print("---------- Loading Model ----------")

    n_classes = get_n_classes(config.dataset, dataset_dir=config.dataset_dir)

    model = models.ActionSegmentRefinementFramework(
        in_channel=config.in_channel,
        n_features=config.n_features,
        n_classes=n_classes,
        n_stages=config.n_stages,
        n_layers=config.n_layers,
        n_stages_asb=config.n_stages_asb,
        n_stages_brb=config.n_stages_brb,
    )

    # send the model to cuda/cpu
    model.to(device)

    optimizer = get_optimizer(
        config.optimizer,
        model,
        config.learning_rate,
        momentum=config.momentum,
        dampening=config.dampening,
        weight_decay=config.weight_decay,
        nesterov=config.nesterov,
    )

    # resume if you want
    columns = ["epoch", "lr", "train_loss"]

    # if you do validation to determine hyperparams
    if config.param_search:
        columns += ["val_loss", "cls_acc", "edit"]
        columns += [
            "segment f1s@{}".format(config.iou_thresholds[i])
            for i in range(len(config.iou_thresholds))
        ]
        columns += ["bound_acc", "precision", "recall", "bound_f1s"]

    begin_epoch = 0
    best_loss = float("inf")
    log = pd.DataFrame(columns=columns)

    if args.resume:
        if os.path.exists(os.path.join(result_path, "checkpoint.pth")):
            checkpoint = resume(result_path, model, optimizer)
            begin_epoch, model, optimizer, best_loss = checkpoint
            log = pd.read_csv(os.path.join(result_path, "log.csv"))
            print("training will start from {} epoch".format(begin_epoch))
        else:
            print("there is no checkpoint at the result folder")

    # criterion for loss
    if config.class_weight:
        class_weight = get_class_weight(
            config.dataset,
            split=config.split,
            dataset_dir=config.dataset_dir,
            csv_dir=config.csv_dir,
            mode="training" if config.param_search else "trainval",
        )
        class_weight = class_weight.to(device)
    else:
        class_weight = None

    criterion_cls = ActionSegmentationLoss(
        ce=config.ce,
        focal=config.focal,
        tmse=config.tmse,
        gstmse=config.gstmse,
        weight=class_weight,
        ignore_index=255,
        ce_weight=config.ce_weight,
        focal_weight=config.focal_weight,
        tmse_weight=config.tmse_weight,
        gstmse_weight=config.gstmse,
    )

    pos_weight = get_pos_weight(
        dataset=config.dataset,
        split=config.split,
        csv_dir=config.csv_dir,
        mode="training" if config.param_search else "trainval",
    ).to(device)

    criterion_bound = BoundaryRegressionLoss(pos_weight=pos_weight)

    # train and validate model
    print("---------- Start training ----------")

    for epoch in range(begin_epoch, config.max_epoch):
        # training
        train_loss = train(
            train_loader,
            model,
            criterion_cls,
            criterion_bound,
            config.lambda_b,
            optimizer,
            epoch,
            device,
        )

        # if you do validation to determine hyperparams
        if config.param_search:
            (
                val_loss,
                cls_acc,
                edit_score,
                segment_f1s,
                bound_acc,
                precision,
                recall,
                bound_f1s,
            ) = validate(
                val_loader,
                model,
                criterion_cls,
                criterion_bound,
                config.lambda_b,
                device,
                config.dataset,
                config.dataset_dir,
                config.iou_thresholds,
                config.boundary_th,
                config.tolerance,
            )

            # save a model if top1 acc is higher than ever
            if best_loss > val_loss:
                best_loss = val_loss
                torch.save(
                    model.state_dict(),
                    os.path.join(result_path, "best_loss_model.prm"),
                )

        # save checkpoint every epoch
        save_checkpoint(result_path, epoch, model, optimizer, best_loss)

        # write logs to dataframe and csv file
        tmp = [epoch, optimizer.param_groups[0]["lr"], train_loss]

        # if you do validation to determine hyperparams
        if config.param_search:
            tmp += [
                val_loss,
                cls_acc,
                edit_score,
            ]
            tmp += segment_f1s
            tmp += [
                bound_acc,
                precision,
                recall,
                bound_f1s,
            ]

        tmp_df = pd.Series(tmp, index=log.columns)

        log = log.append(tmp_df, ignore_index=True)
        log.to_csv(os.path.join(result_path, "log.csv"), index=False)

        if config.param_search:
            # if you do validation to determine hyperparams
            print(
                "epoch: {}\tlr: {:.4f}\ttrain loss: {:.4f}\tval loss: {:.4f}\tval_acc: {:.4f}\tedit: {:.4f}"
                .format(
                    epoch,
                    optimizer.param_groups[0]["lr"],
                    train_loss,
                    val_loss,
                    cls_acc,
                    edit_score,
                ))
        else:
            print("epoch: {}\tlr: {:.4f}\ttrain loss: {:.4f}".format(
                epoch, optimizer.param_groups[0]["lr"], train_loss))

    # delete checkpoint
    os.remove(os.path.join(result_path, "checkpoint.pth"))

    # save models
    torch.save(model.state_dict(), os.path.join(result_path,
                                                "final_model.prm"))

    print("Done!")
Esempio n. 3
0
File: train.py Progetto: jo-kwsm/ssd
def main():
    args = get_arguments()
    config = get_config(args.config)

    result_path = os.path.dirname(args.config)
    experiment_name = os.path.basename(result_path)

    if os.path.exists(os.path.join(result_path, "final_model.prm")):
        print("Already done.")
        return

    device = get_device(allow_only_gpu=True)

    transform = DataTransform(config.size, get_mean())
    voc_classes = [k for k in get_cls2id_map().keys()]

    train_loader = get_dataloader(
        config.train_csv,
        phase="train",
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=True,
        drop_last=True,
        transform=transform,
        transform_anno=Anno_xml2list(voc_classes),
    )

    val_loader = get_dataloader(
        config.val_csv,
        phase="val",
        batch_size=1,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=True,
        drop_last=True,
        transform=transform,
        transform_anno=Anno_xml2list(voc_classes),
    )

    n_classes = len(voc_classes) + 1
    model = get_model(
        input_size=config.size,
        n_classes=n_classes,
        phase="train",
        pretrained=config.pretrained,
    )
    model.to(device)

    optimizer = optim.SGD(
        model.parameters(),
        lr=config.learning_rate,
        momentum=0.9,
        weight_decay=5e-4
    )

    begin_epoch = 0
    best_loss = float("inf")
    # TODO 評価指標の検討
    log = pd.DataFrame(
        columns=[
            "epoch",
            "lr",
            "train_time[sec]",
            "train_loss",
            "val_time[sec]",
            "val_loss",
        ]
    )

    if args.resume:
        resume_path = os.path.join(result_path, "checkpoint.pth")
        begin_epoch, model, optimizer, best_loss = resume(resume_path, model, optimizer)

        log_path = os.path.join(result_path, "log.csv")
        assert os.path.exists(log_path), "there is no checkpoint at the result folder"
        log = pd.read_csv(log_path)

    criterion = get_criterion(device=device)

    print("---------- Start training ----------")

    for epoch in range(begin_epoch, config.max_epoch):
        start = time.time()
        train_loss = train(
            train_loader,
            model,
            criterion,
            optimizer,
            epoch,
            device,
            interval_of_progress=10,
        )
        train_time = int(time.time() - start)

        start = time.time()
        val_loss = evaluate(
            val_loader,
            model,
            criterion,
            device,
        )
        val_time = int(time.time() - start)

        if best_loss > val_loss:
            best_loss = val_loss
            torch.save(
                model.state_dict(),
                os.path.join(result_path, "best_model.prm"),
            )

        save_checkpoint(result_path, epoch, model, optimizer, best_loss)

        tmp = pd.Series(
            [
                epoch,
                optimizer.param_groups[0]["lr"],
                train_time,
                train_loss,
                val_time,
                val_loss,
            ],
            index=log.columns,
        )

        log = log.append(tmp, ignore_index=True)
        log.to_csv(os.path.join(result_path, "log.csv"), index=False)
        make_graphs(os.path.join(result_path, "log.csv"))

        print(
            """epoch: {}\tepoch time[sec]: {}\tlr: {}\ttrain loss: {:.4f}\t\
            val loss: {:.4f}
            """.format(
                epoch,
                train_time + val_time,
                optimizer.param_groups[0]["lr"],
                train_loss,
                val_loss,
            )
        )

    torch.save(model.state_dict(), os.path.join(result_path, "final_model.prm"))

    os.remove(os.path.join(result_path, "checkpoint.pth"))

    print("Done")
Esempio n. 4
0
def main():
    args = get_arguments()
    config = get_config(args.config)

    result_path = os.path.dirname(args.config)
    experiment_name = os.path.basename(result_path)

    if os.path.exists(os.path.join(result_path, "final_model_G.prm")):
        print("Already done.")
        return

    device = get_device(allow_only_gpu=True)

    train_loader = get_dataloader(
        csv_file=config.train_csv,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=config.num_workers,
        pin_memory=True,
        drop_last=True,
        transform=ImageTransform(mean=get_mean(), std=get_std()),
    )

    model = get_model(config.model, z_dim=config.z_dim, image_size=config.size)
    for v in model.values():
        v.to(device)

    g_optimizer = torch.optim.Adam(
        model["G"].parameters(),
        config.g_lr,
        [config.beta1, config.beta2],
    )
    d_optimizer = torch.optim.Adam(
        model["D"].parameters(),
        config.d_lr,
        [config.beta1, config.beta2],
    )
    optimizer = {
        "G": g_optimizer,
        "D": d_optimizer,
    }

    begin_epoch = 0
    best_loss = float("inf")
    # TODO 評価指標の検討
    log = pd.DataFrame(
        columns=[
            "epoch",
            "d_lr",
            "g_lr",
            "train_time[sec]",
            "train_loss",
            "train_d_loss",
            "train_g_loss",
        ]
    )

    if args.resume:
        resume_path = os.path.join(result_path, "checkpoint_%s.pth")
        begin_epoch, model, optimizer, best_loss = resume(resume_path, model, optimizer)

        log_path = os.path.join(result_path, "log.csv")
        assert os.path.exists(log_path), "there is no checkpoint at the result folder"
        log = pd.read_csv(log_path)

    criterion = nn.BCEWithLogitsLoss(reduction="mean")

    print("---------- Start training ----------")

    for epoch in range(begin_epoch, config.max_epoch):
        start = time.time()
        train_d_loss, train_g_loss,  = train(
            train_loader,
            model,
            config.model,
            criterion,
            optimizer,
            epoch,
            config.z_dim,
            device,
            interval_of_progress=1,
        )
        train_time = int(time.time() - start)

        if best_loss > train_d_loss + train_g_loss:
            best_loss = train_d_loss + train_g_loss
            for k in model.keys():
                torch.save(
                    model[k].state_dict(),
                    os.path.join(result_path, "best_model_%s.prm" % k),
                )

        save_checkpoint(result_path, epoch, model, optimizer, best_loss)

        tmp = pd.Series(
            [
                epoch,
                optimizer["D"].param_groups[0]["lr"],
                optimizer["G"].param_groups[0]["lr"],
                train_time,
                train_d_loss + train_g_loss,
                train_d_loss,
                train_g_loss,
            ],
            index=log.columns,
        )

        log = log.append(tmp, ignore_index=True)
        log.to_csv(os.path.join(result_path, "log.csv"), index=False)
        make_graphs(os.path.join(result_path, "log.csv"))

        print(
            "epoch: {}\tepoch time[sec]: {}\tD_lr: {}\tG_lr: {}\ttrain loss: {:.4f}\ttrain d_loss: {:.4f}\ttrain g_loss: {:.4f}".format(
                epoch,
                train_time,
                optimizer["D"].param_groups[0]["lr"],
                optimizer["G"].param_groups[0]["lr"],
                train_d_loss + train_g_loss,
                train_d_loss,
                train_g_loss,
            )
        )

    for k in model.keys():
        torch.save(
            model[k].state_dict(),
            os.path.join(result_path, "final_model_%s.prm" % k),
        )

    for k in model.keys():
        os.remove(os.path.join(result_path, "checkpoint_%s.pth" % k))

    print("Done")