Exemple #1
0
def main_worker(opt):

    model = generate_model(opt)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    model = make_data_parallel(model, opt.distributed, opt.device)

    val_loader, val_logger = get_val_utils(opt)
    criterion = CrossEntropyLoss().to(opt.device)

    prev_val_loss, val_acc = val_epoch(0, val_loader, model, criterion,
                                       opt.device, val_logger, None,
                                       opt.distributed)
    print('Acc ({acc:.3f})'.format(acc=val_acc))
Exemple #2
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        #        opt.device = torch.device(f'cuda:{index}')
        opt.device = torch.device('cuda:{}'.format(index))

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes, opt.strg)

    if opt.strg:
        model = STRG(model, nclass=opt.n_classes, nrois=opt.nrois)
        rpn = RPN(nrois=opt.nrois)
        rpn = make_data_parallel(rpn, opt.distributed, opt.device)
    else:
        rpn = None

    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)

    model = make_data_parallel(model, opt.distributed, opt.device)

    #    if opt.pretrain_path:
    #        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    #    else:
    parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        #from torch.utils.tensorboard import SummaryWriter
        from tensorboardX import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    if opt.wandb:
        name = str(opt.result_path)
        wandb.init(
            project='strg',
            name=name,
            config=opt,
            dir=name,
            #            resume=str(opt.resume_path) != '',
            sync_tensorboard=True)

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i,
                        train_loader,
                        model,
                        criterion,
                        optimizer,
                        opt.device,
                        current_lr,
                        train_logger,
                        train_batch_logger,
                        tb_writer,
                        opt.distributed,
                        rpn=rpn,
                        det_interval=opt.det_interval,
                        nrois=opt.nrois)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i,
                                      val_loader,
                                      model,
                                      criterion,
                                      opt.device,
                                      val_logger,
                                      tb_writer,
                                      opt.distributed,
                                      rpn=rpn,
                                      det_interval=opt.det_interval,
                                      nrois=opt.nrois)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
Exemple #3
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    print('after generating model:', model.fc.in_features, ':',
          model.fc.out_features)
    print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape)

    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    print('after resume model:', model.fc.in_features, ':',
          model.fc.out_features)
    print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape)
    # summary(model, input_size=(3, 112, 112))
    #    if opt.pretrain_path:
    #        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
    #                                      opt.n_finetune_classes)

    print('after pretrained  model:', model.fc.in_features, ':',
          model.fc.out_features)
    print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape)
    print(torch_summarize(model))
    # parameters = model.parameters()
    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         print(name, param.data)
    #    summary(model, (3, 112, 112))
    #    return

    #    print('model parameters shape', parameters.shape)

    (train_loader, train_sampler, train_logger, train_batch_logger, optimizer,
     scheduler) = get_train_utils(opt, model.parameters())

    for i, (inputs, targets) in enumerate(train_loader):
        print('input shape:', inputs.shape)
        print('targets shape:', targets.shape)
        outputs = model(inputs)
        print("output shape", outputs.shape)
        model_arch = make_dot(outputs, params=dict(model.named_parameters()))
        print(model_arch)
        model_arch.render("/apollo/data/model.png", format="png")
        # Source(model_arch).render('/apollo/data/model.png')
        # print("generating /apollo/data/model.png")
        break

    # make_dot(yhat, params=dict(list(model.named_parameters()))).render("rnn_torchviz", format="png")

    return

    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
Exemple #4
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)

    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.resume_path is not None:
        if not opt.no_train:
            opt.begin_epoch, model, optimizer, scheduler = resume(
                opt.resume_path, opt.arch, opt.begin_epoch, model, optimizer,
                scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
        else:
            opt.begin_epoch, model, _, _ = resume(opt.resume_path, opt.arch,
                                                  opt.begin_epoch, model)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)
    if opt.dropout:
        n_classes = opt.n_classes
        if opt.pretrain_path is not None:
            n_classes = opt.n_finetune_classes
        model = replace_fc_layer(model=model,
                                 dropout_factor=opt.dropout_factor,
                                 n_classes=n_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)

    model = make_data_parallel(model, opt.distributed, opt.device)
    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    if opt.labelsmoothing:
        criterion = LabelSmoothingCrossEntropy().to(opt.device)
    else:
        criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    if opt.lr_finder and not opt.no_train and not opt.no_val:
        print(
            "Performing Learning Rate Search\nWith Leslie Smith's approach...")
        lr_finder = LRFinder(model, optimizer, criterion, device=opt.device)
        lr_finder.range_test(train_loader,
                             val_loader=val_loader,
                             start_lr=opt.learning_rate,
                             end_lr=opt.lrf_end_lr,
                             num_iter=opt.lrf_num_it,
                             step_mode=opt.lrf_mode)
        lr_finder.plot(log_lr=False)
        with (opt.result_path / 'lr_search.json').open('w') as results_file:
            json.dump(lr_finder.history, results_file, default=json_serial)
        lr_finder.reset()
        return

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            #current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, train_logger, train_batch_logger,
                        scheduler, opt.lr_scheduler, tb_writer,
                        opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)
        elif not opt.no_train and opt.lr_scheduler == 'cosineannealing':
            scheduler.step()

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
Exemple #6
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0
    model = generate_model(opt)
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
        print('resume model from ', opt.resume_path)

    print('model after resume:', model)

    # save model to current running id
    # mlflow.pytorch.log_model(model, "action_model")
    # model_path = mlflow.get_artifact_uri("action_model")
    # print('mlflow action model path: ', model_path)
    # model = mlflow.pytorch.load_model(model_path)
    if opt.ml_tag_name != '' and opt.ml_tag_value != '':
        # mlflow.set_tag("test_tag", 'inference_test')
        mlflow.set_tag(opt.ml_tag_name, opt.ml_tag_value)

    # load from previous published model version
    if opt.ml_model_name != '' and opt.ml_model_version != '':
        # model_name = 'action_model'
        # model_version = '1'
        model_uri = "models:/{}/{}".format(opt.ml_model_name,
                                           opt.ml_model_version)
        model = mlflow.pytorch.load_model(model_uri)

    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    criterion = CrossEntropyLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)
                if opt.ml_model_name != '':
                    mlflow.pytorch.log_model(model, opt.ml_model_name)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed)
            mlflow.log_metric("loss", prev_val_loss)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
Exemple #7
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt)
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    #criterion = CrossEntropyLoss().to(opt.device)
    # ADDED for 231n
    criterion = FocalLoss().to(opt.device)

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    conf_mtx_dict = {}  # ADDED for CS231n

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed,
                                      conf_mtx_dict)  # ADDED for CS231n

        # ADDED for 231n - uncomment if using cross entropy loss
        #if not opt.no_train and opt.lr_scheduler == 'multistep':
        #    scheduler.step()
        #elif not opt.no_train and opt.lr_scheduler == 'plateau':
        #    scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)

    # ADDED for CS231n
    conf_mtx_file = csv.writer(open("conf_mtxs.csv", "w+"))
    for key, val in conf_mtx_dict.items():
        conf_mtx_file.writerow([key, val])
Exemple #8
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    if opt.inference:
        model = generate_model(opt)
    else:
        model = generate_model(opt, use_features=True)

    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path,
                                      opt.n_finetune_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    #####################################################################################
    ### here add a classifier to predict videos and audios
    if opt.inference is False:
        ### define loss
        criterion = CrossEntropyLoss().to(opt.device)

        if opt.use_audio or opt.use_image:
            criterion_jsd = JSDLoss(weight=0.5)

        #################################################################################
        if opt.use_audio:
            ### define loss
            criterion_ct_av = NCELoss(temperature=0.5)
            ### audio teacher model
            feature_dim = 512 * 2
            if opt.pretrain_path is not None:
                joint_prediction_aud = generate_prediction(
                    feature_dim, opt.n_finetune_classes, normalization=True)
            else:
                joint_prediction_aud = generate_prediction(feature_dim,
                                                           opt.n_classes,
                                                           normalization=True)
            if opt.resume_path is not None:
                aux_checkpoint = Path(
                    os.path.join(str(opt.resume_path.parent),
                                 str(opt.resume_path.name[:-4] +
                                     '_audio.pth')))
                joint_prediction_aud = resume_model(aux_checkpoint, opt.arch,
                                                    joint_prediction_aud)

            joint_prediction_aud = make_data_parallel(joint_prediction_aud,
                                                      opt.distributed,
                                                      opt.device)
            aud_para = joint_prediction_aud.parameters()
            joint_prediction_aud.cuda()
        else:
            aud_para = None

        #################################################################################
        if opt.use_image:
            ### define loss
            criterion_ct_iv = NCELoss(temperature=0.1)
            ### image teacher model
            image_model = torchvision.models.resnet34(pretrained=True)
            # remove the fc layers (only use the image features)
            image_model = torch.nn.Sequential(
                *list(image_model.children())[:-1])
            image_model = make_data_parallel(image_model, opt.distributed,
                                             opt.device)
            feature_dim = 512 * 2
            if opt.pretrain_path is not None:
                joint_prediction_img = generate_prediction(
                    feature_dim, opt.n_finetune_classes, normalization=True)
            else:
                joint_prediction_img = generate_prediction(feature_dim,
                                                           opt.n_classes,
                                                           normalization=True)
            if opt.resume_path is not None:
                aux_checkpoint = Path(
                    os.path.join(str(opt.resume_path.parent),
                                 str(opt.resume_path.name[:-4] +
                                     '_image.pth')))
                joint_prediction_img = resume_model(aux_checkpoint, opt.arch,
                                                    joint_prediction_img)

            joint_prediction_img = make_data_parallel(joint_prediction_img,
                                                      opt.distributed,
                                                      opt.device)
            img_para = joint_prediction_img.parameters()
            joint_prediction_img.cuda()
        else:
            img_para = None

        #################################################################################
        (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, optimizer_av, optimizer_iv, scheduler) = \
            get_train_utils(opt, model_parameters=parameters, av_parameters=aud_para, iv_parameters=img_para)

        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones

    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    pre_val_acc = 0.0
    if opt.image_size > opt.sample_size:
        image_size = opt.image_size
    else:
        image_size = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            if optimizer_av is None and optimizer_iv is None:
                train_epoch(epoch=i,
                            data_loader=train_loader,
                            model=model,
                            criterion=criterion,
                            optimizer=optimizer,
                            device=opt.device,
                            current_lr=current_lr,
                            epoch_logger=train_logger,
                            batch_logger=train_batch_logger,
                            tb_writer=tb_writer,
                            distributed=opt.distributed)
            elif optimizer_av is not None and optimizer_iv is None:
                train_a_epoch(epoch=i,
                              data_loader=train_loader,
                              model=model,
                              joint_prediction_aud=joint_prediction_aud,
                              criterion=criterion,
                              criterion_jsd=criterion_jsd,
                              criterion_ct_av=criterion_ct_av,
                              optimizer=optimizer,
                              optimizer_av=optimizer_av,
                              device=opt.device,
                              current_lr=current_lr,
                              epoch_logger=train_logger,
                              batch_logger=train_batch_logger,
                              tb_writer=tb_writer,
                              distributed=opt.distributed)
            elif optimizer_av is None and optimizer_iv is not None:
                train_i_epoch(epoch=i,
                              data_loader=train_loader,
                              model=model,
                              image_model=image_model,
                              joint_prediction_img=joint_prediction_img,
                              criterion=criterion,
                              criterion_jsd=criterion_jsd,
                              criterion_ct_iv=criterion_ct_iv,
                              optimizer=optimizer,
                              optimizer_iv=optimizer_iv,
                              device=opt.device,
                              current_lr=current_lr,
                              epoch_logger=train_logger,
                              batch_logger=train_batch_logger,
                              tb_writer=tb_writer,
                              distributed=opt.distributed,
                              image_size=image_size)
            else:
                train_ai_epoch(epoch=i,
                               data_loader=train_loader,
                               model=model,
                               image_model=image_model,
                               joint_prediction_aud=joint_prediction_aud,
                               joint_prediction_img=joint_prediction_img,
                               criterion=criterion,
                               criterion_jsd=criterion_jsd,
                               criterion_ct_av=criterion_ct_av,
                               criterion_ct_iv=criterion_ct_iv,
                               optimizer=optimizer,
                               optimizer_av=optimizer_av,
                               optimizer_iv=optimizer_iv,
                               device=opt.device,
                               current_lr=current_lr,
                               epoch_logger=train_logger,
                               batch_logger=train_batch_logger,
                               tb_writer=tb_writer,
                               distributed=opt.distributed,
                               image_size=image_size,
                               loss_weight=opt.loss_weight)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)
                if opt.use_audio:
                    save_file_path = opt.result_path / 'save_{}_audio.pth'.format(
                        i)
                    save_checkpoint(save_file_path, i, opt.arch,
                                    joint_prediction_aud, optimizer, scheduler)
                if opt.use_image:
                    save_file_path = opt.result_path / 'save_{}_image.pth'.format(
                        i)
                    save_checkpoint(save_file_path, i, opt.arch,
                                    joint_prediction_img, optimizer, scheduler)
            if not opt.no_val and i % opt.val_freq == 0:
                prev_val_loss, val_acc = val_epoch(i, val_loader, model,
                                                   criterion, opt.device,
                                                   val_logger, tb_writer,
                                                   opt.distributed)
                if pre_val_acc < val_acc:
                    pre_val_acc = val_acc
                    save_file_path = opt.result_path / 'save_model.pth'
                    save_checkpoint(save_file_path, i, opt.arch, model,
                                    optimizer, scheduler)

            if not opt.no_train and opt.lr_scheduler == 'multistep':
                scheduler.step()
            elif not opt.no_train and opt.lr_scheduler == 'plateau':
                if prev_val_loss is not None:
                    scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)
        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)