def main_worker(opt): model = generate_model(opt) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) val_loader, val_logger = get_val_utils(opt) criterion = CrossEntropyLoss().to(opt.device) prev_val_loss, val_acc = val_epoch(0, val_loader, model, criterion, opt.device, val_logger, None, opt.distributed) print('Acc ({acc:.3f})'.format(acc=val_acc))
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': # opt.device = torch.device(f'cuda:{index}') opt.device = torch.device('cuda:{}'.format(index)) if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes, opt.strg) if opt.strg: model = STRG(model, nclass=opt.n_classes, nrois=opt.nrois) rpn = RPN(nrois=opt.nrois) rpn = make_data_parallel(rpn, opt.distributed, opt.device) else: rpn = None if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) # if opt.pretrain_path: # parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) # else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: #from torch.utils.tensorboard import SummaryWriter from tensorboardX import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None if opt.wandb: name = str(opt.result_path) wandb.init( project='strg', name=name, config=opt, dir=name, # resume=str(opt.resume_path) != '', sync_tensorboard=True) prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed, rpn=rpn, det_interval=opt.det_interval, nrois=opt.nrois) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed, rpn=rpn, det_interval=opt.det_interval, nrois=opt.nrois) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) print('after generating model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) print('after resume model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) # summary(model, input_size=(3, 112, 112)) # if opt.pretrain_path: # model = load_pretrained_model(model, opt.pretrain_path, opt.model, # opt.n_finetune_classes) print('after pretrained model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) print(torch_summarize(model)) # parameters = model.parameters() # for name, param in model.named_parameters(): # if param.requires_grad: # print(name, param.data) # summary(model, (3, 112, 112)) # return # print('model parameters shape', parameters.shape) (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, model.parameters()) for i, (inputs, targets) in enumerate(train_loader): print('input shape:', inputs.shape) print('targets shape:', targets.shape) outputs = model(inputs) print("output shape", outputs.shape) model_arch = make_dot(outputs, params=dict(model.named_parameters())) print(model_arch) model_arch.render("/apollo/data/model.png", format="png") # Source(model_arch).render('/apollo/data/model.png') # print("generating /apollo/data/model.png") break # make_dot(yhat, params=dict(list(model.named_parameters()))).render("rnn_torchviz", format="png") return if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.resume_path is not None: if not opt.no_train: opt.begin_epoch, model, optimizer, scheduler = resume( opt.resume_path, opt.arch, opt.begin_epoch, model, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones else: opt.begin_epoch, model, _, _ = resume(opt.resume_path, opt.arch, opt.begin_epoch, model) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.dropout: n_classes = opt.n_classes if opt.pretrain_path is not None: n_classes = opt.n_finetune_classes model = replace_fc_layer(model=model, dropout_factor=opt.dropout_factor, n_classes=n_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) if opt.labelsmoothing: criterion = LabelSmoothingCrossEntropy().to(opt.device) else: criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None if opt.lr_finder and not opt.no_train and not opt.no_val: print( "Performing Learning Rate Search\nWith Leslie Smith's approach...") lr_finder = LRFinder(model, optimizer, criterion, device=opt.device) lr_finder.range_test(train_loader, val_loader=val_loader, start_lr=opt.learning_rate, end_lr=opt.lrf_end_lr, num_iter=opt.lrf_num_it, step_mode=opt.lrf_mode) lr_finder.plot(log_lr=False) with (opt.result_path / 'lr_search.json').open('w') as results_file: json.dump(lr_finder.history, results_file, default=json_serial) lr_finder.reset() return prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) #current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, train_logger, train_batch_logger, scheduler, opt.lr_scheduler, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) elif not opt.no_train and opt.lr_scheduler == 'cosineannealing': scheduler.step() if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) print('resume model from ', opt.resume_path) print('model after resume:', model) # save model to current running id # mlflow.pytorch.log_model(model, "action_model") # model_path = mlflow.get_artifact_uri("action_model") # print('mlflow action model path: ', model_path) # model = mlflow.pytorch.load_model(model_path) if opt.ml_tag_name != '' and opt.ml_tag_value != '': # mlflow.set_tag("test_tag", 'inference_test') mlflow.set_tag(opt.ml_tag_name, opt.ml_tag_value) # load from previous published model version if opt.ml_model_name != '' and opt.ml_model_version != '': # model_name = 'action_model' # model_version = '1' model_uri = "models:/{}/{}".format(opt.ml_model_name, opt.ml_model_version) model = mlflow.pytorch.load_model(model_uri) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if opt.ml_model_name != '': mlflow.pytorch.log_model(model, opt.ml_model_name) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) mlflow.log_metric("loss", prev_val_loss) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) #criterion = CrossEntropyLoss().to(opt.device) # ADDED for 231n criterion = FocalLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None conf_mtx_dict = {} # ADDED for CS231n prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed, conf_mtx_dict) # ADDED for CS231n # ADDED for 231n - uncomment if using cross entropy loss #if not opt.no_train and opt.lr_scheduler == 'multistep': # scheduler.step() #elif not opt.no_train and opt.lr_scheduler == 'plateau': # scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk) # ADDED for CS231n conf_mtx_file = csv.writer(open("conf_mtxs.csv", "w+")) for key, val in conf_mtx_dict.items(): conf_mtx_file.writerow([key, val])
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 if opt.inference: model = generate_model(opt) else: model = generate_model(opt, use_features=True) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) ##################################################################################### ### here add a classifier to predict videos and audios if opt.inference is False: ### define loss criterion = CrossEntropyLoss().to(opt.device) if opt.use_audio or opt.use_image: criterion_jsd = JSDLoss(weight=0.5) ################################################################################# if opt.use_audio: ### define loss criterion_ct_av = NCELoss(temperature=0.5) ### audio teacher model feature_dim = 512 * 2 if opt.pretrain_path is not None: joint_prediction_aud = generate_prediction( feature_dim, opt.n_finetune_classes, normalization=True) else: joint_prediction_aud = generate_prediction(feature_dim, opt.n_classes, normalization=True) if opt.resume_path is not None: aux_checkpoint = Path( os.path.join(str(opt.resume_path.parent), str(opt.resume_path.name[:-4] + '_audio.pth'))) joint_prediction_aud = resume_model(aux_checkpoint, opt.arch, joint_prediction_aud) joint_prediction_aud = make_data_parallel(joint_prediction_aud, opt.distributed, opt.device) aud_para = joint_prediction_aud.parameters() joint_prediction_aud.cuda() else: aud_para = None ################################################################################# if opt.use_image: ### define loss criterion_ct_iv = NCELoss(temperature=0.1) ### image teacher model image_model = torchvision.models.resnet34(pretrained=True) # remove the fc layers (only use the image features) image_model = torch.nn.Sequential( *list(image_model.children())[:-1]) image_model = make_data_parallel(image_model, opt.distributed, opt.device) feature_dim = 512 * 2 if opt.pretrain_path is not None: joint_prediction_img = generate_prediction( feature_dim, opt.n_finetune_classes, normalization=True) else: joint_prediction_img = generate_prediction(feature_dim, opt.n_classes, normalization=True) if opt.resume_path is not None: aux_checkpoint = Path( os.path.join(str(opt.resume_path.parent), str(opt.resume_path.name[:-4] + '_image.pth'))) joint_prediction_img = resume_model(aux_checkpoint, opt.arch, joint_prediction_img) joint_prediction_img = make_data_parallel(joint_prediction_img, opt.distributed, opt.device) img_para = joint_prediction_img.parameters() joint_prediction_img.cuda() else: img_para = None ################################################################################# (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, optimizer_av, optimizer_iv, scheduler) = \ get_train_utils(opt, model_parameters=parameters, av_parameters=aud_para, iv_parameters=img_para) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None pre_val_acc = 0.0 if opt.image_size > opt.sample_size: image_size = opt.image_size else: image_size = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) if optimizer_av is None and optimizer_iv is None: train_epoch(epoch=i, data_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed) elif optimizer_av is not None and optimizer_iv is None: train_a_epoch(epoch=i, data_loader=train_loader, model=model, joint_prediction_aud=joint_prediction_aud, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_av=criterion_ct_av, optimizer=optimizer, optimizer_av=optimizer_av, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed) elif optimizer_av is None and optimizer_iv is not None: train_i_epoch(epoch=i, data_loader=train_loader, model=model, image_model=image_model, joint_prediction_img=joint_prediction_img, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_iv=criterion_ct_iv, optimizer=optimizer, optimizer_iv=optimizer_iv, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed, image_size=image_size) else: train_ai_epoch(epoch=i, data_loader=train_loader, model=model, image_model=image_model, joint_prediction_aud=joint_prediction_aud, joint_prediction_img=joint_prediction_img, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_av=criterion_ct_av, criterion_ct_iv=criterion_ct_iv, optimizer=optimizer, optimizer_av=optimizer_av, optimizer_iv=optimizer_iv, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed, image_size=image_size, loss_weight=opt.loss_weight) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if opt.use_audio: save_file_path = opt.result_path / 'save_{}_audio.pth'.format( i) save_checkpoint(save_file_path, i, opt.arch, joint_prediction_aud, optimizer, scheduler) if opt.use_image: save_file_path = opt.result_path / 'save_{}_image.pth'.format( i) save_checkpoint(save_file_path, i, opt.arch, joint_prediction_img, optimizer, scheduler) if not opt.no_val and i % opt.val_freq == 0: prev_val_loss, val_acc = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if pre_val_acc < val_acc: pre_val_acc = val_acc save_file_path = opt.result_path / 'save_model.pth' save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': if prev_val_loss is not None: scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)