def setup_model(config): in_ch, out_ch = config['model']['in_ch'], config['model']['out_ch'] depth = config['model']['depth'] detach = config['model']['detach'] generator = DCGanGenerator(in_ch, out_ch, depth=depth, detach=detach) discriminator = DCGanDiscriminator(out_ch, depth=depth, detach=detach) if config['model']['pretrained']['generator'] is not None: load_pretrained_model(generator, config['model']['pretrained']['generator']) if config['model']['pretrained']['discriminator'] is not None: load_pretrained_model(discriminator, config['model']['pretrained']['discriminator']) if config['optimizer']['generator'] is not None: gen_optimizer = Adam(generator.parameters(), **config['optimizer']['generator']) else: gen_optimizer = None if config['optimizer']['discriminator'] is not None: dis_optimizer = Adam(discriminator.parameters(), **config['optimizer']['discriminator']) else: dis_optimizer = None return { 'generator': generator, 'discriminator': discriminator, 'gen_optimizer': gen_optimizer, 'dis_optimizer': dis_optimizer, }
def run_test(): model = AutoEncoder(n=2, dim_x=50, dim_hidden=2000) device_id = 0 model_file = 'model/dae.pth' net = load_pretrained_model(model, model_file, device_id) x_test = pickle.load(open('data/manifold_test.pkl', 'rb')) x_true = x_test[:] noise_shape = x_true.shape[1:] n_dim = np.prod(noise_shape) sigma_train = 0.01 sigma_proposal = 0.01 sigma_noise_hat_init = 0.01 for noise in [0.01, 0.02, 0.03, 0.04]: sigma_noise = [noise] * n_dim sigma_noise = np.array(sigma_noise[:n_dim]).reshape(noise_shape) rmse_mean, rmse_std, noise_mean, noise_std, variance_noise_hat_em = \ solve_agem(net=net, x_true=x_true, sigma_noise=sigma_noise, sigma_train=sigma_train, sigma_noise_hat_init=sigma_noise_hat_init, sigma_proposal=sigma_proposal, type_proposal='mala', candidate='mean', em_epochs=10, sample_epochs=1000) print('[AGEM] noise_gt: %.2f | rmse %.4f (%.4f), noise_est: %.4f (%.4f)' % ( noise, rmse_mean, rmse_std, noise_mean, noise_std ))
def get_top_birds_classification(y): """Predict the 5 most likely bird's species using the PANN model.""" # TODO: Add a plot. model = load_pretrained_model() predictions = get_model_predictions_for_clip(y, model) class_probs = predictions[BIRDS].sum().reset_index() class_probs.columns = ["ebird", "p"] class_probs = class_probs.sort_values(by="p") top_birds = class_probs.tail(5).reset_index(drop=True) return top_birds
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': # opt.device = torch.device(f'cuda:{index}') opt.device = torch.device('cuda:{}'.format(index)) if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes, opt.strg) if opt.strg: model = STRG(model, nclass=opt.n_classes, nrois=opt.nrois) rpn = RPN(nrois=opt.nrois) rpn = make_data_parallel(rpn, opt.distributed, opt.device) else: rpn = None if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) # if opt.pretrain_path: # parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) # else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: #from torch.utils.tensorboard import SummaryWriter from tensorboardX import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None if opt.wandb: name = str(opt.result_path) wandb.init( project='strg', name=name, config=opt, dir=name, # resume=str(opt.resume_path) != '', sync_tensorboard=True) prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed, rpn=rpn, det_interval=opt.det_interval, nrois=opt.nrois) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed, rpn=rpn, det_interval=opt.det_interval, nrois=opt.nrois) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) print('after generating model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) print('after resume model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) # summary(model, input_size=(3, 112, 112)) # if opt.pretrain_path: # model = load_pretrained_model(model, opt.pretrain_path, opt.model, # opt.n_finetune_classes) print('after pretrained model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) print(torch_summarize(model)) # parameters = model.parameters() # for name, param in model.named_parameters(): # if param.requires_grad: # print(name, param.data) # summary(model, (3, 112, 112)) # return # print('model parameters shape', parameters.shape) (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, model.parameters()) for i, (inputs, targets) in enumerate(train_loader): print('input shape:', inputs.shape) print('targets shape:', targets.shape) outputs = model(inputs) print("output shape", outputs.shape) model_arch = make_dot(outputs, params=dict(model.named_parameters())) print(model_arch) model_arch.render("/apollo/data/model.png", format="png") # Source(model_arch).render('/apollo/data/model.png') # print("generating /apollo/data/model.png") break # make_dot(yhat, params=dict(list(model.named_parameters()))).render("rnn_torchviz", format="png") return if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.resume_path is not None: if not opt.no_train: opt.begin_epoch, model, optimizer, scheduler = resume( opt.resume_path, opt.arch, opt.begin_epoch, model, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones else: opt.begin_epoch, model, _, _ = resume(opt.resume_path, opt.arch, opt.begin_epoch, model) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.dropout: n_classes = opt.n_classes if opt.pretrain_path is not None: n_classes = opt.n_finetune_classes model = replace_fc_layer(model=model, dropout_factor=opt.dropout_factor, n_classes=n_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) if opt.labelsmoothing: criterion = LabelSmoothingCrossEntropy().to(opt.device) else: criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None if opt.lr_finder and not opt.no_train and not opt.no_val: print( "Performing Learning Rate Search\nWith Leslie Smith's approach...") lr_finder = LRFinder(model, optimizer, criterion, device=opt.device) lr_finder.range_test(train_loader, val_loader=val_loader, start_lr=opt.learning_rate, end_lr=opt.lrf_end_lr, num_iter=opt.lrf_num_it, step_mode=opt.lrf_mode) lr_finder.plot(log_lr=False) with (opt.result_path / 'lr_search.json').open('w') as results_file: json.dump(lr_finder.history, results_file, default=json_serial) lr_finder.reset() return prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) #current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, train_logger, train_batch_logger, scheduler, opt.lr_scheduler, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) elif not opt.no_train and opt.lr_scheduler == 'cosineannealing': scheduler.step() if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(opt): place = fluid.CPUPlace() if opt.no_cuda else fluid.CUDAPlace(0) with fluid.dygraph.guard(place): print(place) random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) prog = fluid.default_main_program() prog.global_seed(opt.manual_seed) os.environ['PYTHONHASHSEED'] = str(opt.manual_seed) model = generate_model(opt) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, model) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if not opt.no_train: (train_loader, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) best_acc = 0.88 for epoch in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(epoch, train_loader, model, optimizer, scheduler, train_logger, train_batch_logger) if epoch % opt.checkpoint == 0: save_file_path = str( opt.result_path) + 'save_{}_{}_{}'.format( epoch, opt.train_crop, opt.batch_size) save_checkpoint(save_file_path, model, optimizer) if not opt.no_val: prev_val_loss, val_acc = val_epoch(epoch, val_loader, model, val_logger) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.epoch() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if not opt.no_val: if val_acc > best_acc: best_acc = val_acc save_file_path = str( opt.result_path) + 'save_{}_{}_best_val_acc'.format( epoch, opt.train_crop) save_checkpoint(save_file_path, model, optimizer) if not opt.no_train: current_lr = optimizer.current_step_lr() print("current val_loss is %s, current lr is %s" % (prev_val_loss.numpy()[0], current_lr)) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}_{}.json'.format( opt.inference_subset, opt.train_crop) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) print('resume model from ', opt.resume_path) print('model after resume:', model) # save model to current running id # mlflow.pytorch.log_model(model, "action_model") # model_path = mlflow.get_artifact_uri("action_model") # print('mlflow action model path: ', model_path) # model = mlflow.pytorch.load_model(model_path) if opt.ml_tag_name != '' and opt.ml_tag_value != '': # mlflow.set_tag("test_tag", 'inference_test') mlflow.set_tag(opt.ml_tag_name, opt.ml_tag_value) # load from previous published model version if opt.ml_model_name != '' and opt.ml_model_version != '': # model_name = 'action_model' # model_version = '1' model_uri = "models:/{}/{}".format(opt.ml_model_name, opt.ml_model_version) model = mlflow.pytorch.load_model(model_uri) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if opt.ml_model_name != '': mlflow.pytorch.log_model(model, opt.ml_model_name) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) mlflow.log_metric("loss", prev_val_loss) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def score(self): normalize = get_normalize_method(self.opt.mean, self.opt.std, self.opt.no_mean_norm, self.opt.no_std_norm) spatial_transform = [ Resize(self.opt.sample_size), CenterCrop(self.opt.sample_size), ToTensor() ] spatial_transform.extend([ScaleValue(self.opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if self.opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(self.opt.sample_t_stride)) temporal_transform.append( TemporalEvenCrop(self.opt.sample_duration, self.opt.n_val_samples)) temporal_transform = TemporalCompose(temporal_transform) frame_count = get_n_frames(self.opt.video_jpgs_dir_path) frame_indices = list(range(0, frame_count)) frame_indices = temporal_transform(frame_indices) spatial_transform.randomize_parameters() image_name_formatter = lambda x: f'image_{x:05d}.jpg' loader = VideoLoader(image_name_formatter) print('frame_indices', frame_indices) #clips = [] video_outputs = [] model = generate_model(self.opt) model = load_pretrained_model(model, self.opt.pretrain_path, self.opt.model, self.opt.n_finetune_classes) i =0 for frame_indice in frame_indices: print("%d indice: %s" % (i, str(frame_indice))) i+=1 clip = loader(self.opt.video_jpgs_dir_path, frame_indice) clip = [spatial_transform(img) for img in clip] clip = torch.stack(clip, 0).permute(1, 0, 2, 3) #parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) #print('clips:', clips) #for clip in clips: with torch.no_grad(): print(clip.shape) output = model(torch.unsqueeze(clip, 0)) output = F.softmax(output, dim=1).cpu() #print(output) video_outputs.append(output[0]) del clip video_outputs = torch.stack(video_outputs) average_scores = torch.mean(video_outputs, dim=0) #inference_loader, inference_class_names = main.get_inference_utils(self.opt) with self.opt.annotation_path.open('r') as f: data = json.load(f) class_to_idx = get_class_labels(data) idx_to_class = {} for name, label in class_to_idx.items(): idx_to_class[label] = name print(idx_to_class) inference_result = inference.get_video_results( average_scores, idx_to_class, self.opt.output_topk) print(inference_result)
print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:.4f}'.format(best_acc)) model.load_state_dict(best_model_wts) torch.save(model.state_dict(), os.path.join(config.MODEL_PATH, config.MODEL_NAME)) return model if __name__ == '__main__': model_ft = model.load_pretrained_model() criterion = nn.CrossEntropyLoss() optimizer_ft = optim.SGD(model_ft.parameters(), lr=config.LEARNING_RATE, momentum=config.MOMENTUM) exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=config.NUM_EPOCHS) predict.visualize_model(model_ft)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) #criterion = CrossEntropyLoss().to(opt.device) # ADDED for 231n criterion = FocalLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None conf_mtx_dict = {} # ADDED for CS231n prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed, conf_mtx_dict) # ADDED for CS231n # ADDED for 231n - uncomment if using cross entropy loss #if not opt.no_train and opt.lr_scheduler == 'multistep': # scheduler.step() #elif not opt.no_train and opt.lr_scheduler == 'plateau': # scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk) # ADDED for CS231n conf_mtx_file = csv.writer(open("conf_mtxs.csv", "w+")) for key, val in conf_mtx_dict.items(): conf_mtx_file.writerow([key, val])
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 if opt.inference: model = generate_model(opt) else: model = generate_model(opt, use_features=True) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) ##################################################################################### ### here add a classifier to predict videos and audios if opt.inference is False: ### define loss criterion = CrossEntropyLoss().to(opt.device) if opt.use_audio or opt.use_image: criterion_jsd = JSDLoss(weight=0.5) ################################################################################# if opt.use_audio: ### define loss criterion_ct_av = NCELoss(temperature=0.5) ### audio teacher model feature_dim = 512 * 2 if opt.pretrain_path is not None: joint_prediction_aud = generate_prediction( feature_dim, opt.n_finetune_classes, normalization=True) else: joint_prediction_aud = generate_prediction(feature_dim, opt.n_classes, normalization=True) if opt.resume_path is not None: aux_checkpoint = Path( os.path.join(str(opt.resume_path.parent), str(opt.resume_path.name[:-4] + '_audio.pth'))) joint_prediction_aud = resume_model(aux_checkpoint, opt.arch, joint_prediction_aud) joint_prediction_aud = make_data_parallel(joint_prediction_aud, opt.distributed, opt.device) aud_para = joint_prediction_aud.parameters() joint_prediction_aud.cuda() else: aud_para = None ################################################################################# if opt.use_image: ### define loss criterion_ct_iv = NCELoss(temperature=0.1) ### image teacher model image_model = torchvision.models.resnet34(pretrained=True) # remove the fc layers (only use the image features) image_model = torch.nn.Sequential( *list(image_model.children())[:-1]) image_model = make_data_parallel(image_model, opt.distributed, opt.device) feature_dim = 512 * 2 if opt.pretrain_path is not None: joint_prediction_img = generate_prediction( feature_dim, opt.n_finetune_classes, normalization=True) else: joint_prediction_img = generate_prediction(feature_dim, opt.n_classes, normalization=True) if opt.resume_path is not None: aux_checkpoint = Path( os.path.join(str(opt.resume_path.parent), str(opt.resume_path.name[:-4] + '_image.pth'))) joint_prediction_img = resume_model(aux_checkpoint, opt.arch, joint_prediction_img) joint_prediction_img = make_data_parallel(joint_prediction_img, opt.distributed, opt.device) img_para = joint_prediction_img.parameters() joint_prediction_img.cuda() else: img_para = None ################################################################################# (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, optimizer_av, optimizer_iv, scheduler) = \ get_train_utils(opt, model_parameters=parameters, av_parameters=aud_para, iv_parameters=img_para) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None pre_val_acc = 0.0 if opt.image_size > opt.sample_size: image_size = opt.image_size else: image_size = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) if optimizer_av is None and optimizer_iv is None: train_epoch(epoch=i, data_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed) elif optimizer_av is not None and optimizer_iv is None: train_a_epoch(epoch=i, data_loader=train_loader, model=model, joint_prediction_aud=joint_prediction_aud, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_av=criterion_ct_av, optimizer=optimizer, optimizer_av=optimizer_av, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed) elif optimizer_av is None and optimizer_iv is not None: train_i_epoch(epoch=i, data_loader=train_loader, model=model, image_model=image_model, joint_prediction_img=joint_prediction_img, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_iv=criterion_ct_iv, optimizer=optimizer, optimizer_iv=optimizer_iv, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed, image_size=image_size) else: train_ai_epoch(epoch=i, data_loader=train_loader, model=model, image_model=image_model, joint_prediction_aud=joint_prediction_aud, joint_prediction_img=joint_prediction_img, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_av=criterion_ct_av, criterion_ct_iv=criterion_ct_iv, optimizer=optimizer, optimizer_av=optimizer_av, optimizer_iv=optimizer_iv, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed, image_size=image_size, loss_weight=opt.loss_weight) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if opt.use_audio: save_file_path = opt.result_path / 'save_{}_audio.pth'.format( i) save_checkpoint(save_file_path, i, opt.arch, joint_prediction_aud, optimizer, scheduler) if opt.use_image: save_file_path = opt.result_path / 'save_{}_image.pth'.format( i) save_checkpoint(save_file_path, i, opt.arch, joint_prediction_img, optimizer, scheduler) if not opt.no_val and i % opt.val_freq == 0: prev_val_loss, val_acc = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if pre_val_acc < val_acc: pre_val_acc = val_acc save_file_path = opt.result_path / 'save_model.pth' save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': if prev_val_loss is not None: scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)