def runExperiment(): seed = int(cfg['model_tag'].split('_')[0]) torch.manual_seed(seed) torch.cuda.manual_seed(seed) dataset = fetch_dataset(cfg['data_name'], cfg['subset']) process_dataset(dataset['train']) data_loader = make_data_loader(dataset) model = eval('models.{}().to(cfg["device"])'.format(cfg['model_name'])) optimizer = make_optimizer(model) scheduler = make_scheduler(optimizer) if cfg['resume_mode'] == 1: last_epoch, model, optimizer, scheduler, logger = resume( model, cfg['model_tag'], optimizer, scheduler) elif cfg['resume_mode'] == 2: last_epoch = 1 _, model, _, _, _ = resume(model, cfg['model_tag']) current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S') logger_path = 'output/runs/{}_{}'.format(cfg['model_tag'], current_time) logger = Logger(logger_path) else: last_epoch = 1 current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S') logger_path = 'output/runs/train_{}_{}'.format(cfg['model_tag'], current_time) logger = Logger(logger_path) if cfg['world_size'] > 1: model = torch.nn.DataParallel(model, device_ids=list(range( cfg['world_size']))) for epoch in range(last_epoch, cfg['num_epochs'] + 1): logger.safe(True) train(data_loader['train'], model, optimizer, logger, epoch) test(data_loader['train'], model, logger, epoch) if cfg['scheduler_name'] == 'ReduceLROnPlateau': scheduler.step( metrics=logger.mean['test/{}'.format(cfg['pivot_metric'])]) else: scheduler.step() logger.safe(False) model_state_dict = model.module.state_dict( ) if cfg['world_size'] > 1 else model.state_dict() save_result = { 'cfg': cfg, 'epoch': epoch + 1, 'model_dict': model_state_dict, 'optimizer_dict': optimizer.state_dict(), 'scheduler_dict': scheduler.state_dict(), 'logger': logger } save(save_result, './output/model/{}_checkpoint.pt'.format(cfg['model_tag'])) if cfg['pivot'] > logger.mean['test/{}'.format(cfg['pivot_metric'])]: cfg['pivot'] = logger.mean['test/{}'.format(cfg['pivot_metric'])] shutil.copy( './output/model/{}_checkpoint.pt'.format(cfg['model_tag']), './output/model/{}_best.pt'.format(cfg['model_tag'])) logger.reset() logger.safe(False) return
def runExperiment(): cfg['batch_size']['train'] = cfg['batch_size']['test'] seed = int(cfg['model_tag'].split('_')[0]) torch.manual_seed(seed) torch.cuda.manual_seed(seed) dataset = fetch_dataset(cfg['data_name'], cfg['subset']) process_dataset(dataset) model = eval( 'models.{}(model_rate=cfg["global_model_rate"]).to(cfg["device"]).to(cfg["device"])' .format(cfg['model_name'])) last_epoch, data_split, label_split, model, _, _, _ = resume( model, cfg['model_tag'], load_tag='best', strict=False) current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S') logger_path = 'output/runs/test_{}_{}'.format(cfg['model_tag'], current_time) test_logger = Logger(logger_path) test_logger.safe(True) test(dataset['test'], model, test_logger, last_epoch) test_logger.safe(False) _, _, _, _, _, _, train_logger = resume(model, cfg['model_tag'], load_tag='checkpoint', strict=False) save_result = { 'cfg': cfg, 'epoch': last_epoch, 'logger': { 'train': train_logger, 'test': test_logger } } save(save_result, './output/result/{}.pt'.format(cfg['model_tag'])) return
def runExperiment(): seed = int(cfg['model_tag'].split('_')[0]) torch.manual_seed(seed) torch.cuda.manual_seed(seed) dataset = fetch_dataset(cfg['data_name'], cfg['subset']) process_dataset(dataset) model = eval('models.{}(model_rate=cfg["global_model_rate"]).to(cfg["device"])'.format(cfg['model_name'])) optimizer = make_optimizer(model, cfg['lr']) scheduler = make_scheduler(optimizer) if cfg['resume_mode'] == 1: last_epoch, data_split, label_split, model, optimizer, scheduler, logger = resume(model, cfg['model_tag'], optimizer, scheduler) elif cfg['resume_mode'] == 2: last_epoch = 1 _, data_split, label_split, model, _, _, _ = resume(model, cfg['model_tag']) current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S') logger_path = 'output/runs/{}_{}'.format(cfg['model_tag'], current_time) logger = Logger(logger_path) else: last_epoch = 1 data_split, label_split = split_dataset(dataset, cfg['num_users'], cfg['data_split_mode']) current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S') logger_path = 'output/runs/train_{}_{}'.format(cfg['model_tag'], current_time) logger = Logger(logger_path) if data_split is None: data_split, label_split = split_dataset(dataset, cfg['num_users'], cfg['data_split_mode']) global_parameters = model.state_dict() federation = Federation(global_parameters, cfg['model_rate'], label_split) for epoch in range(last_epoch, cfg['num_epochs']['global'] + 1): logger.safe(True) train(dataset['train'], data_split['train'], label_split, federation, model, optimizer, logger, epoch) test_model = stats(dataset['train'], model) test(dataset['test'], data_split['test'], label_split, test_model, logger, epoch) if cfg['scheduler_name'] == 'ReduceLROnPlateau': scheduler.step(metrics=logger.mean['train/{}'.format(cfg['pivot_metric'])]) else: scheduler.step() logger.safe(False) model_state_dict = model.state_dict() save_result = { 'cfg': cfg, 'epoch': epoch + 1, 'data_split': data_split, 'label_split': label_split, 'model_dict': model_state_dict, 'optimizer_dict': optimizer.state_dict(), 'scheduler_dict': scheduler.state_dict(), 'logger': logger} save(save_result, './output/model/{}_checkpoint.pt'.format(cfg['model_tag'])) if cfg['pivot'] < logger.mean['test/{}'.format(cfg['pivot_metric'])]: cfg['pivot'] = logger.mean['test/{}'.format(cfg['pivot_metric'])] shutil.copy('./output/model/{}_checkpoint.pt'.format(cfg['model_tag']), './output/model/{}_best.pt'.format(cfg['model_tag'])) logger.reset() logger.safe(False) return
def runExperiment(): seed = int(cfg['model_tag'].split('_')[0]) torch.manual_seed(seed) torch.cuda.manual_seed(seed) dataset = fetch_dataset(cfg['data_name'], cfg['subset']) process_dataset(dataset['train']) if 'pixelcnn' in cfg['model_name']: ae = eval('models.{}().to(cfg["device"])'.format(cfg['ae_name'])) _, ae, _, _, _ = resume(ae, cfg['ae_tag'], load_tag='best') else: ae = None model = eval('models.{}().to(cfg["device"])'.format(cfg['model_name'])) _, model, _, _, _ = resume(model, cfg['model_tag'], load_tag='best') generate(model, ae) return
def run_test(args_dict): print("Start test of model...") # Set up device if torch.cuda.is_available(): args_dict.device = torch.device("cuda:0") # you can continue going on here, like cuda:1 cuda:2....etc. print("Running on the GPU") else: args_dict.device = torch.device("cpu") print("Running on the CPU") args_dict.resume = True # Define model if args_dict.model == "covidnet": model = CovidNet(args_dict.n_classes) else: model = ResNet(args_dict.n_classes) model.to(args_dict.device) optimizer = torch.optim.Adam(model.parameters(), lr=args_dict.lr) best_sensit, model, optimizer = utils.resume(args_dict, model, optimizer) dl_test = calculateDataLoaderTest(args_dict) valEpoch(args_dict, dl_test, model)
def resume(id): fpath = utils.resume(id) config["g_dir"] = fpath["g_dir"] config["goptim_dir"] = fpath["goptim_dir"] config["a_dir"] = fpath["a_dir"] config["aoptim_dir"] = fpath["aoptim_dir"] config["b_dir"] = fpath["b_dir"] config["boptim_dir"] = fpath["boptim_dir"]
def runExperiment(): seed = int(cfg['model_tag'].split('_')[0]) torch.manual_seed(seed) torch.cuda.manual_seed(seed) dataset = fetch_dataset(cfg['data_name'], cfg['subset']) process_dataset(dataset['train']) model = eval('models.{}().to(cfg["device"])'.format(cfg['model_name'])) _, model, _, _, _ = resume(model, cfg['model_tag'], load_tag='best') transit(model) return
def runExperiment(): seed = int(cfg['model_tag'].split('_')[0]) torch.manual_seed(seed) torch.cuda.manual_seed(seed) dataset = fetch_dataset(cfg['data_name'], cfg['subset']) process_dataset(dataset['train']) data_loader = make_data_loader(dataset) ae = eval('models.{}().to(cfg["device"])'.format(cfg['ae_name'])) _, ae, _, _, _ = resume(ae, cfg['ae_tag'], load_tag='best') model = eval('models.{}().to(cfg["device"])'.format(cfg['model_name'])) load_tag = 'best' last_epoch, model, _, _, _ = resume(model, cfg['model_tag'], load_tag=load_tag) current_time = datetime.datetime.now().strftime('%b%d_%H-%M-%S') logger_path = 'output/runs/train_{}_{}'.format(cfg['model_tag'], current_time) logger = Logger(logger_path) logger.safe(True) test(data_loader['train'], ae, model, logger, last_epoch) logger.safe(False) save_result = {'cfg': cfg, 'epoch': last_epoch, 'logger': logger} save(save_result, './output/result/{}.pt'.format(cfg['model_tag'])) return
def run_gradcam( args_dict ): # call this function to get the gradcam pictures and output - only for one picture print("Start test of model...") args_dict.batch = 1 # Set up device if torch.cuda.is_available(): args_dict.device = torch.device( "cuda:0" ) # you can continue going on here, like cuda:1 cuda:2....etc. print("Running on the GPU") else: args_dict.device = torch.device("cpu") print("Running on the CPU") args_dict.resume = True # Define model if args_dict.model == "covidnet": model = CovidNet(args_dict.n_classes) else: model = ResNet(args_dict.n_classes) model.to(args_dict.device) optimizer = torch.optim.Adam(model.parameters(), lr=args_dict.lr) best_sensit, model, optimizer = utils.resume(args_dict, model, optimizer) dl_test = eval.calculateDataLoaderTest(args_dict) for batch_idx, (x_batch, y_batch, _) in enumerate(dl_test): x_batch, y_batch = x_batch.to(args_dict.device), y_batch.to( args_dict.device) if batch_idx == 2: break output = model(x_batch) pred = np.argmax(output.cpu().data.numpy(), axis=1) if args_dict.model == 'resnet': heatmap, image = grad_cam(model, x_batch) elif args_dict.model == 'covidnet': heatmap, image = grad_cam_covid(model, x_batch, output, pred) # plt.imshow(image, interpolation='nearest') print(heatmap.shape) print(image.shape) plt.imshow(heatmap) plt.show() return heatmap, image, output
def run_calibration(args_dict): """ Apply Temperature scaling for callibration and saves the new model """ print("Start calibration of model...") args_dict.resume = True # Define model if args_dict.model == "covidnet": model = CovidNet(args_dict.n_classes) elif args_dict.model == "resnet": model = ResNet(args_dict.n_classes) # Set up device if torch.cuda.is_available(): args_dict.device = torch.device( "cuda:0" ) # you can continue going on here, like cuda:1 cuda:2....etc. print("Running on the GPU") else: args_dict.device = torch.device("cpu") print("Running on the CPU") model.to(args_dict.device) optimizer = torch.optim.Adam(model.parameters(), lr=args_dict.lr) dl_test = eval.calculateDataLoaderTest(args_dict) _, model, _ = utils.resume(args_dict, model, optimizer) # model.eval() scaled_model = ModelWithTemperature(model) scaled_model.set_temperature(dl_test, args_dict.device) print("saving calibrated model") utils.save_model( args_dict, { 'epoch': args_dict.start_epoch, 'state_dict': scaled_model.state_dict(), 'optimizer': optimizer.state_dict() }) plot_calibration(args_dict)
def main_train_loop(save_dir, model, args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_class = len(args.cates) #resume chekckpoint start_epoch = 0 optimizer = initilize_optimizer(model, args) if args.resume_checkpoint is None and os.path.exists( os.path.join(save_dir, 'checkpoint-latest.pt')): args.resume_checkpoint = os.path.join( save_dir, 'checkpoint-latest.pt') # use the latest checkpoint if args.resume_checkpoint is not None: if args.resume_optimizer: model, optimizer, start_epoch = resume( args.resume_checkpoint, model, optimizer, strict=(not args.resume_non_strict)) else: model, _, start_epoch = resume(args.resume_checkpoint, model, optimizer=None, strict=(not args.resume_non_strict)) print('Resumed from: ' + args.resume_checkpoint) #initilize dataset and load tr_dataset, te_dataset = get_datasets(args) train_sampler = None # for non distributed training train_loader = torch.utils.data.DataLoader(dataset=tr_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=0, pin_memory=True, sampler=train_sampler, drop_last=True, worker_init_fn=np.random.seed( args.seed)) test_loader = torch.utils.data.DataLoader(dataset=te_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=False, worker_init_fn=np.random.seed( args.seed)) #initialize the learning rate scheduler if args.scheduler == 'exponential': scheduler = optim.lr_scheduler.ExponentialLR(optimizer, args.exp_decay) elif args.scheduler == 'step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.epochs // 2, gamma=0.1) elif args.scheduler == 'linear': def lambda_rule(ep): lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float( 0.5 * args.epochs) return lr_l scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) else: assert 0, "args.schedulers should be either 'exponential' or 'linear'" #training starts from here tot_nelbo = [] tot_kl_loss = [] tot_x_reconst = [] best_eval_metric = float('+inf') for epoch in range(start_epoch, args.epochs): # adjust the learning rate if (epoch + 1) % args.exp_decay_freq == 0: scheduler.step(epoch=epoch) #train for one epoch model.train() for bidx, data in enumerate(train_loader): idx_batch, tr_batch, te_batch = data['idx'], data[ 'train_points'], data['test_points'] obj_type = data['cate_idx'] y_one_hot = obj_type.new( np.eye(n_class)[obj_type]).to(device).float() step = bidx + len(train_loader) * epoch if args.random_rotate: tr_batch, _, _ = apply_random_rotation( tr_batch, rot_axis=train_loader.dataset.gravity_axis) inputs = tr_batch.to(device) y_one_hot = y_one_hot.to(device) optimizer.zero_grad() inputs_dict = {'x': inputs, 'y_class': y_one_hot} ret = model(inputs_dict) loss, nelbo, kl_loss, x_reconst, cl_loss = ret['loss'], ret[ 'nelbo'], ret['kl_loss'], ret['x_reconst'], ret['cl_loss'] loss.backward() optimizer.step() cur_loss = loss.cpu().item() cur_nelbo = nelbo.cpu().item() cur_kl_loss = kl_loss.cpu().item() cur_x_reconst = x_reconst.cpu().item() cur_cl_loss = cl_loss.cpu().item() tot_nelbo.append(cur_nelbo) tot_kl_loss.append(cur_kl_loss) tot_x_reconst.append(cur_x_reconst) if step % args.log_freq == 0: print( "Epoch {0:6d} Step {1:12d} Loss {2:12.6f} Nelbo {3:12.6f} KL Loss {4:12.6f} Reconst Loss {5:12.6f} CL_Loss{6:12.6f}" .format(epoch, step, cur_loss, cur_nelbo, cur_kl_loss, cur_x_reconst, cur_cl_loss)) #save checkpoint if (epoch + 1) % args.save_freq == 0: save(model, optimizer, epoch + 1, os.path.join(save_dir, 'checkpoint-%d.pt' % epoch)) save(model, optimizer, epoch + 1, os.path.join(save_dir, 'checkpoint-latest.pt')) eval_metric = evaluate_model(model, te_dataset, args) train_metric = evaluate_model(model, tr_dataset, args) print('Checkpoint: Dev Reconst Loss:{0}, Train Reconst Loss:{1}'. format(eval_metric, train_metric)) if eval_metric < best_eval_metric: best_eval_metric = eval_metric save(model, optimizer, epoch + 1, os.path.join(save_dir, 'checkpoint-best.pt')) print('new best model found!') save(model, optimizer, args.epochs, os.path.join(save_dir, 'checkpoint-latest.pt')) #save final visuliztion of 10 samples model.eval() with torch.no_grad(): samples_A = model.reconstruct_input(inputs) #sample_point(5) results = [] for idx in range(5): res = visualize_point_clouds( samples_A[idx], tr_batch[idx], idx, pert_order=train_loader.dataset.display_axis_order) results.append(res) res = np.concatenate(results, axis=1) imsave(os.path.join(save_dir, 'images', '_epoch%d.png' % (epoch)), res.transpose((1, 2, 0))) #load the best model and compute eval metric: best_model_path = os.path.join(save_dir, 'checkpoint-best.pt') ckpt = torch.load(best_model_path) model.load_state_dict(ckpt['model'], strict=True) eval_metric = evaluate_model(model, te_dataset, args) train_metric = evaluate_model(model, tr_dataset, args) print( 'Best model at epoch:{2} Dev Reconst Loss:{0}, Train Reconst Loss:{1}'. format(eval_metric, train_metric, ckpt['epoch']))
def main( batch_size, nworkers, outdir, num_epochs, snapshot, finetune, lr, lradapt, experiment, labelimage, smoketest=False, trainpath=None, validpath=None ): np.random.seed(0) torch.manual_seed(0) # Visdom environment visdom_environment = experiment + "_" + labelimage.replace(".tif", "") outdir = os.path.join(outdir, visdom_environment) if validpath is None: validpath = os.environ[VALIDATA_ENVIRONMENT_VARIABLE] if trainpath is None: trainpath = os.environ[TRAINDATA_ENVIRONMENT_VARIABLE] train = train_houston_data_loader(trainpath, batch_size=batch_size, num_workers=nworkers, shuffle=True, validation=False, labelimage=labelimage) val = val_houston_data_loader(validpath, batch_size=batch_size, num_workers=nworkers, shuffle=True, validation=True, labelimage=labelimage) if experiment == "vhr": network = pspnet_10m() elif experiment == "s1": network = input_keep_res_net_34_s1_all() elif experiment == "s2": network = input_keep_res_net_34_s2_all() elif experiment == "vhrs1": network = pspnet_fused_s1_10m() elif experiment == "vhrs2": network = pspnet_fused_s2_10m() elif experiment == "s1s2": network = psp34_sentinel1_and_sentinel2() elif experiment == "vhrs1s2": network = pspnet_fused_s1s2_10m() else: raise ValueError("Please insert a valid experiment id. Valid experiments are 'vhr', 's1', 's2', 'vhrs1, 'vhrs2', 'vhrs1s2'") network = nn.DataParallel(network) if torch.cuda.is_available(): network = network.cuda() if finetune or snapshot: resume(finetune or snapshot, network, None) optimizer = optim.Adam(network.parameters(), lr=lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=lradapt) if snapshot: state = resume(snapshot, None, optimizer) train.iterations = state['iteration'] loss = nn.NLLLoss2d() if torch.cuda.is_available(): loss = loss.cuda() trainer = Trainer( network, optimizer, scheduler, loss, train, val, outdir, visdom_environment, smoketest ) trainer.train(num_epochs, start_epoch=0)
def main(): torch.backends.cudnn.benchmark = True # hyper-params initializing args = dictobj() args.gpu = torch.device('cuda:%d' % (6)) timestamp = '%d-%d-%d-%d-%d-%d-%d-%d-%d' % time.localtime(time.time()) args.log_name = '%s-pointflow' % timestamp writer = SummaryWriter(comment=args.log_name) args.use_latent_flow, args.prior_w, args.entropy_w, args.recon_w = True, 1., 1., 1. args.fin, args.fz = 3, 128 args.use_deterministic_encoder = True args.distributed = False args.optimizer = optim.Adam args.batch_size = 16 args.lr, args.beta1, args.beta2, args.weight_decay = 1e-3, 0.9, 0.999, 1e-4 args.T, args.train_T, args.atol, args.rtol = 1., False, 1e-5, 1e-5 args.layer_type = diffop.CoScaleLinear args.solver = 'dopri5' args.use_adjoint, args.bn = True, False args.dims, args.num_blocks = (512, 512), 1 # originally (512 * 3) args.latent_dims, args.latent_num_blocks = (256, 256), 1 args.resume, args.resume_path = False, None args.end_epoch = 2000 args.scheduler, args.scheduler_step_size = optim.lr_scheduler.StepLR, 20 args.random_rotation = True args.save_freq = 10 args.dataset_type = 'shapenet15k' args.cates = ['airplane'] # 'all' for all categories training args.tr_max_sample_points, args.te_max_sample_points = 2048, 2048 args.dataset_scale = 1.0 args.normalize_per_shape = False args.normalize_std_per_axis = False args.num_workers = 4 args.data_dir = "/data/ShapeNetCore.v2.PC15k" torch.cuda.set_device(args.gpu) model = PointFlow(**args).cuda(args.gpu) # load milestone epoch = 0 optimizer = model.get_optimizer(**args) if args.resume: model, optimizer, epoch = resume(args.resume_path, model, optimizer, strict=True) print("Loaded model from %s" % args.resume_path) # load data train_dataset, test_dataset = get_datasets(args) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, sampler=None, drop_last=True) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, sampler=None, drop_last=False) if args.scheduler == optim.lr_scheduler.StepLR: scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=args.scheduler_step_size, gamma=0.65) else: raise NotImplementedError("Only StepLR supported") ent_rec, latent_rec, recon_rec = Averager(), Averager(), Averager() for e in trange(epoch, args.end_epoch): # record lr if writer is not None: writer.add_scalar('lr/optimizer', scheduler.get_lr()[0], e) # feed a batch, train for idx, data in enumerate(tqdm(train_loader)): idx_batch, tr_batch, te_batch = data['idx'], data[ 'train_points'], data['test_points'] model.train() if args.random_rotation: # raise NotImplementedError('Random Rotation Augmentation not implemented yet') tr_batch, _, _ = apply_random_rotation( tr_batch, rot_axis=train_loader.dataset.gravity_axis) inputs = tr_batch.cuda(args.gpu, non_blocking=True) step = idx + len(train_loader) * e # batch step out = model(inputs, optimizer, step, writer, sample_gpu=args.gpu) entropy, prior_nats, recon_nats = out['entropy'], out[ 'prior_nats'], out['recon_nats'] ent_rec.update(entropy) recon_rec.update(recon_nats) latent_rec.update(prior_nats) # update lr scheduler.step(epoch=e) # save milestones if e % args.save_freq == 0 and e != 0: save(model, optimizer, e, path='milestone-%d.save' % e) save(model, optimizer, e, path='milestone-latest.save' % e) # save as latest model
options = interface.gui() input_files = arguments.get_input_files(options.input) stoichiometry = None if options.stoichiometry is not None: stoichiometry = arguments.parse_stoichiometry(options.stoichiometry) if options.verbose: utils.options = options sys.stderr.write("Input correctly parsed.\nFiles used as input:\n") for file in input_files: sys.stderr.write("\t" + file + "\n") sys.stderr.write("\n") # Step 2: get possible structures for macrocomplex construction and skip others if options.resume: (chains, pairs, similar_chains, structures) = utils.resume(options) else: (chains, pairs, similar_chains, structures) = utils.get_information(input_files, options) complexes_found = [] if options.verbose: sys.stderr.write("\n# Beginning to construct the complex\n\n") # STEP4: Begin Macrocomplex reconstruction! def construct_complex(current_complex_real, similar_chains, stoichiometry, structures, used_pairs_real, clashing_real, old_complex_real): # bruteforce ending! current_complex = copy.deepcopy(current_complex_real)
def main_worker(gpu, save_dir, ngpus_per_node, init_data, args): # basic setup cudnn.benchmark = True args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # resume training!!! ################################# if args.resume_checkpoint is None and os.path.exists(os.path.join(save_dir, 'checkpoint-latest.pt')): args.resume_checkpoint = os.path.join(save_dir, 'checkpoint-latest.pt') # use the latest checkpoint print('Checkpoint is set to the latest one.') ################################# # multi-GPU setup model = SoftPointFlow(args) if args.distributed: # Multiple processes, single GPU per process if args.gpu is not None: def _transform_(m): return nn.parallel.DistributedDataParallel( m, device_ids=[args.gpu], output_device=args.gpu, check_reduction=True) torch.cuda.set_device(args.gpu) model.cuda(args.gpu) model.multi_gpu_wrapper(_transform_) args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = 0 else: assert 0, "DistributedDataParallel constructor should always set the single device scope" else: # Single process, multiple GPUs per process def _transform_(m): return nn.DataParallel(m) model = model.cuda() model.multi_gpu_wrapper(_transform_) start_epoch = 1 valid_loss_best = 987654321 optimizer = model.make_optimizer(args) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=args.gamma) if args.resume_checkpoint is not None: model, optimizer, scheduler, start_epoch, valid_loss_best, log_dir = resume( args.resume_checkpoint, model, optimizer, scheduler) model.set_initialized(True) print('Resumed from: ' + args.resume_checkpoint) else: log_dir = save_dir + "/runs/" + str(time.strftime('%Y-%m-%d_%H:%M:%S')) with torch.no_grad(): inputs, inputs_noisy, std_in = init_data inputs = inputs.to(args.gpu, non_blocking=True) inputs_noisy = inputs_noisy.to(args.gpu, non_blocking=True) std_in = std_in.to(args.gpu, non_blocking=True) _ = model(inputs, inputs_noisy, std_in, optimizer, None, None, init=True) del inputs, inputs_noisy, std_in print('Actnorm is initialized') if not args.distributed or (args.rank % ngpus_per_node == 0): writer = SummaryWriter(logdir=log_dir) else: writer = None # initialize datasets and loaders tr_dataset = get_trainset(args) te_dataset = get_testset(args) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(tr_dataset) test_sampler = torch.utils.data.distributed.DistributedSampler(te_dataset) else: train_sampler = None test_sampler = None train_loader = torch.utils.data.DataLoader( dataset=tr_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=0, pin_memory=True, sampler=train_sampler, drop_last=True, worker_init_fn=init_np_seed) test_loader = torch.utils.data.DataLoader( dataset=te_dataset, batch_size=args.batch_size, shuffle=(test_sampler is None), num_workers=0, pin_memory=True, sampler=test_sampler, drop_last=True, worker_init_fn=init_np_seed) # save dataset statistics if not args.distributed or (args.rank % ngpus_per_node == 0): np.save(os.path.join(save_dir, "train_set_mean.npy"), tr_dataset.all_points_mean) np.save(os.path.join(save_dir, "train_set_std.npy"), tr_dataset.all_points_std) np.save(os.path.join(save_dir, "train_set_idx.npy"), np.array(tr_dataset.shuffle_idx)) # main training loop if args.distributed: print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size())) seen_inputs = next(iter(train_loader))['train_points'].cuda(args.gpu, non_blocking=True) unseen_inputs = next(iter(test_loader))['test_points'].cuda(args.gpu, non_blocking=True) del test_loader print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs)) for epoch in range(start_epoch, args.epochs+1): start_time = time.time() if args.distributed: train_sampler.set_epoch(epoch) if writer is not None: writer.add_scalar('lr/optimizer', scheduler.get_lr()[0], epoch) model.train() # train for one epoch for bidx, data in enumerate(train_loader): step = bidx + len(train_loader) * (epoch - 1) tr_batch = data['train_points'] if args.random_rotate: tr_batch, _, _ = apply_random_rotation( tr_batch, rot_axis=train_loader.dataset.gravity_axis) inputs = tr_batch.cuda(args.gpu, non_blocking=True) B, N, D = inputs.shape std = (args.std_max - args.std_min) * torch.rand_like(inputs[:,:,0]).view(B,N,1) + args.std_min eps = torch.randn_like(inputs) * std std_in = std / args.std_max * args.std_scale inputs_noisy = inputs + eps out = model(inputs, inputs_noisy, std_in, optimizer, step, writer) entropy, prior_nats, recon_nats, loss = out['entropy'], out['prior_nats'], out['recon_nats'], out['loss'] if step % args.log_freq == 0: duration = time.time() - start_time start_time = time.time() if writer is not None: writer.add_scalar('train/avg_time', duration, step) print("[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] Entropy %2.5f LatentNats %2.5f PointNats %2.5f loss %2.5f" % (args.rank, epoch, bidx, len(train_loader), duration, entropy, prior_nats, recon_nats, loss)) del inputs, inputs_noisy, std_in, out, eps gc.collect() if epoch < args.stop_scheduler: scheduler.step() if epoch % args.valid_freq == 0: with torch.no_grad(): model.eval() valid_loss = 0.0 valid_entropy = 0.0 valid_prior = 0.0 valid_prior_nats = 0.0 valid_recon = 0.0 valid_recon_nats = 0.0 for bidx, data in enumerate(train_loader): step = bidx + len(train_loader) * epoch tr_batch = data['test_points'] if args.random_rotate: tr_batch, _, _ = apply_random_rotation( tr_batch, rot_axis=train_loader.dataset.gravity_axis) inputs = tr_batch.cuda(args.gpu, non_blocking=True) B, N, D = inputs.shape std = (args.std_max - args.std_min) * torch.rand_like(inputs[:,:,0]).view(B,N,1) + args.std_min eps = torch.randn_like(inputs) * std std_in = std / args.std_max * args.std_scale inputs_noisy = inputs + eps out = model(inputs, inputs_noisy, std_in, optimizer, step, writer, valid=True) valid_loss += out['loss'] / len(train_loader) valid_entropy += out['entropy'] / len(train_loader) valid_prior += out['prior'] / len(train_loader) valid_prior_nats += out['prior_nats'] / len(train_loader) valid_recon += out['recon'] / len(train_loader) valid_recon_nats += out['recon_nats'] / len(train_loader) del inputs, inputs_noisy, std_in, out, eps gc.collect() if writer is not None: writer.add_scalar('valid/entropy', valid_entropy, epoch) writer.add_scalar('valid/prior', valid_prior, epoch) writer.add_scalar('valid/prior(nats)', valid_prior_nats, epoch) writer.add_scalar('valid/recon', valid_recon, epoch) writer.add_scalar('valid/recon(nats)', valid_recon_nats, epoch) writer.add_scalar('valid/loss', valid_loss, epoch) duration = time.time() - start_time start_time = time.time() print("[Valid] Epoch %d Time [%3.2fs] Entropy %2.5f LatentNats %2.5f PointNats %2.5f loss %2.5f loss_best %2.5f" % (epoch, duration, valid_entropy, valid_prior_nats, valid_recon_nats, valid_loss, valid_loss_best)) if valid_loss < valid_loss_best: valid_loss_best = valid_loss if not args.distributed or (args.rank % ngpus_per_node == 0): save(model, optimizer, epoch + 1, scheduler, valid_loss_best, log_dir, os.path.join(save_dir, 'checkpoint-best.pt')) print('best model saved!') if epoch % args.save_freq == 0 and (not args.distributed or (args.rank % ngpus_per_node == 0)): save(model, optimizer, epoch + 1, scheduler, valid_loss_best, log_dir, os.path.join(save_dir, 'checkpoint-%d.pt' % epoch)) save(model, optimizer, epoch + 1, scheduler, valid_loss_best, log_dir, os.path.join(save_dir, 'checkpoint-latest.pt')) print('model saved!') # save visualizations if epoch % args.viz_freq == 0: with torch.no_grad(): # reconstructions model.eval() samples = model.reconstruct(unseen_inputs) results = [] for idx in range(min(16, unseen_inputs.size(0))): res = visualize_point_clouds(samples[idx], unseen_inputs[idx], idx, pert_order=train_loader.dataset.display_axis_order) results.append(res) res = np.concatenate(results, axis=1) imageio.imwrite(os.path.join(save_dir, 'images', 'SPF_epoch%d-gpu%s_recon_unseen.png' % (epoch, args.gpu)), res.transpose(1, 2, 0)) if writer is not None: writer.add_image('tr_vis/conditioned', torch.as_tensor(res), epoch) samples = model.reconstruct(seen_inputs) results = [] for idx in range(min(16, seen_inputs.size(0))): res = visualize_point_clouds(samples[idx], seen_inputs[idx], idx, pert_order=train_loader.dataset.display_axis_order) results.append(res) res = np.concatenate(results, axis=1) imageio.imwrite(os.path.join(save_dir, 'images', 'SPF_epoch%d-gpu%s_recon_seen.png' % (epoch, args.gpu)), res.transpose(1, 2, 0)) if writer is not None: writer.add_image('tr_vis/conditioned', torch.as_tensor(res), epoch) num_samples = min(16, unseen_inputs.size(0)) num_points = unseen_inputs.size(1) _, samples = model.sample(num_samples, num_points) results = [] for idx in range(num_samples): res = visualize_point_clouds(samples[idx], unseen_inputs[idx], idx, pert_order=train_loader.dataset.display_axis_order) results.append(res) res = np.concatenate(results, axis=1) imageio.imwrite(os.path.join(save_dir, 'images', 'SPF_epoch%d-gpu%s_sample.png' % (epoch, args.gpu)), res.transpose((1, 2, 0))) if writer is not None: writer.add_image('tr_vis/sampled', torch.as_tensor(res), epoch) print('image saved!')
def main_worker(gpu, save_dir, ngpus_per_node, args): # basic setup cudnn.benchmark = True args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.distributed: args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.log_name is not None: log_dir = "runs/%s" % args.log_name else: log_dir = "runs/time-%d" % time.time() if not args.distributed or (args.rank % ngpus_per_node == 0): writer = SummaryWriter(logdir=log_dir) else: writer = None if not args.use_latent_flow: # auto-encoder only args.prior_weight = 0 args.entropy_weight = 0 # multi-GPU setup model = PointFlow(args) if args.distributed: # Multiple processes, single GPU per process if args.gpu is not None: def _transform_(m): return nn.parallel.DistributedDataParallel( m, device_ids=[args.gpu], output_device=args.gpu, check_reduction=True) torch.cuda.set_device(args.gpu) model.cuda(args.gpu) model.multi_gpu_wrapper(_transform_) args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = 0 else: assert 0, "DistributedDataParallel constructor should always set the single device scope" elif args.gpu is not None: # Single process, single GPU per process torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # Single process, multiple GPUs per process def _transform_(m): return nn.DataParallel(m) model = model.cuda() model.multi_gpu_wrapper(_transform_) # resume checkpoints start_epoch = 0 optimizer = model.make_optimizer(args) if args.resume_checkpoint is None and os.path.exists( os.path.join(save_dir, 'checkpoint-latest.pt')): args.resume_checkpoint = os.path.join( save_dir, 'checkpoint-latest.pt') # use the latest checkpoint if args.resume_checkpoint is not None: if args.resume_optimizer: model, optimizer, start_epoch = resume( args.resume_checkpoint, model, optimizer, strict=(not args.resume_non_strict)) else: model, _, start_epoch = resume(args.resume_checkpoint, model, optimizer=None, strict=(not args.resume_non_strict)) print('Resumed from: ' + args.resume_checkpoint) # initialize datasets and loaders tr_dataset = MyDataset(args.data_dir, istest=False) te_dataset = MyDataset(args.data_dir, istest=True) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( tr_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(dataset=tr_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=0, pin_memory=True, sampler=train_sampler, drop_last=True, worker_init_fn=init_np_seed) test_loader = torch.utils.data.DataLoader(dataset=te_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=False, worker_init_fn=init_np_seed) # save dataset statistics # if not args.distributed or (args.rank % ngpus_per_node == 0): # np.save(os.path.join(save_dir, "train_set_mean.npy"), tr_dataset.all_points_mean) # np.save(os.path.join(save_dir, "train_set_std.npy"), tr_dataset.all_points_std) # np.save(os.path.join(save_dir, "train_set_idx.npy"), np.array(tr_dataset.shuffle_idx)) # np.save(os.path.join(save_dir, "val_set_mean.npy"), te_dataset.all_points_mean) # np.save(os.path.join(save_dir, "val_set_std.npy"), te_dataset.all_points_std) # np.save(os.path.join(save_dir, "val_set_idx.npy"), np.array(te_dataset.shuffle_idx)) # load classification dataset if needed if args.eval_classification: from datasets import get_clf_datasets def _make_data_loader_(dataset): return torch.utils.data.DataLoader(dataset=dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True, drop_last=False, worker_init_fn=init_np_seed) clf_datasets = get_clf_datasets(args) clf_loaders = { k: [_make_data_loader_(ds) for ds in ds_lst] for k, ds_lst in clf_datasets.items() } else: clf_loaders = None # initialize the learning rate scheduler if args.scheduler == 'exponential': scheduler = optim.lr_scheduler.ExponentialLR(optimizer, args.exp_decay) elif args.scheduler == 'step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.epochs // 2, gamma=0.1) elif args.scheduler == 'linear': def lambda_rule(ep): lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float( 0.5 * args.epochs) return lr_l scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) else: assert 0, "args.schedulers should be either 'exponential' or 'linear'" # main training loop start_time = time.time() entropy_avg_meter = AverageValueMeter() latent_nats_avg_meter = AverageValueMeter() point_nats_avg_meter = AverageValueMeter() if args.distributed: print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size())) print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs)) for epoch in range(start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # adjust the learning rate if (epoch + 1) % args.exp_decay_freq == 0: scheduler.step(epoch=epoch) if writer is not None: writer.add_scalar('lr/optimizer', scheduler.get_lr()[0], epoch) # train for one epoch for bidx, data in enumerate(train_loader): idx_batch, tr_batch, te_batch = data['idx'], data[ 'train_points'], data['test_points'] step = bidx + len(train_loader) * epoch model.train() inputs = tr_batch.cuda(args.gpu, non_blocking=True) out = model(inputs, optimizer, step, writer) entropy, prior_nats, recon_nats = out['entropy'], out[ 'prior_nats'], out['recon_nats'] entropy_avg_meter.update(entropy) point_nats_avg_meter.update(recon_nats) latent_nats_avg_meter.update(prior_nats) if step % args.log_freq == 0: duration = time.time() - start_time start_time = time.time() print( "[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] Entropy %2.5f LatentNats %2.5f PointNats %2.5f" % (args.rank, epoch, bidx, len(train_loader), duration, entropy_avg_meter.avg, latent_nats_avg_meter.avg, point_nats_avg_meter.avg)) # evaluate on the validation set # if not args.no_validation and (epoch + 1) % args.val_freq == 0: # from utils import validate # validate(test_loader, model, epoch, writer, save_dir, args, clf_loaders=clf_loaders) # save visualizations if (epoch + 1) % args.viz_freq == 0: # reconstructions model.eval() samples = model.reconstruct(inputs) results = [] for idx in range(min(10, inputs.size(0))): res = visualize_point_clouds(samples[idx], inputs[idx], idx) results.append(res) res = np.concatenate(results, axis=1) scipy.misc.imsave( os.path.join( save_dir, 'images', 'tr_vis_conditioned_epoch%d-gpu%s.png' % (epoch, args.gpu)), res.transpose((1, 2, 0))) if writer is not None: writer.add_image('tr_vis/conditioned', torch.as_tensor(res), epoch) # samples if args.use_latent_flow: num_samples = min(10, inputs.size(0)) num_points = inputs.size(1) _, samples = model.sample(num_samples, num_points) results = [] for idx in range(num_samples): res = visualize_point_clouds(samples[idx], inputs[idx], idx) results.append(res) res = np.concatenate(results, axis=1) scipy.misc.imsave( os.path.join( save_dir, 'images', 'tr_vis_conditioned_epoch%d-gpu%s.png' % (epoch, args.gpu)), res.transpose((1, 2, 0))) if writer is not None: writer.add_image('tr_vis/sampled', torch.as_tensor(res), epoch) # save checkpoints if not args.distributed or (args.rank % ngpus_per_node == 0): if (epoch + 1) % args.save_freq == 0: save(model, optimizer, epoch + 1, os.path.join(save_dir, 'checkpoint-%d.pt' % epoch)) save(model, optimizer, epoch + 1, os.path.join(save_dir, 'checkpoint-latest.pt'))
def read_main_(config_file,logger,kfold_index=None,return_uAUG=True): """ return a dict """ POPEN = Auto_popen( config_file) POPEN.kfold_index = kfold_index if POPEN.kfold_cv: if kfold_index is None: raise NotImplementedError("please specify the kfold index to perform K fold cross validation") POPEN.vae_log_path = POPEN.vae_log_path.replace(".log","_cv%d.log"% kfold_index) POPEN.vae_pth_path = POPEN.vae_pth_path.replace(".pth","_cv%d.pth"% kfold_index) # device = 'cuda' if torch.cuda.is_available() else 'cpu' # cuda2 = torch.device('cuda:2') # Run name if POPEN.run_name is None: run_name = POPEN.model_type + time.strftime("__%Y_%m_%d_%H:%M") else: run_name = POPEN.run_name # log dir # logger = utils.setup_logs(POPEN.vae_log_path) # built model dir or check resume POPEN.check_experiment(logger) # |=====================================| # |=========== setup part ==========| # |=====================================| # read data if return_uAUG: POPEN.aux_task_columns += ['with_uAUG'] loader_ls = reader.get_dataloader(POPEN) # =========== setup model =========== # train_iter = iter(train_loader) # X,Y = next(train_iter) # -- pretrain -- if POPEN.pretrain_pth is not None: # load pretran model pretrain_popen = Auto_popen(POPEN.pretrain_pth) try: pretrain_model = pretrain_popen.Model_Class(*pretrain_popen.model_args) utils.load_model(pretrain_popen,pretrain_model,logger) except: pretrain_model = torch.load(pretrain_popen.vae_pth_path)['state_dict'] # DL_models.LSTM_AE if POPEN.Model_Class == pretrain_popen.Model_Class: # if not POPEN.Resumable: # # we only load pre-train for the first time # # later we can resume model = pretrain_model del pretrain_model else: downstream_model = POPEN.Model_Class(*POPEN.model_args) # merge model = MTL_models.Enc_n_Down(pretrain_model,downstream_model) # -- end2end -- elif POPEN.path_category == "CrossStitch": backbone = {} for t in POPEN.tasks: task_popen = Auto_popen(POPEN.backbone_config[t]) task_model = task_popen.Model_Class(*task_popen.model_args) load_model(task_popen,task_model,logger) backbone[t] = task_model POPEN.model_args = [backbone] + POPEN.model_args model = POPEN.Model_Class(*POPEN.model_args) else: Model_Class = POPEN.Model_Class # DL_models.LSTM_AE model = Model_Class(*POPEN.model_args) # =========== set optimizer =========== if POPEN.optimizer == 'Schedule': optimizer = ScheduledOptim(optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09, weight_decay=1e-4, amsgrad=True), n_warmup_steps=20) elif type(POPEN.optimizer) == dict: optimizer = eval(scheduleoptim_dict_str.format(**POPEN.optimizer)) else: optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=POPEN.lr, betas=(0.9, 0.98), eps=1e-09, weight_decay=POPEN.l2) if POPEN.loss_schema == 'DTP': POPEN.loss_schedualer = Dynamic_Task_Priority(POPEN.tasks,POPEN.gamma,POPEN.chimerla_weight) elif POPEN.loss_schema == 'DWA': POPEN.loss_schedualer = Dynamic_Weight_Averaging(POPEN.tasks,POPEN.tau,POPEN.chimerla_weight) # =========== resume =========== best_loss = np.inf best_acc = 0 best_epoch = 0 previous_epoch = 0 if POPEN.Resumable: model,previous_epoch,best_loss,best_acc = resume(POPEN,model,optimizer,logger) # =========== fix parameters =========== if POPEN.modual_to_fix in dir(model): model = fix_parameter(model,POPEN.modual_to_fix) logger.info(' \t \t ==============<<< %s part is fixed>>>============== \t \t \n'%POPEN.modual_to_fix) return {"popen":POPEN,"model":model,"loader_ls":loader_ls}
def main( batch_size, num_mini_batches, nworkers, datadir, outdir, num_epochs, snapshot, finetune, lr, n_classes, loadvgg, network_type, fusion, data, ): n_classes = 2 tile_size = 960 channel_basis = { 'pre_img10': 3, 'post_img10': 3, 'pre_sar': 1, 'post_sar': 1, 'vhr': 3 } channel_dict = dict() np.random.seed(0) network_type = 'baseline_vhr' for item in data: channel_dict['{}'.format(item)] = channel_basis[item] if network_type == 'baseline_vhr': network = damage_net_vhr(n_classes=n_classes) network.load_state_dict( model_zoo.load_url( 'https://download.pytorch.org/models/resnet50-19c8e357.pth')) elif network_type == 'baseline_s1': network = damage_net_s1(n_classes=n_classes) network.load_state_dict( model_zoo.load_url( 'https://download.pytorch.org/models/resnet50-19c8e357.pth')) elif network_type == 'baseline_s2': network = damage_net_s2(n_classes=n_classes) network.load_state_dict( model_zoo.load_url( 'https://download.pytorch.org/models/resnet50-19c8e357.pth')) elif network_type == 'damagenet_fusion_simple': network = damage_net_vhr_fusion_simple(n_classes=n_classes) network.load_state_dict( model_zoo.load_url( 'https://download.pytorch.org/models/resnet50-19c8e357.pth')) if pwd.getpwuid(os.getuid())[0] == "jf330": finetune = "/Users/jf330/Downloads/results/epoch_{:02}_classes_{:02}.pth".format( num_epochs, n_classes) elif pwd.getpwuid(os.getuid())[0] == "timrudner": finetune = "/Volumes/Data/Google_Drive/AYA_Google_Drive/Git/fdl-eo/code/damage-density-estimation/src/results/epoch_{:02}_classes_{:02}.pth".format( num_epochs, n_classes) else: finetune = "/results/epoch_{:02}_classes_{:02}.pth".format( num_epochs, n_classes) val = val_houston_data_loader(batch_size=batch_size, num_workers=nworkers, channels=channel_dict, tile_size=tile_size, n_classes=n_classes, shuffle=False, validation=True) metric = classmetric() if torch.cuda.is_available(): network = network.cuda() if loadvgg == True: network.load_vgg16_weights() if torch.cuda.is_available(): network = nn.DataParallel(network).cuda() # else: # network = nn.DataParallel(network) param_groups = [{'params': network.parameters(), 'lr': lr}] if finetune or snapshot: state = resume(finetune or snapshot, network, None) loss_str_list = [] network.eval() for iteration, data in enumerate(val): input = data[1] input_id = data[0] upsample = nn.Upsample(size=(int(tile_size / 1.25), int(tile_size / 1.25)), mode='bilinear', align_corners=True) # Harvey target = upsample(data[2]["label"]) if torch.cuda.is_available(): target = Variable(target.float()).cuda() else: target = Variable(target.float()) output_raw = network.forward(input) # Normalize if n_classes == 1: output = output_raw else: soft = nn.Softmax2d() output = soft(output_raw) output = upsample(output) train_metric = metric(target, output) loss_str_list.append("Input ID: {}; Metric: {} ".format( input_id, str(train_metric))) # convert zo W x H x C if torch.cuda.is_available(): prediction = output.data.cuda()[0].permute(1, 2, 0) target = target.data.cuda()[0].permute(1, 2, 0) else: prediction = output.data[0].permute(1, 2, 0) target = target.data[0].permute(1, 2, 0) if not os.path.exists(RESULTS_PATH + "/img"): os.makedirs(RESULTS_PATH + "/img") # Remove extra dim if n_classes == 1: prediction_img = prediction.cpu().numpy() else: prediction_img = np.argmax(prediction, n_classes).cpu().numpy() target_img = target.cpu().numpy() # Write input image (only first 3 bands) # input_img = input.squeeze(0).cpu().numpy() # # if input_img[:, 0, 0].size >= 3: # input_img = cv2.merge((input_img[0], input_img[1], input_img[2])) # else: # input_img = input_img[0] #upsample = nn.Upsample(size=(int(tile_size/1.25), int(tile_size/1.25)), mode='bilinear', align_corners=True) # Harvey # cv2.imwrite(RESULTS_PATH+"/img/{}_input_class_{:02}.png".format(iteration, n_classes), input_img*255) cv2.imwrite( RESULTS_PATH + "/img/{}_prediction_class_{:02}.png".format(iteration, n_classes), prediction_img * 255) cv2.imwrite( RESULTS_PATH + "/img/{}_target_class_{:02}.png".format(iteration, n_classes), target_img * 255) #exit(0) with open(RESULTS_PATH + "/MSEloss.csv", "w") as output: writer = csv.writer(output, delimiter=';', lineterminator='\n') for val in loss_str_list: writer.writerow([val])
def main_worker(save_dir, args): # basic setup cudnn.benchmark = True if args.log_name is not None: log_dir = "runs/%s" % args.log_name else: log_dir = f"runs/{datetime.datetime.now().strftime('%m-%d-%H-%M-%S')}" if args.local_rank == 0: logger = SummaryWriter(log_dir) else: logger = None deepspeed.init_distributed(dist_backend='nccl') torch.cuda.set_device(args.local_rank) model = SetVAE(args) parameters = model.parameters() n_parameters = sum(p.numel() for p in parameters if p.requires_grad) print(f'number of params: {n_parameters}') try: n_gen_parameters = sum(p.numel() for p in model.init_set.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.pre_decoder.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.decoder.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.post_decoder.parameters() if p.requires_grad) + \ sum(p.numel() for p in model.output.parameters() if p.requires_grad) print(f'number of generator params: {n_gen_parameters}') except AttributeError: pass optimizer, criterion = model.make_optimizer(args) # initialize datasets and loaders train_dataset, val_dataset, train_loader, val_loader = get_datasets(args) # initialize the learning rate scheduler if args.scheduler == 'exponential': assert not (args.warmup_epochs > 0) scheduler = torch.optim.lr_scheduler.ExponentialLR( optimizer, args.exp_decay) elif args.scheduler == 'step': assert not (args.warmup_epochs > 0) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.epochs // 2, gamma=0.1) elif args.scheduler == 'linear': def lambda_rule(ep): lr_w = min(1., ep / args.warmup_epochs) if (args.warmup_epochs > 0) else 1. lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float( 0.5 * args.epochs) return lr_l * lr_w scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif args.scheduler == 'cosine': assert not (args.warmup_epochs > 0) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs) else: # Fake SCHEDULER def lambda_rule(ep): return 1.0 scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) # extract collate_fn if args.distributed: collate_fn = deepcopy(train_loader.collate_fn) model, optimizer, train_loader, scheduler = deepspeed.initialize( args=args, model=model, optimizer=optimizer, model_parameters=parameters, training_data=train_dataset, collate_fn=collate_fn, lr_scheduler=scheduler) # resume checkpoints start_epoch = 0 if args.resume_checkpoint is None and Path( Path(save_dir) / 'checkpoint-latest.pt').exists(): args.resume_checkpoint = os.path.join( save_dir, 'checkpoint-latest.pt') # use the latest checkpoint print('Resumed from: ' + args.resume_checkpoint) if args.resume_checkpoint is not None: if args.distributed: if args.resume_optimizer: model.module, model.optimizer, model.lr_scheduler, start_epoch = resume( args.resume_checkpoint, model.module, model.optimizer, scheduler=model.lr_scheduler, strict=(not args.resume_non_strict)) else: model.module, _, _, start_epoch = resume( args.resume_checkpoint, model.module, optimizer=None, strict=(not args.resume_non_strict)) else: if args.resume_optimizer: model, optimizer, scheduler, start_epoch = resume( args.resume_checkpoint, model, optimizer, scheduler=scheduler, strict=(not args.resume_non_strict)) else: model, _, _, start_epoch = resume( args.resume_checkpoint, model, optimizer=None, strict=(not args.resume_non_strict)) # save dataset statistics if args.local_rank == 0: train_dataset.save_statistics(save_dir) val_dataset.save_statistics(save_dir) # main training loop avg_meters = { 'kl_avg_meter': AverageValueMeter(), 'l2_avg_meter': AverageValueMeter() } assert args.distributed epoch = start_epoch print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs)) for epoch in range(start_epoch, args.epochs): if args.local_rank == 0: # evaluate on the validation set if epoch % args.val_freq == 0 and epoch != 0: model.eval() with torch.no_grad(): val_res = validate(model.module, args, val_loader, epoch, logger, save_dir) for k, v in val_res.items(): v = v.cpu().detach().item() send_slack(f'{k}:{v}, Epoch {epoch - 1}') if logger is not None and v is not None: logger.add_scalar(f'val_sample/{k}', v, epoch - 1) # train for one epoch train_one_epoch(epoch, model, criterion, optimizer, args, train_loader, avg_meters, logger) # Only on HEAD process if args.local_rank == 0: # save checkpoints if (epoch + 1) % args.save_freq == 0: if args.eval: validate_reconstruct_l2(epoch, val_loader, model, criterion, args, logger) save(model.module, model.optimizer, model.lr_scheduler, epoch + 1, Path(save_dir) / f'checkpoint-{epoch}.pt') save(model.module, model.optimizer, model.lr_scheduler, epoch + 1, Path(save_dir) / 'checkpoint-latest.pt') # save visualizations if (epoch + 1) % args.viz_freq == 0: with torch.no_grad(): visualize(model.module, args, val_loader, epoch, logger) # adjust the learning rate model.lr_scheduler.step() if logger is not None and args.local_rank == 0: logger.add_scalar('train lr', model.lr_scheduler.get_last_lr()[0], epoch) model.eval() if args.local_rank == 0: with torch.no_grad(): val_res = validate(model.module, args, val_loader, epoch, logger, save_dir) for k, v in val_res.items(): v = v.cpu().detach().item() send_slack(f'{k}:{v}, Epoch {epoch}') if logger is not None and v is not None: logger.add_scalar(f'val_sample/{k}', v, epoch) if logger is not None: logger.flush() logger.close()
def main_worker(gpu, save_dir, args): # basic setup cudnn.benchmark = True args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) model = HyperRegression(args) torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) start_epoch = 0 optimizer = model.make_optimizer(args) if args.resume_checkpoint is None and os.path.exists( os.path.join(save_dir, 'checkpoint-latest.pt')): args.resume_checkpoint = os.path.join( save_dir, 'checkpoint-latest.pt') # use the latest checkpoint if args.resume_checkpoint is not None: if args.resume_optimizer: model, optimizer, start_epoch = resume( args.resume_checkpoint, model, optimizer, strict=(not args.resume_non_strict)) else: model, _, start_epoch = resume(args.resume_checkpoint, model, optimizer=None, strict=(not args.resume_non_strict)) print('Resumed from: ' + args.resume_checkpoint) # main training loop start_time = time.time() point_nats_avg_meter = AverageValueMeter() if args.distributed: print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size())) print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs)) for epoch in range(start_epoch, args.epochs): print("Epoch starts:") data = ExampleData() train_loader = torch.utils.data.DataLoader(dataset=data, batch_size=args.batch_size, shuffle=True, num_workers=0, pin_memory=True) for bidx, data in enumerate(train_loader): x, y = data x = x.float().to(args.gpu).unsqueeze(1) y = y.float().to(args.gpu).unsqueeze(1).unsqueeze(2) step = bidx + len(train_loader) * epoch model.train() recon_nats = model(x, y, optimizer, step, None) point_nats_avg_meter.update(recon_nats.item()) if step % args.log_freq == 0: duration = time.time() - start_time start_time = time.time() print( "[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] PointNats %2.5f" % (args.rank, epoch, bidx, len(train_loader), duration, point_nats_avg_meter.avg)) # save visualizations kk = 3 if (epoch + 1) % args.viz_freq == 0: # reconstructions model.eval() x = torch.from_numpy(np.linspace(0, kk, num=100)).float().to( args.gpu).unsqueeze(1) _, y = model.decode(x, 100) x = x.cpu().detach().numpy() y = y.cpu().detach().numpy() x = np.expand_dims(x, 1).repeat(100, axis=1).flatten() y = y.flatten() figs, axs = plt.subplots(1, 1, figsize=(12, 12)) plt.xlim([0, kk]) plt.ylim([-2, 2]) plt.scatter(x, y) plt.savefig( os.path.join( save_dir, 'images', 'tr_vis_sampled_epoch%d-gpu%s.png' % (epoch, args.gpu))) plt.clf() if (epoch + 1) % args.save_freq == 0: save(model, optimizer, epoch + 1, os.path.join(save_dir, 'checkpoint-%d.pt' % epoch)) save(model, optimizer, epoch + 1, os.path.join(save_dir, 'checkpoint-latest.pt'))
def main_worker(gpu, save_dir, ngpus_per_node, args): # basic setup cudnn.benchmark = True normalize = False args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) model = HyperRegression(args) torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) start_epoch = 0 optimizer = model.make_optimizer(args) if args.resume_checkpoint is None and os.path.exists( os.path.join(save_dir, 'checkpoint-latest.pt')): args.resume_checkpoint = os.path.join( save_dir, 'checkpoint-latest.pt') # use the latest checkpoint if args.resume_checkpoint is not None: if args.resume_optimizer: model, optimizer, start_epoch = resume( args.resume_checkpoint, model, optimizer, strict=(not args.resume_non_strict)) else: model, _, start_epoch = resume(args.resume_checkpoint, model, optimizer=None, strict=(not args.resume_non_strict)) print('Resumed from: ' + args.resume_checkpoint) # initialize datasets and loaders # initialize the learning rate scheduler if args.scheduler == 'exponential': scheduler = optim.lr_scheduler.ExponentialLR(optimizer, args.exp_decay) elif args.scheduler == 'step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.epochs // 2, gamma=0.1) elif args.scheduler == 'linear': def lambda_rule(ep): lr_l = 1.0 - max(0, ep - 0.5 * args.epochs) / float( 0.5 * args.epochs) return lr_l scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) else: assert 0, "args.schedulers should be either 'exponential' or 'linear'" # main training loop start_time = time.time() entropy_avg_meter = AverageValueMeter() latent_nats_avg_meter = AverageValueMeter() point_nats_avg_meter = AverageValueMeter() if args.distributed: print("[Rank %d] World size : %d" % (args.rank, dist.get_world_size())) print("Start epoch: %d End epoch: %d" % (start_epoch, args.epochs)) data = SDDData(split='train', normalize=normalize, root=args.data_dir) data_test = SDDData(split='test', normalize=normalize, root=args.data_dir) train_loader = torch.utils.data.DataLoader(dataset=data, batch_size=args.batch_size, shuffle=True, num_workers=0, pin_memory=True) test_loader = torch.utils.data.DataLoader(dataset=data_test, batch_size=1, shuffle=False, num_workers=0, pin_memory=True) for epoch in range(start_epoch, args.epochs): # adjust the learning rate if (epoch + 1) % args.exp_decay_freq == 0: scheduler.step(epoch=epoch) # train for one epoch print("Epoch starts:") for bidx, data in enumerate(train_loader): # if bidx < 2: x, y = data #y = y.float().to(args.gpu).unsqueeze(1).repeat(1, 10).unsqueeze(2) x = x.float().to(args.gpu) y = y.float().to(args.gpu).unsqueeze(1) y = y.repeat(1, 20, 1) y += torch.randn(y.shape[0], y.shape[1], y.shape[2]).to(args.gpu) step = bidx + len(train_loader) * epoch model.train() recon_nats = model(x, y, optimizer, step, None) point_nats_avg_meter.update(recon_nats.item()) if step % args.log_freq == 0: duration = time.time() - start_time start_time = time.time() print( "[Rank %d] Epoch %d Batch [%2d/%2d] Time [%3.2fs] PointNats %2.5f" % (args.rank, epoch, bidx, len(train_loader), duration, point_nats_avg_meter.avg)) # print("Memory") # print(process.memory_info().rss / (1024.0 ** 3)) # save visualizations if (epoch + 1) % args.viz_freq == 0: # reconstructions model.eval() for bidx, data in enumerate(test_loader): x, _ = data x = x.float().to(args.gpu) _, y_pred = model.decode(x, 100) y_pred = y_pred.cpu().detach().numpy().squeeze() # y_pred[y_pred < 0] = 0 # y_pred[y_pred >= 0.98] = 0.98 testing_sequence = data_test.dataset.scenes[ data_test.test_id].sequences[bidx] objects_list = [] for k in range(3): objects_list.append( decode_obj(testing_sequence.objects[k], testing_sequence.id)) objects = np.stack(objects_list, axis=0) gt_object = decode_obj(testing_sequence.objects[-1], testing_sequence.id) drawn_img_hyps = draw_hyps(testing_sequence.imgs[-1], y_pred, gt_object, objects, normalize) cv2.imwrite( os.path.join(save_dir, 'images', str(bidx) + '-' + str(epoch) + '-hyps.jpg'), drawn_img_hyps) if (epoch + 1) % args.save_freq == 0: save(model, optimizer, epoch + 1, os.path.join(save_dir, 'checkpoint-%d.pt' % epoch)) save(model, optimizer, epoch + 1, os.path.join(save_dir, 'checkpoint-latest.pt'))
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=POPEN.lr, betas=(0.9, 0.98), eps=1e-09, weight_decay=POPEN.l2) if POPEN.loss_schema == 'DTP': POPEN.loss_schedualer = Dynamic_Task_Priority(POPEN.tasks,POPEN.gamma,POPEN.chimerla_weight) elif POPEN.loss_schema == 'DWA': POPEN.loss_schedualer = Dynamic_Weight_Averaging(POPEN.tasks,POPEN.tau,POPEN.chimerla_weight) # =========== resume =========== best_loss = np.inf best_acc = 0 best_epoch = 0 previous_epoch = 0 if POPEN.Resumable: previous_epoch,best_loss,best_acc = utils.resume(POPEN, optimizer,logger) # |=====================================| # |========== training part ==========| # |=====================================| for epoch in range(POPEN.max_epoch-previous_epoch+1): epoch += previous_epoch # logger.info("===============================| epoch {} |===============================".format(epoch)) train_val.iter_train(loader_set,model=model,optimizer=optimizer,popen=POPEN,epoch=epoch) # -----------| validate |-----------
def train_model(args_dict): # Define model if args_dict.model == "covidnet": model = CovidNet(args_dict.n_classes) elif args_dict.model == "resnet": model = ResNet(args_dict.n_classes) print("model selected: {}".format(args_dict.model)) model.to(args_dict.device) # Loss and optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args_dict.lr) criterion = nn.CrossEntropyLoss( weight=torch.Tensor(args_dict.class_weights).to(args_dict.device)) # Resume training if needed best_sensit, model, optimizer = utils.resume(args_dict, model, optimizer) scheduler = ReduceLROnPlateau(optimizer, factor=args_dict.factor, patience=args_dict.patience, verbose=True) # Load data dl_non_covid, dl_covid = calculateDataLoaderTrain(args_dict) # Data loading for test dl_test = eval.calculateDataLoaderTest(args_dict) # Now, let's start the training process! print('Start training...') pat_track = 0 for epoch in range(args_dict.epochs): # Compute a training epoch trainEpoch(args_dict, dl_non_covid, dl_covid, model, criterion, optimizer, epoch) # Compute a validation epoch sensitivity_covid, accuracy = eval.valEpoch(args_dict, dl_test, model) scheduler.step(accuracy) # save if it is the best model if accuracy >= 0.80: # only compare sensitivity if we have a minimum accuracy of 0.8 is_best = sensitivity_covid > best_sensit if is_best: print("BEST MODEL FOUND!") best_sensit = max(sensitivity_covid, best_sensit) utils.save_model( args_dict, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_sensit': best_sensit, 'optimizer': optimizer.state_dict(), 'valtrack': pat_track, # 'freeVision': args_dict.freeVision, 'curr_val': accuracy, }) print( '** Validation: %f (best_sensitivity) - %f (current acc) - %d (patience)' % (best_sensit, accuracy, pat_track)) # Plot plotter.plot('Sensitivity', 'test', 'sensitivity covid', epoch, sensitivity_covid) plotter.plot('Accuracy', 'test', 'Accuracy', epoch, accuracy)
def main(): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) # torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %s' % args.gpu) logging.info("args = %s", args) genotype = eval("genotypes.%s" % args.arch) model = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype) model.drop_path_prob = args.drop_path_prob model = model.cuda() model = torch.nn.DataParallel(model) if args.resume != "": start = utils.resume(model, os.path.join(args.resume, 'weights.pt')) else: start = 0 logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) valid_acc_best = 0. for epoch in range(start): scheduler.step() for epoch in range(start, args.epochs): scheduler.step() logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) model.drop_path_prob = args.drop_path_prob * epoch / args.epochs train_acc, train_obj = train(train_queue, model, criterion, optimizer) logging.info('train_acc %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) if valid_acc > valid_acc_best: utils.save(model, epoch, os.path.join(args.save, 'weights.pt')) valid_acc_best = valid_acc
def plot_calibration(args_dict): args_dict.resume = True dl_test = eval.calculateDataLoaderTest(args_dict) # Define model if args_dict.model == "covidnet": model_normal = CovidNet(args_dict.n_classes) elif args_dict.model == "resnet": model_normal = ResNet(args_dict.n_classes) # Set up device if torch.cuda.is_available(): args_dict.device = torch.device( "cuda:0" ) # you can continue going on here, like cuda:1 cuda:2....etc. else: args_dict.device = torch.device("cpu") # load normal model model_normal.to(args_dict.device) optimizer = torch.optim.Adam(model_normal.parameters(), lr=args_dict.lr) _, model_normal, _ = utils.resume(args_dict, model_normal, optimizer) # load calibrated model model_calib = ModelWithTemperature(model_normal) calib_model_path = args_dict.dir_model + "calibrated_" + args_dict.model + '_best_model.pth.tar' checkpoint_calib = torch.load(calib_model_path, map_location=torch.device(args_dict.device)) model_calib.load_state_dict(checkpoint_calib['state_dict']) print("Calculating probabilities for test set...") probs_normal, y_true = eval.valEpoch(args_dict, dl_test, model_normal, calibration=True) probs_normal = softmax(probs_normal, axis=1) probs_calib, y_true = eval.valEpoch(args_dict, dl_test, model_calib, calibration=True) probs_calib = softmax(probs_calib, axis=1) print("calibration graph...") idx2class = {0: 'normal', 1: 'pneumonia', 2: 'COVID19'} fig, axs = plt.subplots(1, args_dict.n_classes, figsize=(15, 5)) for idx_class in range(args_dict.n_classes): y_class = y_true == idx_class # reliability diagram fop_normal, mpv_normal = calibration_curve(y_class, probs_normal[:, idx_class]) fop_calib, mpv_calib = calibration_curve(y_class, probs_calib[:, idx_class]) # plot perfectly calibrated axs[idx_class].plot([0, 1], [0, 1], linestyle='--') # plot calibrated reliability axs[idx_class].plot(mpv_calib, fop_calib, marker='.', label='calibrated') axs[idx_class].plot(mpv_normal, fop_normal, marker='*', label='normal') axs[idx_class].set(xlabel='confidence') axs[idx_class].set(ylabel='accuracy') # title axs[idx_class].set_title(idx2class[idx_class]) for ax in axs.flat: ax.label_outer() fig.autofmt_xdate() plt.subplots_adjust(wspace=0.1) handles, labels = ax.get_legend_handles_labels() fig.legend(handles, labels, loc='lower center', ncol=args_dict.n_classes) fig.savefig('calibration_' + args_dict.model + '.png')