class GeneticAlgorithm(): def __init__(self, weights, fitness_function, population_size, sigma, num_threads, folder, settings): #GeneticAlgorithm.__init__(self, weights, fitness_function, population_size, sigma, num_threads, folder, settings) #! Python 3super(GeneticAlgorithm, self).__init__() self.weight_shape = len(weights) self.fitness_function = fitness_function self.pop_size = population_size self.sigma = sigma self.num_threads = num_threads self.folder = folder self.settings = settings sigma = settings['sigma'] np.random.seed(self.settings['seed']) filename="run_"+str(self.settings['seed']) if self.settings['recurrent']: filename+="_recurrent" if LOG: self.writer = SummaryWriter(folder+"/"+filename+"/"+settings['target']) def run(self, generations, print_step): population = [] fitness_log =[] fitness_all = [] plot_log = [] gen=[] maxgenfit=[] meanfit = [] maxfit = [] N = self.settings['N']#self.pop_size/3 #use top 5 individuals elitism = N if self.settings['data_read']: population = torch.load('run1/model_stateFiles/weights_gen_1.pt') morphogens = np.load('run1/model_stateFiles/morphogens_gen_1.npy') hidden_states_batched_A = torch.load('run1/model_stateFiles/hidden_states_batched_A_gen_0.pt') hidden_states_batched_B = torch.load('run1/model_stateFiles/hidden_states_batched_B_gen_0.pt') pattern=r'([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)' f = open('run1/fitnessFiles/fitness_log_gen_1.txt') fitness = f.read() f.close() fitness=re.findall(pattern,fitness) fitness = map(float, fitness) fitness = np.array(fitness) max_fitness = max(fitness[::-4]) else: for i in range(self.pop_size): w = torch.from_numpy( np.random.normal(0, 1, self.weight_shape)*0.1*self.settings['initial_noise']).float() population.append( w ) max_fitness = -20 pool = multiprocessing.Pool(self.num_threads) for epoch in range(0, generations): #fitness, sim, env, generations, individual_id, morphogens if self.settings['data_read']: results = pool.map(self.fitness_function, [ (it, epoch, self.settings, morphogens[it[0]], hidden_states_batched_A[it[0]], hidden_states_batched_B[it[0]], max_fitness) for it in enumerate(population) ]) else: results = pool.map(self.fitness_function, [ (it, epoch, self.settings) for it in enumerate(population) ]) # ,hidden_states_batched_A[it],hidden_states_batched_B[it] fitness = [row[0] for row in results] sim = [row[1] for row in results] env = [row[2] for row in results] individual_id = [row[4] for row in results] morphogens = [row[5] for row in results] hidden_states_batched_A = [row[6] for row in results] hidden_states_batched_B = [row[7] for row in results] dev_states = [row[8] for row in results] alpha = [row[9] for row in results] cutedgemorphogens = [row[10] for row in results] out = [row[11] for row in results] past_hidden_states_batched_A = [row[12] for row in results] past_hidden_states_batched_B = [row[13] for row in results] fitness_all.append(fitness) #Get individuales id sort_idx = np.argsort([-f for f in fitness]) max_gen_f = np.max(fitness) if (max_gen_f>max_fitness): max_fitness = max_gen_f write_voxelyze_file_fitness(sim[sort_idx[0]], env[sort_idx[0]], epoch, individual_id[sort_idx[0]], cutedgemorphogens[sort_idx[0]], max_fitness, self.settings['im_size'], self.settings['run_directory'], self.settings['run_name']) torch.save(population[sort_idx[0]], "{0}/bestofFiles/weights_gen_{1}_{2}.pt".format(self.settings['run_directory'],epoch,sort_idx[0])) np.save('{0}/bestofFiles/morphogens_gen_{1}_id_{2}'.format(self.settings['run_directory'],epoch, sort_idx[0]), morphogens[sort_idx[0]]) torch.save(past_hidden_states_batched_A[sort_idx[0]],'{0}/bestofFiles/past_hidden_states_batched_A_gen_{1}_id_{2}.pt'.format(self.settings['run_directory'],epoch, sort_idx[0])) torch.save(past_hidden_states_batched_B[sort_idx[0]],'{0}/bestofFiles/past_hidden_states_batched_B_gen_{1}_id_{2}.pt'.format(self.settings['run_directory'],epoch, sort_idx[0])) dev_states = np.asarray(dev_states) alpha = np.asarray(alpha) #filename = "{0}/GA_saved_weights_gen_{1}_{2}".format(self.folder,epoch, max_fitness) #m = self.fitness_function( (population[sort_idx[0] ], self.settings), True, filename) dev_states = dev_states.reshape(len(population),len(dev_states[1]),self.settings['im_size'],self.settings['im_size'],self.settings['im_size']) alpha = alpha.reshape(len(population),len(alpha[1]),self.settings['im_size'],self.settings['im_size'],self.settings['im_size']) np.save('{0}/bestofFiles/dev_states_gen_{1}_id_{2}'.format(self.settings['run_directory'],epoch, sort_idx[0]), dev_states[sort_idx[0]]) np.save('{0}/bestofFiles/alpha_gen_{1}_id_{2}'.format(self.settings['run_directory'],epoch, sort_idx[0]), alpha[sort_idx[0]]) mynorm = plt.Normalize(vmin=0, vmax=1) fig = plt.figure(figsize=(20,10)) for it in range(0,len(dev_states[1])): #print(it) voxels = dev_states[sort_idx[0]][it] voxels = voxels.transpose((2,1,0)) alpha_temp = alpha[sort_idx[0]][it] alpha_temp = alpha_temp.transpose((2,1,0)) #print(voxels[1],voxels[1,3]) ax = fig.add_subplot(2, 5, it+1, projection= '3d') #plt.subplot(3, iterations/3+12, it+1)#,figsize=(15,15)) col = [[1, 1, 1], [0, 1, 1], [0, 0, 1], [1, 0, 0],[0, 1, 0]] face_col = np.concatenate( (np.array(col)[voxels.astype(int)], np.expand_dims(alpha_temp, axis=3) ) , axis=3) #face_col = np.concatenate( (np.array(col)[morphocegens[0].astype(int)], np.expand_dims(morphogens[1], axis=3) ) , axis=3) #face_col = face_col.transpose((1,2,0)) ax.set_aspect(aspect=1) ax.voxels( voxels, facecolors=face_col,edgecolor='k')#np.array(col)[morphogens[0].astype(int)])#, plt.savefig('{0}/bestofFiles/gen{1}_id{2}.pdf'.format(self.settings['run_directory'],epoch, sort_idx[0])) plt.close() if LOG: self.writer.add_image("Image", m.transpose(2, 0, 1), epoch) fitness_log.append((epoch, max_gen_f, np.mean(fitness), max_fitness)) gen.append (fitness_log[epoch][0]) maxgenfit.append(fitness_log[epoch][1]) meanfit.append (fitness_log[epoch][2]) maxfit.append (fitness_log[epoch][3]) #print(fitness_log) if epoch % self.settings['fig_output_rate'] ==0: fig = plt.figure() ax = fig.add_subplot(1, 1, 1) #ax.plot(gen, maxgenfit, linestyle='--', color='b', label='maxgenfit') ax.plot(gen, meanfit, linestyle='-', color='r', label='Mean') ax.plot(gen, maxfit, linestyle='dotted', color = 'b', label='Max fitness') ax.set_xlabel('Generations') ax.set_ylabel('Fitness') ax.legend(loc='best') plt.savefig('{0}/epoch{1}.pdf'.format(self.settings['run_directory'],epoch)) #print(population) torch.save(population, "{0}/model_stateFiles/weights_gen_{1}.pt".format(self.settings['run_directory'],epoch)) str_ = str(fitness_log) str_1 = str(fitness_all) with open("{0}/fitnessFiles/fitness_log_gen_{1}.txt".format(self.settings['run_directory'], epoch), 'wt') as f: f.write(str_) with open("{0}/fitnessFiles/fitness_all_gen_{1}.txt".format(self.settings['run_directory'], epoch), 'wt') as g: g.write(str_1) dev_states = np.asarray(dev_states) alpha = np.asarray(alpha) #np.save('{0}/dev_stateFiles/dev_states_gen_{1}_id_{2}'.format(self.settings['run_directory'], epoch, sort_idx[0]), dev_states[sort_idx[0]]) #filename = "{0}/GA_saved_weights_gen_{1}_{2}".format(self.folder,epoch, max_fitness) #m = self.fitness_function( (population[sort_idx[0] ], self.settings), True, filename) dev_states = dev_states.reshape(len(population),len(dev_states[1]),self.settings['im_size'],self.settings['im_size'],self.settings['im_size']) alpha = alpha.reshape(len(population),len(alpha[1]),self.settings['im_size'],self.settings['im_size'],self.settings['im_size']) new_pop = [] for idx in range(self.pop_size-elitism): #Select indivdual from the top N i = np.random.randint(0, N ) p = population[ sort_idx[i]] new_ind = p + torch.from_numpy( np.random.normal(0, 1, self.weight_shape) * self.sigma).float() new_pop.append(new_ind) for idx in sort_idx[:elitism]: new_pop.append(population[idx] ) population = new_pop
def train_manipulator(model, data_loaders, args): """Train an emotion EBM.""" device = args.device optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) model, optimizer, _, start_epoch, is_trained = load_from_ckpnt( args.classifier_ckpnt, model, optimizer, scheduler=None ) if is_trained: return model writer = SummaryWriter('runs/' + args.checkpoint.replace('.pt', '')) # Training loop for epoch in range(start_epoch, args.epochs): print("Epoch: %d/%d" % (epoch + 1, args.epochs)) kbar = pkbar.Kbar(target=len(data_loaders['train']), width=25) model.train() model.disable_batchnorm() model.zero_grad() # model.enable_grads() for step, ex in enumerate(data_loaders['train']): images, _, emotions, neg_images = ex # positive samples pos_samples = images.to(device) # prepare negative samples neg_samples, neg_masks = rand_mask(images.clone().to(device), device) # negative samples neg_ld_samples, neg_list = langevin_updates( model, torch.clone(neg_samples), args.langevin_steps, args.langevin_step_size, neg_masks ) # Compute energy pos_out = model(pos_samples) neg_img_out = model(neg_images.to(device)) neg_ld_out = model(neg_ld_samples.to(device)) # Loss loss_reg = (pos_out**2 + neg_ld_out**2 + neg_img_out**2).mean() # loss_reg = (torch.abs(pos_out) + torch.abs(neg_ld_out) + torch.abs(neg_img_out)).mean() loss_ml = 2*pos_out.mean() - neg_ld_out.mean() - neg_img_out.mean() coeff = loss_ml.detach().clone() / loss_reg.detach().clone() loss = 0.5*loss_reg + loss_ml # if epoch == 0: # loss = loss * 0.05 ''' loss = ( pos_out**2 + neg_out**2 + neg_img_out**2 + neg_img_ld_out**2 + 3*pos_out - neg_out - neg_img_out - neg_img_ld_out ).mean() ''' # Step optimizer.zero_grad() loss.backward() clip_grad(model.parameters(), optimizer) optimizer.step() kbar.update(step, [("loss", loss)]) # Log loss writer.add_scalar('energy/energy_pos', pos_out.mean().item(), epoch * len(data_loaders['train']) + step) writer.add_scalar('energy/energy_neg', neg_ld_out.mean().item(), epoch * len(data_loaders['train']) + step) writer.add_scalar('loss/loss_reg', loss_reg.item(), epoch * len(data_loaders['train']) + step) writer.add_scalar('loss/loss_ml', loss_ml.item(), epoch * len(data_loaders['train']) + step) writer.add_scalar('loss/loss_total', loss.item(), epoch * len(data_loaders['train']) + step) # Log image evolution if step % 50 != 0: continue writer.add_image( 'random_image_sample', back2color(unnormalize_imagenet_rgb(pos_samples[0], device)), epoch * len(data_loaders['train']) + step ) neg_list = [ back2color(unnormalize_imagenet_rgb(neg, device)) for neg in neg_list ] neg_list = [torch.zeros_like(neg_list[0])] + neg_list vid_to_write = torch.stack(neg_list, dim=0).unsqueeze(0) writer.add_video( 'ebm_evolution', vid_to_write, fps=args.ebm_log_fps, global_step=epoch * len(data_loaders['train']) + step ) writer.add_scalar( 'lr', optimizer.state_dict()['param_groups'][0]['lr'], epoch ) # Save checkpoint torch.save( { "epoch": epoch + 1, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict() }, args.classifier_ckpnt ) torch.save( { "epoch": epoch + 1, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict() }, "manipulator_%02d.pt" % (epoch+1) ) print('\nValidation') print(eval_manipulator(model, data_loaders['test'], args)) return model
class Writer(): def __init__(self, output_folder, periods, flush_period, use_tb=False, **meta): self.meta = meta ensure_dir(output_folder) self.metric_rows = {} self.meta_rows = {} self.metrics_folder = f"{output_folder}/metrics" # self.json_writer = jsonlines.open(f"{output_folder}/metrics.jsonl", mode='w', flush=True) if use_tb: self.tensorboard_writer = SummaryWriter( log_dir=f'{output_folder}/tensorboard') else: self.tensorboard_writer = None self.image_folder = f"{output_folder}/images" self.env_folder = f"{output_folder}/envs" self.video_folder = f"{output_folder}/videos" self.model_folder = f"{output_folder}/models" self.df_folder = f"{output_folder}/df" ensure_dir(self.df_folder) ensure_dir(self.metrics_folder) ensure_dir(self.env_folder) self.periods = periods self.frames = {} self.traces = {} self.flush_idx = 0 self.flush_period = flush_period def add_meta(self, **meta): self.meta = {**self.meta, **meta} @selector def check_on(self): return True @property def step(self): return self.meta['_step'] @selector def add_table(self, **kwargs): # might change in the future # now = datetime.datetime.utcnow() # meta = {**self.meta, **meta, 'createdAt': now, 'name': name} # for k, v in meta.items(): # df[k] = v self._write_table(**kwargs) @selector def add_env(self, env): filename = os.path.join( self.env_folder, f"{self.meta['mode']}.{self.meta['episode']}.json") with open(filename, 'w') as outfile: json.dump(env.to_dict(), outfile) @selector def add_metrics2(self, scope, metrics): if scope not in self.traces: self.traces[scope] = { 'values': [], 'episode': [], 'episode_step': [], 'mode': [], 'name': [] } metrics = {k: v.cpu().numpy() for k, v in metrics.items()} for k, v in metrics.items(): self.traces[scope]['episode'].append(self.meta['episode']) self.traces[scope]['episode_step'].append( self.meta['episode_step']) self.traces[scope]['mode'].append(self.meta['mode']) self.traces[scope]['name'].append(k) self.traces[scope]['values'].append(v) if len(self.traces[scope]['values']) > self.flush_period: self.metrics2_flush() def metrics2_flush(self): for scope_name, traces in self.traces.items(): values = traces.pop('values') index = pd.MultiIndex.from_frame(pd.DataFrame(traces)) if values[0].size > 1: columns = pd.Series( [f'agent_{i}' for i in range(len(values[0]))], name='agents') df = pd.DataFrame(data=values, index=index, columns=columns) else: df = pd.DataFrame(data=values, index=index, columns=['value']) metrics_file = os.path.join( self.metrics_folder, f"{scope_name}.{self.flush_idx}.parquet") df.to_parquet(metrics_file) self.traces = {} self.flush_idx += 1 @selector def add_metrics(self, name, metrics, meta, tf=[]): if self.tensorboard_writer: for n in tf: self.tensorboard_writer.add_scalar(n, metrics[n], self.step) meta = {**self.meta, **meta} if name in self.meta_rows: assert name in self.metric_rows self.meta_rows[name].append(parse_dict(meta)) self.metric_rows[name].append(parse_dict(metrics)) else: self.meta_rows[name] = [parse_dict(meta)] self.metric_rows[name] = [parse_dict(metrics)] @selector def add_image(self, name, image): name = name.format(**self.meta) if self.tensorboard_writer: self.tensorboard_writer.add_image(name, image, self.step) self._write_image(name, image) @selector def add_video(self, name, video): name = name.format(**self.meta) if self.tensorboard_writer: self.tensorboard_writer.add_video(name, video, self.step, fps=1) assert video.shape[0] == 1, 'Multiple videos are not yet supported' self._write_video(name, video[0], fps=1) @selector def add_frame(self, name, callback, flush=False): name = name.format(**self.meta) if name not in self.frames: self.frames[name] = [callback()] else: self.frames[name].append(callback()) if flush: self.frames_flush() def frames_flush(self): for name, frames in self.frames.items(): video = th.cat(frames, dim=1) self.add_video(name, video) self.frames = {} def _write_image(self, name, array): ensure_dir(self.image_folder) file_name = os.path.join(self.image_folder, f'{name}.{self.step}.png') assert array.shape[0] == 1, 'Multiple images are not yet supported' imageio.imwrite(file_name, array[0].detach().numpy()) def _write_video(self, name, array, fps=1): ensure_dir(self.video_folder) file_name = os.path.join(self.video_folder, f'{name}.{self.step}.mp4') array_np = array.transpose(1, 3).transpose(1, 2).cpu().numpy() clip = ImageSequenceClip([f for f in array_np], fps=1) clip.write_videofile(file_name) def _write_table(self, df, name, sheet): df.to_csv(f"{self.df_folder}/{name}.{sheet}.csv") def flush(self): self.rows_flush() self.metrics2_flush() self.flush_idx += 1 def __del__(self): # self.json_writer.close() self.flush() def rows_flush(self): names = self.metric_rows.keys() for n in names: df_metrics = pd.DataFrame.from_records(self.metric_rows[n]) df_meta = pd.DataFrame.from_records( self.meta_rows[n]).astype('category') df = pd.concat([df_meta, df_metrics], axis=1) metrics_file = os.path.join(self.metrics_folder, f"{n}.parquet") df.to_parquet(metrics_file) @selector def write_module(self, name, module): name = name.format(**self.meta) if self.tensorboard_writer: for p_name, values in module.named_parameters(): self.tensorboard_writer.add_histogram(f'{name}.{p_name}', values, self.step) def set_details(self, details): self.details = details
t = t.reshape(-1, 12 * 4 * 4) t = self.fc1(t) t = F.relu(t) # (5) hidden linear layer t = self.fc2(t) t = F.relu(t) # (6) output layer t = self.out(t) #t = F.softmax(t, dim=1) return t torch.set_grad_enabled(False) network = Network() train_loader = torch.utils.data.DataLoader(train_set, batch_size=1000, shuffle=True) optimizer = optim.Adam(network.parameters(), lr=0.01) images, labels = next(iter(train_loader)) grid = torchvision.utils.make_grid(images) tb = SummaryWriter() tb.add_image('images', grid) tb.add_graph(network, images) tb.close()
class TRAModel(Model): """ TRA Model Args: model_config (dict): model config (will be used by RNN or Transformer) tra_config (dict): TRA config (will be used by TRA) model_type (str): which backbone model to use (RNN/Transformer) lr (float): learning rate n_epochs (int): number of total epochs early_stop (int): early stop when performance not improved at this step update_freq (int): gradient update frequency max_steps_per_epoch (int): maximum number of steps in one epoch lamb (float): regularization parameter rho (float): exponential decay rate for `lamb` alpha (float): fusion parameter for calculating transport loss matrix seed (int): random seed logdir (str): local log directory eval_train (bool): whether evaluate train set between epochs eval_test (bool): whether evaluate test set between epochs pretrain (bool): whether pretrain the backbone model before training TRA. Note that only TRA will be optimized after pretraining init_state (str): model init state path freeze_model (bool): whether freeze backbone model parameters freeze_predictors (bool): whether freeze predictors parameters transport_method (str): transport method, can be none/router/oracle memory_mode (str): memory mode, the same argument for MTSDatasetH """ def __init__( self, model_config, tra_config, model_type="RNN", lr=1e-3, n_epochs=500, early_stop=50, update_freq=1, max_steps_per_epoch=None, lamb=0.0, rho=0.99, alpha=1.0, seed=None, logdir=None, eval_train=False, eval_test=False, pretrain=False, init_state=None, reset_router=False, freeze_model=False, freeze_predictors=False, transport_method="none", memory_mode="sample", ): self.logger = get_module_logger("TRA") assert memory_mode in ["sample", "daily"], "invalid memory mode" assert transport_method in [ "none", "router", "oracle" ], f"invalid transport method {transport_method}" assert transport_method == "none" or tra_config[ "num_states"] > 1, "optimal transport requires `num_states` > 1" assert (memory_mode != "daily" or tra_config["src_info"] == "TPE"), "daily transport can only support TPE as `src_info`" if transport_method == "router" and not eval_train: self.logger.warning( "`eval_train` will be ignored when using TRA.router") if seed is not None: np.random.seed(seed) torch.manual_seed(seed) self.model_config = model_config self.tra_config = tra_config self.model_type = model_type self.lr = lr self.n_epochs = n_epochs self.early_stop = early_stop self.update_freq = update_freq self.max_steps_per_epoch = max_steps_per_epoch self.lamb = lamb self.rho = rho self.alpha = alpha self.seed = seed self.logdir = logdir self.eval_train = eval_train self.eval_test = eval_test self.pretrain = pretrain self.init_state = init_state self.reset_router = reset_router self.freeze_model = freeze_model self.freeze_predictors = freeze_predictors self.transport_method = transport_method self.use_daily_transport = memory_mode == "daily" self.transport_fn = transport_daily if self.use_daily_transport else transport_sample self._writer = None if self.logdir is not None: if os.path.exists(self.logdir): self.logger.warning(f"logdir {self.logdir} is not empty") os.makedirs(self.logdir, exist_ok=True) if SummaryWriter is not None: self._writer = SummaryWriter(log_dir=self.logdir) self._init_model() def _init_model(self): self.logger.info("init TRAModel...") self.model = eval(self.model_type)(**self.model_config).to(device) print(self.model) self.tra = TRA(self.model.output_size, **self.tra_config).to(device) print(self.tra) if self.init_state: self.logger.warning(f"load state dict from `init_state`") state_dict = torch.load(self.init_state, map_location="cpu") self.model.load_state_dict(state_dict["model"]) res = load_state_dict_unsafe(self.tra, state_dict["tra"]) self.logger.warning(str(res)) if self.reset_router: self.logger.warning(f"reset TRA.router parameters") self.tra.fc.reset_parameters() self.tra.router.reset_parameters() if self.freeze_model: self.logger.warning(f"freeze model parameters") for param in self.model.parameters(): param.requires_grad_(False) if self.freeze_predictors: self.logger.warning(f"freeze TRA.predictors parameters") for param in self.tra.predictors.parameters(): param.requires_grad_(False) self.logger.info("# model params: %d" % sum( [p.numel() for p in self.model.parameters() if p.requires_grad])) self.logger.info( "# tra params: %d" % sum([p.numel() for p in self.tra.parameters() if p.requires_grad])) self.optimizer = optim.Adam(list(self.model.parameters()) + list(self.tra.parameters()), lr=self.lr) self.fitted = False self.global_step = -1 def train_epoch(self, epoch, data_set, is_pretrain=False): self.model.train() self.tra.train() data_set.train() self.optimizer.zero_grad() P_all = [] prob_all = [] choice_all = [] max_steps = len(data_set) if self.max_steps_per_epoch is not None: if epoch == 0 and self.max_steps_per_epoch < max_steps: self.logger.info( f"max steps updated from {max_steps} to {self.max_steps_per_epoch}" ) max_steps = min(self.max_steps_per_epoch, max_steps) cur_step = 0 total_loss = 0 total_count = 0 for batch in tqdm(data_set, total=max_steps): cur_step += 1 if cur_step > max_steps: break if not is_pretrain: self.global_step += 1 data, state, label, count = batch["data"], batch["state"], batch[ "label"], batch["daily_count"] index = batch[ "daily_index"] if self.use_daily_transport else batch["index"] with torch.set_grad_enabled(not self.freeze_model): hidden = self.model(data) all_preds, choice, prob = self.tra(hidden, state) if is_pretrain or self.transport_method != "none": # NOTE: use oracle transport for pre-training loss, pred, L, P = self.transport_fn( all_preds, label, choice, prob, state.mean(dim=1), count, self.transport_method if not is_pretrain else "oracle", self.alpha, training=True, ) data_set.assign_data(index, L) # save loss to memory if self.use_daily_transport: # only save for daily transport P_all.append( pd.DataFrame(P.detach().cpu().numpy(), index=index)) prob_all.append( pd.DataFrame(prob.detach().cpu().numpy(), index=index)) choice_all.append( pd.DataFrame(choice.detach().cpu().numpy(), index=index)) decay = self.rho**(self.global_step // 100 ) # decay every 100 steps lamb = 0 if is_pretrain else self.lamb * decay reg = prob.log().mul(P).sum( dim=1).mean() # train router to predict OT assignment if self._writer is not None and not is_pretrain: self._writer.add_scalar("training/router_loss", -reg.item(), self.global_step) self._writer.add_scalar("training/reg_loss", loss.item(), self.global_step) self._writer.add_scalar("training/lamb", lamb, self.global_step) if not self.use_daily_transport: P_mean = P.mean(axis=0).detach() self._writer.add_scalar("training/P", P_mean.max() / P_mean.min(), self.global_step) loss = loss - lamb * reg else: pred = all_preds.mean(dim=1) loss = loss_fn(pred, label) (loss / self.update_freq).backward() if cur_step % self.update_freq == 0: self.optimizer.step() self.optimizer.zero_grad() if self._writer is not None and not is_pretrain: self._writer.add_scalar("training/total_loss", loss.item(), self.global_step) total_loss += loss.item() total_count += 1 if self.use_daily_transport and len(P_all): P_all = pd.concat(P_all, axis=0) prob_all = pd.concat(prob_all, axis=0) choice_all = pd.concat(choice_all, axis=0) P_all.index = data_set.restore_daily_index(P_all.index) prob_all.index = P_all.index choice_all.index = P_all.index if not is_pretrain: self._writer.add_image("P", plot(P_all), epoch, dataformats="HWC") self._writer.add_image("prob", plot(prob_all), epoch, dataformats="HWC") self._writer.add_image("choice", plot(choice_all), epoch, dataformats="HWC") total_loss /= total_count if self._writer is not None and not is_pretrain: self._writer.add_scalar("training/loss", total_loss, epoch) return total_loss def test_epoch(self, epoch, data_set, return_pred=False, prefix="test", is_pretrain=False): self.model.eval() self.tra.eval() data_set.eval() preds = [] probs = [] P_all = [] metrics = [] for batch in tqdm(data_set): data, state, label, count = batch["data"], batch["state"], batch[ "label"], batch["daily_count"] index = batch[ "daily_index"] if self.use_daily_transport else batch["index"] with torch.no_grad(): hidden = self.model(data) all_preds, choice, prob = self.tra(hidden, state) if is_pretrain or self.transport_method != "none": loss, pred, L, P = self.transport_fn( all_preds, label, choice, prob, state.mean(dim=1), count, self.transport_method if not is_pretrain else "oracle", self.alpha, training=False, ) data_set.assign_data(index, L) # save loss to memory if P is not None and return_pred: P_all.append(pd.DataFrame(P.cpu().numpy(), index=index)) else: pred = all_preds.mean(dim=1) X = np.c_[pred.cpu().numpy(), label.cpu().numpy(), all_preds.cpu().numpy()] columns = ["score", "label" ] + ["score_%d" % d for d in range(all_preds.shape[1])] pred = pd.DataFrame(X, index=batch["index"], columns=columns) metrics.append(evaluate(pred)) if return_pred: preds.append(pred) if prob is not None: columns = [ "prob_%d" % d for d in range(all_preds.shape[1]) ] probs.append( pd.DataFrame(prob.cpu().numpy(), index=index, columns=columns)) metrics = pd.DataFrame(metrics) metrics = { "MSE": metrics.MSE.mean(), "MAE": metrics.MAE.mean(), "IC": metrics.IC.mean(), "ICIR": metrics.IC.mean() / metrics.IC.std(), } if self._writer is not None and epoch >= 0 and not is_pretrain: for key, value in metrics.items(): self._writer.add_scalar(prefix + "/" + key, value, epoch) if return_pred: preds = pd.concat(preds, axis=0) preds.index = data_set.restore_index(preds.index) preds.index = preds.index.swaplevel() preds.sort_index(inplace=True) if probs: probs = pd.concat(probs, axis=0) if self.use_daily_transport: probs.index = data_set.restore_daily_index(probs.index) else: probs.index = data_set.restore_index(probs.index) probs.index = probs.index.swaplevel() probs.sort_index(inplace=True) if len(P_all): P_all = pd.concat(P_all, axis=0) if self.use_daily_transport: P_all.index = data_set.restore_daily_index(P_all.index) else: P_all.index = data_set.restore_index(P_all.index) P_all.index = P_all.index.swaplevel() P_all.sort_index(inplace=True) return metrics, preds, probs, P_all def _fit(self, train_set, valid_set, test_set, evals_result, is_pretrain=True): best_score = -1 best_epoch = 0 stop_rounds = 0 best_params = { "model": copy.deepcopy(self.model.state_dict()), "tra": copy.deepcopy(self.tra.state_dict()), } # train if not is_pretrain and self.transport_method != "none": self.logger.info("init memory...") self.test_epoch(-1, train_set) for epoch in range(self.n_epochs): self.logger.info("Epoch %d:", epoch) self.logger.info("training...") self.train_epoch(epoch, train_set, is_pretrain=is_pretrain) self.logger.info("evaluating...") # NOTE: during evaluating, the whole memory will be refreshed if not is_pretrain and (self.transport_method == "router" or self.eval_train): train_set.clear_memory() # NOTE: clear the shared memory train_metrics = self.test_epoch(epoch, train_set, is_pretrain=is_pretrain, prefix="train")[0] evals_result["train"].append(train_metrics) self.logger.info("train metrics: %s" % train_metrics) valid_metrics = self.test_epoch(epoch, valid_set, is_pretrain=is_pretrain, prefix="valid")[0] evals_result["valid"].append(valid_metrics) self.logger.info("valid metrics: %s" % valid_metrics) if self.eval_test: test_metrics = self.test_epoch(epoch, test_set, is_pretrain=is_pretrain, prefix="test")[0] evals_result["test"].append(test_metrics) self.logger.info("test metrics: %s" % test_metrics) if valid_metrics["IC"] > best_score: best_score = valid_metrics["IC"] stop_rounds = 0 best_epoch = epoch best_params = { "model": copy.deepcopy(self.model.state_dict()), "tra": copy.deepcopy(self.tra.state_dict()), } if self.logdir is not None: torch.save(best_params, self.logdir + "/model.bin") else: stop_rounds += 1 if stop_rounds >= self.early_stop: self.logger.info("early stop @ %s" % epoch) break self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch)) self.model.load_state_dict(best_params["model"]) self.tra.load_state_dict(best_params["tra"]) return best_score def fit(self, dataset, evals_result=dict()): assert isinstance( dataset, MTSDatasetH ), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`" train_set, valid_set, test_set = dataset.prepare( ["train", "valid", "test"]) self.fitted = True self.global_step = -1 evals_result["train"] = [] evals_result["valid"] = [] evals_result["test"] = [] if self.pretrain: self.logger.info("pretraining...") self.optimizer = optim.Adam(list(self.model.parameters()) + list(self.tra.predictors.parameters()), lr=self.lr) self._fit(train_set, valid_set, test_set, evals_result, is_pretrain=True) # reset optimizer self.optimizer = optim.Adam(list(self.model.parameters()) + list(self.tra.parameters()), lr=self.lr) self.logger.info("training...") best_score = self._fit(train_set, valid_set, test_set, evals_result, is_pretrain=False) self.logger.info("inference") train_metrics, train_preds, train_probs, train_P = self.test_epoch( -1, train_set, return_pred=True) self.logger.info("train metrics: %s" % train_metrics) valid_metrics, valid_preds, valid_probs, valid_P = self.test_epoch( -1, valid_set, return_pred=True) self.logger.info("valid metrics: %s" % valid_metrics) test_metrics, test_preds, test_probs, test_P = self.test_epoch( -1, test_set, return_pred=True) self.logger.info("test metrics: %s" % test_metrics) if self.logdir: self.logger.info("save model & pred to local directory") pd.concat( { name: pd.DataFrame(evals_result[name]) for name in evals_result }, axis=1).to_csv(self.logdir + "/logs.csv", index=False) torch.save( { "model": self.model.state_dict(), "tra": self.tra.state_dict() }, self.logdir + "/model.bin") train_preds.to_pickle(self.logdir + "/train_pred.pkl") valid_preds.to_pickle(self.logdir + "/valid_pred.pkl") test_preds.to_pickle(self.logdir + "/test_pred.pkl") if len(train_probs): train_probs.to_pickle(self.logdir + "/train_prob.pkl") valid_probs.to_pickle(self.logdir + "/valid_prob.pkl") test_probs.to_pickle(self.logdir + "/test_prob.pkl") if len(train_P): train_P.to_pickle(self.logdir + "/train_P.pkl") valid_P.to_pickle(self.logdir + "/valid_P.pkl") test_P.to_pickle(self.logdir + "/test_P.pkl") info = { "config": { "model_config": self.model_config, "tra_config": self.tra_config, "model_type": self.model_type, "lr": self.lr, "n_epochs": self.n_epochs, "early_stop": self.early_stop, "max_steps_per_epoch": self.max_steps_per_epoch, "lamb": self.lamb, "rho": self.rho, "alpha": self.alpha, "seed": self.seed, "logdir": self.logdir, "pretrain": self.pretrain, "init_state": self.init_state, "transport_method": self.transport_method, "use_daily_transport": self.use_daily_transport, }, "best_eval_metric": -best_score, # NOTE: -1 for minimize "metrics": { "train": train_metrics, "valid": valid_metrics, "test": test_metrics }, } with open(self.logdir + "/info.json", "w") as f: json.dump(info, f) def predict(self, dataset, segment="test"): assert isinstance( dataset, MTSDatasetH ), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`" if not self.fitted: raise ValueError("model is not fitted yet!") test_set = dataset.prepare(segment) metrics, preds, _, _ = self.test_epoch(-1, test_set, return_pred=True) self.logger.info("test metrics: %s" % metrics) return preds
class Trainer: """Trainer for naive MoLM model""" def __init__( self, generator, moment_network, train_set, training_params, device=None, scores=None, tensorboard=False, save_folder="runs/run", eval_generate_images=False, ): """ generator: a nn.Module child class serving as a generator network moment_network: a nn.Module child class serving as the moment network loader: a training data loader scores: None, or a dict of shape {'name':obj} with score object with a __call__ function that returns a score training_params: dict of training parameters with: n0: number of objectives nm: number of moments trainig step ng: number of generating training steps lr: learning rate beta1 / beta2: Adam parameters acw: activation wieghts alpha: the norm penalty parameter gen_batch_size: the batch size to train the generator mom_batch_size: the batch size to train the moment network eval_batch_size: the batch size to evaluate the generated eval_size: total number of generated samples on which to evaluate the scores tensorboard: whether to use tensorboard to save training information save_folder: root folder to save the training information eval_generate_images: generates images during training for evaluation """ self.G = generator self.MoNet = moment_network self.train_set = train_set self.training_params = training_params self.nm = training_params["nm"] self.ng = training_params["ng"] self.no = training_params["no"] self.no_obj = 0 # current objective self.n_moments = training_params["n_moments"] self.gen_batch_size = training_params["gen_batch_size"] self.eval_batch_size = training_params["eval_batch_size"] self.learn_moments = training_params["learn_moments"] lr, beta1, beta2 = ( self.training_params["lr"], self.training_params["beta1"], self.training_params["beta2"], ) self.optimizerG = optim.Adam(self.G.parameters(), lr=lr, betas=(beta1, beta2)) self.optimizerM = optim.Adam(self.MoNet.parameters(), lr=lr, betas=(beta1, beta2)) self.LM = [] self.LG = [] self.iter = 0 self.device = device self.cross_entropy = F.binary_cross_entropy self.mse = MSELoss(reduction="sum") # to track the evolution of generated samples from a single batch of noises self.fixed_z = torch.randn(20, self.G.dims[0], device=self.device) # saving training info self.run_folder = Path(save_folder) if not (self.run_folder / "results").exists(): os.mkdir(self.run_folder / "results") self.save_path_img = self.run_folder / "results/images/" self.save_path_checkpoints = self.run_folder / "checkpoints/" if not self.save_path_checkpoints.exists(): os.mkdir(self.save_path_checkpoints) self.eval_generate_images = eval_generate_images # monitoring the progress of the training with the evaluation scores self.scores = scores if scores is not None and not (self.run_folder / "scores.csv").exists(): # Save scores with open(self.run_folder / "scores.csv", "w") as f: f.write(f'Objective,{",".join(scores.keys())}\n') # monitoring through tensorboard if tensorboard: comment = "".join([ "{}={} ".format(key, training_params[key]) for key in training_params ]) self.tb = SummaryWriter(self.run_folder, comment=comment) self.tb.add_graph(generator, self.fixed_z) else: self.tb = None # set up handler to file fh = logging.FileHandler(self.run_folder / "logging.txt") fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) def train_monet(self): """Solves one Moment Network objective.""" # reshuffle training data loader = iter( torch.utils.data.DataLoader( self.train_set, shuffle=True, batch_size=self.training_params["mom_batch_size"], )) for i in range(self.nm): batch = loader.next() samples = batch samples = samples.to(self.device) # samples = (samples * 2) - 1 sample_size = samples.size(0) one_labels = torch.ones(sample_size, device=self.device) zero_labels = torch.zeros(sample_size, device=self.device) # generating latent vector # self.dims = [Z_dim, h1_dim, h2_dim, h3_dim, X_dim] z = torch.randn(sample_size, self.G.dims[0], device=self.device) res = self.G(z) prob_trues = self.MoNet(samples) output_trues = self.MoNet.output prob_gen = self.MoNet(res) output_gen = self.MoNet.output prob_trues, prob_gen = prob_trues.squeeze(), prob_gen.squeeze() LM_samples = self.cross_entropy(prob_trues, one_labels) LM_gen = self.cross_entropy(prob_gen, zero_labels) LM = LM_samples + LM_gen # We now need to compute the gradients to add the regularization term mean_output = output_trues.mean() self.optimizerM.zero_grad() grad_monet = self.MoNet.get_gradients(mean_output) grad_monet = grad_monet.squeeze() grad_norm = torch.dot(grad_monet, grad_monet) LM = (LM_samples + LM_gen + self.training_params["alpha"] * ((grad_norm - 1)**2)) # LM = LM_samples + LM_gen # Add to tensorboard if self.tb: self.tb.add_scalar( "LossMonet/objective_{}".format(self.no_obj + 1), float(LM), i + 1) self.LM.append(float(LM)) if i % 50 == 0: logger.info("Moment Network Iteration {}/{}: LM: {:.6}".format( i + 1, self.nm, LM.item())) self.optimizerM.zero_grad() LM.backward() self.optimizerM.step() del grad_monet del batch def eval_true_moments(self): """Returns the value of moment vector on observed data.""" loader = torch.utils.data.DataLoader( self.train_set, shuffle=True, batch_size=self.training_params["mom_batch_size"], ) # Calculate the moment vector over the entire dataset: moments = torch.zeros(self.n_moments, device=self.device) for i, batch in enumerate(loader): samples = batch samples = samples.to(self.device) sample_size = samples.size(0) # NOT Scaling true images to tanh activation interval: # samples = (samples * 2) - 1 self.optimizerM.zero_grad() moments_b = self.MoNet.get_moment_vector( samples, sample_size, weights=self.training_params["activation_weight"], detach=True, ) moments = ((i) * moments + moments_b) / (i + 1) del batch del samples del moments_b return moments def train_generator(self, true_moments): """Solves one generator objective.""" for i in range(self.ng): z = torch.randn(self.gen_batch_size, self.G.dims[0], device=self.device) res = self.G(z) self.optimizerM.zero_grad() moments_gz = self.MoNet.get_moment_vector( res, self.gen_batch_size, weights=self.training_params["activation_weight"], ) # moments_gz = ((i) * moments_gz + moments_z) / (i+1) del z del res LG = torch.dot( true_moments - moments_gz, true_moments - moments_gz) # equivalent to dot product of difference # LG = self.mse(true_moments, moments_gz) # Add to tensorboard if self.tb: self.tb.add_scalar( "LossGenerator/objective_{}".format(self.no_obj + 1), float(LG), i + 1, ) self.LG.append(float(LG)) if i % 100 == 0: logger.info("Generator Iteration {}/{}: LG: {:.6}".format( i + 1, self.ng, LG.item())) self.optimizerG.zero_grad() LG.backward() self.optimizerG.step() del moments_gz def generate_and_display(self, z, save=False, save_path=None): """"Generates rows of images from latent variable z.""" # Visualizing the generated images examples = self.G(z).detach().cpu() examples = examples.reshape(-1, 3, self.G.dims[-1], self.G.dims[-1]) examples = (examples + 1) / 2 grid = torchvision.utils.make_grid(examples, nrow=10) # 10 images per row # Add to tensorboard if self.tb: self.tb.add_image("generated images", grid, self.no_obj) fig = plt.figure(figsize=(15, 15)) plt.imshow(np.transpose(grid, (1, 2, 0))) if save: plt.savefig(save_path) else: plt.show() plt.close(fig) def eval(self): """Evaluate generated batch with scores in self.scores""" logger.info( f"Evaluating generated samples with scores: {self.scores.keys()}") scores_dict = self.scores n_loops = self.training_params["eval_size"] // self.eval_batch_size results = dict(zip(scores_dict.keys(), [None] * len(scores_dict))) for score in scores_dict: results[score] = np.zeros(n_loops) for i in range(n_loops): with torch.no_grad(): z = torch.randn(self.eval_batch_size, self.G.dims[0], device=self.device) samples = self.G(z).cpu() if "IS" in scores_dict or "FID" in scores_dict: samples = InceptionScore.preprocess(samples) for score in scores_dict: value = scores_dict[score](samples) results[score][i] = value if value is not None else np.nan for score in scores_dict: results[score] = np.nanmean(results[score]) return results def load_from_checkpoints(self, path): """ Loads network parameters and training info from checkpoint path: path to checkpoint """ logger.info( "Loading network parameters and training info from checkpoint...") checkpoint = torch.load(path) self.G.load_state_dict(checkpoint["generator_state_dict"]) self.optimizerG.load_state_dict(checkpoint["optimizerG_state_dict"]) self.G.train() if self.learn_moments: self.MoNet.load_state_dict(checkpoint["monet_state_dict"]) self.optimizerM.load_state_dict( checkpoint["optimizerM_state_dict"]) self.MoNet.train() last_objective = checkpoint["objective"] lossG = checkpoint["last_lossG"] lossM = checkpoint["last_lossM"] return last_objective, lossG, lossM def train(self, save_images=False, from_checkpoint=None): """Trains naive MoLM model made of generator self.G and moment network self.MoNet""" if save_images and not self.save_path_img.exists(): os.mkdir(self.save_path_img) last_objective = 0 if not self.learn_moments: true_moments = self.eval_true_moments() if from_checkpoint: last_objective, lossG, lossM = self.load_from_checkpoints( from_checkpoint) logger.info( "Starting training from Objective: {}, lossG: {}, lossM: {}". format(last_objective, lossG, lossM)) for i in range(last_objective, self.no): # Track the no of objectives solved self.no_obj = i start = time.time() if self.learn_moments: logger.info("Training Moment Network...") self.train_monet() logger.info("Evaluating true moments value...") true_moments = self.eval_true_moments() logger.info("Training Generator") self.train_generator(true_moments) self.iter += 1 stop = time.time() duration = (stop - start) / 60 if self.learn_moments: logger.info( "Objective {}/{} - {:.2} minutes: LossMonet: {:.6} LossG: {:.6}" .format(i + 1, self.no, duration, self.LM[-1], self.LG[-1])) else: logger.info( "Objective {}/{} - {:.2} minutes: LossG: {:.6}".format( i + 1, self.no, duration, self.LG[-1])) if self.eval_generate_images: self.generate_and_display( self.fixed_z, save=save_images, save_path=self.save_path_img + "generated_molm_iter{}.png".format(i), ) if i % SAVING_FREQUENCY == 0: logger.info("Saving model ...") save_path_checkpoints = self.save_path_checkpoints / f"molm_iter{i}.pt" save_dict = { "monet_state_dict": self.MoNet.state_dict(), "generator_state_dict": self.G.state_dict(), "optimizerG_state_dict": self.optimizerG.state_dict(), "objective": i + 1, "last_lossG": self.LG[-1], } if self.learn_moments: save_dict["last_lossM"] = self.LM[-1] save_dict[ "optimizerM_state_dict"] = self.optimizerM.state_dict( ) torch.save(save_dict, save_path_checkpoints) if self.scores: scores = self.eval() logger.info(f"{scores}") # Add to tensorboard if self.tb: for score in scores: self.tb.add_scalar("Scores/{}".format(score), scores[score], i + 1) # Save scores with open(self.run_folder / "scores.csv", "a") as f: f.write( f'{i+1},{",".join([str(metric) for metric in scores.values()])}\n' ) # Updating data on tensorboard if self.tb: for name, param in self.G.named_parameters(): self.tb.add_histogram("generator.{}".format(name), param, i + 1) self.tb.add_histogram("generator.{}.grad".format(name), param.grad, i + 1) for name, param in self.MoNet.named_parameters(): self.tb.add_histogram("momentNetwork.{}".format(name), param, i + 1) self.tb.add_histogram("momentNetwork.{}.grad".format(name), param.grad, i + 1)
# input_img = torch.randn(content_img.data.size(), device=device) # add the original input image to the figure: #plt.figure() #imshow(input_img, title='Input Image') start = time.time() output = run_style_transfer(cnn, normalization_mean, normalization_std, content_img, style_img, input_img, opt.content_layers_default, opt.style_layers_default, device, opt.num_epochs, opt.style_weight, opt.content_weight) timeSince(start) writer = SummaryWriter( log_dir='train_result') # tensorboard直接就能保存tensor,只要图片符合C x H x W就行。 writer.add_image('img', output.squeeze(0), 1) # 前面为什么要增一维,导致这里又要减一维 writer.close() # tensor只有一个元素,tensor.item();有很多元素tensor.data # transforms.ToTensor()与下面对应,相互转化吧(不说类型,数值的变化为0-255转化为0-1;下面则相反) # unloader = transforms.ToPILImage() # reconvert into PIL image def imshow(tensor, title=None): image = tensor.cpu().clone() # we clone the tensor to not do changes on it ''' 实验证明,clone()是为了不改变原来的变量; x = torch.Tensor(2,2).fill_(2);y = x.clone();y = x.clone().view(4); clone()可以重新开辟一块内存,x,y互不影响,克隆时也可以改变形状。 import copy x = torch.Tensor(2,2).fill_(2);y = copy.copy(x),此时x,y指向一个内存单元; y = copy.deepcopy(x)重新开辟内存,互不影响。
def train_epoch(net, datasets, optimizer, lr_scheduler, args): batch_size = {'train': args['batch_size'], 'val': 4} data_loader = { phase: uData.DataLoader(datasets[phase], batch_size=batch_size[phase], shuffle=True, num_workers=args['num_workers'], pin_memory=True) for phase in _modes } num_data = {phase: len(datasets[phase]) for phase in _modes} num_iter_epoch = { phase: ceil(num_data[phase] / batch_size[phase]) for phase in _modes } step = args['step'] if args['resume'] else 0 step_img = args['step_img'] if args['resume'] else {x: 0 for x in _modes} writer = SummaryWriter(str(Path(args['log_dir']))) for epoch in range(args['epoch_start'], args['epochs']): loss_epoch = {x: 0 for x in ['PL', 'DL', 'GL']} subloss_epoch = { x: 0 for x in [ 'Px', 'Pxg', 'Py', 'Pyg', 'Dx', 'DE', 'DAE', 'Gy', 'GMean', 'GErr', 'TGErr' ] } mae_epoch = {'train': 0, 'val': 0} tic = time.time() # train stage net['D'].train() net['G'].train() net['P'].train() lr_D = optimizer['D'].param_groups[0]['lr'] lr_G = optimizer['G'].param_groups[0]['lr'] lr_P = optimizer['P'].param_groups[0]['lr'] if lr_D < 1e-6: sys.exit('Reach the minimal learning rate') phase = 'train' iter_GD = 0 for ii, data in enumerate(data_loader[phase]): im_noisy, im_gt = [x.cuda() for x in data] # update the netP PL, Px, Pxg, Py, Pyg = train_step_P(net, im_gt, im_noisy, optimizer['P'], args) loss_epoch['PL'] += PL.item() subloss_epoch['Px'] += Px.item() subloss_epoch['Pxg'] += Pxg.item() subloss_epoch['Py'] += Py.item() subloss_epoch['Pyg'] += Pyg.item() # update the netD if (ii + 1) % args['num_critic'] == 0: DL, Dx, DE, DAE, im_denoise = train_step_D( net, im_gt, im_noisy, optimizer['D'], args) loss_epoch['DL'] += DL.item() subloss_epoch['Dx'] += Dx.item() subloss_epoch['DE'] += DE.item() subloss_epoch['DAE'] += DAE.item() mae_epoch[phase] += DAE.item() # update the netG GL, Gy, GMean, im_generate = train_step_G( net, im_gt, im_noisy, optimizer['G'], args) loss_epoch['GL'] += GL.item() subloss_epoch['Gy'] += Gy.item() subloss_epoch['GMean'] += GMean.item() GErr = F.l1_loss(im_generate, im_gt, reduction='mean') subloss_epoch['GErr'] += GErr.item() TGErr = F.l1_loss(im_noisy, im_gt, reduction='mean') subloss_epoch['TGErr'] += TGErr.item() iter_GD += 1 if (ii + 1) % args['print_freq'] == 0: template = '[Epoch:{:>2d}/{:<3d}] {:s}:{:0>5d}/{:0>5d}, PLx:{:>6.2f}/{:4.2f},'+\ ' PLy:{:>6.2f}/{:4.2f}, DL:{:>6.2f}/{:.1e}, DAE:{:.2e}, '+\ 'GL:{:>6.2f}/{:<5.2f}, GErr:{:.1e}/{:.1e}' print( template.format(epoch + 1, args['epochs'], phase, ii + 1, num_iter_epoch[phase], Px.item(), Pxg.item(), Py.item(), Pyg.item(), Dx.item(), DE.item(), DAE.item(), Gy.item(), GMean.item(), GErr.item(), TGErr.item())) writer.add_scalar('Train PNet Loss Iter', PL.item(), step) writer.add_scalar('Train DNet Loss Iter', DL.item(), step) writer.add_scalar('Train GNet Loss Iter', GL.item(), step) step += 1 if (ii + 1) % (10 * args['print_freq']) == 0: x1 = vutils.make_grid(im_noisy, normalize=True, scale_each=True) writer.add_image(phase + ' Noisy Image', x1, step_img[phase]) x2 = vutils.make_grid(im_gt, normalize=True, scale_each=True) writer.add_image(phase + ' GroundTruth', x2, step_img[phase]) x3 = vutils.make_grid(im_denoise.clamp_(0.0, 1.0), normalize=True, scale_each=True) writer.add_image(phase + ' Denoised images', x3, step_img[phase]) x4 = vutils.make_grid(im_generate.clamp_(0.0, 1.0), normalize=True, scale_each=True) writer.add_image(phase + ' Generated images', x4, step_img[phase]) step_img[phase] += 1 loss_epoch['PL'] /= (ii + 1) subloss_epoch['Px'] /= (ii + 1) subloss_epoch['Pxg'] /= (ii + 1) subloss_epoch['Py'] /= (ii + 1) subloss_epoch['Pyg'] /= (ii + 1) loss_epoch['DL'] /= (iter_GD + 1) subloss_epoch['Dx'] /= (iter_GD + 1) subloss_epoch['DAE'] /= (iter_GD + 1) mae_epoch[phase] /= (iter_GD + 1) loss_epoch['GL'] /= (iter_GD + 1) subloss_epoch['Gy'] /= (iter_GD + 1) subloss_epoch['GMean'] /= (iter_GD + 1) subloss_epoch['GErr'] /= (iter_GD + 1) subloss_epoch['TGErr'] /= (iter_GD + 1) template = '{:s}: PL={:5.2f}, DL={:5.2f}, GL={:5.2f}, DAE:{:4.2e}, GMean:{:4.2e}, ' +\ 'GE:{:.2e}/{:.2e}, tauDG:{:.1e}/{:.1e}, lrDGP:{:.2e}/{:.2e}/{:.2e}' print( template.format(phase, loss_epoch['PL'], loss_epoch['DL'], loss_epoch['GL'], subloss_epoch['DAE'], subloss_epoch['GMean'], subloss_epoch['GErr'], subloss_epoch['TGErr'], args['tau_D'], args['tau_G'], lr_D, lr_G, lr_P)) print('-' * 150) # test stage net['D'].eval() psnr_per_epoch = ssim_per_epoch = 0 phase = 'val' for ii, data in enumerate(data_loader[phase]): im_noisy, im_gt = [x.cuda() for x in data] with torch.set_grad_enabled(False): im_denoise = im_noisy - net['D'](im_noisy) mae_iter = F.l1_loss(im_denoise, im_gt) im_denoise.clamp_(0.0, 1.0) mae_epoch[phase] += mae_iter psnr_iter = batch_PSNR(im_denoise, im_gt) psnr_per_epoch += psnr_iter ssim_iter = batch_SSIM(im_denoise, im_gt) ssim_per_epoch += ssim_iter # print statistics every log_interval mini_batches if (ii + 1) % 50 == 0: log_str = '[Epoch:{:>2d}/{:<2d}] {:s}:{:0>3d}/{:0>3d}, mae={:.2e}, ' + \ 'psnr={:4.2f}, ssim={:5.4f}' print( log_str.format(epoch + 1, args['epochs'], phase, ii + 1, num_iter_epoch[phase], mae_iter, psnr_iter, ssim_iter)) # tensorboard summary x1 = vutils.make_grid(im_denoise, normalize=True, scale_each=True) writer.add_image(phase + ' Denoised images', x1, step_img[phase]) x2 = vutils.make_grid(im_gt, normalize=True, scale_each=True) writer.add_image(phase + ' GroundTruth', x2, step_img[phase]) x5 = vutils.make_grid(im_noisy, normalize=True, scale_each=True) writer.add_image(phase + ' Noisy Image', x5, step_img[phase]) step_img[phase] += 1 psnr_per_epoch /= (ii + 1) ssim_per_epoch /= (ii + 1) mae_epoch[phase] /= (ii + 1) print('{:s}: mae={:.3e}, PSNR={:4.2f}, SSIM={:5.4f}'.format( phase, mae_epoch[phase], psnr_per_epoch, ssim_per_epoch)) print('-' * 150) # adjust the learning rate lr_scheduler['D'].step() lr_scheduler['G'].step() lr_scheduler['P'].step() # save model save_path_model = str( Path(args['model_dir']) / ('model_' + str(epoch + 1))) torch.save( { 'epoch': epoch + 1, 'step': step + 1, 'step_img': {x: step_img[x] + 1 for x in _modes}, 'model_state_dict': {x: net[x].state_dict() for x in ['D', 'P', 'G']}, 'optimizer_state_dict': {x: optimizer[x].state_dict() for x in ['D', 'P', 'G']}, 'lr_scheduler_state_dict': {x: lr_scheduler[x].state_dict() for x in ['D', 'P', 'G']} }, save_path_model) save_path_model = str( Path(args['model_dir']) / ('model_state_' + str(epoch + 1) + '.pt')) torch.save({x: net[x].state_dict() for x in ['D', 'G']}, save_path_model) writer.add_scalars('MAE_epoch', mae_epoch, epoch) writer.add_scalar('Val PSNR epoch', psnr_per_epoch, epoch) writer.add_scalar('Val SSIM epoch', ssim_per_epoch, epoch) toc = time.time() print('This epoch take time {:.2f}'.format(toc - tic)) writer.close() print('Reach the maximal epochs! Finish training')
def train(opt, device): dataloader = data_loader(opt) gnet = GNet(opt).to(device) dnet = DNet(opt).to(device) #writer.add_graph(gnet)'''做实验试验下第二个参数''' #writer.add_graph(dnet) if device.type == 'cuda': # 就算device里有多个GPU可见,但是若不用分发功能,仍然只有第0块在跑 gnet = nn.DataParallel(gnet, [0, 1, 2]) # list(range(ngpu))不好使,只能用前几个 dnet = nn.DataParallel(dnet, [0, 1, 2]) gnet.apply( weight_init) # 也就是初始化了下面的d/gnet.parameters();不进行初始化则会系统给你进行一次随机初始 dnet.apply(weight_init) print('Generative NetWork:') print(gnet) print('') print('Discriminative NetWork:') print(dnet) criterion = nn.BCELoss() ''' params (iterable): iterable of parameters to optimize or dicts defining parameter groups 除了下面的整体赋值,还可以通过迭代给优化器赋值,把模型中所有需要参数的过程都分别设置值;如学长代码: optimizer = optim.SGD([ {'params': model.features.parameters(), 'lr': 0.1 * lr}, {'params': model.sample_128.parameters(), 'lr': lr}, {'params': model.sample_256.parameters(), 'lr': lr}, {'params': model.fc_concat.parameters(), 'lr': lr} ], lr=1e-1, momentum=0.9, weight_decay=1e-5) ''' g_optimizer = optim.Adam(gnet.parameters(), lr=opt.lr1, betas=(opt.beta1, 0.999)) d_optimizer = optim.Adam(dnet.parameters(), lr=opt.lr2, betas=(opt.beta1, 0.999)) # 优化器只会进行一次初始赋值,其他都是反向调整 print('g_optimizer:') print(g_optimizer) print('d_optimizer:') print(d_optimizer) writer = SummaryWriter(log_dir='train_result') # 定义writer时候就会生成events文件,而tensorboard执行时会搜索大文件下的所有路径,找出所有需要的文件 #dummy1_input = torch.rand(opt.batch_size, 3, 96,96) #dummy2_input = torch.rand(opt.batch_size, opt.nd,1,1) #writer.add_graph(dnet, dummy1_input) #writer.add_graph(gnet, dumm2_input # Training Loop # Lists to keep track of progress '''完全可以不用列表,但是为了以后可能有其他用,就保留了''' img_list = [] G_losses = [] D_losses = [] iters = 0 fixed_noise = torch.randn(opt.batch_size, opt.nd, 1, 1, device=device) print("Starting Training Loop...") dnet.train() gnet.train() # 不写也默认为train模式;当有BN层和dropout层时,肯定得考虑模式切换,因为训练时这两个层有变化,验证时不能让它变,而eval()模式就不变,train()会变 # For each epoch for epoch in range(1, opt.max_epoch + 1): # For each batch in the dataloader print(len(dataloader)) print(type(dataloader)) for i, (imgs, _) in enumerate(dataloader, 1): # torch.utils.data.DataLoader()返回的就是二元组组成的一个特殊的对象(不是列表等,也不能切片); # 在MNIST数据集中img, label = data;这些动漫头像没有标签,打印出来后发现是tensor([0, 0,...0, 0 0]) ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ############################ ## Train with all-real batch dnet.zero_grad() '''先训练判别器,再训练生成器''' # Format batch real_img = imgs.to(device) # 每个batch.to(device) # torch.full((2,3), 1.2),第一个参数必须是元组,可以是任意维数,但想要一维填充时也得为元组,而元组只有一个元素时后面必须有个, label = torch.full((opt.batch_size, ), opt.real_label, device=device) # Forward pass real batch through D output = dnet(real_img) # 在model模块中已经被展成一维的啦 # Calculate loss on all-real batch d_err_real = criterion(output, label) # 平均损失 # Calculate gradients for D in backward pass d_err_real.backward() D_x = output.mean().item() # 真实图片的平均得分,当然是接近1越好 ## Train with all-fake batch # Generate batch of latent vectors latent:隐藏的,潜伏的 noise = torch.randn(opt.batch_size, opt.nd, 1, 1, device=device) # gnet会生成opt.batch_size个图像,因为一个(opt.nd,1,1)可以生成一个图像;在gnet中,每张图有otp.nd个feature maps # ,每个feature map大小为1 x 1,所以每个opt.nd(也就是一个值),控制着生成图像中的一个特征 # Generate fake image batch with G fake = gnet(noise) label.fill_(opt.fake_label) # Classify all fake batch with D output = dnet(fake.detach()) # Calculate D's loss on the all-fake batch d_err_fake = criterion(output, label) # Calculate the gradients for this batch d_err_fake.backward() D_G_z1 = output.mean().item() # 假图像的分数,自然是越接近0越好 # Add the gradients from the all-real and all-fake batches d_err = d_err_real + d_err_fake #tensor(1.272)+tensor(0.183)可以直接相加,不需要先取出数值。 # tensor自成体系,tensor和tensor的加减乘除和标量一模一样;只是tensor和标量之间不能直接算 # Update D d_optimizer.step() ############################ # (2) Update G network: maximize log(D(G(z))) ########################### gnet.zero_grad() label.fill_( opt.real_label) # fake labels are real for generator cost # Since we just updated D, perform another forward pass of all-fake batch through D output = dnet( fake ) # 更新了一步D网络后,同样一批假图片,自然是希望判别得分output比更新前的假图片得分要小,也就是使下面的g_err扩大 # Calculate G's loss based on this output g_err = criterion(output, label) '''生成器就是要把假图片往真标签身上凑;所以假图片+真标签,进行比较后,损失越小越好''' # Calculate gradients for G g_err.backward() D_G_z2 = output.mean().item( ) # 因为更新过一次判别器,所以这个假图片的output均值应该比上面的假图片的output均值更接近0才健康 # Update G g_optimizer.step() # Output training stats if i % 50 == 0: print( '[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\treal_img_mean_score: %.4f\tfake_img_mean_score_1/2: %.4f / %.4f' % (epoch, opt.max_epoch, i, len(dataloader), d_err.item(), g_err.item(), D_x, D_G_z1, D_G_z2)) # Save Losses for plotting later G_losses.append(g_err.item()) D_losses.append(d_err.item()) writer.add_scalars('dnet_gnet_loss', { 'G_losses': G_losses[iters], 'D_losses': D_losses[iters] }, iters) # Check how the generator is doing by saving G's output on fixed_noise if (iters % 500 == 0) or ((epoch == opt.max_epoch) and (i == len(dataloader))): with torch.no_grad( ): # 上下文管理,处于with范围内的tensor待会不反向,所以前向时不用求局部梯度了,节省计算。因为forward时就会把每层对应局部梯度公式求出来 fake = gnet( fixed_noise ) #.detach().cpu() 截断再放CPU里没什么特殊用啊,有没有效果一样,只是拷贝一份假图片存放cpu里 img_list.append(vutils.make_grid(fake, normalize=True)) '''还不知道合成的图有多少个小图呢''' writer.add_image('fake%d' % (iters / 500), img_list[int(iters / 500)], int(iters / 500)) iters += 1 torch.save(dnet.state_dict(), 'dnet.pth') torch.save(gnet.state_dict(), 'gnet.pth') writer.close() '''
x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = x.view(-1, 16 * 4 * 4) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x net = Net() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) from torch.utils.tensorboard import SummaryWriter # default `log_dir` is "runs" - we'll be more specific here writer = SummaryWriter('runs/fashion_mnist_experiment_1') # get some random training images dataiter = iter(trainloader) images, labels = dataiter.next() # create grid of images img_grid = torchvision.utils.make_grid(images) # show images matplotlib_imshow(img_grid, one_channel=True) # write to tensorboard writer.add_image('four_fashion_mnist_images', img_grid)
class face_learner(object): def __init__(self, conf, inference=False): print(conf) if conf.use_mobilfacenet: self.model = MobileFaceNet(conf.embedding_size).to(conf.device) print('MobileFaceNet model generated') else: self.model = Backbone(conf.net_depth, conf.drop_ratio, conf.net_mode).to(conf.device) print('{}_{} model generated'.format(conf.net_mode, conf.net_depth)) if not inference: self.milestones = conf.milestones self.loader, self.class_num = get_train_loader_VTT(conf) self.writer = SummaryWriter(conf.log_path) self.step = 0 self.head = Arcface(embedding_size=conf.embedding_size, classnum=self.class_num).to(conf.device) print('two model heads generated') paras_only_bn, paras_wo_bn = separate_bn_paras(self.model) if conf.use_mobilfacenet: self.optimizer = optim.SGD( [{ 'params': paras_wo_bn[:-1], 'weight_decay': 4e-5 }, { 'params': [paras_wo_bn[-1]] + [self.head.kernel], 'weight_decay': 4e-4 }, { 'params': paras_only_bn }], lr=conf.lr, momentum=conf.momentum) else: self.optimizer = optim.SGD( [{ 'params': paras_wo_bn + [self.head.kernel], 'weight_decay': 5e-4 }, { 'params': paras_only_bn }], lr=conf.lr, momentum=conf.momentum) print(self.optimizer) # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=40, verbose=True) print('optimizers generated') self.board_loss_every = len(self.loader) // 10 #100 self.evaluate_every = len(self.loader) // 10 self.save_every = len(self.loader) // 5 self.agedb_30, self.cfp_fp, self.lfw, self.agedb_30_issame, self.cfp_fp_issame, self.lfw_issame = get_val_data( 'D:\Dataset\Face') else: self.threshold = conf.threshold def save_state(self, conf, accuracy, to_save_folder=False, extra=None, model_only=False): # if to_save_folder: # save_path = conf.save_path # else: # save_path = conf.model_path # torch.save( # self.model.state_dict(), save_path / # ('model_{}_accuracy:{}_step:{}_{}.pth'.format(get_time(), accuracy, self.step, extra))) # if not model_only: # torch.save( # self.head.state_dict(), save_path / # ('head_{}_accuracy:{}_step:{}_{}.pth'.format(get_time(), accuracy, self.step, extra))) # torch.save( # self.optimizer.state_dict(), save_path / # ('optimizer_{}_accuracy:{}_step:{}_{}.pth'.format(get_time(), accuracy, self.step, extra))) torch.save(self.model.state_dict(), './models/backbone.pth') torch.save(self.head.state_dict(), './models/head.pth') def load_state(self, conf, fixed_str, from_save_folder=False, model_only=False): if from_save_folder: save_path = conf.save_path else: save_path = conf.model_path # self.model.load_state_dict(torch.load(save_path/'model_{}'.format(fixed_str))) self.model.load_state_dict(torch.load('./models/backbone.pth')) if not model_only: # self.head.load_state_dict(torch.load(save_path/'head_{}'.format(fixed_str))) self.head.load_state_dict(torch.load('./models/head.pth')) # self.optimizer.load_state_dict(torch.load(save_path/'optimizer_{}'.format(fixed_str))) def board_val(self, db_name, accuracy, best_threshold, roc_curve_tensor): self.writer.add_scalar('{}_accuracy'.format(db_name), accuracy, self.step) self.writer.add_scalar('{}_best_threshold'.format(db_name), best_threshold, self.step) self.writer.add_image('{}_roc_curve'.format(db_name), roc_curve_tensor, self.step) # self.writer.add_scalar('{}_val:true accept ratio'.format(db_name), val, self.step) # self.writer.add_scalar('{}_val_std'.format(db_name), val_std, self.step) # self.writer.add_scalar('{}_far:False Acceptance Ratio'.format(db_name), far, self.step) def evaluate(self, conf, carray, issame, nrof_folds=5, tta=False): self.model.eval() idx = 0 embeddings = np.zeros([len(carray), conf.embedding_size]) with torch.no_grad(): while idx + conf.batch_size <= len(carray): batch = torch.tensor(carray[idx:idx + conf.batch_size]) if tta: fliped = hflip_batch(batch) emb_batch = self.model(batch.to(conf.device)) + self.model( fliped.to(conf.device)) embeddings[idx:idx + conf.batch_size] = l2_norm(emb_batch) else: embeddings[idx:idx + conf.batch_size] = self.model( batch.to(conf.device)).cpu() idx += conf.batch_size if idx < len(carray): batch = torch.tensor(carray[idx:]) if tta: fliped = hflip_batch(batch) emb_batch = self.model(batch.to(conf.device)) + self.model( fliped.to(conf.device)) embeddings[idx:] = l2_norm(emb_batch) else: embeddings[idx:] = self.model(batch.to(conf.device)).cpu() tpr, fpr, accuracy, best_thresholds = evaluate(embeddings, issame, nrof_folds) buf = gen_plot(fpr, tpr) roc_curve = Image.open(buf) roc_curve_tensor = trans.ToTensor()(roc_curve) return accuracy.mean(), best_thresholds.mean(), roc_curve_tensor def find_lr(self, conf, init_value=1e-8, final_value=10., beta=0.98, bloding_scale=3., num=None): if not num: num = len(self.loader) mult = (final_value / init_value)**(1 / num) lr = init_value for params in self.optimizer.param_groups: params['lr'] = lr self.model.train() avg_loss = 0. best_loss = 0. batch_num = 0 losses = [] log_lrs = [] for i, (imgs, labels) in tqdm(enumerate(self.loader), total=num): imgs = imgs.to(conf.device) labels = labels.to(conf.device) batch_num += 1 self.optimizer.zero_grad() embeddings = self.model(imgs) thetas = self.head(embeddings, labels) loss = conf.ce_loss(thetas, labels) #Compute the smoothed loss avg_loss = beta * avg_loss + (1 - beta) * loss.item() self.writer.add_scalar('avg_loss', avg_loss, batch_num) smoothed_loss = avg_loss / (1 - beta**batch_num) self.writer.add_scalar('smoothed_loss', smoothed_loss, batch_num) #Stop if the loss is exploding if batch_num > 1 and smoothed_loss > bloding_scale * best_loss: print('exited with best_loss at {}'.format(best_loss)) plt.plot(log_lrs[10:-5], losses[10:-5]) return log_lrs, losses #Record the best loss if smoothed_loss < best_loss or batch_num == 1: best_loss = smoothed_loss #Store the values losses.append(smoothed_loss) log_lrs.append(math.log10(lr)) self.writer.add_scalar('log_lr', math.log10(lr), batch_num) #Do the SGD step #Update the lr for the next step loss.backward() self.optimizer.step() lr *= mult for params in self.optimizer.param_groups: params['lr'] = lr if batch_num > num: plt.plot(log_lrs[10:-5], losses[10:-5]) return log_lrs, losses def train(self, conf, epochs): self.model.train() running_loss = 0. for e in range(epochs): print('epoch {} started'.format(e)) if e == self.milestones[0]: self.schedule_lr() if e == self.milestones[1]: self.schedule_lr() if e == self.milestones[2]: self.schedule_lr() for imgs, labels in tqdm(iter(self.loader)): imgs = imgs.to(conf.device) labels = labels.to(conf.device) self.optimizer.zero_grad() embeddings = self.model(imgs) thetas = self.head(embeddings, labels) loss = conf.ce_loss(thetas, labels) # cross entropy loss.backward() running_loss += loss.item() self.optimizer.step() if self.step % self.board_loss_every == 0 and self.step != 0: loss_board = running_loss / self.board_loss_every self.writer.add_scalar('train_loss', loss_board, self.step) running_loss = 0. if self.step % self.evaluate_every == 0 and self.step != 0: accuracy, best_threshold, roc_curve_tensor = self.evaluate( conf, self.agedb_30, self.agedb_30_issame) self.board_val('agedb_30', accuracy, best_threshold, roc_curve_tensor) accuracy, best_threshold, roc_curve_tensor = self.evaluate( conf, self.lfw, self.lfw_issame) self.board_val('lfw', accuracy, best_threshold, roc_curve_tensor) accuracy, best_threshold, roc_curve_tensor = self.evaluate( conf, self.cfp_fp, self.cfp_fp_issame) self.board_val('cfp_fp', accuracy, best_threshold, roc_curve_tensor) self.model.train() if self.step % self.save_every == 0 and self.step != 0: self.save_state(conf, accuracy) self.step += 1 self.save_state(conf, accuracy, to_save_folder=True, extra='final') def schedule_lr(self): for params in self.optimizer.param_groups: params['lr'] /= 10 print(self.optimizer) def infer(self, conf, faces, target_embs, tta=False): ''' faces : list of PIL Image target_embs : [n, 512] computed embeddings of faces in facebank names : recorded names of faces in facebank tta : test time augmentation (hfilp, that's all) ''' embs = [] for img in faces: if tta: mirror = trans.functional.hflip(img) emb = self.model( conf.test_transform(img).to(conf.device).unsqueeze(0)) emb_mirror = self.model( conf.test_transform(mirror).to(conf.device).unsqueeze(0)) embs.append(l2_norm(emb + emb_mirror)) else: embs.append( self.model( conf.test_transform(img).to(conf.device).unsqueeze(0))) source_embs = torch.cat(embs) diff = source_embs.unsqueeze(-1) - target_embs.transpose( 1, 0).unsqueeze(0) dist = torch.sum(torch.pow(diff, 2), dim=1) minimum, min_idx = torch.min(dist, dim=1) min_idx[minimum > self.threshold] = -1 # if no match, set idx to -1 return min_idx, minimum
# where the second option of maximizing doesn't suffer from # saturating gradients output = disc(fake).view(-1) lossG = criterion(output, torch.ones_like(output)) gen.zero_grad() lossG.backward() opt_gen.step() if batch_idx == 0: print( f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \ Loss D: {lossD:.4f}, loss G: {lossG:.4f}") with torch.no_grad(): fake = gen(fixed_noise).reshape(-1, 1, 28, 28) data = real.reshape(-1, 1, 28, 28) img_grid_fake = torchvision.utils.make_grid(fake, normalize=True) img_grid_real = torchvision.utils.make_grid(data, normalize=True) writer_fake.add_image("Mnist Fake Images", img_grid_fake, global_step=step) writer_real.add_image("Mnist Real Images", img_grid_real, global_step=step) step += 1 #%%
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) # Tensorboard : dataloader 캡쳐 dataiter = iter(train_loader) images, labels = dataiter.next() img_grid = torchvision.utils.make_grid(images) # matplotlib_imshow(img_grid, one_channel=True) writer.add_image('face_images', img_grid) # ---------------------- # ''' define model''' model = models.resnet50() model.fc = nn.Linear(2048, 512) # model.load_state_dict(torch.load(path +'.pth')) model.to(device) margin = ArcMarginProduct(in_feature=512, out_feature=num_classes, easy_margin=True) # margin.load_state_dict(torch.load(path+'Margin.pth')) margin.to(device) nomargin = ArcMarginForTest(in_feature=512, out_feature=num_classes,
transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True) model = torchvision.models.resnet50(False) model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False) images, labels = next(iter(trainloader)) grid = torchvision.utils.make_grid(images) writer.add_image('images', grid, 0) writer.add_graph(model, images) def send_stats(i, module, input, output): writer.add_scalar(f"layer {i}-mean", output.data.mean()) writer.add_scalar(f"layer {i}-stddev", output.data.std()) from functools import partial for i, m in enumerate(model.children()): m.register_forward_hook(partial(send_stats, i)) # Now train the model and watch output in Tensorboard
class Trainer(object): TrainParams = TrainParams def __init__(self, model, train_params, train_data, val_data=None): assert isinstance(train_params, TrainParams) self.params = train_params # Data loaders self.train_data = train_data self.val_data = val_data # Criterion, Optimizer, learning rate and heatmap type init self.last_epoch = 0 self.hm_type = self.params.hm_type self.criterion = self.params.criterion self.optimizer = self.params.optimizer self.lr_scheduler = self.params.lr_scheduler logger.info('Set criterion to {}'.format(type(self.criterion))) logger.info('Set optimizer to {}'.format(type(self.optimizer))) logger.info('Set lr_scheduler to {}'.format(type(self.lr_scheduler))) logger.info('Set heatmap refine to <{}>'.format( ["static", "stage", "liner", "exp", "new"][int(self.hm_type)])) # load model self.model = model # set CUDA_VISIBLE_DEVICES if len(self.params.gpus) > 0: gpus = ','.join([str(x) for x in self.params.gpus]) os.environ['CUDA_VISIBLE_DEVICES'] = gpus self.params.gpus = tuple(range(len(self.params.gpus))) logger.info('Set CUDA_VISIBLE_DEVICES to GPU[{}]'.format(gpus)) self.model = nn.DataParallel(self.model, device_ids=self.params.gpus) self.model = self.model.cuda() logger.info('Set output dir to {}'.format(self.params.save_dir)) if os.path.isdir(self.params.save_dir): pass else: os.makedirs(self.params.save_dir) ckpt = self.params.ckpt if ckpt is not None: self._load_ckpt(ckpt) logger.info('Load ckpt from {}'.format(ckpt)) # meters self.loss_meter = meter.AverageValueMeter() # tensorboard self.writer = SummaryWriter() # train self.model.train() def train(self): best_loss = np.inf for epoch in range(self.last_epoch, self.params.max_epoch): self.loss_meter.reset() epoch += 1 self.last_epoch += 1 print(' ') logger.info('Start training epoch {}'.format(epoch)) # calculate trainng time for one epoch start_time = time.time() self._train_one_epoch() total_time = time.time() - start_time # logger info: heatmap kernel sigma & training time logger.info('The heatmap kernel size = {:.2f} pixel'.format( self.sigma)) logger.info('The training time = {:.2f} m {:.2f} s'.format( total_time // 60, total_time % 60)) # save model if (epoch >= self.params.start_save_epoch) and ( epoch % self.params.save_freq_epoch == 0) or (epoch == self.params.max_epoch - 1): save_name = self.params.save_dir + 'ckpt_epoch_{}.pth'.format( epoch) t.save(self.model.state_dict(), save_name) # validate and get average_err logger.info('Val on validation set...') self._val_one_epoch() logger.info('Mean Per Joint 2D Error = {:.4f} pixel'.format( self.AveErr)) # loss update if self.loss_meter.value()[0] < best_loss: logger.info('Found a better ckpt ({:.6f} -> {:.6f})'.format( best_loss, self.loss_meter.value()[0])) best_loss = self.loss_meter.value()[0] # tensorboard self.writer.add_scalar('train/hm_kernel', self.sigma, self.last_epoch) self.writer.add_scalar('train/loss', self.loss_meter.value()[0], self.last_epoch) self.writer.add_scalar('train/ave_err', self.AveErr, self.last_epoch) self.writer.add_scalar('train/PCHk', self.PCkh, self.last_epoch) # adjust the lr if isinstance(self.lr_scheduler, ReduceLROnPlateau): self.lr_scheduler.step(self.loss_meter.value()[0]) def _train_one_epoch(self): bar = Bar('Processing', max=len(self.train_data)) for step, (data, label) in enumerate(self.train_data): self.sigma = hm_kernel_size(self.hm_type, self.last_epoch, threshold=4) target = gene_heatmap(label, self.sigma) inputs = Variable(data) target = Variable(t.from_numpy(target)) if len(self.params.gpus) > 0: inputs = inputs.cuda() target = target.type(t.FloatTensor).cuda() # forward score = self.model(inputs) loss = 0 # stack hourglass for s in range(len(score)): loss += self.criterion(score[s], target) loss = loss / len(score) # simple pose res # loss = self.criterion(score[1], target) # backward self.optimizer.zero_grad() loss.backward() self.optimizer.step(None) # meters update self.loss_meter.add(loss.item()) # evaluation: calculate PCKh predictions = spatial_soft_argmax2d(score[len(score) - 1], 1000, False).cpu().numpy().reshape( -1, 2) targetcoors = label.numpy().reshape(-1, 2) steppckh, steperr = evalPCKh(predictions, targetcoors, threshold=50, alpha=0.2) # tensorboard show if step % 500 == 0: target_shows = t.sum(target[0], 0) target_shows[target_shows > 1] = 1 self.writer.add_image('train/input', inputs[0], self.last_epoch) self.writer.add_image('train/taget', target_shows, self.last_epoch, dataformats='HW') self.writer.add_image('train/output', t.sum(score[1][0], 0), self.last_epoch, dataformats='HW') bar.suffix = 'Train: [%(index)d/%(max)d] | Epoch: [{0}/{1}]| Loss: {loss:6f} | PCKh: {pckh:4f} | AveErr: {err:.2f} pixel |'.format( self.last_epoch, self.params.max_epoch, loss=loss, pckh=steppckh, err=steperr) bar.next() bar.finish() def _val_one_epoch(self): bar = Bar('Validating', max=len(self.val_data)) self.model.eval() predictions = np.empty((0, 2)) targetcoors = np.empty((0, 2)) for step, (data, label) in enumerate(self.val_data): with t.no_grad(): inputs = data target = label.reshape(-1, 2) # target = label.type(t.FloatTensor) if len(self.params.gpus) > 0: inputs = inputs.cuda(1) # target = target.cuda() score = self.model(inputs) coors = spatial_soft_argmax2d(score[len(score) - 1], 1000, False).cpu().numpy().reshape(-1, 2) predictions = np.concatenate((predictions, coors), axis=0) targetcoors = np.concatenate((targetcoors, target), axis=0) # evaluation: calculate PCKh currentpckh, currenterr = evalPCKh(predictions, targetcoors, threshold=50, alpha=0.2) # tensorboard visualization if step % 100 == 0: self.writer.add_image('valid/img', inputs[0], self.last_epoch) self.writer.add_image('valid/output', t.sum(score[1][0], 0), self.last_epoch, dataformats='HW') bar.suffix = 'Valid: [%(index)d/%(max)d] | PCKh: {pckh:6f} | AveErr: {err:.2f} pixel |'.format( pckh=currentpckh, err=currenterr) bar.next() bar.finish() self.PCkh, self.AveErr = evalPCKh(predictions, targetcoors, threshold=50, alpha=0.2) self.model.train() def _load_ckpt(self, ckpt): self.model.load_state_dict(t.load(ckpt))
with torch.no_grad(): running_losses.update(losses) last_iteration = global_step == len( dataset) // batch_size * n_epochs - 1 if global_step % 25 == 0 or last_iteration: average_losses = running_losses.get() for key, value in average_losses.items(): writer.add_scalar(key, value, global_step) running_losses.reset() if global_step % 100 == 0 or last_iteration: styled_test_image = stylize_image( Image.open("test_image.jpeg"), model) writer.add_image('test image', styled_test_image, global_step) for i in range(0, len(dataset), len(dataset) // 4): sample = dataset[i] styled_train_image_1 = stylize_image( sample["frame"], model) styled_train_image_2 = stylize_image( sample["previous_frame"], model) grid = torchvision.utils.make_grid( [styled_train_image_1, styled_train_image_2]) writer.add_image(f'train images {i}', grid, global_step) global_step += 1 torch.save(model.state_dict(), args.output_file)
def train_worker(rank, addr, port): # Distributed Setup os.environ['MASTER_ADDR'] = addr os.environ['MASTER_PORT'] = port dist.init_process_group("nccl", rank=rank, world_size=distributed_num_gpus) # Training DataLoader dataset_train = ZipDataset([ ZipDataset([ ImagesDataset(DATA_PATH[args.dataset_name]['train']['pha'], mode='L'), ImagesDataset(DATA_PATH[args.dataset_name]['train']['fgr'], mode='RGB'), ], transforms=A.PairCompose([ A.PairRandomAffineAndResize((2048, 2048), degrees=(-5, 5), translate=(0.1, 0.1), scale=(0.3, 1), shear=(-5, 5)), A.PairRandomHorizontalFlip(), A.PairRandomBoxBlur(0.1, 5), A.PairRandomSharpen(0.1), A.PairApplyOnlyAtIndices([1], T.ColorJitter(0.15, 0.15, 0.15, 0.05)), A.PairApply(T.ToTensor()) ]), assert_equal_length=True), ImagesDataset(DATA_PATH['backgrounds']['train'], mode='RGB', transforms=T.Compose([ A.RandomAffineAndResize((2048, 2048), degrees=(-5, 5), translate=(0.1, 0.1), scale=(1, 2), shear=(-5, 5)), T.RandomHorizontalFlip(), A.RandomBoxBlur(0.1, 5), A.RandomSharpen(0.1), T.ColorJitter(0.15, 0.15, 0.15, 0.05), T.ToTensor() ])), ]) dataset_train_len_per_gpu_worker = int(len(dataset_train) / distributed_num_gpus) dataset_train = Subset(dataset_train, range(rank * dataset_train_len_per_gpu_worker, (rank + 1) * dataset_train_len_per_gpu_worker)) dataloader_train = DataLoader(dataset_train, shuffle=True, pin_memory=True, drop_last=True, batch_size=args.batch_size // distributed_num_gpus, num_workers=args.num_workers // distributed_num_gpus) # Validation DataLoader if rank == 0: dataset_valid = ZipDataset([ ZipDataset([ ImagesDataset(DATA_PATH[args.dataset_name]['valid']['pha'], mode='L'), ImagesDataset(DATA_PATH[args.dataset_name]['valid']['fgr'], mode='RGB') ], transforms=A.PairCompose([ A.PairRandomAffineAndResize((2048, 2048), degrees=(-5, 5), translate=(0.1, 0.1), scale=(0.3, 1), shear=(-5, 5)), A.PairApply(T.ToTensor()) ]), assert_equal_length=True), ImagesDataset(DATA_PATH['backgrounds']['valid'], mode='RGB', transforms=T.Compose([ A.RandomAffineAndResize((2048, 2048), degrees=(-5, 5), translate=(0.1, 0.1), scale=(1, 1.2), shear=(-5, 5)), T.ToTensor() ])), ]) dataset_valid = SampleDataset(dataset_valid, 50) dataloader_valid = DataLoader(dataset_valid, pin_memory=True, drop_last=True, batch_size=args.batch_size // distributed_num_gpus, num_workers=args.num_workers // distributed_num_gpus) # Model model = MattingRefine(args.model_backbone, args.model_backbone_scale, args.model_refine_mode, args.model_refine_sample_pixels, args.model_refine_thresholding, args.model_refine_kernel_size).to(rank) model = nn.SyncBatchNorm.convert_sync_batchnorm(model) model_distributed = nn.parallel.DistributedDataParallel(model, device_ids=[rank]) if args.model_last_checkpoint is not None: load_matched_state_dict(model, torch.load(args.model_last_checkpoint)) optimizer = Adam([ {'params': model.backbone.parameters(), 'lr': 5e-5}, {'params': model.aspp.parameters(), 'lr': 5e-5}, {'params': model.decoder.parameters(), 'lr': 1e-4}, {'params': model.refiner.parameters(), 'lr': 3e-4}, ]) scaler = GradScaler() # Logging and checkpoints if rank == 0: if not os.path.exists(f'checkpoint/{args.model_name}'): os.makedirs(f'checkpoint/{args.model_name}') writer = SummaryWriter(f'log/{args.model_name}') # Run loop for epoch in range(args.epoch_start, args.epoch_end): for i, ((true_pha, true_fgr), true_bgr) in enumerate(tqdm(dataloader_train)): step = epoch * len(dataloader_train) + i true_pha = true_pha.to(rank, non_blocking=True) true_fgr = true_fgr.to(rank, non_blocking=True) true_bgr = true_bgr.to(rank, non_blocking=True) true_pha, true_fgr, true_bgr = random_crop(true_pha, true_fgr, true_bgr) true_src = true_bgr.clone() # Augment with shadow aug_shadow_idx = torch.rand(len(true_src)) < 0.3 if aug_shadow_idx.any(): aug_shadow = true_pha[aug_shadow_idx].mul(0.3 * random.random()) aug_shadow = T.RandomAffine(degrees=(-5, 5), translate=(0.2, 0.2), scale=(0.5, 1.5), shear=(-5, 5))(aug_shadow) aug_shadow = kornia.filters.box_blur(aug_shadow, (random.choice(range(20, 40)),) * 2) true_src[aug_shadow_idx] = true_src[aug_shadow_idx].sub_(aug_shadow).clamp_(0, 1) del aug_shadow del aug_shadow_idx # Composite foreground onto source true_src = true_fgr * true_pha + true_src * (1 - true_pha) # Augment with noise aug_noise_idx = torch.rand(len(true_src)) < 0.4 if aug_noise_idx.any(): true_src[aug_noise_idx] = true_src[aug_noise_idx].add_(torch.randn_like(true_src[aug_noise_idx]).mul_(0.03 * random.random())).clamp_(0, 1) true_bgr[aug_noise_idx] = true_bgr[aug_noise_idx].add_(torch.randn_like(true_bgr[aug_noise_idx]).mul_(0.03 * random.random())).clamp_(0, 1) del aug_noise_idx # Augment background with jitter aug_jitter_idx = torch.rand(len(true_src)) < 0.8 if aug_jitter_idx.any(): true_bgr[aug_jitter_idx] = kornia.augmentation.ColorJitter(0.18, 0.18, 0.18, 0.1)(true_bgr[aug_jitter_idx]) del aug_jitter_idx # Augment background with affine aug_affine_idx = torch.rand(len(true_bgr)) < 0.3 if aug_affine_idx.any(): true_bgr[aug_affine_idx] = T.RandomAffine(degrees=(-1, 1), translate=(0.01, 0.01))(true_bgr[aug_affine_idx]) del aug_affine_idx with autocast(): pred_pha, pred_fgr, pred_pha_sm, pred_fgr_sm, pred_err_sm, _ = model_distributed(true_src, true_bgr) loss = compute_loss(pred_pha, pred_fgr, pred_pha_sm, pred_fgr_sm, pred_err_sm, true_pha, true_fgr) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() optimizer.zero_grad() if rank == 0: if (i + 1) % args.log_train_loss_interval == 0: writer.add_scalar('loss', loss, step) if (i + 1) % args.log_train_images_interval == 0: writer.add_image('train_pred_pha', make_grid(pred_pha, nrow=5), step) writer.add_image('train_pred_fgr', make_grid(pred_fgr, nrow=5), step) writer.add_image('train_pred_com', make_grid(pred_fgr * pred_pha, nrow=5), step) writer.add_image('train_pred_err', make_grid(pred_err_sm, nrow=5), step) writer.add_image('train_true_src', make_grid(true_src, nrow=5), step) del true_pha, true_fgr, true_src, true_bgr del pred_pha, pred_fgr, pred_pha_sm, pred_fgr_sm, pred_err_sm if (i + 1) % args.log_valid_interval == 0: valid(model, dataloader_valid, writer, step) if (step + 1) % args.checkpoint_interval == 0: torch.save(model.state_dict(), f'checkpoint/{args.model_name}/epoch-{epoch}-iter-{step}.pth') if rank == 0: torch.save(model.state_dict(), f'checkpoint/{args.model_name}/epoch-{epoch}.pth') # Clean up dist.destroy_process_group()
label = (torch.ones(batch_size) * 0.1).to(device) output = netD(fake.detach()).reshape(-1) lossD_fake = criterion(output, label) lossD = lossD_real + lossD_fake lossD.backward() optimizerD.step() # Train Generator: max_log(D(G(z))) netG.zero_grad() label = torch.ones(batch_size).to(device) output = netD(fake).reshape(-1) lossG = criterion(output, label) lossG.backward() optimizerG.step() if batch_idx % 100 == 0: print( f'Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(dataloader)} \ loss D: {lossD:.4f}, {lossG:.4f} D(x): {D_x:.4f}') with torch.no_grad(): fake = netG(fixed_noise) img_grid_real = torchvision.utils.make_grid(data[:32], normalize=True) img_grid_fake = torchvision.utils.make_grid(fake[:32], normalize=True) writer_real.add_image('MNIST real images', img_grid_real) writer_real.add_image('MNIST fake images', img_grid_fake)
# loop through all generated dataloaders with adversarial images results_dict = {} for attack_name in adv_dict: # measure attack success print("Testing performance of attack {}: ".format(attack_name)) for epsilon_attack, epsilon in zip(adv_dict[attack_name], epsilons): attacked_acc, _ = validate(epsilon_attack, model, criterion, 1, args) # save adv images for visualization purposes dataiter = iter(epsilon_attack) images, _ = dataiter.next() img_grid = utils.make_grid(images) summary.add_image( "Training Images Adversarially Attacked Using {} with eps {}". format(attack_name, epsilon), img_grid) print("Generating defences for attack {} with eps {}: ".format( attack_name, epsilon)) def_adv_dict = gen_defences(epsilon_attack, attack_name, defence_list) accuracies = { 'initial': initial_acc.item(), 'attacked': attacked_acc.item() } if 'adv_retraining' in args.defences: # evaluate retrained model acc1, val_loss = validate(epsilon_attack, robust_model,
class Evaluator(object): def __init__(self, args): self.args = args self.device = torch.device(args.device) # image transform input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(cfg.DATASET.MEAN, cfg.DATASET.STD), ]) self.lr = 2.5 self.prefix = f"2_boxes_3_7={self.lr}" # self.prefix = f"overfit__count_toy_experiment_3class_7_2_1_conf_loss=total_xavier_weights_xavier_bias_lr={self.lr}" self.writer = SummaryWriter(log_dir=f"cce_toy_logs/{self.prefix}") # self.writer = SummaryWriter(log_dir= f"cce_cityscapes_logs/{self.prefix}") # dataset and dataloader val_dataset = get_segmentation_dataset(cfg.DATASET.NAME, split='val', mode='testval', transform=input_transform) # val_sampler = make_data_sampler(val_dataset, False, args.distributed) self.val_loader = data.DataLoader(dataset=val_dataset, shuffle=True, batch_size=cfg.TEST.BATCH_SIZE, drop_last=True, num_workers=cfg.DATASET.WORKERS, pin_memory=True) self.dataset = val_dataset self.classes = val_dataset.classes self.metric = SegmentationMetric(val_dataset.num_class, args.distributed) # self.model = get_segmentation_model().to(self.device) # if hasattr(self.model, 'encoder') and hasattr(self.model.encoder, 'named_modules') and \ # cfg.MODEL.BN_EPS_FOR_ENCODER: # logging.info('set bn custom eps for bn in encoder: {}'.format(cfg.MODEL.BN_EPS_FOR_ENCODER)) # self.set_batch_norm_attr(self.model.encoder.named_modules(), 'eps', cfg.MODEL.BN_EPS_FOR_ENCODER) # if args.distributed: # self.model = nn.parallel.DistributedDataParallel(self.model, # device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # self.model.to(self.device) def set_batch_norm_attr(self, named_modules, attr, value): for m in named_modules: if isinstance(m[1], nn.BatchNorm2d) or isinstance( m[1], nn.SyncBatchNorm): setattr(m[1], attr, value) def eval(self): self.metric.reset() print(f"Length of classes: {len(self.classes)}") temp_weights = torch.eye(len(self.classes), device="cuda") torch.nn.init.xavier_uniform_(temp_weights, gain=1.0) print(temp_weights) temp_weights.requires_grad = True # temp_weights.requires_grad= True temp_bias = torch.zeros(len(self.classes), device="cuda") # torch.nn.init.xavier_uniform_(temp_bias, gain=1.0) temp_bias.requires_grad = True # temp_weights = torch.rand(len(self.classes), len(self.classes), device="cuda", requires_grad=True) # temp_bias = torch.rand(len(self.classes), device="cuda", requires_grad=True) logging.info( "Start training of temprature weights, Total sample: {:d}".format( len(self.val_loader))) cce_criterion = CCELoss(len(self.classes)).to(self.device) cross_criterion = torch.nn.CrossEntropyLoss(ignore_index=-1) optimizer = torch.optim.SGD([temp_weights, temp_bias], lr=self.lr) import time time_start = time.time() num_epochs = 300 for epoch in range(num_epochs): eceEvaluator_perimage = perimageCCE(n_classes=len(self.classes)) epoch_loss_cce_total = 0 epoch_loss_cross_entropy_total = 0 epoch_loss_total = 0 for i, (images, targets, filenames) in enumerate(self.val_loader): # import pdb; pdb.set_trace() optimizer.zero_grad() images = images.to(self.device) targets = targets.to(self.device) # print(image.shape) with torch.no_grad(): # outputs = model.evaluate(images) # outputs = torch.rand(1,3,300,400) outputs = torch.ones(1, 2, 300, 400) * (torch.Tensor( [0.3, 0.7]).reshape(1, -1, 1, 1)) # outputs = torch.ones(1,4,300,400)*(torch.Tensor([0.5,0.25,0.15, 0.1]).reshape(1,-1,1,1)) outputs = outputs.cuda() outputs[0, 0, :, :200] = 0.7 outputs[0, 1, :, 200:] = 0.3 # outputs = torch.ones(1,3,300,400)*(torch.Tensor([0.7,0.2,0.1]).reshape(1,-1,1,1)) # # outputs = torch.ones(1,4,300,400)*(torch.Tensor([0.5,0.25,0.15, 0.1]).reshape(1,-1,1,1)) # outputs = outputs.cuda() # outputs[0,0,100:200, 50:150] = 0.1 # outputs[0,0,100:150, 250:300] = 0.2 # outputs[0,1,100:200, 50:150] = 0.7 # outputs[0,1,100:150, 250:300] = 0.1 # outputs[0,2,100:200, 50:150] = 0.2 # outputs[0,2,100:150, 250:300] = 0.7 # Converting back to logits outputs = torch.log(outputs) outputs = outputs.permute(0, 2, 3, 1).contiguous() outputs = torch.matmul(outputs, temp_weights) outputs = outputs + temp_bias outputs = outputs.permute(0, 3, 1, 2).contiguous() # Add image stuff save_imgs = torch.softmax(outputs, dim=1).squeeze(0) # analyse(outputs = save_imgs.unsqueeze(0)) # accuracy(outputs = outputs) for class_no, class_distri in enumerate(save_imgs): plt.clf() class_distri[0][0] = 0 class_distri[0][1] = 1 im = plt.imshow(class_distri.detach().cpu().numpy(), cmap="Greens") plt.colorbar(im) plt.savefig("temp_files/temp.jpg") plt.clf() import cv2 img_dif = cv2.imread("temp_files/temp.jpg") self.writer.add_image(f"Class_{class_no}", img_dif, epoch, dataformats="HWC") loss_cce = cce_criterion.forward(outputs, targets) loss_cross_entropy = cross_criterion.forward(outputs, targets) alpha = 0 total_loss = loss_cce + alpha * loss_cross_entropy epoch_loss_cce_total += loss_cce.item() epoch_loss_cross_entropy_total += loss_cross_entropy.item() epoch_loss_total += total_loss.item() total_loss.backward() optimizer.step() with torch.no_grad(): for output, target in zip(outputs, targets.detach()): # older ece requires softmax and size output=[class,w,h] target=[w,h] eceEvaluator_perimage.update(output.softmax(dim=0), target) # print(outputs.shape) # print(eceEvaluator_perimage.get_overall_CCELoss()) print( f"batch :{i+1}/{len(self.val_loader)}" + "loss cce : {:.5f} | loss cls : {:.5f} | loss tot : {:.5f}" .format(loss_cce, loss_cross_entropy, total_loss)) print(temp_weights) print(temp_bias) epoch_loss_cce_total /= len(self.val_loader) epoch_loss_cross_entropy_total /= len(self.val_loader) epoch_loss_total /= len(self.val_loader) count_table_image, _ = eceEvaluator_perimage.get_count_table_img( self.classes) cce_table_image, dif_map = eceEvaluator_perimage.get_perc_table_img( self.classes) self.writer.add_image("CCE_table", cce_table_image, epoch, dataformats="HWC") self.writer.add_image("Count table", count_table_image, epoch, dataformats="HWC") self.writer.add_image("DifMap", dif_map, epoch, dataformats="HWC") self.writer.add_scalar(f"Cross EntropyLoss_LR", epoch_loss_cross_entropy_total, epoch) self.writer.add_scalar(f"CCELoss_LR", epoch_loss_cce_total, epoch) self.writer.add_scalar(f"Total Loss_LR", epoch_loss_total, epoch) self.writer.add_histogram("Weights", temp_weights, epoch) self.writer.add_histogram("Bias", temp_bias, epoch) # output = output/temp_weights # print(output.shape) # print(temp_weights, temp_bias) if epoch > 0 and epoch % 10 == 0: print("saving weights.") np.save("weights/toy/wt_{}_{}.npy".format(epoch, self.prefix), temp_weights.cpu().detach().numpy()) np.save("weights/toy/b{}_{}.npy".format(epoch, self.prefix), temp_bias.cpu().detach().numpy()) # print("epoch {} : loss {:.5f}".format(epoch, epoch_loss)) # import pdb; pdb.set_trace() self.writer.close()
# Create the dataset object for example with the "NIC_v2 - 79 benchmark" # and assuming the core50 location in ~/core50/128x128/ dataset = CORE50(root='/home/akash/core50/data/core50_128x128', scenario="ni", task_type='segment') writer = SummaryWriter() # Get the fixed test set # test_x, test_y = dataset.get_test_set() # loop over the training incremental batches for i, train_batch in enumerate(dataset): # WARNING train_batch is NOT a mini-batch, but one incremental batch! # You can later train with SGD indexing train_x and train_y properly. train_x, train_y, t = train_batch print("----------- batch {0} -------------".format(i)) print("train_x shape: {}, train_y shape: {}, task: {}".format( train_x.shape, train_y['mask'].shape, t)) if (train_x.shape[0] > 0): print("TASK NOT EMPTY") img_1 = train_y['mask'][0, :, :] writer.add_image('task_img_' + str(t), train_x[0, :, :, :], dataformats='HWC') writer.add_image('task_seg_' + str(t), img_1, dataformats='HW') # use the data pass
class TeeTrainer(Trainer): """for segmentation task""" def __init__(self, model, train_set, val_set, configs): super().__init__() print("Start trainer..") # load config self._configs = configs self._lr = self._configs["lr"] self._batch_size = self._configs["batch_size"] self._momentum = self._configs["momentum"] self._weight_decay = self._configs["weight_decay"] self._distributed = self._configs["distributed"] self._num_workers = self._configs["num_workers"] self._device = torch.device(self._configs["device"]) self._max_epoch_num = self._configs["max_epoch_num"] self._max_plateau_count = self._configs["max_plateau_count"] # load dataloader and model self._train_set = train_set self._val_set = val_set self._model = model( in_channels=configs["in_channels"], num_classes=configs["num_classes"], ) self._model.load_state_dict( torch.load("saved/checkpoints/mixed.test")["net"]) print(self._configs) self._model = self._model.to(self._device) if self._distributed == 1: torch.distributed.init_process_group(backend="nccl") self._model = nn.parallel.DistributedDataParallel( self._model, find_unused_parameters=True) self._train_loader = DataLoader( self._train_set, batch_size=self._batch_size, num_workers=self._num_workers, pin_memory=True, shuffle=True, worker_init_fn=lambda x: np.random.seed(x), ) self._val_loader = DataLoader( self._val_set, batch_size=self._batch_size, num_workers=self._num_workers, pin_memory=True, shuffle=False, worker_init_fn=lambda x: np.random.seed(x), ) else: self._train_loader = DataLoader( self._train_set, batch_size=self._batch_size, num_workers=self._num_workers, pin_memory=True, shuffle=True, # worker_init_fn=lambda x: np.random.seed(x) ) self._val_loader = DataLoader( self._val_set, batch_size=self._batch_size, num_workers=self._num_workers, pin_memory=True, shuffle=False, # worker_init_fn=lambda x: np.random.seed(x) ) # define loss function (criterion) and optimizer # class_weights = torch.FloatTensor(np.array([0.3, 0.7])).to(self._device) self._criterion = nn.CrossEntropyLoss().to(self._device) self._optimizer = RAdam( params=self._model.parameters(), lr=self._lr, weight_decay=self._weight_decay, ) self._scheduler = ReduceLROnPlateau( self._optimizer, patience=self._configs["plateau_patience"], verbose=True) # training info self._start_time = datetime.datetime.now() self._start_time = self._start_time.replace(microsecond=0) log_dir = os.path.join( self._configs["cwd"], self._configs["log_dir"], "{}_{}".format(self._configs["model_name"], str(self._start_time)), ) self._writer = SummaryWriter(log_dir) self._train_loss = [] self._train_acc = [] self._val_loss = [] self._val_acc = [] self._best_loss = 1e9 self._best_acc = 0 self._plateau_count = 0 self._current_epoch_num = 0 def reset(self): """reset trainer""" pass def _train(self): self._model.train() train_loss = 0.0 train_acc = 0.0 for i, (images, targets) in tqdm(enumerate(self._train_loader), total=len(self._train_loader), leave=False): images = images.cuda(non_blocking=True) targets = targets.cuda(non_blocking=True) # compute output, measure accuracy and record loss outputs = self._model(images) loss = self._criterion(outputs, targets) acc = accuracy(outputs, targets)[0] # acc = eval_metrics(targets, outputs, 2)[0] train_loss += loss.item() train_acc += acc.item() # compute gradient and do SGD step self._optimizer.zero_grad() loss.backward() self._optimizer.step() # log if i == 0: grid = torchvision.utils.make_grid(images) self._writer.add_image("images", grid, 0) # self._writer.add_graph(self._model, images) # self._writer.close() if self._configs["little"] == 1: mask = torch.squeeze(outputs, 0) mask = mask.detach().cpu().numpy() * 255 mask = np.transpose(mask, (1, 2, 0)).astype(np.uint8) cv2.imwrite( os.path.join("debug", "e{}.png".format(self._current_epoch_num)), mask[..., 1], ) i += 1 self._train_loss.append(train_loss / i) self._train_acc.append(train_acc / i) def _val(self): self._model.eval() val_loss = 0.0 val_acc = 0.0 os.system("rm -rf debug/*") for i, (images, targets) in tqdm(enumerate(self._val_loader), total=len(self._val_loader), leave=False): images = images.cuda(non_blocking=True) targets = targets.cuda(non_blocking=True) # compute output, measure accuracy and record loss outputs = self._model(images) loss = self._criterion(outputs, targets) acc = accuracy(outputs, targets)[0] # acc = eval_metrics(targets, outputs, 2)[0] val_loss += loss.item() val_acc += acc.item() # debug time outputs = torch.squeeze(outputs, dim=0) outputs = torch.argmax(outputs, dim=0) tmp_image = torch.squeeze(images, dim=0) print(tmp_image.shape) tmp_image = tmp_image.cpu().numpy() cv2.imwrite("debug/{}/{}.png".format(outputs, i), tmp_image) i += 1 self._val_loss.append(val_loss / i) self._val_acc.append(val_acc / i) def train(self): """make a training job""" while not self._is_stop(): self._train() self._val() self._update_training_state() self._logging() self._increase_epoch_num() self._writer.close() # be careful with this line of code def _update_training_state(self): if self._val_acc[-1] > self._best_acc: self._save_weights() self._plateau_count = 0 self._best_acc = self._val_acc[-1] self._best_loss = self._val_loss[-1] else: self._plateau_count += 1 self._scheduler.step(self._val_loss[-1]) def _logging(self): # TODO: save message to log file, tensorboard then consume_time = str(datetime.datetime.now() - self._start_time) message = "\nE{:03d} {:.3f}/{:.3f}/{:.3f} {:.3f}/{:.3f}/{:.3f} | p{:02d} Time {}\n".format( self._current_epoch_num, self._train_loss[-1], self._val_loss[-1], self._best_loss, self._train_acc[-1], self._val_acc[-1], self._best_acc, self._plateau_count, consume_time[:-7], ) self._writer.add_scalar("Accuracy/train", self._train_acc[-1], self._current_epoch_num) self._writer.add_scalar("Accuracy/val", self._val_acc[-1], self._current_epoch_num) self._writer.add_scalar("Loss/train", self._train_loss[-1], self._current_epoch_num) self._writer.add_scalar("Loss/val", self._val_loss[-1], self._current_epoch_num) print(message) def _is_stop(self): """check stop condition""" return (self._plateau_count > self._max_plateau_count or self._current_epoch_num > self._max_epoch_num) def _increase_epoch_num(self): self._current_epoch_num += 1 def _store_trainer(self): """store config, training info and traning result to file""" pass def _save_weights(self): """save checkpoint""" if self._distributed == 0: state_dict = self._model.state_dict() else: state_dict = self._model.module.state_dict() state = { **self._configs, "net": state_dict, "best_loss": self._best_loss, "best_acc": self._best_acc, } checkpoint_dir = os.path.join(self._configs["cwd"], "saved/checkpoints") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir, exist_ok=True) torch.save(state, os.path.join(checkpoint_dir, self._configs["model_name"]))
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument( "--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)", ) parser.add_argument( "--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)", ) parser.add_argument( "--epochs", type=int, default=10, metavar="N", help="number of epochs to train (default: 10)", ) parser.add_argument( "--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)", ) parser.add_argument( "--momentum", type=float, default=0.5, metavar="M", help="SGD momentum (default: 0.5)", ) parser.add_argument( "--no-cuda", action="store_true", default=False, help="disables CUDA training" ) parser.add_argument( "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)" ) parser.add_argument( "--log-interval", type=int, default=100, metavar="N", help="how many batches to wait before logging training status", ) parser.add_argument( "--save-model", action="store_true", default=False, help="For Saving the current Model", ) parser.add_argument( "--optimizer", default="sgd", choices=["raner", "ranerqh", "sgd"], help="choose optimizer from choices", ) parser.add_argument( "--sa", action="store_true", help="use self attention module", ) parser.add_argument( "--mish", action="store_true", help="use Mish activate function" ) parser.add_argument("--smooth", default=None, help="put float to smooth or sce") parser.add_argument("--gp", action="store_true", help="use global pooling") parser.add_argument("--fpa", action="store_true", help="use fpa scheduler") args = parser.parse_args() # Tensorboard writer = SummaryWriter() torch.manual_seed(args.seed) use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST( "../data", train=True, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ), batch_size=args.batch_size, shuffle=True, **kwargs ) test_loader = torch.utils.data.DataLoader( datasets.MNIST( "../data", train=False, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ), batch_size=args.test_batch_size, shuffle=True, **kwargs ) model = Net(args.sa, args.gp, args.mish).to(device) # choose loss function if args.smooth is None: print("use CrossEntropy Loss") criterion = torch.nn.CrossEntropyLoss() else: print("use LabelSmoothing Loss") criterion = LabelSmoothingLoss(smoothing=float(args.smooth)) # choose optimizer if args.optimizer == "sgd": print("use Momentum SGD optimizer") optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) elif args.optimizer == "ranger": print("use Ranger optimizer") optimizer = Ranger(model.parameters(), lr=args.lr) elif args.optimizer == "rangerqh": print("use RangerQH optimizer") optimizer = RangerQH(model.parameters(), lr=args.lr) # choose LR scheduler if args.fpa: print("use FlatplusAnneal scheduler") scheduler = FlatplusAnneal(optimizer, max_iter=args.epochs, step_size=0.7) else: print("use StepLR scheduler") scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.epochs // 3) for epoch in range(1, args.epochs + 1): train_loss = train( args, model, device, train_loader, optimizer, epoch, criterion ) test_loss, test_acc = test(args, model, device, test_loader, criterion) scheduler.step() writer.add_scalar("lr", scheduler.get_lr()[0], epoch) writer.add_scalars("loss", {"train": train_loss, "test": test_loss}, epoch) writer.add_scalar("acc/test", test_acc, epoch) if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt") # tfboard images, labels = next(iter(train_loader)) grid = utils.make_grid(images) writer.add_image("images", grid, 0) writer.add_graph(model, images) writer.close()
def train_gssoft(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = GSSOFT(args.channels, args.latent_dim, args.num_embeddings, args.embedding_dim) model.to(device) model_name = "{}_C_{}_N_{}_M_{}_D_{}".format(args.model, args.channels, args.latent_dim, args.num_embeddings, args.embedding_dim) checkpoint_dir = Path(model_name) checkpoint_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(log_dir=Path("runs") / model_name) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) if args.resume is not None: print("Resume checkpoint from: {}:".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) global_step = checkpoint["step"] else: global_step = 0 transform = transforms.Compose( [transforms.ToTensor(), transforms.Lambda(shift)]) training_dataset = datasets.CIFAR10("./CIFAR10", train=True, download=True, transform=transform) test_dataset = datasets.CIFAR10("./CIFAR10", train=False, download=True, transform=transform) training_dataloader = DataLoader(training_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, drop_last=True, num_workers=args.num_workers, pin_memory=True) num_epochs = args.num_training_steps // len(training_dataloader) + 1 start_epoch = global_step // len(training_dataloader) + 1 N = 3 * 32 * 32 for epoch in range(start_epoch, num_epochs + 1): model.train() average_logp = average_KL = average_elbo = average_bpd = average_perplexity = 0 for i, (images, _) in enumerate(tqdm(training_dataloader), 1): images = images.to(device) dist, KL, perplexity = model(images) targets = (images + 0.5) * 255 targets = targets.long() logp = dist.log_prob(targets).sum((1, 2, 3)).mean() loss = (KL - logp) / N elbo = (KL - logp) / N bpd = elbo / np.log(2) optimizer.zero_grad() loss.backward() optimizer.step() global_step += 1 if global_step % 25000 == 0: save_checkpoint(model, optimizer, global_step, checkpoint_dir) average_logp += (logp.item() - average_logp) / i average_KL += (KL.item() - average_KL) / i average_elbo += (elbo.item() - average_elbo) / i average_bpd += (bpd.item() - average_bpd) / i average_perplexity += (perplexity.item() - average_perplexity) / i writer.add_scalar("logp/train", average_logp, epoch) writer.add_scalar("kl/train", average_KL, epoch) writer.add_scalar("elbo/train", average_elbo, epoch) writer.add_scalar("bpd/train", average_bpd, epoch) writer.add_scalar("perplexity/train", average_perplexity, epoch) model.eval() average_logp = average_KL = average_elbo = average_bpd = average_perplexity = 0 for i, (images, _) in enumerate(test_dataloader, 1): images = images.to(device) with torch.no_grad(): dist, KL, perplexity = model(images) targets = (images + 0.5) * 255 targets = targets.long() logp = dist.log_prob(targets).sum((1, 2, 3)).mean() elbo = (KL - logp) / N bpd = elbo / np.log(2) average_logp += (logp.item() - average_logp) / i average_KL += (KL.item() - average_KL) / i average_elbo += (elbo.item() - average_elbo) / i average_bpd += (bpd.item() - average_bpd) / i average_perplexity += (perplexity.item() - average_perplexity) / i writer.add_scalar("logp/test", average_logp, epoch) writer.add_scalar("kl/test", average_KL, epoch) writer.add_scalar("elbo/test", average_elbo, epoch) writer.add_scalar("bpd/test", average_bpd, epoch) writer.add_scalar("perplexity/test", average_perplexity, epoch) samples = torch.argmax(dist.logits, dim=-1) grid = utils.make_grid(samples.float() / 255) writer.add_image("reconstructions", grid, epoch) print( "epoch:{}, logp:{:.3E}, KL:{:.3E}, elbo:{:.3f}, bpd:{:.3f}, perplexity:{:.3f}" .format(epoch, average_logp, average_KL, average_elbo, average_bpd, average_perplexity))
if iter_d == 0: writer.add_scalar('D/fake', D_fake, iteration + int(len(loader)/5)*epoch) writer.add_scalar('D/GP', gradient_penalty, iteration + int(len(loader)/5)*epoch) writer.add_scalar('D/real', D_real, iteration+int(len(loader)/5)*epoch) writer.add_scalar('D/cost', D_cost, iteration + int(len(loader)/5)*epoch) writer.add_scalar('D/wasserstein', Wasserstein_D, iteration + int(len(loader)/5)*epoch) # Train generator network for i in range(1): for p in netD.parameters(): p.requires_grad = False # to avoid computation netG.zero_grad() fake = netG.generate_images(config.batchSize, device) G = netD(fake) G = G.mean() G.backward(mone) G_cost = -G optimizerG.step() writer.add_scalar('G/cost', G_cost, iteration + int(len(loader)/5)*epoch) if iteration%20 == 0: valid_x = netG(valid_noise) writer.add_image('valid_image', torchvision.utils.make_grid(valid_x, nrow=3), global_step=iteration)
def train_latent(self, imgs, classes, model_dir, tensorboard_dir): self.latent_model = LatentModel(self.config) data = dict(img=torch.from_numpy(imgs).permute(0, 3, 1, 2), img_id=torch.from_numpy(np.arange(imgs.shape[0])), class_id=torch.from_numpy(classes.astype(np.int64))) dataset = NamedTensorDataset(data) data_loader = DataLoader(dataset, batch_size=self.config['train']['batch_size'], shuffle=True, sampler=None, batch_sampler=None, num_workers=1, pin_memory=True, drop_last=True) self.latent_model.init() self.latent_model.to(self.device) criterion = VGGDistance(self.config['perceptual_loss']['layers']).to( self.device) optimizer = Adam([{ 'params': itertools.chain(self.latent_model.modulation.parameters(), self.latent_model.generator.parameters()), 'lr': self.config['train']['learning_rate']['generator'] }, { 'params': itertools.chain(self.latent_model.content_embedding.parameters(), self.latent_model.class_embedding.parameters()), 'lr': self.config['train']['learning_rate']['latent'] }], betas=(0.5, 0.999)) scheduler = CosineAnnealingLR( optimizer, T_max=self.config['train']['n_epochs'] * len(data_loader), eta_min=self.config['train']['learning_rate']['min']) summary = SummaryWriter(log_dir=tensorboard_dir) train_loss = AverageMeter() for epoch in range(self.config['train']['n_epochs']): self.latent_model.train() train_loss.reset() pbar = tqdm(iterable=data_loader) for batch in pbar: batch = { name: tensor.to(self.device) for name, tensor in batch.items() } optimizer.zero_grad() out = self.latent_model(batch['img_id'], batch['class_id']) content_penalty = torch.sum(out['content_code']**2, dim=1).mean() loss = criterion( out['img'], batch['img'] ) + self.config['content_decay'] * content_penalty loss.backward() optimizer.step() scheduler.step() train_loss.update(loss.item()) pbar.set_description_str('epoch #{}'.format(epoch)) pbar.set_postfix(loss=train_loss.avg) pbar.close() self.save(model_dir, latent=True, amortized=False) summary.add_scalar(tag='loss', scalar_value=train_loss.avg, global_step=epoch) fixed_sample_img = self.generate_samples(dataset, randomized=False) random_sample_img = self.generate_samples(dataset, randomized=True) summary.add_image(tag='sample-fixed', img_tensor=fixed_sample_img, global_step=epoch) summary.add_image(tag='sample-random', img_tensor=random_sample_img, global_step=epoch) summary.close()
class NetworkTrainer(object): def __init__(self, opt = 'adam', lr = 0.001, batch_size = 4, epochs = 10, dcm_loss = True, padding_center = False, experiment = 'TEST', gpu = '0', ): ## Set the default information self.info = OrderedDict() self.set_info(opt, lr, batch_size, epochs, dcm_loss, padding_center, experiment) self.set_default_info() self.device = torch.device("cuda:%s" % gpu if torch.cuda.is_available() else "cpu") ## Dataset if self.info['experiment'] == 'TEST': self.all_dataset = self.load_dataset(list_id = range(101)) else: self.all_dataset = self.load_dataset() ## Output folder path self.output_dir = os.path.join("./results", experiment) if not os.path.exists(self.output_dir): os.mkdir(self.output_dir) self.writer = SummaryWriter(log_dir = self.output_dir) ## initialize results dictionary self.results = self.intialize_results_dict() # Network self.generator = self.get_network('ce-net') self.discriminator = self.get_network('disc') #self.optimizer = self.get_optimizer() self.criteriaMSE = self.get_loss_fx('MSE') self.criteriaBCE = self.get_loss_fx('BCE') self.transform = self.get_transform() self.train_loader, self.val_loader = self.get_data_loader() print("Network initizliation:") print("Training Number: %s" % (len(self.train_loader) * self.info['batch_size'])) print("Validation Number: %s" % (len(self.val_loader) * self.info['batch_size'])) def set_info(self, opt, lr, batch_size, epochs, dcm_loss, padding_center, experiment): self.info['optimizer'] = opt self.info['learning_rate'] = lr self.info['batch_size'] = batch_size self.info['epochs'] = epochs self.info['dcm_loss'] = dcm_loss self.info['padding_center'] = padding_center self.info['experiment'] = experiment def set_default_info(self): self.info['Generator_adv_loss'] = 0.1 self.info['Generator_mse_loss'] = 0.9 self.info['Discriminator_adv_loss'] = 0.9 self.info['Discriminator_dcm_loss'] = 0.1 self.info['Sample_interval'] = 10 def update_info(key, value): self.info[key] = value def intialize_results_dict(self): results = OrderedDict() results['G_training_loss'] = [] results['D_training_loss'] = [] results['validation_loss'] = [] results['validation_mse_loss'] = [] results['validation_adv_loss'] = [] results['best_loss'] = float('inf') results['best_MSE'] = float('inf') return results def load_dataset(self, list_id = None, transform = None, inpaint = False): return SSIDataset(list_id = list_id, transform = transform, inpaint = inpaint) def data_split(self, validation_split = 0.2, random_seed = 123, shuffle_dataset = True): dataset_size = len(self.all_dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_split * dataset_size)) if shuffle_dataset : np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] return train_indices, val_indices def get_data_loader(self): train_indices, val_indices = self.data_split() train_dataset = self.load_dataset(list_id = train_indices, transform = self.transform['train'], inpaint = True) val_dataset = self.load_dataset(list_id = val_indices, transform = self.transform['val'], inpaint = True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.info['batch_size'], num_workers = 4) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=self.info['batch_size'], num_workers = 4) return train_loader, val_loader def get_optimizer(self): opt, lr, reg = self.info['optimizer'], self.info['learning_rate'], self.info['regularization_weights'] if opt == 'adam': return optim.Adam(self.network.parameters(), lr=lr, weight_decay = reg) elif opt == 'rmsprop': return optim.RMSprop(self.network.parameters(), lr = lr, weight_decay = reg) else: return optim.SGD(self.network.parameters(), lr = lr, weight_decay = reg) def get_network(self, net = 'ce-net'): if net == 'ce-net': return CENet() elif net == 'disc': if self.info['dcm_loss']: return Discriminator(9) else: return Discriminator(1) def get_transform(self): return get_transformer_norm() def get_loss_fx(self, loss_fx): if loss_fx == 'MSE': return nn.MSELoss() elif loss_fx == 'BCE': return nn.BCELoss() def _get_output_labels(self, output): if self.info['dcm_loss']: # Discrimation labels pred_dlabel = output[:,0] # DICOM Labels pred_dicom = output[:,1:] else: pred_dlabel = output pred_dicom = None return pred_dlabel, pred_dicom def padding_center(self, imgs, centers): new_img = imgs.clone() new_img[:,:,64:192, 100:300] = centers return new_img def get_disc_input(self, imgs, centers): if self.info['padding_center']: disc_input = self.padding_center(imgs, centers) else: disc_input = centers return disc_input def sample_images(self, imgs, centers, pred_centers, epoch): true = self.padding_center(imgs, centers) true = make_grid(true, normalize= True) self.writer.add_image('true_images', true, epoch) pred = self.padding_center(imgs, pred_centers) pred = make_grid(pred, normalize= True) self.writer.add_image('pred_images', pred, epoch) def evaluate(self, dataloader, epoch, sample_p = 0.1): self.generator.eval() self.discriminator.eval() #dlabel = torch.FloatTensor(self.info['batch_size']) MSE_loss = 0 Adv_loss = 0 for i, (imgs, centers, dcm_labels) in enumerate(dataloader): batch_size = dcm_labels.clone().size(0) imgs, centers, dcm_labels = imgs.to(self.device), centers.to(self.device), dcm_labels.to(self.device) pred_centers = self.generator(imgs) if np.random.uniform(0,1) < sample_p: self.sample_images(imgs, centers, pred_centers, epoch) # Advasarial loss # Fake Image succesfully fool the discriminator dlabel = torch.FloatTensor(batch_size).fill_(1).to(self.device) disc_input = self.get_disc_input(imgs, pred_centers) output = self.discriminator(disc_input) pred_dlabel, _ = self._get_output_labels(output) lossAdv_Encoder = self.criteriaBCE(pred_dlabel, dlabel) # MSE Loss lossMSE_Encoder = self.criteriaMSE(pred_centers, centers) MSE_loss += lossMSE_Encoder.item() Adv_loss += lossAdv_Encoder.item() MSE_loss /= i Adv_loss /= i return MSE_loss, Adv_loss def train(self): self.generator = self.generator.to(self.device) self.discriminator = self.discriminator.to(self.device) epochs = self.info['epochs'] n_iter = 0 # Define the optimizer for the network optG = optim.Adam(self.generator.parameters(), lr = self.info['learning_rate']) optD = optim.Adam(self.discriminator.parameters(), lr = self.info['learning_rate']) for epoch in range(epochs): print('Starting epoch {}/{}.'.format(epoch + 1, epochs)) self.generator.train() self.discriminator.train() G_epoch_loss = 0 D_epoch_loss = 0 dlabel = torch.FloatTensor(self.info['batch_size']).to(self.device) for i, (imgs, centers, dcm_labels) in enumerate(self.train_loader): batch_size = imgs.size(0) imgs, centers, dcm_labels = imgs.to(self.device), centers.to(self.device), dcm_labels.to(self.device) # ----------------------- # Train Generator (Encoder) # ----------------------- optG.zero_grad() pred_centers = self.generator(imgs) # Advasarial loss #dlabel.data.resize_(batch_size).fill_(1) dlabel = torch.FloatTensor(batch_size).fill_(1).to(self.device) output = self.discriminator(pred_centers) pred_dlabel, _ = self._get_output_labels(output) lossAdv_Encoder = self.criteriaBCE(pred_dlabel, dlabel) # MSE Loss lossMSE_Encoder = self.criteriaMSE(pred_centers, centers) lossG = self.info['Generator_adv_loss'] * lossAdv_Encoder + self.info['Generator_mse_loss']* lossMSE_Encoder G_epoch_loss += lossG.item() # Write the loss to summary writer self.writer.add_scalar('Loss/train_ADV_G', lossAdv_Encoder.item(), n_iter) self.writer.add_scalar('Loss/train_MSE_G', lossMSE_Encoder.item(), n_iter) self.writer.add_scalar('Loss/train_G', lossG.item(), n_iter) lossG.backward() optG.step() # ----------------------- # Train Discriminator # ----------------------- # Discriminator - Train with real optD.zero_grad() #dlabel.data.resize_(batch_size).fill_(1) dlabel = torch.FloatTensor(batch_size).fill_(1).to(self.device) # Padding the original context as the input for discriminator if self.info['padding_center']: disc_input = self.padding_center(imgs, centers) else: disc_input = centers output = self.discriminator(disc_input) # Get the output labels for discriminator pred_dlabel, pred_dicom = self._get_output_labels(output) lossAdv_real = self.criteriaBCE(pred_dlabel, dlabel) if self.info['dcm_loss']: lossDCM = self.criteriaBCE(pred_dicom, dcm_labels.float()) else: lossDCM = torch.Tensor([0]).to(self.device) # Discriminator - Train with fake pred_centers = self.generator(imgs) #dlabel.data.resize_(batch_size).fill_(0) dlabel = torch.FloatTensor(batch_size).fill_(0).to(self.device) if self.info['padding_center']: disc_input = self.padding_center(imgs, pred_centers) else: disc_input = pred_centers output = self.discriminator(pred_centers) pred_dlabel = output[:, 0] lossAdv_fake = self.criteriaBCE(pred_dlabel, dlabel) lossD = self.info['Discriminator_adv_loss'] * (lossAdv_real + lossAdv_fake) + self.info['Discriminator_dcm_loss'] * lossDCM D_epoch_loss += lossD.item() self.writer.add_scalar('Loss/train_ADV_D', (lossAdv_real + lossAdv_fake).item(), n_iter) self.writer.add_scalar('Loss/train_DCM_D', lossDCM.item(), n_iter) self.writer.add_scalar('Loss/train_D', lossD.item(), n_iter) n_iter += 1 lossD.backward() optD.step() if i % 100 == 0: print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f' % (epoch, epochs, i, len(self.train_loader), lossD.item(), lossG.item())) D_epoch_loss /= i+1 G_epoch_loss /= i+1 self.results['G_training_loss'].append(D_epoch_loss) self.results['D_training_loss'].append(G_epoch_loss) print(f'Epoch finished ! D_Loss: {D_epoch_loss}, G_Loss: {G_epoch_loss}' ) # Validation with torch.set_grad_enabled(False): self.generator.eval() MSE_loss, Adv_loss = self.evaluate(self.val_loader, epoch = epoch) self.results['validation_mse_loss'].append(MSE_loss) self.results['validation_adv_loss'].append(Adv_loss) print('Validation MSE Loss: {}'.format(MSE_loss)) print('Validation Adv Loss: {}'.format(Adv_loss)) if MSE_loss + Adv_loss < self.results['best_loss']: self.results['best_loss'] = MSE_loss + Adv_loss self.results['best_MSE'] = MSE_loss self.results['best_epoch'] = epoch + 1 torch.save(self.generator.state_dict(), os.path.join(self.output_dir, "epoch{}.pth".format(epoch+1))) print("Best Validation MSE improved!") elif (epoch+1) % self.info['Sample_interval'] == 0: torch.save(self.generator.state_dict(), os.path.join(self.output_dir, "epoch{}.pth".format(epoch+1))) # Save the training dice score using the best weights #self.evaluate_train() def evaluate_train(self): weight_path = os.path.join(self.output_dir, "epoch{}.pth".format(self.results['best_epoch'])) self.network.load_state_dict(torch.load(weight_path)) with torch.set_grad_enabled(False): self.network.eval() train_loss, train_acc, train_precision, train_recall = eval_net(self.network, self.train_loader, self.criterion, self.device) self.results['train_accuracy'] = train_acc self.results['train_precision'] = train_precision self.results['train_recall'] = train_recall def plot_training(): pass def save_results(self): config = configparser.ConfigParser() config['INFO'] = self.info config['BEST RESULTS'] = {'val_mse': self.results['best_MSE'], 'best_epoch': self.results['best_epoch']} with open(os.path.join(self.output_dir, 'exp.ini'), 'w') as configfile: config.write(configfile) loss_history = pd.DataFrame({'generator_training_loss': self.results['G_training_loss'], 'discriminator_loss': self.results['D_training_loss']}) loss_history.to_csv(os.path.join(self.output_dir, 'loss_history.csv')) torch.save(self.generator.state_dict(), os.path.join(self.output_dir, "epoch_last.pth")) def load_weights(self, weight_path): self.network.load_state_dict(torch.load(weight_path))
def train_amortized(self, imgs, classes, model_dir, tensorboard_dir): self.amortized_model = AmortizedModel(self.config) self.amortized_model.modulation.load_state_dict( self.latent_model.modulation.state_dict()) self.amortized_model.generator.load_state_dict( self.latent_model.generator.state_dict()) data = dict(img=torch.from_numpy(imgs).permute(0, 3, 1, 2), img_id=torch.from_numpy(np.arange(imgs.shape[0])), class_id=torch.from_numpy(classes.astype(np.int64))) dataset = NamedTensorDataset(data) data_loader = DataLoader(dataset, batch_size=self.config['train']['batch_size'], shuffle=True, sampler=None, batch_sampler=None, num_workers=1, pin_memory=True, drop_last=True) self.latent_model.to(self.device) self.amortized_model.to(self.device) reconstruction_criterion = VGGDistance( self.config['perceptual_loss']['layers']).to(self.device) embedding_criterion = nn.MSELoss() optimizer = Adam( params=self.amortized_model.parameters(), lr=self.config['train_encoders']['learning_rate']['max'], betas=(0.5, 0.999)) scheduler = CosineAnnealingLR( optimizer, T_max=self.config['train_encoders']['n_epochs'] * len(data_loader), eta_min=self.config['train_encoders']['learning_rate']['min']) summary = SummaryWriter(log_dir=tensorboard_dir) train_loss = AverageMeter() for epoch in range(self.config['train_encoders']['n_epochs']): self.latent_model.eval() self.amortized_model.train() train_loss.reset() pbar = tqdm(iterable=data_loader) for batch in pbar: batch = { name: tensor.to(self.device) for name, tensor in batch.items() } optimizer.zero_grad() target_content_code = self.latent_model.content_embedding( batch['img_id']) target_class_code = self.latent_model.class_embedding( batch['class_id']) out = self.amortized_model(batch['img']) loss_reconstruction = reconstruction_criterion( out['img'], batch['img']) loss_content = embedding_criterion(out['content_code'], target_content_code) loss_class = embedding_criterion(out['class_code'], target_class_code) loss = loss_reconstruction + 10 * loss_content + 10 * loss_class loss.backward() optimizer.step() scheduler.step() train_loss.update(loss.item()) pbar.set_description_str('epoch #{}'.format(epoch)) pbar.set_postfix(loss=train_loss.avg) pbar.close() self.save(model_dir, latent=False, amortized=True) summary.add_scalar(tag='loss-amortized', scalar_value=loss.item(), global_step=epoch) summary.add_scalar(tag='rec-loss-amortized', scalar_value=loss_reconstruction.item(), global_step=epoch) summary.add_scalar(tag='content-loss-amortized', scalar_value=loss_content.item(), global_step=epoch) summary.add_scalar(tag='class-loss-amortized', scalar_value=loss_class.item(), global_step=epoch) fixed_sample_img = self.generate_samples_amortized( dataset, randomized=False) random_sample_img = self.generate_samples_amortized( dataset, randomized=True) summary.add_image(tag='sample-fixed-amortized', img_tensor=fixed_sample_img, global_step=epoch) summary.add_image(tag='sample-random-amortized', img_tensor=random_sample_img, global_step=epoch) summary.close()
plt.show() dis_optimizer.zero_grad() dis_output_true_v = net_discr(batch_v) dis_output_fake_v = net_discr(gen_output_v.detach()) dis_loss = objective(dis_output_true_v, true_labels_v) + objective( dis_output_fake_v, fake_labels_v) dis_loss.backward() dis_optimizer.step() dis_losses.append(dis_loss.item()) gen_optimizer.zero_grad() dis_output_v = net_discr(gen_output_v) gen_loss_v = objective(dis_output_v, true_labels_v) gen_loss_v.backward() gen_optimizer.step() gen_losses.append(gen_loss_v.item()) iter_no += 1 if iter_no % REPORT_EVERY_ITER == 0: log.info("Iter %d: gen_loss=%.3e, dis_loss=%.3e", iter_no, np.mean(gen_losses), np.mean(dis_losses)) writer.add_scalar("gen_loss", np.mean(gen_losses), iter_no) writer.add_scalar("dis_loss", np.mean(dis_losses), iter_no) gen_losses = [] dis_losses = [] if iter_no % SAVE_IMAGE_EVERY_ITER == 0: writer.add_image("fake", vutils.make_grid(gen_output_v.data[:64]), iter_no) writer.add_image("real", vutils.make_grid(batch_v.data[:64]), iter_no)
class UNetModel: '''Wrapper class for different Unet models to facilitate training, validation, logging etc. Args: exp_config: Experiment configuration file as given in the experiment folder ''' def __init__(self, exp_config, logger=None, tensorboard=True): self.net = exp_config.model(input_channels=exp_config.input_channels, num_classes=exp_config.n_classes, num_filters=exp_config.filter_channels, latent_levels=exp_config.latent_levels, no_convs_fcomb=exp_config.no_convs_fcomb, beta=exp_config.beta, image_size=exp_config.image_size, reversible=exp_config.use_reversible ) self.exp_config = exp_config self.batch_size = exp_config.batch_size self.logger = logger self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.net.to(self.device) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=1e-3, weight_decay=1e-5) self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', min_lr=1e-4, verbose=True, patience=50000) if exp_config.pretrained_model is not None: self.logger.info('Loading pretrained model {}'.format(exp_config.pretrained_model)) model_path = os.path.join(sys_config.project_root, 'models', exp_config.pretrained_model) model_name = self.exp_config.experiment_name + '_' + exp_config.pretrained_model + '.pth' log_dir = os.path.join(sys_config.log_root, exp_config.log_dir_name, exp_config.experiment_name) save_model_path = os.path.join(log_dir, model_name) if os.path.exists(model_path): self.net.load_state_dict(torch.load(save_model_path)) else: self.logger.info('The file {} does not exist. Starting training without pretrained net.' .format(save_model_path)) self.mean_loss_of_epoch = 0 self.tot_loss = 0 self.kl_loss = 0 self.reconstruction_loss = 0 self.dice_mean = 0 self.val_loss = 0 self.foreground_dice = 0 self.val_recon_loss = 0 self.val_elbo = 0 self.val_kl_loss = 0 self.avg_dice = 0 self.avg_ged = -1 self.avg_ncc = -1 self.best_dice = -1 self.best_loss = np.inf self.best_ged = np.inf self.best_ncc = -1 if tensorboard: self.training_writer = SummaryWriter() self.validation_writer = SummaryWriter(comment='_validation') self.iteration = 0 def train(self, data): self.net.train() self.logger.info('Starting training.') self.logger.info('Current filters: {}'.format(self.exp_config.filter_channels)) self.logger.info('Batch size: {}'.format(self.batch_size)) for self.iteration in range(1, self.exp_config.iterations): x_b, s_b = data.train.next_batch(self.batch_size) patch = torch.tensor(x_b, dtype=torch.float32).to(self.device) mask = torch.tensor(s_b, dtype=torch.float32).to(self.device) mask = torch.unsqueeze(mask, 1) self.mask = mask self.patch = patch self.net.forward(patch, mask, training=True) self.loss = self.net.loss(mask) self.tot_loss += self.loss self.reconstruction_loss += self.net.reconstruction_loss self.kl_loss += self.net.kl_divergence_loss self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() if self.iteration % self.exp_config.validation_frequency == 0: self.validate(data) if self.iteration % self.exp_config.logging_frequency == 0: self.logger.info('Iteration {} Loss {}'.format(self.iteration, self.loss)) #self._create_tensorboard_summary() self.tot_loss = 0 self.kl_loss = 0 self.reconstruction_loss = 0 self.scheduler.step(self.loss) self.logger.info('Finished training.') def validate(self, data): self.net.eval() with torch.no_grad(): self.logger.info('Validation for step {}'.format(self.iteration)) self.logger.info('Checkpointing model.') self.save_model('validation_ckpt') if self.device == torch.device('cuda'): allocated_memory = torch.cuda.max_memory_allocated(self.device) self.logger.info('Memory allocated in current iteration: {}{}'.format(allocated_memory, self.iteration)) ged_list = [] dice_list = [] ncc_list = [] elbo_list = [] kl_list = [] recon_list = [] time_ = time.time() validation_set_size = data.validation.images.shape[0]\ if self.exp_config.num_validation_images == 'all' else self.exp_config.num_validation_images for ii in range(validation_set_size): s_gt_arr = data.validation.labels[ii, ...] # from HW to NCHW x_b = data.validation.images[ii, ...] patch = torch.tensor(x_b, dtype=torch.float32).to(self.device) val_patch = patch.unsqueeze(dim=0).unsqueeze(dim=1) s_b = s_gt_arr[:, :, np.random.choice(self.exp_config.annotator_range)] mask = torch.tensor(s_b, dtype=torch.float32).to(self.device) val_mask = mask.unsqueeze(dim=0).unsqueeze(dim=1) val_masks = torch.tensor(s_gt_arr, dtype=torch.float32).to(self.device) # HWC val_masks = val_masks.transpose(0, 2).transpose(1, 2) # CHW patch_arrangement = val_patch.repeat((self.exp_config.validation_samples, 1, 1, 1)) mask_arrangement = val_mask.repeat((self.exp_config.validation_samples, 1, 1, 1)) self.mask = mask_arrangement self.patch = patch_arrangement # training=True for constructing posterior as well s_out_eval_list = self.net.forward(patch_arrangement, mask_arrangement, training=False) s_prediction_softmax_arrangement = self.net.accumulate_output(s_out_eval_list, use_softmax=True) # sample N times self.val_loss = self.net.loss(mask_arrangement) elbo = self.val_loss kl = self.net.kl_divergence_loss recon = self.net.reconstruction_loss s_prediction_softmax_mean = torch.mean(s_prediction_softmax_arrangement, axis=0) s_prediction_arrangement = torch.argmax(s_prediction_softmax_arrangement, dim=1) ground_truth_arrangement = val_masks # nlabels, H, W ged = utils.generalised_energy_distance(s_prediction_arrangement, ground_truth_arrangement, nlabels=self.exp_config.n_classes - 1, label_range=range(1, self.exp_config.n_classes)) # num_gts, nlabels, H, W s_gt_arr_r = val_masks.unsqueeze(dim=1) ground_truth_arrangement_one_hot = utils.convert_batch_to_onehot(s_gt_arr_r, nlabels=self.exp_config.n_classes) ncc = utils.variance_ncc_dist(s_prediction_softmax_arrangement, ground_truth_arrangement_one_hot) s_ = torch.argmax(s_prediction_softmax_mean, dim=0) # HW s = val_mask.view(val_mask.shape[-2], val_mask.shape[-1]) #HW # Write losses to list per_lbl_dice = [] for lbl in range(self.exp_config.n_classes): binary_pred = (s_ == lbl) * 1 binary_gt = (s == lbl) * 1 if torch.sum(binary_gt) == 0 and torch.sum(binary_pred) == 0: per_lbl_dice.append(1.0) elif torch.sum(binary_pred) > 0 and torch.sum(binary_gt) == 0 or torch.sum(binary_pred) == 0 and torch.sum( binary_gt) > 0: per_lbl_dice.append(0.0) else: per_lbl_dice.append(dc(binary_pred.detach().cpu().numpy(), binary_gt.detach().cpu().numpy())) dice_list.append(per_lbl_dice) elbo_list.append(elbo) kl_list.append(kl) recon_list.append(recon) ged_list.append(ged) ncc_list.append(ncc) dice_tensor = torch.tensor(dice_list) per_structure_dice = dice_tensor.mean(dim=0) elbo_tensor = torch.tensor(elbo_list) kl_tensor = torch.tensor(kl_list) recon_tensor = torch.tensor(recon_list) ged_tensor = torch.tensor(ged_list) ncc_tensor = torch.tensor(ncc_list) self.avg_dice = torch.mean(dice_tensor) self.foreground_dice = torch.mean(dice_tensor, dim=0)[1] self.val_elbo = torch.mean(elbo_tensor) self.val_recon_loss = torch.mean(recon_tensor) self.val_kl_loss = torch.mean(kl_tensor) self.avg_ged = torch.mean(ged_tensor) self.avg_ncc = torch.mean(ncc_tensor) self.logger.info(' - Foreground dice: %.4f' % torch.mean(self.foreground_dice)) self.logger.info(' - Mean (neg.) ELBO: %.4f' % self.val_elbo) self.logger.info(' - Mean GED: %.4f' % self.avg_ged) self.logger.info(' - Mean NCC: %.4f' % self.avg_ncc) if torch.mean(per_structure_dice) >= self.best_dice: self.best_dice = torch.mean(per_structure_dice) self.logger.info('New best validation Dice! (%.3f)' % self.best_dice) self.save_model(savename='best_dice') if self.val_elbo <= self.best_loss: self.best_loss = self.val_elbo self.logger.info('New best validation loss! (%.3f)' % self.best_loss) self.save_model(savename='best_loss') if self.avg_ged <= self.best_ged: self.best_ged = self.avg_ged self.logger.info('New best GED score! (%.3f)' % self.best_ged) self.save_model(savename='best_ged') if self.avg_ncc >= self.best_ncc: self.best_ncc = self.avg_ncc self.logger.info('New best NCC score! (%.3f)' % self.best_ncc) self.save_model(savename='best_ncc') self.logger.info('Validation took {} seconds'.format(time.time()-time_)) self.net.train() def train_brats(self, trainDataLoader): epoch = 1 while epoch < 100: # set net up training self.net.train() for i, data in enumerate(trainDataLoader): # load data inputs, pid, labels = data inputs, labels = inputs.to(self.device), labels.to(self.device) # forward and backward pass outputs = self.net.forward(inputs, labels) loss = self.loss(outputs, labels) print('Current loss at iteration {} : {}'.format(i, loss)) del inputs, outputs, labels loss.backward() epoch = epoch + 1 def _create_tensorboard_summary(self, end_of_epoch=False): self.net.eval() with torch.no_grad(): # calculate the means since the last validation self.training_writer.add_scalar('Mean_loss', self.tot_loss/self.exp_config.validation_frequency, global_step=self.iteration) self.training_writer.add_scalar('KL_Divergence_loss', self.kl_loss/self.exp_config.validation_frequency, global_step=self.iteration) self.training_writer.add_scalar('Reconstruction_loss', self.reconstruction_loss/self.exp_config.validation_frequency, global_step=self.iteration) self.validation_writer.add_scalar('Dice_score_of_last_validation', self.foreground_dice, global_step=self.iteration) self.validation_writer.add_scalar('GED_score_of_last_validation', self.avg_ged, global_step=self.iteration) self.validation_writer.add_scalar('NCC_score_of_last_validation', self.avg_ncc, global_step=self.iteration) self.validation_writer.add_scalar('Mean_loss', self.val_elbo, global_step=self.iteration) self.validation_writer.add_scalar('KL_Divergence_loss', self.val_kl_loss, global_step=self.iteration) self.validation_writer.add_scalar('Reconstruction_loss', self.val_recon_loss, global_step=self.iteration) # plot images of current patch for summary sample = torch.softmax(self.net.sample(), dim=1) sample1 = torch.chunk(sample, 2, dim=1)[self.exp_config.n_classes-1] self.training_writer.add_image('Patch/GT/Sample', torch.cat([self.patch, self.mask.view(-1, 1, self.exp_config.image_size[1], self.exp_config.image_size[2]), sample1], dim=2), global_step=self.iteration, dataformats='NCHW') if self.device == torch.device('cuda'): allocated_memory = torch.cuda.max_memory_allocated(self.device) self.logger.info('Memory allocated in current iteration: {}{}'.format(allocated_memory, self.iteration)) self.training_writer.add_scalar('Max_memory_allocated', allocated_memory, self.iteration) self.net.train() def test(self, data, sys_config): self.net.eval() with torch.no_grad(): model_selection = self.exp_config.experiment_name + '_best_loss.pth' self.logger.info('Testing {}'.format(model_selection)) self.logger.info('Loading pretrained model {}'.format(model_selection)) model_path = os.path.join( sys_config.log_root, self.exp_config.log_dir_name, self.exp_config.experiment_name, model_selection) if os.path.exists(model_path): self.net.load_state_dict(torch.load(model_path)) else: self.logger.info('The file {} does not exist. Aborting test function.'.format(model_path)) return ged_list = [] dice_list = [] ncc_list = [] time_ = time.time() end_dice = 0.0 end_ged = 0.0 end_ncc = 0.0 for i in range(10): self.logger.info('Doing iteration {}'.format(i)) n_samples = 10 for ii in range(data.test.images.shape[0]): s_gt_arr = data.test.labels[ii, ...] # from HW to NCHW x_b = data.test.images[ii, ...] patch = torch.tensor(x_b, dtype=torch.float32).to(self.device) val_patch = patch.unsqueeze(dim=0).unsqueeze(dim=1) s_b = s_gt_arr[:, :, np.random.choice(self.exp_config.annotator_range)] mask = torch.tensor(s_b, dtype=torch.float32).to(self.device) val_mask = mask.unsqueeze(dim=0).unsqueeze(dim=1) val_masks = torch.tensor(s_gt_arr, dtype=torch.float32).to(self.device) # HWC val_masks = val_masks.transpose(0, 2).transpose(1, 2) # CHW patch_arrangement = val_patch.repeat((n_samples, 1, 1, 1)) mask_arrangement = val_mask.repeat((n_samples, 1, 1, 1)) self.mask = mask_arrangement self.patch = patch_arrangement # training=True for constructing posterior as well s_out_eval_list = self.net.forward(patch_arrangement, mask_arrangement, training=False) s_prediction_softmax_arrangement = self.net.accumulate_output(s_out_eval_list, use_softmax=True) s_prediction_softmax_mean = torch.mean(s_prediction_softmax_arrangement, axis=0) s_prediction_arrangement = torch.argmax(s_prediction_softmax_arrangement, dim=1) ground_truth_arrangement = val_masks # nlabels, H, W ged = utils.generalised_energy_distance(s_prediction_arrangement, ground_truth_arrangement, nlabels=self.exp_config.n_classes - 1, label_range=range(1, self.exp_config.n_classes)) # num_gts, nlabels, H, W s_gt_arr_r = val_masks.unsqueeze(dim=1) ground_truth_arrangement_one_hot = utils.convert_batch_to_onehot(s_gt_arr_r, nlabels=self.exp_config.n_classes) ncc = utils.variance_ncc_dist(s_prediction_softmax_arrangement, ground_truth_arrangement_one_hot) s_ = torch.argmax(s_prediction_softmax_mean, dim=0) # HW s = val_mask.view(val_mask.shape[-2], val_mask.shape[-1]) # HW # Write losses to list per_lbl_dice = [] for lbl in range(self.exp_config.n_classes): binary_pred = (s_ == lbl) * 1 binary_gt = (s == lbl) * 1 if torch.sum(binary_gt) == 0 and torch.sum(binary_pred) == 0: per_lbl_dice.append(1.0) elif torch.sum(binary_pred) > 0 and torch.sum(binary_gt) == 0 or torch.sum( binary_pred) == 0 and torch.sum( binary_gt) > 0: per_lbl_dice.append(0.0) else: per_lbl_dice.append(dc(binary_pred.detach().cpu().numpy(), binary_gt.detach().cpu().numpy())) dice_list.append(per_lbl_dice) ged_list.append(ged) ncc_list.append(ncc) if ii % 100 == 0: self.logger.info(' - Mean GED: %.4f' % torch.mean(torch.tensor(ged_list))) self.logger.info(' - Mean NCC: %.4f' % torch.mean(torch.tensor(ncc_list))) dice_tensor = torch.tensor(dice_list) per_structure_dice = dice_tensor.mean(dim=0) ged_tensor = torch.tensor(ged_list) ncc_tensor = torch.tensor(ncc_list) model_path = os.path.join( sys_config.log_root, self.exp_config.log_dir_name, self.exp_config.experiment_name) np.savez(os.path.join(model_path, 'ged%s_%s_2.npz' % (str(n_samples), model_selection)), ged_tensor.numpy()) np.savez(os.path.join(model_path, 'ncc%s_%s_2.npz' % (str(n_samples), model_selection)), ncc_tensor.numpy()) self.avg_dice = torch.mean(dice_tensor) self.foreground_dice = torch.mean(dice_tensor, dim=0)[1] self.avg_ged = torch.mean(ged_tensor) self.avg_ncc = torch.mean(ncc_tensor) logging.info('-- GED: --') logging.info(torch.mean(ged_tensor)) logging.info(torch.std(ged_tensor)) logging.info('-- NCC: --') logging.info(torch.mean(ncc_tensor)) logging.info(torch.std(ncc_tensor)) self.logger.info(' - Foreground dice: %.4f' % torch.mean(self.foreground_dice)) self.logger.info(' - Mean (neg.) ELBO: %.4f' % self.val_elbo) self.logger.info(' - Mean GED: %.4f' % self.avg_ged) self.logger.info(' - Mean NCC: %.4f' % self.avg_ncc) self.logger.info('Testing took {} seconds'.format(time.time() - time_)) end_dice += self.avg_dice end_ged += self.avg_ged end_ncc += self.avg_ncc self.logger.info('Mean dice: {}'.format(end_dice/10)) self.logger.info('Mean ged: {}'.format(end_ged / 10)) self.logger.info('Mean ncc: {}'.format(end_ncc / 10)) def generate_images(self, data, sys_config): self.net.eval() with torch.no_grad(): model_selection = self.exp_config.experiment_name + '_best_dice.pth' self.logger.info('Generating samples {}'.format(model_selection)) self.logger.info('Loading pretrained model {}'.format(model_selection)) model_path = os.path.join( sys_config.log_root, self.exp_config.log_dir_name, self.exp_config.experiment_name, model_selection) image_path = os.path.join( sys_config.log_root, self.exp_config.log_dir_name, self.exp_config.experiment_name, ) # if os.path.exists(model_path): # self.net.load_state_dict(torch.load(model_path)) # else: # self.logger.info('The file {} does not exist. Aborting test function.'.format(model_path)) # return n_samples = 10 for ii in range(31,100): s_gt_arr = data.test.labels[ii, ...] # from HW to NCHW x_b = data.test.images[ii, ...] patch = torch.tensor(x_b, dtype=torch.float32).to(self.device) val_patch = patch.unsqueeze(dim=0).unsqueeze(dim=1) s_b = s_gt_arr[:, :, np.random.choice(self.exp_config.annotator_range)] mask = torch.tensor(s_b, dtype=torch.float32).to(self.device) val_mask = mask.unsqueeze(dim=0).unsqueeze(dim=1) val_masks = torch.tensor(s_gt_arr, dtype=torch.float32).to(self.device) # HWC val_masks = val_masks.transpose(0, 2).transpose(1, 2) # CHW patch_arrangement = val_patch.repeat((n_samples, 1, 1, 1)) mask_arrangement = val_mask.repeat((n_samples, 1, 1, 1)) self.mask = mask_arrangement self.patch = patch_arrangement # training=True for constructing posterior as well s_out_eval_list = self.net.forward(patch_arrangement, mask_arrangement, training=False) s_prediction_softmax_arrangement = self.net.accumulate_output(s_out_eval_list, use_softmax=True) s_ = torch.argmax(s_prediction_softmax_arrangement, dim=1) self.logger.info('s_.shape{}'.format(s_.shape)) self.logger.info('s_'.format(s_)) self.save_images(image_path, patch, val_masks, s_, ii) def save_images(self, save_location, image, ground_truth_labels, sample, iteration): from torchvision.utils import save_image save_image(image, os.path.join(save_location, '{}image.png'.format(iteration)), pad_value=1, scale_each=True, normalize=True) for i in range(self.exp_config.num_labels_per_subject): save_image(ground_truth_labels[i].float(), os.path.join(save_location, '{}mask{}.png'.format(iteration, i)), pad_value=1, scale_each=True, normalize=True) for i in range(10): save_image(sample[i].float(), os.path.join(save_location, '{}sample{}.png'.format(iteration, i)), pad_value=1, scale_each=True, normalize=True) def save_model(self, savename): model_name = self.exp_config.experiment_name + '_' + savename + '.pth' log_dir = os.path.join(sys_config.log_root, exp_config.log_dir_name, exp_config.experiment_name) save_model_path = os.path.join(log_dir, model_name) torch.save(self.net.state_dict(), save_model_path) self.logger.info('saved model to .pth file in {}'.format(save_model_path))