Esempio n. 1
0
class GeneticAlgorithm():
    def __init__(self, weights, fitness_function, population_size, sigma, num_threads, folder, settings):
        #GeneticAlgorithm.__init__(self, weights, fitness_function, population_size, sigma, num_threads, folder, settings) 
        #! Python 3super(GeneticAlgorithm, self).__init__()
        self.weight_shape = len(weights)
        self.fitness_function = fitness_function
        self.pop_size = population_size
        self.sigma = sigma
        self.num_threads = num_threads
        self.folder = folder
        self.settings = settings
        sigma = settings['sigma']

        np.random.seed(self.settings['seed'])

        filename="run_"+str(self.settings['seed'])
        if self.settings['recurrent']:
            filename+="_recurrent"

        if LOG:
            self.writer = SummaryWriter(folder+"/"+filename+"/"+settings['target'])


    def run(self, generations, print_step):

        population = []
        fitness_log =[]
        fitness_all = []
        plot_log = []
        gen=[]
        maxgenfit=[]
        meanfit = []
        maxfit = []

        N = self.settings['N']#self.pop_size/3 #use top 5 individuals

        elitism = N

        if self.settings['data_read']:
            population = torch.load('run1/model_stateFiles/weights_gen_1.pt')
            morphogens = np.load('run1/model_stateFiles/morphogens_gen_1.npy')
            hidden_states_batched_A = torch.load('run1/model_stateFiles/hidden_states_batched_A_gen_0.pt')
            hidden_states_batched_B = torch.load('run1/model_stateFiles/hidden_states_batched_B_gen_0.pt')

            pattern=r'([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)'
            f = open('run1/fitnessFiles/fitness_log_gen_1.txt')
            fitness = f.read()
            f.close()
            fitness=re.findall(pattern,fitness)
            fitness = map(float, fitness)
            fitness = np.array(fitness)
            max_fitness = max(fitness[::-4])

        else:
            for i in range(self.pop_size):
                w = torch.from_numpy( np.random.normal(0, 1, self.weight_shape)*0.1*self.settings['initial_noise']).float()
                population.append( w )
            max_fitness = -20

        pool = multiprocessing.Pool(self.num_threads)

        for epoch in range(0, generations):
            #fitness, sim, env, generations, individual_id, morphogens 

            if self.settings['data_read']:
                results = pool.map(self.fitness_function, [ (it, epoch, self.settings, morphogens[it[0]], hidden_states_batched_A[it[0]], hidden_states_batched_B[it[0]], max_fitness) for it in enumerate(population) ])
            else:
                results = pool.map(self.fitness_function, [ (it, epoch, self.settings) for it in enumerate(population) ])
            # ,hidden_states_batched_A[it],hidden_states_batched_B[it]

            fitness = [row[0] for row in results]
            sim = [row[1] for row in results]
            env = [row[2] for row in results]
            individual_id = [row[4] for row in results]
            morphogens = [row[5] for row in results]
            hidden_states_batched_A = [row[6] for row in results]
            hidden_states_batched_B = [row[7] for row in results]
            dev_states = [row[8] for row in results]
            alpha = [row[9] for row in results]
            cutedgemorphogens = [row[10] for row in results]
            out = [row[11] for row in results]
            past_hidden_states_batched_A = [row[12] for row in results]
            past_hidden_states_batched_B = [row[13] for row in results]

            fitness_all.append(fitness)
            #Get individuales id
            sort_idx = np.argsort([-f for f in fitness])
            max_gen_f = np.max(fitness)

            if (max_gen_f>max_fitness):
                max_fitness = max_gen_f

                write_voxelyze_file_fitness(sim[sort_idx[0]], env[sort_idx[0]], epoch, individual_id[sort_idx[0]], cutedgemorphogens[sort_idx[0]], max_fitness, self.settings['im_size'], self.settings['run_directory'], self.settings['run_name'])
                torch.save(population[sort_idx[0]], "{0}/bestofFiles/weights_gen_{1}_{2}.pt".format(self.settings['run_directory'],epoch,sort_idx[0]))
                np.save('{0}/bestofFiles/morphogens_gen_{1}_id_{2}'.format(self.settings['run_directory'],epoch, sort_idx[0]), morphogens[sort_idx[0]])

                torch.save(past_hidden_states_batched_A[sort_idx[0]],'{0}/bestofFiles/past_hidden_states_batched_A_gen_{1}_id_{2}.pt'.format(self.settings['run_directory'],epoch, sort_idx[0]))
                torch.save(past_hidden_states_batched_B[sort_idx[0]],'{0}/bestofFiles/past_hidden_states_batched_B_gen_{1}_id_{2}.pt'.format(self.settings['run_directory'],epoch, sort_idx[0]))

                dev_states = np.asarray(dev_states)
                alpha = np.asarray(alpha)
                #filename = "{0}/GA_saved_weights_gen_{1}_{2}".format(self.folder,epoch, max_fitness)
                #m = self.fitness_function( (population[sort_idx[0] ], self.settings), True, filename)
                dev_states = dev_states.reshape(len(population),len(dev_states[1]),self.settings['im_size'],self.settings['im_size'],self.settings['im_size'])
                alpha = alpha.reshape(len(population),len(alpha[1]),self.settings['im_size'],self.settings['im_size'],self.settings['im_size'])
                np.save('{0}/bestofFiles/dev_states_gen_{1}_id_{2}'.format(self.settings['run_directory'],epoch, sort_idx[0]), dev_states[sort_idx[0]])
                np.save('{0}/bestofFiles/alpha_gen_{1}_id_{2}'.format(self.settings['run_directory'],epoch, sort_idx[0]), alpha[sort_idx[0]])

                mynorm = plt.Normalize(vmin=0, vmax=1)
                fig = plt.figure(figsize=(20,10))
                for it in range(0,len(dev_states[1])):
                    #print(it)
                    voxels = dev_states[sort_idx[0]][it]
                    voxels = voxels.transpose((2,1,0))
                    alpha_temp = alpha[sort_idx[0]][it]
                    alpha_temp = alpha_temp.transpose((2,1,0))
                    #print(voxels[1],voxels[1,3])
                    
                    ax = fig.add_subplot(2, 5, it+1, projection= '3d')
                    #plt.subplot(3, iterations/3+12, it+1)#,figsize=(15,15))
                    col = [[1, 1, 1], [0, 1, 1], [0, 0, 1], [1, 0, 0],[0, 1, 0]]
        
                    face_col = np.concatenate( (np.array(col)[voxels.astype(int)], np.expand_dims(alpha_temp, axis=3) ) , axis=3)
                    #face_col = np.concatenate( (np.array(col)[morphocegens[0].astype(int)], np.expand_dims(morphogens[1], axis=3) ) , axis=3) 
                    #face_col = face_col.transpose((1,2,0))
                    ax.set_aspect(aspect=1)
                    ax.voxels( voxels, facecolors=face_col,edgecolor='k')#np.array(col)[morphogens[0].astype(int)])#,

                plt.savefig('{0}/bestofFiles/gen{1}_id{2}.pdf'.format(self.settings['run_directory'],epoch, sort_idx[0]))
                plt.close()

                if LOG:
                    self.writer.add_image("Image", m.transpose(2, 0, 1), epoch)
        
            fitness_log.append((epoch, max_gen_f, np.mean(fitness), max_fitness))
            gen.append (fitness_log[epoch][0])
            maxgenfit.append(fitness_log[epoch][1])
            meanfit.append (fitness_log[epoch][2])
            maxfit.append (fitness_log[epoch][3])
            #print(fitness_log)
            if epoch % self.settings['fig_output_rate'] ==0:
                fig = plt.figure()
                ax = fig.add_subplot(1, 1, 1) 
                #ax.plot(gen, maxgenfit, linestyle='--', color='b', label='maxgenfit')
                ax.plot(gen, meanfit, linestyle='-', color='r', label='Mean')
                ax.plot(gen, maxfit, linestyle='dotted', color = 'b', label='Max fitness')
                ax.set_xlabel('Generations')
                ax.set_ylabel('Fitness')
                ax.legend(loc='best')
                plt.savefig('{0}/epoch{1}.pdf'.format(self.settings['run_directory'],epoch))

                #print(population)
                torch.save(population, "{0}/model_stateFiles/weights_gen_{1}.pt".format(self.settings['run_directory'],epoch))

                str_ = str(fitness_log)
                str_1 = str(fitness_all)
                with open("{0}/fitnessFiles/fitness_log_gen_{1}.txt".format(self.settings['run_directory'], epoch), 'wt') as f:
                    f.write(str_)
                with open("{0}/fitnessFiles/fitness_all_gen_{1}.txt".format(self.settings['run_directory'], epoch), 'wt') as g:
                    g.write(str_1)

                dev_states = np.asarray(dev_states)   
                alpha = np.asarray(alpha)
                #np.save('{0}/dev_stateFiles/dev_states_gen_{1}_id_{2}'.format(self.settings['run_directory'], epoch, sort_idx[0]), dev_states[sort_idx[0]])
                #filename = "{0}/GA_saved_weights_gen_{1}_{2}".format(self.folder,epoch, max_fitness)
                #m = self.fitness_function( (population[sort_idx[0] ], self.settings), True, filename)
                dev_states = dev_states.reshape(len(population),len(dev_states[1]),self.settings['im_size'],self.settings['im_size'],self.settings['im_size'])

                alpha = alpha.reshape(len(population),len(alpha[1]),self.settings['im_size'],self.settings['im_size'],self.settings['im_size'])

            new_pop = []
            for idx in range(self.pop_size-elitism):

                #Select indivdual from the top N
                i = np.random.randint(0, N )
                p = population[ sort_idx[i]]

                new_ind = p + torch.from_numpy( np.random.normal(0, 1, self.weight_shape) * self.sigma).float()

                new_pop.append(new_ind)

            for idx in sort_idx[:elitism]:
                new_pop.append(population[idx] )
            population = new_pop
Esempio n. 2
0
def train_manipulator(model, data_loaders, args):
    """Train an emotion EBM."""
    device = args.device
    optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
    model, optimizer, _, start_epoch, is_trained = load_from_ckpnt(
        args.classifier_ckpnt, model, optimizer, scheduler=None
    )
    if is_trained:
        return model
    writer = SummaryWriter('runs/' + args.checkpoint.replace('.pt', ''))

    # Training loop
    for epoch in range(start_epoch, args.epochs):
        print("Epoch: %d/%d" % (epoch + 1, args.epochs))
        kbar = pkbar.Kbar(target=len(data_loaders['train']), width=25)
        model.train()
        model.disable_batchnorm()
        model.zero_grad()
        # model.enable_grads()
        for step, ex in enumerate(data_loaders['train']):
            images, _, emotions, neg_images = ex
            # positive samples
            pos_samples = images.to(device)
            # prepare negative samples
            neg_samples, neg_masks = rand_mask(images.clone().to(device), device)
            # negative samples
            neg_ld_samples, neg_list = langevin_updates(
                model, torch.clone(neg_samples),
                args.langevin_steps, args.langevin_step_size,
                neg_masks
            )
            # Compute energy
            pos_out = model(pos_samples)
            neg_img_out = model(neg_images.to(device))
            neg_ld_out = model(neg_ld_samples.to(device))
            # Loss
            loss_reg = (pos_out**2 + neg_ld_out**2 + neg_img_out**2).mean()
            # loss_reg = (torch.abs(pos_out) + torch.abs(neg_ld_out) + torch.abs(neg_img_out)).mean()
            loss_ml = 2*pos_out.mean() - neg_ld_out.mean() - neg_img_out.mean()
            coeff = loss_ml.detach().clone() / loss_reg.detach().clone()
            loss = 0.5*loss_reg + loss_ml
            # if epoch == 0:
            #     loss = loss * 0.05
            '''
            loss = (
                pos_out**2 + neg_out**2 + neg_img_out**2 + neg_img_ld_out**2
                + 3*pos_out - neg_out - neg_img_out - neg_img_ld_out
            ).mean()
             '''
            # Step
            optimizer.zero_grad()
            loss.backward()
            clip_grad(model.parameters(), optimizer)
            optimizer.step()
            kbar.update(step, [("loss", loss)])
            # Log loss
            writer.add_scalar('energy/energy_pos', pos_out.mean().item(), epoch * len(data_loaders['train']) + step)
            writer.add_scalar('energy/energy_neg', neg_ld_out.mean().item(), epoch * len(data_loaders['train']) + step)
            writer.add_scalar('loss/loss_reg', loss_reg.item(), epoch * len(data_loaders['train']) + step)
            writer.add_scalar('loss/loss_ml', loss_ml.item(), epoch * len(data_loaders['train']) + step)
            writer.add_scalar('loss/loss_total', loss.item(), epoch * len(data_loaders['train']) + step)
            # Log image evolution
            if step % 50 != 0:
                continue
            writer.add_image(
                'random_image_sample',
                back2color(unnormalize_imagenet_rgb(pos_samples[0], device)),
                epoch * len(data_loaders['train']) + step
            )
            neg_list = [
                back2color(unnormalize_imagenet_rgb(neg, device))
                for neg in neg_list
            ]
            neg_list = [torch.zeros_like(neg_list[0])] + neg_list
            vid_to_write = torch.stack(neg_list, dim=0).unsqueeze(0)
            writer.add_video(
                'ebm_evolution', vid_to_write, fps=args.ebm_log_fps,
                global_step=epoch * len(data_loaders['train']) + step
            )
        writer.add_scalar(
            'lr', optimizer.state_dict()['param_groups'][0]['lr'], epoch
        )
        # Save checkpoint
        torch.save(
            {
                "epoch": epoch + 1,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict()
            },
            args.classifier_ckpnt
        )
        torch.save(
            {
                "epoch": epoch + 1,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict()
            },
            "manipulator_%02d.pt" % (epoch+1)
        )
        print('\nValidation')
        print(eval_manipulator(model, data_loaders['test'], args))
    return model
Esempio n. 3
0
class Writer():
    def __init__(self,
                 output_folder,
                 periods,
                 flush_period,
                 use_tb=False,
                 **meta):
        self.meta = meta
        ensure_dir(output_folder)
        self.metric_rows = {}
        self.meta_rows = {}
        self.metrics_folder = f"{output_folder}/metrics"
        # self.json_writer = jsonlines.open(f"{output_folder}/metrics.jsonl", mode='w', flush=True)
        if use_tb:
            self.tensorboard_writer = SummaryWriter(
                log_dir=f'{output_folder}/tensorboard')
        else:
            self.tensorboard_writer = None
        self.image_folder = f"{output_folder}/images"
        self.env_folder = f"{output_folder}/envs"
        self.video_folder = f"{output_folder}/videos"
        self.model_folder = f"{output_folder}/models"
        self.df_folder = f"{output_folder}/df"
        ensure_dir(self.df_folder)
        ensure_dir(self.metrics_folder)
        ensure_dir(self.env_folder)
        self.periods = periods
        self.frames = {}
        self.traces = {}
        self.flush_idx = 0
        self.flush_period = flush_period

    def add_meta(self, **meta):
        self.meta = {**self.meta, **meta}

    @selector
    def check_on(self):
        return True

    @property
    def step(self):
        return self.meta['_step']

    @selector
    def add_table(self, **kwargs):
        # might change in the future

        # now = datetime.datetime.utcnow()
        # meta = {**self.meta, **meta, 'createdAt': now, 'name': name}
        # for k, v in meta.items():
        #     df[k] = v
        self._write_table(**kwargs)

    @selector
    def add_env(self, env):
        filename = os.path.join(
            self.env_folder,
            f"{self.meta['mode']}.{self.meta['episode']}.json")
        with open(filename, 'w') as outfile:
            json.dump(env.to_dict(), outfile)

    @selector
    def add_metrics2(self, scope, metrics):
        if scope not in self.traces:
            self.traces[scope] = {
                'values': [],
                'episode': [],
                'episode_step': [],
                'mode': [],
                'name': []
            }
        metrics = {k: v.cpu().numpy() for k, v in metrics.items()}
        for k, v in metrics.items():
            self.traces[scope]['episode'].append(self.meta['episode'])
            self.traces[scope]['episode_step'].append(
                self.meta['episode_step'])
            self.traces[scope]['mode'].append(self.meta['mode'])
            self.traces[scope]['name'].append(k)
            self.traces[scope]['values'].append(v)

        if len(self.traces[scope]['values']) > self.flush_period:
            self.metrics2_flush()

    def metrics2_flush(self):
        for scope_name, traces in self.traces.items():
            values = traces.pop('values')
            index = pd.MultiIndex.from_frame(pd.DataFrame(traces))
            if values[0].size > 1:
                columns = pd.Series(
                    [f'agent_{i}' for i in range(len(values[0]))],
                    name='agents')
                df = pd.DataFrame(data=values, index=index, columns=columns)
            else:
                df = pd.DataFrame(data=values, index=index, columns=['value'])
            metrics_file = os.path.join(
                self.metrics_folder, f"{scope_name}.{self.flush_idx}.parquet")
            df.to_parquet(metrics_file)
        self.traces = {}

        self.flush_idx += 1

    @selector
    def add_metrics(self, name, metrics, meta, tf=[]):
        if self.tensorboard_writer:
            for n in tf:
                self.tensorboard_writer.add_scalar(n, metrics[n], self.step)
        meta = {**self.meta, **meta}
        if name in self.meta_rows:
            assert name in self.metric_rows
            self.meta_rows[name].append(parse_dict(meta))
            self.metric_rows[name].append(parse_dict(metrics))
        else:
            self.meta_rows[name] = [parse_dict(meta)]
            self.metric_rows[name] = [parse_dict(metrics)]

    @selector
    def add_image(self, name, image):
        name = name.format(**self.meta)
        if self.tensorboard_writer:
            self.tensorboard_writer.add_image(name, image, self.step)
        self._write_image(name, image)

    @selector
    def add_video(self, name, video):
        name = name.format(**self.meta)
        if self.tensorboard_writer:
            self.tensorboard_writer.add_video(name, video, self.step, fps=1)
        assert video.shape[0] == 1, 'Multiple videos are not yet supported'
        self._write_video(name, video[0], fps=1)

    @selector
    def add_frame(self, name, callback, flush=False):
        name = name.format(**self.meta)
        if name not in self.frames:
            self.frames[name] = [callback()]
        else:
            self.frames[name].append(callback())
        if flush:
            self.frames_flush()

    def frames_flush(self):
        for name, frames in self.frames.items():
            video = th.cat(frames, dim=1)
            self.add_video(name, video)
        self.frames = {}

    def _write_image(self, name, array):
        ensure_dir(self.image_folder)
        file_name = os.path.join(self.image_folder, f'{name}.{self.step}.png')
        assert array.shape[0] == 1, 'Multiple images are not yet supported'
        imageio.imwrite(file_name, array[0].detach().numpy())

    def _write_video(self, name, array, fps=1):
        ensure_dir(self.video_folder)
        file_name = os.path.join(self.video_folder, f'{name}.{self.step}.mp4')
        array_np = array.transpose(1, 3).transpose(1, 2).cpu().numpy()
        clip = ImageSequenceClip([f for f in array_np], fps=1)
        clip.write_videofile(file_name)

    def _write_table(self, df, name, sheet):
        df.to_csv(f"{self.df_folder}/{name}.{sheet}.csv")

    def flush(self):
        self.rows_flush()
        self.metrics2_flush()
        self.flush_idx += 1

    def __del__(self):
        # self.json_writer.close()
        self.flush()

    def rows_flush(self):
        names = self.metric_rows.keys()
        for n in names:
            df_metrics = pd.DataFrame.from_records(self.metric_rows[n])
            df_meta = pd.DataFrame.from_records(
                self.meta_rows[n]).astype('category')
            df = pd.concat([df_meta, df_metrics], axis=1)
            metrics_file = os.path.join(self.metrics_folder, f"{n}.parquet")
            df.to_parquet(metrics_file)

    @selector
    def write_module(self, name, module):
        name = name.format(**self.meta)
        if self.tensorboard_writer:
            for p_name, values in module.named_parameters():
                self.tensorboard_writer.add_histogram(f'{name}.{p_name}',
                                                      values, self.step)

    def set_details(self, details):
        self.details = details
Esempio n. 4
0
        t = t.reshape(-1, 12 * 4 * 4)
        t = self.fc1(t)
        t = F.relu(t)

        # (5) hidden linear layer
        t = self.fc2(t)
        t = F.relu(t)

        # (6) output layer
        t = self.out(t)
        #t = F.softmax(t, dim=1)

        return t


torch.set_grad_enabled(False)
network = Network()
train_loader = torch.utils.data.DataLoader(train_set,
                                           batch_size=1000,
                                           shuffle=True)
optimizer = optim.Adam(network.parameters(), lr=0.01)

images, labels = next(iter(train_loader))
grid = torchvision.utils.make_grid(images)

tb = SummaryWriter()
tb.add_image('images', grid)
tb.add_graph(network, images)

tb.close()
Esempio n. 5
0
class TRAModel(Model):
    """
    TRA Model

    Args:
        model_config (dict): model config (will be used by RNN or Transformer)
        tra_config (dict): TRA config (will be used by TRA)
        model_type (str): which backbone model to use (RNN/Transformer)
        lr (float): learning rate
        n_epochs (int): number of total epochs
        early_stop (int): early stop when performance not improved at this step
        update_freq (int): gradient update frequency
        max_steps_per_epoch (int): maximum number of steps in one epoch
        lamb (float): regularization parameter
        rho (float): exponential decay rate for `lamb`
        alpha (float): fusion parameter for calculating transport loss matrix
        seed (int): random seed
        logdir (str): local log directory
        eval_train (bool): whether evaluate train set between epochs
        eval_test (bool): whether evaluate test set between epochs
        pretrain (bool): whether pretrain the backbone model before training TRA.
            Note that only TRA will be optimized after pretraining
        init_state (str): model init state path
        freeze_model (bool): whether freeze backbone model parameters
        freeze_predictors (bool): whether freeze predictors parameters
        transport_method (str): transport method, can be none/router/oracle
        memory_mode (str): memory mode, the same argument for MTSDatasetH
    """
    def __init__(
        self,
        model_config,
        tra_config,
        model_type="RNN",
        lr=1e-3,
        n_epochs=500,
        early_stop=50,
        update_freq=1,
        max_steps_per_epoch=None,
        lamb=0.0,
        rho=0.99,
        alpha=1.0,
        seed=None,
        logdir=None,
        eval_train=False,
        eval_test=False,
        pretrain=False,
        init_state=None,
        reset_router=False,
        freeze_model=False,
        freeze_predictors=False,
        transport_method="none",
        memory_mode="sample",
    ):

        self.logger = get_module_logger("TRA")

        assert memory_mode in ["sample", "daily"], "invalid memory mode"
        assert transport_method in [
            "none", "router", "oracle"
        ], f"invalid transport method {transport_method}"
        assert transport_method == "none" or tra_config[
            "num_states"] > 1, "optimal transport requires `num_states` > 1"
        assert (memory_mode != "daily" or tra_config["src_info"]
                == "TPE"), "daily transport can only support TPE as `src_info`"

        if transport_method == "router" and not eval_train:
            self.logger.warning(
                "`eval_train` will be ignored when using TRA.router")

        if seed is not None:
            np.random.seed(seed)
            torch.manual_seed(seed)

        self.model_config = model_config
        self.tra_config = tra_config
        self.model_type = model_type
        self.lr = lr
        self.n_epochs = n_epochs
        self.early_stop = early_stop
        self.update_freq = update_freq
        self.max_steps_per_epoch = max_steps_per_epoch
        self.lamb = lamb
        self.rho = rho
        self.alpha = alpha
        self.seed = seed
        self.logdir = logdir
        self.eval_train = eval_train
        self.eval_test = eval_test
        self.pretrain = pretrain
        self.init_state = init_state
        self.reset_router = reset_router
        self.freeze_model = freeze_model
        self.freeze_predictors = freeze_predictors
        self.transport_method = transport_method
        self.use_daily_transport = memory_mode == "daily"
        self.transport_fn = transport_daily if self.use_daily_transport else transport_sample

        self._writer = None
        if self.logdir is not None:
            if os.path.exists(self.logdir):
                self.logger.warning(f"logdir {self.logdir} is not empty")
            os.makedirs(self.logdir, exist_ok=True)
            if SummaryWriter is not None:
                self._writer = SummaryWriter(log_dir=self.logdir)

        self._init_model()

    def _init_model(self):

        self.logger.info("init TRAModel...")

        self.model = eval(self.model_type)(**self.model_config).to(device)
        print(self.model)

        self.tra = TRA(self.model.output_size, **self.tra_config).to(device)
        print(self.tra)

        if self.init_state:
            self.logger.warning(f"load state dict from `init_state`")
            state_dict = torch.load(self.init_state, map_location="cpu")
            self.model.load_state_dict(state_dict["model"])
            res = load_state_dict_unsafe(self.tra, state_dict["tra"])
            self.logger.warning(str(res))

        if self.reset_router:
            self.logger.warning(f"reset TRA.router parameters")
            self.tra.fc.reset_parameters()
            self.tra.router.reset_parameters()

        if self.freeze_model:
            self.logger.warning(f"freeze model parameters")
            for param in self.model.parameters():
                param.requires_grad_(False)

        if self.freeze_predictors:
            self.logger.warning(f"freeze TRA.predictors parameters")
            for param in self.tra.predictors.parameters():
                param.requires_grad_(False)

        self.logger.info("# model params: %d" % sum(
            [p.numel() for p in self.model.parameters() if p.requires_grad]))
        self.logger.info(
            "# tra params: %d" %
            sum([p.numel() for p in self.tra.parameters() if p.requires_grad]))

        self.optimizer = optim.Adam(list(self.model.parameters()) +
                                    list(self.tra.parameters()),
                                    lr=self.lr)

        self.fitted = False
        self.global_step = -1

    def train_epoch(self, epoch, data_set, is_pretrain=False):

        self.model.train()
        self.tra.train()
        data_set.train()
        self.optimizer.zero_grad()

        P_all = []
        prob_all = []
        choice_all = []
        max_steps = len(data_set)
        if self.max_steps_per_epoch is not None:
            if epoch == 0 and self.max_steps_per_epoch < max_steps:
                self.logger.info(
                    f"max steps updated from {max_steps} to {self.max_steps_per_epoch}"
                )
            max_steps = min(self.max_steps_per_epoch, max_steps)

        cur_step = 0
        total_loss = 0
        total_count = 0
        for batch in tqdm(data_set, total=max_steps):
            cur_step += 1
            if cur_step > max_steps:
                break

            if not is_pretrain:
                self.global_step += 1

            data, state, label, count = batch["data"], batch["state"], batch[
                "label"], batch["daily_count"]
            index = batch[
                "daily_index"] if self.use_daily_transport else batch["index"]

            with torch.set_grad_enabled(not self.freeze_model):
                hidden = self.model(data)

            all_preds, choice, prob = self.tra(hidden, state)

            if is_pretrain or self.transport_method != "none":
                # NOTE: use oracle transport for pre-training
                loss, pred, L, P = self.transport_fn(
                    all_preds,
                    label,
                    choice,
                    prob,
                    state.mean(dim=1),
                    count,
                    self.transport_method if not is_pretrain else "oracle",
                    self.alpha,
                    training=True,
                )
                data_set.assign_data(index, L)  # save loss to memory
                if self.use_daily_transport:  # only save for daily transport
                    P_all.append(
                        pd.DataFrame(P.detach().cpu().numpy(), index=index))
                    prob_all.append(
                        pd.DataFrame(prob.detach().cpu().numpy(), index=index))
                    choice_all.append(
                        pd.DataFrame(choice.detach().cpu().numpy(),
                                     index=index))
                decay = self.rho**(self.global_step // 100
                                   )  # decay every 100 steps
                lamb = 0 if is_pretrain else self.lamb * decay
                reg = prob.log().mul(P).sum(
                    dim=1).mean()  # train router to predict OT assignment
                if self._writer is not None and not is_pretrain:
                    self._writer.add_scalar("training/router_loss",
                                            -reg.item(), self.global_step)
                    self._writer.add_scalar("training/reg_loss", loss.item(),
                                            self.global_step)
                    self._writer.add_scalar("training/lamb", lamb,
                                            self.global_step)
                    if not self.use_daily_transport:
                        P_mean = P.mean(axis=0).detach()
                        self._writer.add_scalar("training/P",
                                                P_mean.max() / P_mean.min(),
                                                self.global_step)
                loss = loss - lamb * reg
            else:
                pred = all_preds.mean(dim=1)
                loss = loss_fn(pred, label)

            (loss / self.update_freq).backward()
            if cur_step % self.update_freq == 0:
                self.optimizer.step()
                self.optimizer.zero_grad()

            if self._writer is not None and not is_pretrain:
                self._writer.add_scalar("training/total_loss", loss.item(),
                                        self.global_step)

            total_loss += loss.item()
            total_count += 1

        if self.use_daily_transport and len(P_all):
            P_all = pd.concat(P_all, axis=0)
            prob_all = pd.concat(prob_all, axis=0)
            choice_all = pd.concat(choice_all, axis=0)
            P_all.index = data_set.restore_daily_index(P_all.index)
            prob_all.index = P_all.index
            choice_all.index = P_all.index
            if not is_pretrain:
                self._writer.add_image("P",
                                       plot(P_all),
                                       epoch,
                                       dataformats="HWC")
                self._writer.add_image("prob",
                                       plot(prob_all),
                                       epoch,
                                       dataformats="HWC")
                self._writer.add_image("choice",
                                       plot(choice_all),
                                       epoch,
                                       dataformats="HWC")

        total_loss /= total_count

        if self._writer is not None and not is_pretrain:
            self._writer.add_scalar("training/loss", total_loss, epoch)

        return total_loss

    def test_epoch(self,
                   epoch,
                   data_set,
                   return_pred=False,
                   prefix="test",
                   is_pretrain=False):

        self.model.eval()
        self.tra.eval()
        data_set.eval()

        preds = []
        probs = []
        P_all = []
        metrics = []
        for batch in tqdm(data_set):
            data, state, label, count = batch["data"], batch["state"], batch[
                "label"], batch["daily_count"]
            index = batch[
                "daily_index"] if self.use_daily_transport else batch["index"]

            with torch.no_grad():
                hidden = self.model(data)
                all_preds, choice, prob = self.tra(hidden, state)

            if is_pretrain or self.transport_method != "none":
                loss, pred, L, P = self.transport_fn(
                    all_preds,
                    label,
                    choice,
                    prob,
                    state.mean(dim=1),
                    count,
                    self.transport_method if not is_pretrain else "oracle",
                    self.alpha,
                    training=False,
                )
                data_set.assign_data(index, L)  # save loss to memory
                if P is not None and return_pred:
                    P_all.append(pd.DataFrame(P.cpu().numpy(), index=index))
            else:
                pred = all_preds.mean(dim=1)

            X = np.c_[pred.cpu().numpy(),
                      label.cpu().numpy(),
                      all_preds.cpu().numpy()]
            columns = ["score", "label"
                       ] + ["score_%d" % d for d in range(all_preds.shape[1])]
            pred = pd.DataFrame(X, index=batch["index"], columns=columns)

            metrics.append(evaluate(pred))

            if return_pred:
                preds.append(pred)
                if prob is not None:
                    columns = [
                        "prob_%d" % d for d in range(all_preds.shape[1])
                    ]
                    probs.append(
                        pd.DataFrame(prob.cpu().numpy(),
                                     index=index,
                                     columns=columns))

        metrics = pd.DataFrame(metrics)
        metrics = {
            "MSE": metrics.MSE.mean(),
            "MAE": metrics.MAE.mean(),
            "IC": metrics.IC.mean(),
            "ICIR": metrics.IC.mean() / metrics.IC.std(),
        }

        if self._writer is not None and epoch >= 0 and not is_pretrain:
            for key, value in metrics.items():
                self._writer.add_scalar(prefix + "/" + key, value, epoch)

        if return_pred:
            preds = pd.concat(preds, axis=0)
            preds.index = data_set.restore_index(preds.index)
            preds.index = preds.index.swaplevel()
            preds.sort_index(inplace=True)

            if probs:
                probs = pd.concat(probs, axis=0)
                if self.use_daily_transport:
                    probs.index = data_set.restore_daily_index(probs.index)
                else:
                    probs.index = data_set.restore_index(probs.index)
                    probs.index = probs.index.swaplevel()
                    probs.sort_index(inplace=True)

            if len(P_all):
                P_all = pd.concat(P_all, axis=0)
                if self.use_daily_transport:
                    P_all.index = data_set.restore_daily_index(P_all.index)
                else:
                    P_all.index = data_set.restore_index(P_all.index)
                    P_all.index = P_all.index.swaplevel()
                    P_all.sort_index(inplace=True)

        return metrics, preds, probs, P_all

    def _fit(self,
             train_set,
             valid_set,
             test_set,
             evals_result,
             is_pretrain=True):

        best_score = -1
        best_epoch = 0
        stop_rounds = 0
        best_params = {
            "model": copy.deepcopy(self.model.state_dict()),
            "tra": copy.deepcopy(self.tra.state_dict()),
        }
        # train
        if not is_pretrain and self.transport_method != "none":
            self.logger.info("init memory...")
            self.test_epoch(-1, train_set)

        for epoch in range(self.n_epochs):
            self.logger.info("Epoch %d:", epoch)

            self.logger.info("training...")
            self.train_epoch(epoch, train_set, is_pretrain=is_pretrain)

            self.logger.info("evaluating...")
            # NOTE: during evaluating, the whole memory will be refreshed
            if not is_pretrain and (self.transport_method == "router"
                                    or self.eval_train):
                train_set.clear_memory()  # NOTE: clear the shared memory
                train_metrics = self.test_epoch(epoch,
                                                train_set,
                                                is_pretrain=is_pretrain,
                                                prefix="train")[0]
                evals_result["train"].append(train_metrics)
                self.logger.info("train metrics: %s" % train_metrics)

            valid_metrics = self.test_epoch(epoch,
                                            valid_set,
                                            is_pretrain=is_pretrain,
                                            prefix="valid")[0]
            evals_result["valid"].append(valid_metrics)
            self.logger.info("valid metrics: %s" % valid_metrics)

            if self.eval_test:
                test_metrics = self.test_epoch(epoch,
                                               test_set,
                                               is_pretrain=is_pretrain,
                                               prefix="test")[0]
                evals_result["test"].append(test_metrics)
                self.logger.info("test metrics: %s" % test_metrics)

            if valid_metrics["IC"] > best_score:
                best_score = valid_metrics["IC"]
                stop_rounds = 0
                best_epoch = epoch
                best_params = {
                    "model": copy.deepcopy(self.model.state_dict()),
                    "tra": copy.deepcopy(self.tra.state_dict()),
                }
                if self.logdir is not None:
                    torch.save(best_params, self.logdir + "/model.bin")
            else:
                stop_rounds += 1
                if stop_rounds >= self.early_stop:
                    self.logger.info("early stop @ %s" % epoch)
                    break

        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
        self.model.load_state_dict(best_params["model"])
        self.tra.load_state_dict(best_params["tra"])

        return best_score

    def fit(self, dataset, evals_result=dict()):

        assert isinstance(
            dataset, MTSDatasetH
        ), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`"

        train_set, valid_set, test_set = dataset.prepare(
            ["train", "valid", "test"])

        self.fitted = True
        self.global_step = -1

        evals_result["train"] = []
        evals_result["valid"] = []
        evals_result["test"] = []

        if self.pretrain:
            self.logger.info("pretraining...")
            self.optimizer = optim.Adam(list(self.model.parameters()) +
                                        list(self.tra.predictors.parameters()),
                                        lr=self.lr)
            self._fit(train_set,
                      valid_set,
                      test_set,
                      evals_result,
                      is_pretrain=True)

            # reset optimizer
            self.optimizer = optim.Adam(list(self.model.parameters()) +
                                        list(self.tra.parameters()),
                                        lr=self.lr)

        self.logger.info("training...")
        best_score = self._fit(train_set,
                               valid_set,
                               test_set,
                               evals_result,
                               is_pretrain=False)

        self.logger.info("inference")
        train_metrics, train_preds, train_probs, train_P = self.test_epoch(
            -1, train_set, return_pred=True)
        self.logger.info("train metrics: %s" % train_metrics)

        valid_metrics, valid_preds, valid_probs, valid_P = self.test_epoch(
            -1, valid_set, return_pred=True)
        self.logger.info("valid metrics: %s" % valid_metrics)

        test_metrics, test_preds, test_probs, test_P = self.test_epoch(
            -1, test_set, return_pred=True)
        self.logger.info("test metrics: %s" % test_metrics)

        if self.logdir:
            self.logger.info("save model & pred to local directory")

            pd.concat(
                {
                    name: pd.DataFrame(evals_result[name])
                    for name in evals_result
                },
                axis=1).to_csv(self.logdir + "/logs.csv", index=False)

            torch.save(
                {
                    "model": self.model.state_dict(),
                    "tra": self.tra.state_dict()
                }, self.logdir + "/model.bin")

            train_preds.to_pickle(self.logdir + "/train_pred.pkl")
            valid_preds.to_pickle(self.logdir + "/valid_pred.pkl")
            test_preds.to_pickle(self.logdir + "/test_pred.pkl")

            if len(train_probs):
                train_probs.to_pickle(self.logdir + "/train_prob.pkl")
                valid_probs.to_pickle(self.logdir + "/valid_prob.pkl")
                test_probs.to_pickle(self.logdir + "/test_prob.pkl")

            if len(train_P):
                train_P.to_pickle(self.logdir + "/train_P.pkl")
                valid_P.to_pickle(self.logdir + "/valid_P.pkl")
                test_P.to_pickle(self.logdir + "/test_P.pkl")

            info = {
                "config": {
                    "model_config": self.model_config,
                    "tra_config": self.tra_config,
                    "model_type": self.model_type,
                    "lr": self.lr,
                    "n_epochs": self.n_epochs,
                    "early_stop": self.early_stop,
                    "max_steps_per_epoch": self.max_steps_per_epoch,
                    "lamb": self.lamb,
                    "rho": self.rho,
                    "alpha": self.alpha,
                    "seed": self.seed,
                    "logdir": self.logdir,
                    "pretrain": self.pretrain,
                    "init_state": self.init_state,
                    "transport_method": self.transport_method,
                    "use_daily_transport": self.use_daily_transport,
                },
                "best_eval_metric": -best_score,  # NOTE: -1 for minimize
                "metrics": {
                    "train": train_metrics,
                    "valid": valid_metrics,
                    "test": test_metrics
                },
            }
            with open(self.logdir + "/info.json", "w") as f:
                json.dump(info, f)

    def predict(self, dataset, segment="test"):

        assert isinstance(
            dataset, MTSDatasetH
        ), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`"

        if not self.fitted:
            raise ValueError("model is not fitted yet!")

        test_set = dataset.prepare(segment)

        metrics, preds, _, _ = self.test_epoch(-1, test_set, return_pred=True)
        self.logger.info("test metrics: %s" % metrics)

        return preds
Esempio n. 6
0
class Trainer:
    """Trainer for naive MoLM model"""
    def __init__(
        self,
        generator,
        moment_network,
        train_set,
        training_params,
        device=None,
        scores=None,
        tensorboard=False,
        save_folder="runs/run",
        eval_generate_images=False,
    ):
        """
            generator: a nn.Module child class serving as a generator network
            moment_network: a nn.Module child class serving as the moment network
            loader: a training data loader
            scores: None, or a dict of shape {'name':obj} with score object
                    with a __call__ function that returns a score

            training_params: dict of training parameters with:
                n0: number of objectives
                nm: number of moments trainig step
                ng: number of generating training steps
                lr: learning rate
                beta1 / beta2: Adam parameters
                acw: activation wieghts
                alpha: the norm penalty parameter
                gen_batch_size: the batch size to train the generator
                mom_batch_size: the batch size to train the moment network
                eval_batch_size: the batch size to evaluate the generated
                eval_size: total number of generated samples on which to evaluate the scores

            tensorboard: whether to use tensorboard to save training information
            save_folder: root folder to save the training information
            eval_generate_images: generates images during training for evaluation

        """
        self.G = generator
        self.MoNet = moment_network
        self.train_set = train_set
        self.training_params = training_params
        self.nm = training_params["nm"]
        self.ng = training_params["ng"]
        self.no = training_params["no"]
        self.no_obj = 0  # current objective
        self.n_moments = training_params["n_moments"]
        self.gen_batch_size = training_params["gen_batch_size"]
        self.eval_batch_size = training_params["eval_batch_size"]
        self.learn_moments = training_params["learn_moments"]

        lr, beta1, beta2 = (
            self.training_params["lr"],
            self.training_params["beta1"],
            self.training_params["beta2"],
        )
        self.optimizerG = optim.Adam(self.G.parameters(),
                                     lr=lr,
                                     betas=(beta1, beta2))
        self.optimizerM = optim.Adam(self.MoNet.parameters(),
                                     lr=lr,
                                     betas=(beta1, beta2))

        self.LM = []
        self.LG = []
        self.iter = 0
        self.device = device

        self.cross_entropy = F.binary_cross_entropy
        self.mse = MSELoss(reduction="sum")

        # to track the evolution of generated samples from a single batch of noises
        self.fixed_z = torch.randn(20, self.G.dims[0], device=self.device)

        # saving training info
        self.run_folder = Path(save_folder)
        if not (self.run_folder / "results").exists():
            os.mkdir(self.run_folder / "results")
        self.save_path_img = self.run_folder / "results/images/"
        self.save_path_checkpoints = self.run_folder / "checkpoints/"
        if not self.save_path_checkpoints.exists():
            os.mkdir(self.save_path_checkpoints)
        self.eval_generate_images = eval_generate_images

        # monitoring the progress of the training with the evaluation scores
        self.scores = scores
        if scores is not None and not (self.run_folder /
                                       "scores.csv").exists():
            # Save scores
            with open(self.run_folder / "scores.csv", "w") as f:
                f.write(f'Objective,{",".join(scores.keys())}\n')

        # monitoring through tensorboard
        if tensorboard:
            comment = "".join([
                "{}={} ".format(key, training_params[key])
                for key in training_params
            ])
            self.tb = SummaryWriter(self.run_folder, comment=comment)
            self.tb.add_graph(generator, self.fixed_z)
        else:
            self.tb = None

        # set up handler to file
        fh = logging.FileHandler(self.run_folder / "logging.txt")
        fh.setLevel(logging.INFO)
        fh.setFormatter(formatter)
        logger.addHandler(fh)

    def train_monet(self):
        """Solves one Moment Network objective."""
        # reshuffle training data
        loader = iter(
            torch.utils.data.DataLoader(
                self.train_set,
                shuffle=True,
                batch_size=self.training_params["mom_batch_size"],
            ))
        for i in range(self.nm):
            batch = loader.next()
            samples = batch
            samples = samples.to(self.device)
            # samples = (samples * 2) - 1

            sample_size = samples.size(0)
            one_labels = torch.ones(sample_size, device=self.device)
            zero_labels = torch.zeros(sample_size, device=self.device)

            # generating latent vector
            # self.dims = [Z_dim, h1_dim, h2_dim, h3_dim, X_dim]
            z = torch.randn(sample_size, self.G.dims[0], device=self.device)
            res = self.G(z)
            prob_trues = self.MoNet(samples)
            output_trues = self.MoNet.output
            prob_gen = self.MoNet(res)
            output_gen = self.MoNet.output

            prob_trues, prob_gen = prob_trues.squeeze(), prob_gen.squeeze()
            LM_samples = self.cross_entropy(prob_trues, one_labels)
            LM_gen = self.cross_entropy(prob_gen, zero_labels)
            LM = LM_samples + LM_gen

            # We now need to compute the gradients to add the regularization term
            mean_output = output_trues.mean()
            self.optimizerM.zero_grad()
            grad_monet = self.MoNet.get_gradients(mean_output)
            grad_monet = grad_monet.squeeze()
            grad_norm = torch.dot(grad_monet, grad_monet)
            LM = (LM_samples + LM_gen + self.training_params["alpha"] *
                  ((grad_norm - 1)**2))
            # LM = LM_samples + LM_gen
            # Add to tensorboard
            if self.tb:
                self.tb.add_scalar(
                    "LossMonet/objective_{}".format(self.no_obj + 1),
                    float(LM), i + 1)
            self.LM.append(float(LM))
            if i % 50 == 0:
                logger.info("Moment Network Iteration {}/{}: LM: {:.6}".format(
                    i + 1, self.nm, LM.item()))

            self.optimizerM.zero_grad()
            LM.backward()
            self.optimizerM.step()

            del grad_monet
            del batch

    def eval_true_moments(self):
        """Returns the value of moment vector on observed data."""
        loader = torch.utils.data.DataLoader(
            self.train_set,
            shuffle=True,
            batch_size=self.training_params["mom_batch_size"],
        )
        # Calculate the moment vector over the entire dataset:
        moments = torch.zeros(self.n_moments, device=self.device)
        for i, batch in enumerate(loader):
            samples = batch
            samples = samples.to(self.device)
            sample_size = samples.size(0)
            # NOT Scaling true images to tanh activation interval:
            # samples = (samples * 2) - 1
            self.optimizerM.zero_grad()
            moments_b = self.MoNet.get_moment_vector(
                samples,
                sample_size,
                weights=self.training_params["activation_weight"],
                detach=True,
            )
            moments = ((i) * moments + moments_b) / (i + 1)
            del batch
            del samples
            del moments_b
        return moments

    def train_generator(self, true_moments):
        """Solves one generator objective."""
        for i in range(self.ng):

            z = torch.randn(self.gen_batch_size,
                            self.G.dims[0],
                            device=self.device)
            res = self.G(z)
            self.optimizerM.zero_grad()
            moments_gz = self.MoNet.get_moment_vector(
                res,
                self.gen_batch_size,
                weights=self.training_params["activation_weight"],
            )
            # moments_gz = ((i) * moments_gz + moments_z) / (i+1)

            del z
            del res

            LG = torch.dot(
                true_moments - moments_gz, true_moments -
                moments_gz)  # equivalent to dot product of difference
            # LG = self.mse(true_moments, moments_gz)
            # Add to tensorboard
            if self.tb:
                self.tb.add_scalar(
                    "LossGenerator/objective_{}".format(self.no_obj + 1),
                    float(LG),
                    i + 1,
                )
            self.LG.append(float(LG))
            if i % 100 == 0:
                logger.info("Generator Iteration {}/{}: LG: {:.6}".format(
                    i + 1, self.ng, LG.item()))
            self.optimizerG.zero_grad()
            LG.backward()
            self.optimizerG.step()

            del moments_gz

    def generate_and_display(self, z, save=False, save_path=None):
        """"Generates rows of images from latent variable z."""
        # Visualizing the generated images
        examples = self.G(z).detach().cpu()
        examples = examples.reshape(-1, 3, self.G.dims[-1], self.G.dims[-1])
        examples = (examples + 1) / 2
        grid = torchvision.utils.make_grid(examples,
                                           nrow=10)  # 10 images per row
        # Add to tensorboard
        if self.tb:
            self.tb.add_image("generated images", grid, self.no_obj)
        fig = plt.figure(figsize=(15, 15))
        plt.imshow(np.transpose(grid, (1, 2, 0)))
        if save:
            plt.savefig(save_path)
        else:
            plt.show()
        plt.close(fig)

    def eval(self):
        """Evaluate generated batch with scores in self.scores"""
        logger.info(
            f"Evaluating generated samples with scores: {self.scores.keys()}")
        scores_dict = self.scores
        n_loops = self.training_params["eval_size"] // self.eval_batch_size
        results = dict(zip(scores_dict.keys(), [None] * len(scores_dict)))
        for score in scores_dict:
            results[score] = np.zeros(n_loops)
        for i in range(n_loops):
            with torch.no_grad():
                z = torch.randn(self.eval_batch_size,
                                self.G.dims[0],
                                device=self.device)
                samples = self.G(z).cpu()
            if "IS" in scores_dict or "FID" in scores_dict:
                samples = InceptionScore.preprocess(samples)
            for score in scores_dict:
                value = scores_dict[score](samples)
                results[score][i] = value if value is not None else np.nan
        for score in scores_dict:
            results[score] = np.nanmean(results[score])
        return results

    def load_from_checkpoints(self, path):
        """
        Loads network parameters and training info from checkpoint
            path: path to checkpoint
        """
        logger.info(
            "Loading network parameters and training info from checkpoint...")
        checkpoint = torch.load(path)
        self.G.load_state_dict(checkpoint["generator_state_dict"])
        self.optimizerG.load_state_dict(checkpoint["optimizerG_state_dict"])
        self.G.train()

        if self.learn_moments:
            self.MoNet.load_state_dict(checkpoint["monet_state_dict"])
            self.optimizerM.load_state_dict(
                checkpoint["optimizerM_state_dict"])
            self.MoNet.train()

        last_objective = checkpoint["objective"]
        lossG = checkpoint["last_lossG"]
        lossM = checkpoint["last_lossM"]

        return last_objective, lossG, lossM

    def train(self, save_images=False, from_checkpoint=None):
        """Trains naive MoLM model made of generator self.G and
        moment network self.MoNet"""
        if save_images and not self.save_path_img.exists():
            os.mkdir(self.save_path_img)
        last_objective = 0
        if not self.learn_moments:
            true_moments = self.eval_true_moments()

        if from_checkpoint:
            last_objective, lossG, lossM = self.load_from_checkpoints(
                from_checkpoint)
            logger.info(
                "Starting training from Objective: {}, lossG: {}, lossM: {}".
                format(last_objective, lossG, lossM))

        for i in range(last_objective, self.no):
            # Track the no of objectives solved
            self.no_obj = i

            start = time.time()
            if self.learn_moments:
                logger.info("Training Moment Network...")
                self.train_monet()
                logger.info("Evaluating true moments value...")
                true_moments = self.eval_true_moments()
            logger.info("Training Generator")
            self.train_generator(true_moments)
            self.iter += 1
            stop = time.time()
            duration = (stop - start) / 60

            if self.learn_moments:
                logger.info(
                    "Objective {}/{} - {:.2} minutes: LossMonet: {:.6} LossG: {:.6}"
                    .format(i + 1, self.no, duration, self.LM[-1],
                            self.LG[-1]))
            else:
                logger.info(
                    "Objective {}/{} - {:.2} minutes: LossG: {:.6}".format(
                        i + 1, self.no, duration, self.LG[-1]))

            if self.eval_generate_images:
                self.generate_and_display(
                    self.fixed_z,
                    save=save_images,
                    save_path=self.save_path_img +
                    "generated_molm_iter{}.png".format(i),
                )

            if i % SAVING_FREQUENCY == 0:
                logger.info("Saving model ...")
                save_path_checkpoints = self.save_path_checkpoints / f"molm_iter{i}.pt"
                save_dict = {
                    "monet_state_dict": self.MoNet.state_dict(),
                    "generator_state_dict": self.G.state_dict(),
                    "optimizerG_state_dict": self.optimizerG.state_dict(),
                    "objective": i + 1,
                    "last_lossG": self.LG[-1],
                }
                if self.learn_moments:
                    save_dict["last_lossM"] = self.LM[-1]
                    save_dict[
                        "optimizerM_state_dict"] = self.optimizerM.state_dict(
                        )

                torch.save(save_dict, save_path_checkpoints)

                if self.scores:
                    scores = self.eval()
                    logger.info(f"{scores}")
                    # Add to tensorboard
                    if self.tb:
                        for score in scores:
                            self.tb.add_scalar("Scores/{}".format(score),
                                               scores[score], i + 1)
                    # Save scores
                    with open(self.run_folder / "scores.csv", "a") as f:
                        f.write(
                            f'{i+1},{",".join([str(metric) for metric in scores.values()])}\n'
                        )

            # Updating data on tensorboard
            if self.tb:
                for name, param in self.G.named_parameters():
                    self.tb.add_histogram("generator.{}".format(name), param,
                                          i + 1)
                    self.tb.add_histogram("generator.{}.grad".format(name),
                                          param.grad, i + 1)
                for name, param in self.MoNet.named_parameters():
                    self.tb.add_histogram("momentNetwork.{}".format(name),
                                          param, i + 1)
                    self.tb.add_histogram("momentNetwork.{}.grad".format(name),
                                          param.grad, i + 1)
Esempio n. 7
0
# input_img = torch.randn(content_img.data.size(), device=device)

# add the original input image to the figure:
#plt.figure()
#imshow(input_img, title='Input Image')
start = time.time()
output = run_style_transfer(cnn, normalization_mean, normalization_std,
                            content_img, style_img, input_img,
                            opt.content_layers_default,
                            opt.style_layers_default, device, opt.num_epochs,
                            opt.style_weight, opt.content_weight)

timeSince(start)
writer = SummaryWriter(
    log_dir='train_result')  # tensorboard直接就能保存tensor,只要图片符合C x H x W就行。
writer.add_image('img', output.squeeze(0), 1)  # 前面为什么要增一维,导致这里又要减一维
writer.close()


# tensor只有一个元素,tensor.item();有很多元素tensor.data
# transforms.ToTensor()与下面对应,相互转化吧(不说类型,数值的变化为0-255转化为0-1;下面则相反)
# unloader = transforms.ToPILImage()  # reconvert into PIL image
def imshow(tensor, title=None):
    image = tensor.cpu().clone()  # we clone the tensor to not do changes on it
    '''
    实验证明,clone()是为了不改变原来的变量;
    x = torch.Tensor(2,2).fill_(2);y = x.clone();y = x.clone().view(4);
    clone()可以重新开辟一块内存,x,y互不影响,克隆时也可以改变形状。
    import copy
    x = torch.Tensor(2,2).fill_(2);y = copy.copy(x),此时x,y指向一个内存单元;
    y = copy.deepcopy(x)重新开辟内存,互不影响。
Esempio n. 8
0
def train_epoch(net, datasets, optimizer, lr_scheduler, args):
    batch_size = {'train': args['batch_size'], 'val': 4}
    data_loader = {
        phase: uData.DataLoader(datasets[phase],
                                batch_size=batch_size[phase],
                                shuffle=True,
                                num_workers=args['num_workers'],
                                pin_memory=True)
        for phase in _modes
    }
    num_data = {phase: len(datasets[phase]) for phase in _modes}
    num_iter_epoch = {
        phase: ceil(num_data[phase] / batch_size[phase])
        for phase in _modes
    }
    step = args['step'] if args['resume'] else 0
    step_img = args['step_img'] if args['resume'] else {x: 0 for x in _modes}
    writer = SummaryWriter(str(Path(args['log_dir'])))
    for epoch in range(args['epoch_start'], args['epochs']):
        loss_epoch = {x: 0 for x in ['PL', 'DL', 'GL']}
        subloss_epoch = {
            x: 0
            for x in [
                'Px', 'Pxg', 'Py', 'Pyg', 'Dx', 'DE', 'DAE', 'Gy', 'GMean',
                'GErr', 'TGErr'
            ]
        }
        mae_epoch = {'train': 0, 'val': 0}
        tic = time.time()
        # train stage
        net['D'].train()
        net['G'].train()
        net['P'].train()
        lr_D = optimizer['D'].param_groups[0]['lr']
        lr_G = optimizer['G'].param_groups[0]['lr']
        lr_P = optimizer['P'].param_groups[0]['lr']
        if lr_D < 1e-6:
            sys.exit('Reach the minimal learning rate')
        phase = 'train'
        iter_GD = 0
        for ii, data in enumerate(data_loader[phase]):
            im_noisy, im_gt = [x.cuda() for x in data]
            # update the netP
            PL, Px, Pxg, Py, Pyg = train_step_P(net, im_gt, im_noisy,
                                                optimizer['P'], args)
            loss_epoch['PL'] += PL.item()
            subloss_epoch['Px'] += Px.item()
            subloss_epoch['Pxg'] += Pxg.item()
            subloss_epoch['Py'] += Py.item()
            subloss_epoch['Pyg'] += Pyg.item()
            # update the netD
            if (ii + 1) % args['num_critic'] == 0:
                DL, Dx, DE, DAE, im_denoise = train_step_D(
                    net, im_gt, im_noisy, optimizer['D'], args)
                loss_epoch['DL'] += DL.item()
                subloss_epoch['Dx'] += Dx.item()
                subloss_epoch['DE'] += DE.item()
                subloss_epoch['DAE'] += DAE.item()
                mae_epoch[phase] += DAE.item()
                # update the netG
                GL, Gy, GMean, im_generate = train_step_G(
                    net, im_gt, im_noisy, optimizer['G'], args)
                loss_epoch['GL'] += GL.item()
                subloss_epoch['Gy'] += Gy.item()
                subloss_epoch['GMean'] += GMean.item()
                GErr = F.l1_loss(im_generate, im_gt, reduction='mean')
                subloss_epoch['GErr'] += GErr.item()
                TGErr = F.l1_loss(im_noisy, im_gt, reduction='mean')
                subloss_epoch['TGErr'] += TGErr.item()
                iter_GD += 1

                if (ii + 1) % args['print_freq'] == 0:
                    template = '[Epoch:{:>2d}/{:<3d}] {:s}:{:0>5d}/{:0>5d}, PLx:{:>6.2f}/{:4.2f},'+\
                                         ' PLy:{:>6.2f}/{:4.2f}, DL:{:>6.2f}/{:.1e}, DAE:{:.2e}, '+\
                                                            'GL:{:>6.2f}/{:<5.2f}, GErr:{:.1e}/{:.1e}'
                    print(
                        template.format(epoch + 1, args['epochs'],
                                        phase, ii + 1, num_iter_epoch[phase],
                                        Px.item(), Pxg.item(), Py.item(),
                                        Pyg.item(), Dx.item(), DE.item(),
                                        DAE.item(), Gy.item(), GMean.item(),
                                        GErr.item(), TGErr.item()))
                    writer.add_scalar('Train PNet Loss Iter', PL.item(), step)
                    writer.add_scalar('Train DNet Loss Iter', DL.item(), step)
                    writer.add_scalar('Train GNet Loss Iter', GL.item(), step)
                    step += 1
                    if (ii + 1) % (10 * args['print_freq']) == 0:
                        x1 = vutils.make_grid(im_noisy,
                                              normalize=True,
                                              scale_each=True)
                        writer.add_image(phase + ' Noisy Image', x1,
                                         step_img[phase])
                        x2 = vutils.make_grid(im_gt,
                                              normalize=True,
                                              scale_each=True)
                        writer.add_image(phase + ' GroundTruth', x2,
                                         step_img[phase])
                        x3 = vutils.make_grid(im_denoise.clamp_(0.0, 1.0),
                                              normalize=True,
                                              scale_each=True)
                        writer.add_image(phase + ' Denoised images', x3,
                                         step_img[phase])
                        x4 = vutils.make_grid(im_generate.clamp_(0.0, 1.0),
                                              normalize=True,
                                              scale_each=True)
                        writer.add_image(phase + ' Generated images', x4,
                                         step_img[phase])
                        step_img[phase] += 1

        loss_epoch['PL'] /= (ii + 1)
        subloss_epoch['Px'] /= (ii + 1)
        subloss_epoch['Pxg'] /= (ii + 1)
        subloss_epoch['Py'] /= (ii + 1)
        subloss_epoch['Pyg'] /= (ii + 1)
        loss_epoch['DL'] /= (iter_GD + 1)
        subloss_epoch['Dx'] /= (iter_GD + 1)
        subloss_epoch['DAE'] /= (iter_GD + 1)
        mae_epoch[phase] /= (iter_GD + 1)
        loss_epoch['GL'] /= (iter_GD + 1)
        subloss_epoch['Gy'] /= (iter_GD + 1)
        subloss_epoch['GMean'] /= (iter_GD + 1)
        subloss_epoch['GErr'] /= (iter_GD + 1)
        subloss_epoch['TGErr'] /= (iter_GD + 1)
        template = '{:s}: PL={:5.2f}, DL={:5.2f}, GL={:5.2f}, DAE:{:4.2e}, GMean:{:4.2e}, ' +\
                                 'GE:{:.2e}/{:.2e}, tauDG:{:.1e}/{:.1e}, lrDGP:{:.2e}/{:.2e}/{:.2e}'
        print(
            template.format(phase, loss_epoch['PL'], loss_epoch['DL'],
                            loss_epoch['GL'], subloss_epoch['DAE'],
                            subloss_epoch['GMean'], subloss_epoch['GErr'],
                            subloss_epoch['TGErr'], args['tau_D'],
                            args['tau_G'], lr_D, lr_G, lr_P))
        print('-' * 150)

        # test stage
        net['D'].eval()
        psnr_per_epoch = ssim_per_epoch = 0
        phase = 'val'
        for ii, data in enumerate(data_loader[phase]):
            im_noisy, im_gt = [x.cuda() for x in data]
            with torch.set_grad_enabled(False):
                im_denoise = im_noisy - net['D'](im_noisy)

            mae_iter = F.l1_loss(im_denoise, im_gt)
            im_denoise.clamp_(0.0, 1.0)
            mae_epoch[phase] += mae_iter
            psnr_iter = batch_PSNR(im_denoise, im_gt)
            psnr_per_epoch += psnr_iter
            ssim_iter = batch_SSIM(im_denoise, im_gt)
            ssim_per_epoch += ssim_iter
            # print statistics every log_interval mini_batches
            if (ii + 1) % 50 == 0:
                log_str = '[Epoch:{:>2d}/{:<2d}] {:s}:{:0>3d}/{:0>3d}, mae={:.2e}, ' + \
                                                                    'psnr={:4.2f}, ssim={:5.4f}'
                print(
                    log_str.format(epoch + 1, args['epochs'], phase, ii + 1,
                                   num_iter_epoch[phase], mae_iter, psnr_iter,
                                   ssim_iter))
                # tensorboard summary
                x1 = vutils.make_grid(im_denoise,
                                      normalize=True,
                                      scale_each=True)
                writer.add_image(phase + ' Denoised images', x1,
                                 step_img[phase])
                x2 = vutils.make_grid(im_gt, normalize=True, scale_each=True)
                writer.add_image(phase + ' GroundTruth', x2, step_img[phase])
                x5 = vutils.make_grid(im_noisy,
                                      normalize=True,
                                      scale_each=True)
                writer.add_image(phase + ' Noisy Image', x5, step_img[phase])
                step_img[phase] += 1

        psnr_per_epoch /= (ii + 1)
        ssim_per_epoch /= (ii + 1)
        mae_epoch[phase] /= (ii + 1)
        print('{:s}: mae={:.3e}, PSNR={:4.2f}, SSIM={:5.4f}'.format(
            phase, mae_epoch[phase], psnr_per_epoch, ssim_per_epoch))
        print('-' * 150)

        # adjust the learning rate
        lr_scheduler['D'].step()
        lr_scheduler['G'].step()
        lr_scheduler['P'].step()
        # save model
        save_path_model = str(
            Path(args['model_dir']) / ('model_' + str(epoch + 1)))
        torch.save(
            {
                'epoch': epoch + 1,
                'step': step + 1,
                'step_img': {x: step_img[x] + 1
                             for x in _modes},
                'model_state_dict':
                {x: net[x].state_dict()
                 for x in ['D', 'P', 'G']},
                'optimizer_state_dict':
                {x: optimizer[x].state_dict()
                 for x in ['D', 'P', 'G']},
                'lr_scheduler_state_dict':
                {x: lr_scheduler[x].state_dict()
                 for x in ['D', 'P', 'G']}
            }, save_path_model)
        save_path_model = str(
            Path(args['model_dir']) /
            ('model_state_' + str(epoch + 1) + '.pt'))
        torch.save({x: net[x].state_dict()
                    for x in ['D', 'G']}, save_path_model)

        writer.add_scalars('MAE_epoch', mae_epoch, epoch)
        writer.add_scalar('Val PSNR epoch', psnr_per_epoch, epoch)
        writer.add_scalar('Val SSIM epoch', ssim_per_epoch, epoch)
        toc = time.time()
        print('This epoch take time {:.2f}'.format(toc - tic))
    writer.close()
    print('Reach the maximal epochs! Finish training')
Esempio n. 9
0
def train(opt, device):

    dataloader = data_loader(opt)

    gnet = GNet(opt).to(device)
    dnet = DNet(opt).to(device)
    #writer.add_graph(gnet)'''做实验试验下第二个参数'''
    #writer.add_graph(dnet)
    if device.type == 'cuda':  # 就算device里有多个GPU可见,但是若不用分发功能,仍然只有第0块在跑
        gnet = nn.DataParallel(gnet, [0, 1, 2])  # list(range(ngpu))不好使,只能用前几个
        dnet = nn.DataParallel(dnet, [0, 1, 2])
    gnet.apply(
        weight_init)  # 也就是初始化了下面的d/gnet.parameters();不进行初始化则会系统给你进行一次随机初始
    dnet.apply(weight_init)
    print('Generative NetWork:')
    print(gnet)
    print('')
    print('Discriminative NetWork:')
    print(dnet)

    criterion = nn.BCELoss()
    '''
    params (iterable): iterable of parameters to optimize or dicts defining parameter groups
    除了下面的整体赋值,还可以通过迭代给优化器赋值,把模型中所有需要参数的过程都分别设置值;如学长代码:
    optimizer = optim.SGD([
                            {'params': model.features.parameters(), 'lr': 0.1 * lr},
                            {'params': model.sample_128.parameters(), 'lr': lr},
                            {'params': model.sample_256.parameters(), 'lr': lr},
                            {'params': model.fc_concat.parameters(), 'lr': lr}
                        ], lr=1e-1, momentum=0.9, weight_decay=1e-5)
    '''
    g_optimizer = optim.Adam(gnet.parameters(),
                             lr=opt.lr1,
                             betas=(opt.beta1, 0.999))
    d_optimizer = optim.Adam(dnet.parameters(),
                             lr=opt.lr2,
                             betas=(opt.beta1, 0.999))
    # 优化器只会进行一次初始赋值,其他都是反向调整
    print('g_optimizer:')
    print(g_optimizer)
    print('d_optimizer:')
    print(d_optimizer)

    writer = SummaryWriter(log_dir='train_result')
    # 定义writer时候就会生成events文件,而tensorboard执行时会搜索大文件下的所有路径,找出所有需要的文件
    #dummy1_input = torch.rand(opt.batch_size, 3, 96,96)
    #dummy2_input = torch.rand(opt.batch_size, opt.nd,1,1)
    #writer.add_graph(dnet, dummy1_input)
    #writer.add_graph(gnet, dumm2_input

    # Training Loop
    # Lists to keep track of progress
    '''完全可以不用列表,但是为了以后可能有其他用,就保留了'''
    img_list = []
    G_losses = []
    D_losses = []
    iters = 0
    fixed_noise = torch.randn(opt.batch_size, opt.nd, 1, 1, device=device)
    print("Starting Training Loop...")
    dnet.train()
    gnet.train()
    # 不写也默认为train模式;当有BN层和dropout层时,肯定得考虑模式切换,因为训练时这两个层有变化,验证时不能让它变,而eval()模式就不变,train()会变
    # For each epoch
    for epoch in range(1, opt.max_epoch + 1):
        # For each batch in the dataloader
        print(len(dataloader))
        print(type(dataloader))
        for i, (imgs, _) in enumerate(dataloader, 1):
            # torch.utils.data.DataLoader()返回的就是二元组组成的一个特殊的对象(不是列表等,也不能切片);
            # 在MNIST数据集中img, label = data;这些动漫头像没有标签,打印出来后发现是tensor([0, 0,...0, 0 0])
            ############################
            # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
            ############################
            ## Train with all-real batch
            dnet.zero_grad()
            '''先训练判别器,再训练生成器'''
            # Format batch
            real_img = imgs.to(device)  # 每个batch.to(device)
            # torch.full((2,3), 1.2),第一个参数必须是元组,可以是任意维数,但想要一维填充时也得为元组,而元组只有一个元素时后面必须有个,
            label = torch.full((opt.batch_size, ),
                               opt.real_label,
                               device=device)
            # Forward pass real batch through D
            output = dnet(real_img)  # 在model模块中已经被展成一维的啦
            # Calculate loss on all-real batch
            d_err_real = criterion(output, label)  # 平均损失
            # Calculate gradients for D in backward pass
            d_err_real.backward()
            D_x = output.mean().item()  # 真实图片的平均得分,当然是接近1越好

            ## Train with all-fake batch
            # Generate batch of latent vectors  latent:隐藏的,潜伏的
            noise = torch.randn(opt.batch_size, opt.nd, 1, 1, device=device)
            # gnet会生成opt.batch_size个图像,因为一个(opt.nd,1,1)可以生成一个图像;在gnet中,每张图有otp.nd个feature maps
            # ,每个feature map大小为1 x 1,所以每个opt.nd(也就是一个值),控制着生成图像中的一个特征
            # Generate fake image batch with G
            fake = gnet(noise)
            label.fill_(opt.fake_label)
            # Classify all fake batch with D
            output = dnet(fake.detach())
            # Calculate D's loss on the all-fake batch
            d_err_fake = criterion(output, label)
            # Calculate the gradients for this batch
            d_err_fake.backward()
            D_G_z1 = output.mean().item()  # 假图像的分数,自然是越接近0越好
            # Add the gradients from the all-real and all-fake batches
            d_err = d_err_real + d_err_fake  #tensor(1.272)+tensor(0.183)可以直接相加,不需要先取出数值。
            # tensor自成体系,tensor和tensor的加减乘除和标量一模一样;只是tensor和标量之间不能直接算
            # Update D
            d_optimizer.step()

            ############################
            # (2) Update G network: maximize log(D(G(z)))
            ###########################
            gnet.zero_grad()
            label.fill_(
                opt.real_label)  # fake labels are real for generator cost
            # Since we just updated D, perform another forward pass of all-fake batch through D
            output = dnet(
                fake
            )  # 更新了一步D网络后,同样一批假图片,自然是希望判别得分output比更新前的假图片得分要小,也就是使下面的g_err扩大
            # Calculate G's loss based on this output
            g_err = criterion(output, label)
            '''生成器就是要把假图片往真标签身上凑;所以假图片+真标签,进行比较后,损失越小越好'''
            # Calculate gradients for G
            g_err.backward()
            D_G_z2 = output.mean().item(
            )  # 因为更新过一次判别器,所以这个假图片的output均值应该比上面的假图片的output均值更接近0才健康
            # Update G
            g_optimizer.step()

            # Output training stats
            if i % 50 == 0:
                print(
                    '[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\treal_img_mean_score: %.4f\tfake_img_mean_score_1/2: %.4f / %.4f'
                    % (epoch, opt.max_epoch, i, len(dataloader), d_err.item(),
                       g_err.item(), D_x, D_G_z1, D_G_z2))

            # Save Losses for plotting later
            G_losses.append(g_err.item())
            D_losses.append(d_err.item())

            writer.add_scalars('dnet_gnet_loss', {
                'G_losses': G_losses[iters],
                'D_losses': D_losses[iters]
            }, iters)

            # Check how the generator is doing by saving G's output on fixed_noise
            if (iters % 500 == 0) or ((epoch == opt.max_epoch) and
                                      (i == len(dataloader))):
                with torch.no_grad(
                ):  # 上下文管理,处于with范围内的tensor待会不反向,所以前向时不用求局部梯度了,节省计算。因为forward时就会把每层对应局部梯度公式求出来
                    fake = gnet(
                        fixed_noise
                    )  #.detach().cpu() 截断再放CPU里没什么特殊用啊,有没有效果一样,只是拷贝一份假图片存放cpu里
                img_list.append(vutils.make_grid(fake, normalize=True))
                '''还不知道合成的图有多少个小图呢'''
                writer.add_image('fake%d' % (iters / 500),
                                 img_list[int(iters / 500)], int(iters / 500))

            iters += 1

    torch.save(dnet.state_dict(), 'dnet.pth')
    torch.save(gnet.state_dict(), 'gnet.pth')

    writer.close()
    '''
Esempio n. 10
0
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/fashion_mnist_experiment_1')

# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# create grid of images
img_grid = torchvision.utils.make_grid(images)

# show images
matplotlib_imshow(img_grid, one_channel=True)

# write to tensorboard
writer.add_image('four_fashion_mnist_images', img_grid)
Esempio n. 11
0
class face_learner(object):
    def __init__(self, conf, inference=False):
        print(conf)
        if conf.use_mobilfacenet:
            self.model = MobileFaceNet(conf.embedding_size).to(conf.device)
            print('MobileFaceNet model generated')
        else:
            self.model = Backbone(conf.net_depth, conf.drop_ratio,
                                  conf.net_mode).to(conf.device)
            print('{}_{} model generated'.format(conf.net_mode,
                                                 conf.net_depth))

        if not inference:
            self.milestones = conf.milestones
            self.loader, self.class_num = get_train_loader_VTT(conf)

            self.writer = SummaryWriter(conf.log_path)
            self.step = 0
            self.head = Arcface(embedding_size=conf.embedding_size,
                                classnum=self.class_num).to(conf.device)

            print('two model heads generated')

            paras_only_bn, paras_wo_bn = separate_bn_paras(self.model)

            if conf.use_mobilfacenet:
                self.optimizer = optim.SGD(
                    [{
                        'params': paras_wo_bn[:-1],
                        'weight_decay': 4e-5
                    }, {
                        'params': [paras_wo_bn[-1]] + [self.head.kernel],
                        'weight_decay': 4e-4
                    }, {
                        'params': paras_only_bn
                    }],
                    lr=conf.lr,
                    momentum=conf.momentum)
            else:
                self.optimizer = optim.SGD(
                    [{
                        'params': paras_wo_bn + [self.head.kernel],
                        'weight_decay': 5e-4
                    }, {
                        'params': paras_only_bn
                    }],
                    lr=conf.lr,
                    momentum=conf.momentum)
            print(self.optimizer)
            #             self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=40, verbose=True)

            print('optimizers generated')
            self.board_loss_every = len(self.loader) // 10  #100
            self.evaluate_every = len(self.loader) // 10
            self.save_every = len(self.loader) // 5
            self.agedb_30, self.cfp_fp, self.lfw, self.agedb_30_issame, self.cfp_fp_issame, self.lfw_issame = get_val_data(
                'D:\Dataset\Face')
        else:
            self.threshold = conf.threshold

    def save_state(self,
                   conf,
                   accuracy,
                   to_save_folder=False,
                   extra=None,
                   model_only=False):
        # if to_save_folder:
        #     save_path = conf.save_path
        # else:
        #     save_path = conf.model_path
        # torch.save(
        #     self.model.state_dict(), save_path /
        #     ('model_{}_accuracy:{}_step:{}_{}.pth'.format(get_time(), accuracy, self.step, extra)))
        # if not model_only:
        #     torch.save(
        #         self.head.state_dict(), save_path /
        #         ('head_{}_accuracy:{}_step:{}_{}.pth'.format(get_time(), accuracy, self.step, extra)))
        #     torch.save(
        #         self.optimizer.state_dict(), save_path /
        #         ('optimizer_{}_accuracy:{}_step:{}_{}.pth'.format(get_time(), accuracy, self.step, extra)))

        torch.save(self.model.state_dict(), './models/backbone.pth')
        torch.save(self.head.state_dict(), './models/head.pth')

    def load_state(self,
                   conf,
                   fixed_str,
                   from_save_folder=False,
                   model_only=False):
        if from_save_folder:
            save_path = conf.save_path
        else:
            save_path = conf.model_path
        # self.model.load_state_dict(torch.load(save_path/'model_{}'.format(fixed_str)))
        self.model.load_state_dict(torch.load('./models/backbone.pth'))
        if not model_only:
            # self.head.load_state_dict(torch.load(save_path/'head_{}'.format(fixed_str)))
            self.head.load_state_dict(torch.load('./models/head.pth'))
            # self.optimizer.load_state_dict(torch.load(save_path/'optimizer_{}'.format(fixed_str)))

    def board_val(self, db_name, accuracy, best_threshold, roc_curve_tensor):
        self.writer.add_scalar('{}_accuracy'.format(db_name), accuracy,
                               self.step)
        self.writer.add_scalar('{}_best_threshold'.format(db_name),
                               best_threshold, self.step)
        self.writer.add_image('{}_roc_curve'.format(db_name), roc_curve_tensor,
                              self.step)
#         self.writer.add_scalar('{}_val:true accept ratio'.format(db_name), val, self.step)
#         self.writer.add_scalar('{}_val_std'.format(db_name), val_std, self.step)
#         self.writer.add_scalar('{}_far:False Acceptance Ratio'.format(db_name), far, self.step)

    def evaluate(self, conf, carray, issame, nrof_folds=5, tta=False):
        self.model.eval()
        idx = 0
        embeddings = np.zeros([len(carray), conf.embedding_size])
        with torch.no_grad():
            while idx + conf.batch_size <= len(carray):
                batch = torch.tensor(carray[idx:idx + conf.batch_size])
                if tta:
                    fliped = hflip_batch(batch)
                    emb_batch = self.model(batch.to(conf.device)) + self.model(
                        fliped.to(conf.device))
                    embeddings[idx:idx + conf.batch_size] = l2_norm(emb_batch)
                else:
                    embeddings[idx:idx + conf.batch_size] = self.model(
                        batch.to(conf.device)).cpu()
                idx += conf.batch_size
            if idx < len(carray):
                batch = torch.tensor(carray[idx:])
                if tta:
                    fliped = hflip_batch(batch)
                    emb_batch = self.model(batch.to(conf.device)) + self.model(
                        fliped.to(conf.device))
                    embeddings[idx:] = l2_norm(emb_batch)
                else:
                    embeddings[idx:] = self.model(batch.to(conf.device)).cpu()
        tpr, fpr, accuracy, best_thresholds = evaluate(embeddings, issame,
                                                       nrof_folds)
        buf = gen_plot(fpr, tpr)
        roc_curve = Image.open(buf)
        roc_curve_tensor = trans.ToTensor()(roc_curve)
        return accuracy.mean(), best_thresholds.mean(), roc_curve_tensor

    def find_lr(self,
                conf,
                init_value=1e-8,
                final_value=10.,
                beta=0.98,
                bloding_scale=3.,
                num=None):
        if not num:
            num = len(self.loader)
        mult = (final_value / init_value)**(1 / num)
        lr = init_value
        for params in self.optimizer.param_groups:
            params['lr'] = lr
        self.model.train()
        avg_loss = 0.
        best_loss = 0.
        batch_num = 0
        losses = []
        log_lrs = []
        for i, (imgs, labels) in tqdm(enumerate(self.loader), total=num):

            imgs = imgs.to(conf.device)
            labels = labels.to(conf.device)
            batch_num += 1

            self.optimizer.zero_grad()

            embeddings = self.model(imgs)
            thetas = self.head(embeddings, labels)
            loss = conf.ce_loss(thetas, labels)

            #Compute the smoothed loss
            avg_loss = beta * avg_loss + (1 - beta) * loss.item()
            self.writer.add_scalar('avg_loss', avg_loss, batch_num)
            smoothed_loss = avg_loss / (1 - beta**batch_num)
            self.writer.add_scalar('smoothed_loss', smoothed_loss, batch_num)
            #Stop if the loss is exploding
            if batch_num > 1 and smoothed_loss > bloding_scale * best_loss:
                print('exited with best_loss at {}'.format(best_loss))
                plt.plot(log_lrs[10:-5], losses[10:-5])
                return log_lrs, losses
            #Record the best loss
            if smoothed_loss < best_loss or batch_num == 1:
                best_loss = smoothed_loss
            #Store the values
            losses.append(smoothed_loss)
            log_lrs.append(math.log10(lr))
            self.writer.add_scalar('log_lr', math.log10(lr), batch_num)
            #Do the SGD step
            #Update the lr for the next step

            loss.backward()
            self.optimizer.step()

            lr *= mult
            for params in self.optimizer.param_groups:
                params['lr'] = lr
            if batch_num > num:
                plt.plot(log_lrs[10:-5], losses[10:-5])
                return log_lrs, losses

    def train(self, conf, epochs):
        self.model.train()
        running_loss = 0.
        for e in range(epochs):
            print('epoch {} started'.format(e))
            if e == self.milestones[0]:
                self.schedule_lr()
            if e == self.milestones[1]:
                self.schedule_lr()
            if e == self.milestones[2]:
                self.schedule_lr()
            for imgs, labels in tqdm(iter(self.loader)):
                imgs = imgs.to(conf.device)
                labels = labels.to(conf.device)
                self.optimizer.zero_grad()
                embeddings = self.model(imgs)
                thetas = self.head(embeddings, labels)
                loss = conf.ce_loss(thetas, labels)  # cross entropy
                loss.backward()
                running_loss += loss.item()
                self.optimizer.step()

                if self.step % self.board_loss_every == 0 and self.step != 0:
                    loss_board = running_loss / self.board_loss_every
                    self.writer.add_scalar('train_loss', loss_board, self.step)
                    running_loss = 0.

                if self.step % self.evaluate_every == 0 and self.step != 0:
                    accuracy, best_threshold, roc_curve_tensor = self.evaluate(
                        conf, self.agedb_30, self.agedb_30_issame)
                    self.board_val('agedb_30', accuracy, best_threshold,
                                   roc_curve_tensor)
                    accuracy, best_threshold, roc_curve_tensor = self.evaluate(
                        conf, self.lfw, self.lfw_issame)
                    self.board_val('lfw', accuracy, best_threshold,
                                   roc_curve_tensor)
                    accuracy, best_threshold, roc_curve_tensor = self.evaluate(
                        conf, self.cfp_fp, self.cfp_fp_issame)
                    self.board_val('cfp_fp', accuracy, best_threshold,
                                   roc_curve_tensor)
                    self.model.train()
                if self.step % self.save_every == 0 and self.step != 0:
                    self.save_state(conf, accuracy)

                self.step += 1

        self.save_state(conf, accuracy, to_save_folder=True, extra='final')

    def schedule_lr(self):
        for params in self.optimizer.param_groups:
            params['lr'] /= 10
        print(self.optimizer)

    def infer(self, conf, faces, target_embs, tta=False):
        '''
        faces : list of PIL Image
        target_embs : [n, 512] computed embeddings of faces in facebank
        names : recorded names of faces in facebank
        tta : test time augmentation (hfilp, that's all)
        '''
        embs = []
        for img in faces:
            if tta:
                mirror = trans.functional.hflip(img)
                emb = self.model(
                    conf.test_transform(img).to(conf.device).unsqueeze(0))
                emb_mirror = self.model(
                    conf.test_transform(mirror).to(conf.device).unsqueeze(0))
                embs.append(l2_norm(emb + emb_mirror))
            else:
                embs.append(
                    self.model(
                        conf.test_transform(img).to(conf.device).unsqueeze(0)))
        source_embs = torch.cat(embs)

        diff = source_embs.unsqueeze(-1) - target_embs.transpose(
            1, 0).unsqueeze(0)
        dist = torch.sum(torch.pow(diff, 2), dim=1)
        minimum, min_idx = torch.min(dist, dim=1)
        min_idx[minimum > self.threshold] = -1  # if no match, set idx to -1
        return min_idx, minimum
Esempio n. 12
0
        # where the second option of maximizing doesn't suffer from
        # saturating gradients
        output = disc(fake).view(-1)
        lossG = criterion(output, torch.ones_like(output))
        gen.zero_grad()
        lossG.backward()
        opt_gen.step()

        if batch_idx == 0:
            print(
                f"Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(loader)} \
                      Loss D: {lossD:.4f}, loss G: {lossG:.4f}")

            with torch.no_grad():
                fake = gen(fixed_noise).reshape(-1, 1, 28, 28)
                data = real.reshape(-1, 1, 28, 28)
                img_grid_fake = torchvision.utils.make_grid(fake,
                                                            normalize=True)
                img_grid_real = torchvision.utils.make_grid(data,
                                                            normalize=True)

                writer_fake.add_image("Mnist Fake Images",
                                      img_grid_fake,
                                      global_step=step)
                writer_real.add_image("Mnist Real Images",
                                      img_grid_real,
                                      global_step=step)
                step += 1

#%%
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

# Tensorboard : dataloader 캡쳐

dataiter = iter(train_loader)
images, labels = dataiter.next()
img_grid = torchvision.utils.make_grid(images)
# matplotlib_imshow(img_grid, one_channel=True)
writer.add_image('face_images', img_grid)

# ---------------------- #
''' define model'''

model = models.resnet50()
model.fc = nn.Linear(2048, 512)
# model.load_state_dict(torch.load(path +'.pth'))
model.to(device)
margin = ArcMarginProduct(in_feature=512,
                          out_feature=num_classes,
                          easy_margin=True)
# margin.load_state_dict(torch.load(path+'Margin.pth'))
margin.to(device)
nomargin = ArcMarginForTest(in_feature=512,
                            out_feature=num_classes,
Esempio n. 14
0
                          transform=transform)
trainloader = torch.utils.data.DataLoader(trainset,
                                          batch_size=64,
                                          shuffle=True)
model = torchvision.models.resnet50(False)

model.conv1 = torch.nn.Conv2d(1,
                              64,
                              kernel_size=7,
                              stride=2,
                              padding=3,
                              bias=False)
images, labels = next(iter(trainloader))

grid = torchvision.utils.make_grid(images)
writer.add_image('images', grid, 0)
writer.add_graph(model, images)


def send_stats(i, module, input, output):
    writer.add_scalar(f"layer {i}-mean", output.data.mean())
    writer.add_scalar(f"layer {i}-stddev", output.data.std())


from functools import partial

for i, m in enumerate(model.children()):
    m.register_forward_hook(partial(send_stats, i))

# Now train the model and watch output in Tensorboard
Esempio n. 15
0
class Trainer(object):

    TrainParams = TrainParams

    def __init__(self, model, train_params, train_data, val_data=None):
        assert isinstance(train_params, TrainParams)
        self.params = train_params

        # Data loaders
        self.train_data = train_data
        self.val_data = val_data

        # Criterion, Optimizer, learning rate and heatmap type init
        self.last_epoch = 0
        self.hm_type = self.params.hm_type
        self.criterion = self.params.criterion
        self.optimizer = self.params.optimizer
        self.lr_scheduler = self.params.lr_scheduler

        logger.info('Set criterion to {}'.format(type(self.criterion)))
        logger.info('Set optimizer to {}'.format(type(self.optimizer)))
        logger.info('Set lr_scheduler to {}'.format(type(self.lr_scheduler)))
        logger.info('Set heatmap refine to <{}>'.format(
            ["static", "stage", "liner", "exp", "new"][int(self.hm_type)]))

        # load model
        self.model = model

        # set CUDA_VISIBLE_DEVICES
        if len(self.params.gpus) > 0:
            gpus = ','.join([str(x) for x in self.params.gpus])
            os.environ['CUDA_VISIBLE_DEVICES'] = gpus
            self.params.gpus = tuple(range(len(self.params.gpus)))
            logger.info('Set CUDA_VISIBLE_DEVICES to GPU[{}]'.format(gpus))
            self.model = nn.DataParallel(self.model,
                                         device_ids=self.params.gpus)
            self.model = self.model.cuda()

        logger.info('Set output dir to {}'.format(self.params.save_dir))
        if os.path.isdir(self.params.save_dir):
            pass
        else:
            os.makedirs(self.params.save_dir)

        ckpt = self.params.ckpt
        if ckpt is not None:
            self._load_ckpt(ckpt)
            logger.info('Load ckpt from {}'.format(ckpt))

        # meters
        self.loss_meter = meter.AverageValueMeter()

        # tensorboard
        self.writer = SummaryWriter()

        # train
        self.model.train()

    def train(self):

        best_loss = np.inf
        for epoch in range(self.last_epoch, self.params.max_epoch):

            self.loss_meter.reset()

            epoch += 1
            self.last_epoch += 1
            print(' ')
            logger.info('Start training epoch {}'.format(epoch))

            # calculate trainng time for one epoch
            start_time = time.time()
            self._train_one_epoch()
            total_time = time.time() - start_time

            # logger info: heatmap kernel sigma & training time
            logger.info('The heatmap kernel size = {:.2f} pixel'.format(
                self.sigma))
            logger.info('The training time = {:.2f} m {:.2f} s'.format(
                total_time // 60, total_time % 60))

            # save model
            if (epoch >= self.params.start_save_epoch) and (
                    epoch % self.params.save_freq_epoch
                    == 0) or (epoch == self.params.max_epoch - 1):
                save_name = self.params.save_dir + 'ckpt_epoch_{}.pth'.format(
                    epoch)
                t.save(self.model.state_dict(), save_name)

            # validate and get average_err
            logger.info('Val on validation set...')
            self._val_one_epoch()
            logger.info('Mean Per Joint 2D Error = {:.4f} pixel'.format(
                self.AveErr))

            # loss update
            if self.loss_meter.value()[0] < best_loss:
                logger.info('Found a better ckpt ({:.6f} -> {:.6f})'.format(
                    best_loss,
                    self.loss_meter.value()[0]))
                best_loss = self.loss_meter.value()[0]

            # tensorboard
            self.writer.add_scalar('train/hm_kernel', self.sigma,
                                   self.last_epoch)
            self.writer.add_scalar('train/loss',
                                   self.loss_meter.value()[0], self.last_epoch)
            self.writer.add_scalar('train/ave_err', self.AveErr,
                                   self.last_epoch)
            self.writer.add_scalar('train/PCHk', self.PCkh, self.last_epoch)

            # adjust the lr
            if isinstance(self.lr_scheduler, ReduceLROnPlateau):
                self.lr_scheduler.step(self.loss_meter.value()[0])

    def _train_one_epoch(self):
        bar = Bar('Processing', max=len(self.train_data))
        for step, (data, label) in enumerate(self.train_data):

            self.sigma = hm_kernel_size(self.hm_type,
                                        self.last_epoch,
                                        threshold=4)
            target = gene_heatmap(label, self.sigma)
            inputs = Variable(data)
            target = Variable(t.from_numpy(target))
            if len(self.params.gpus) > 0:
                inputs = inputs.cuda()
                target = target.type(t.FloatTensor).cuda()

            # forward
            score = self.model(inputs)
            loss = 0

            # stack hourglass
            for s in range(len(score)):
                loss += self.criterion(score[s], target)
            loss = loss / len(score)

            # simple pose res
            # loss = self.criterion(score[1], target)

            # backward
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step(None)

            # meters update
            self.loss_meter.add(loss.item())

            # evaluation: calculate PCKh
            predictions = spatial_soft_argmax2d(score[len(score) - 1], 1000,
                                                False).cpu().numpy().reshape(
                                                    -1, 2)
            targetcoors = label.numpy().reshape(-1, 2)
            steppckh, steperr = evalPCKh(predictions,
                                         targetcoors,
                                         threshold=50,
                                         alpha=0.2)

            # tensorboard show
            if step % 500 == 0:
                target_shows = t.sum(target[0], 0)
                target_shows[target_shows > 1] = 1
                self.writer.add_image('train/input', inputs[0],
                                      self.last_epoch)
                self.writer.add_image('train/taget',
                                      target_shows,
                                      self.last_epoch,
                                      dataformats='HW')
                self.writer.add_image('train/output',
                                      t.sum(score[1][0], 0),
                                      self.last_epoch,
                                      dataformats='HW')

            bar.suffix = 'Train: [%(index)d/%(max)d] | Epoch: [{0}/{1}]| Loss: {loss:6f} | PCKh: {pckh:4f} | AveErr: {err:.2f} pixel |'.format(
                self.last_epoch,
                self.params.max_epoch,
                loss=loss,
                pckh=steppckh,
                err=steperr)
            bar.next()
        bar.finish()

    def _val_one_epoch(self):
        bar = Bar('Validating', max=len(self.val_data))
        self.model.eval()

        predictions = np.empty((0, 2))
        targetcoors = np.empty((0, 2))

        for step, (data, label) in enumerate(self.val_data):
            with t.no_grad():
                inputs = data
                target = label.reshape(-1, 2)
                # target = label.type(t.FloatTensor)
            if len(self.params.gpus) > 0:
                inputs = inputs.cuda(1)
                # target = target.cuda()

            score = self.model(inputs)
            coors = spatial_soft_argmax2d(score[len(score) - 1], 1000,
                                          False).cpu().numpy().reshape(-1, 2)
            predictions = np.concatenate((predictions, coors), axis=0)
            targetcoors = np.concatenate((targetcoors, target), axis=0)

            # evaluation: calculate PCKh
            currentpckh, currenterr = evalPCKh(predictions,
                                               targetcoors,
                                               threshold=50,
                                               alpha=0.2)

            # tensorboard visualization
            if step % 100 == 0:
                self.writer.add_image('valid/img', inputs[0], self.last_epoch)
                self.writer.add_image('valid/output',
                                      t.sum(score[1][0], 0),
                                      self.last_epoch,
                                      dataformats='HW')

            bar.suffix = 'Valid: [%(index)d/%(max)d] | PCKh: {pckh:6f} | AveErr: {err:.2f} pixel |'.format(
                pckh=currentpckh, err=currenterr)
            bar.next()
        bar.finish()

        self.PCkh, self.AveErr = evalPCKh(predictions,
                                          targetcoors,
                                          threshold=50,
                                          alpha=0.2)
        self.model.train()

    def _load_ckpt(self, ckpt):
        self.model.load_state_dict(t.load(ckpt))
Esempio n. 16
0
                with torch.no_grad():
                    running_losses.update(losses)

                    last_iteration = global_step == len(
                        dataset) // batch_size * n_epochs - 1
                    if global_step % 25 == 0 or last_iteration:
                        average_losses = running_losses.get()
                        for key, value in average_losses.items():
                            writer.add_scalar(key, value, global_step)

                        running_losses.reset()

                    if global_step % 100 == 0 or last_iteration:
                        styled_test_image = stylize_image(
                            Image.open("test_image.jpeg"), model)
                        writer.add_image('test image', styled_test_image,
                                         global_step)

                        for i in range(0, len(dataset), len(dataset) // 4):
                            sample = dataset[i]
                            styled_train_image_1 = stylize_image(
                                sample["frame"], model)
                            styled_train_image_2 = stylize_image(
                                sample["previous_frame"], model)
                            grid = torchvision.utils.make_grid(
                                [styled_train_image_1, styled_train_image_2])
                            writer.add_image(f'train images {i}', grid,
                                             global_step)

                    global_step += 1

        torch.save(model.state_dict(), args.output_file)
Esempio n. 17
0
def train_worker(rank, addr, port):
    
    # Distributed Setup
    os.environ['MASTER_ADDR'] = addr
    os.environ['MASTER_PORT'] = port
    dist.init_process_group("nccl", rank=rank, world_size=distributed_num_gpus)
    
    # Training DataLoader
    dataset_train = ZipDataset([
        ZipDataset([
            ImagesDataset(DATA_PATH[args.dataset_name]['train']['pha'], mode='L'),
            ImagesDataset(DATA_PATH[args.dataset_name]['train']['fgr'], mode='RGB'),
        ], transforms=A.PairCompose([
            A.PairRandomAffineAndResize((2048, 2048), degrees=(-5, 5), translate=(0.1, 0.1), scale=(0.3, 1), shear=(-5, 5)),
            A.PairRandomHorizontalFlip(),
            A.PairRandomBoxBlur(0.1, 5),
            A.PairRandomSharpen(0.1),
            A.PairApplyOnlyAtIndices([1], T.ColorJitter(0.15, 0.15, 0.15, 0.05)),
            A.PairApply(T.ToTensor())
        ]), assert_equal_length=True),
        ImagesDataset(DATA_PATH['backgrounds']['train'], mode='RGB', transforms=T.Compose([
            A.RandomAffineAndResize((2048, 2048), degrees=(-5, 5), translate=(0.1, 0.1), scale=(1, 2), shear=(-5, 5)),
            T.RandomHorizontalFlip(),
            A.RandomBoxBlur(0.1, 5),
            A.RandomSharpen(0.1),
            T.ColorJitter(0.15, 0.15, 0.15, 0.05),
            T.ToTensor()
        ])),
    ])
    dataset_train_len_per_gpu_worker = int(len(dataset_train) / distributed_num_gpus)
    dataset_train = Subset(dataset_train, range(rank * dataset_train_len_per_gpu_worker, (rank + 1) * dataset_train_len_per_gpu_worker))
    dataloader_train = DataLoader(dataset_train,
                                  shuffle=True,
                                  pin_memory=True,
                                  drop_last=True,
                                  batch_size=args.batch_size // distributed_num_gpus,
                                  num_workers=args.num_workers // distributed_num_gpus)
    
    # Validation DataLoader
    if rank == 0:
        dataset_valid = ZipDataset([
            ZipDataset([
                ImagesDataset(DATA_PATH[args.dataset_name]['valid']['pha'], mode='L'),
                ImagesDataset(DATA_PATH[args.dataset_name]['valid']['fgr'], mode='RGB')
            ], transforms=A.PairCompose([
                A.PairRandomAffineAndResize((2048, 2048), degrees=(-5, 5), translate=(0.1, 0.1), scale=(0.3, 1), shear=(-5, 5)),
                A.PairApply(T.ToTensor())
            ]), assert_equal_length=True),
            ImagesDataset(DATA_PATH['backgrounds']['valid'], mode='RGB', transforms=T.Compose([
                A.RandomAffineAndResize((2048, 2048), degrees=(-5, 5), translate=(0.1, 0.1), scale=(1, 1.2), shear=(-5, 5)),
                T.ToTensor()
            ])),
        ])
        dataset_valid = SampleDataset(dataset_valid, 50)
        dataloader_valid = DataLoader(dataset_valid,
                                      pin_memory=True,
                                      drop_last=True,
                                      batch_size=args.batch_size // distributed_num_gpus,
                                      num_workers=args.num_workers // distributed_num_gpus)
    
    # Model
    model = MattingRefine(args.model_backbone,
                          args.model_backbone_scale,
                          args.model_refine_mode,
                          args.model_refine_sample_pixels,
                          args.model_refine_thresholding,
                          args.model_refine_kernel_size).to(rank)
    model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model_distributed = nn.parallel.DistributedDataParallel(model, device_ids=[rank])
    
    if args.model_last_checkpoint is not None:
        load_matched_state_dict(model, torch.load(args.model_last_checkpoint))

    optimizer = Adam([
        {'params': model.backbone.parameters(), 'lr': 5e-5},
        {'params': model.aspp.parameters(), 'lr': 5e-5},
        {'params': model.decoder.parameters(), 'lr': 1e-4},
        {'params': model.refiner.parameters(), 'lr': 3e-4},
    ])
    scaler = GradScaler()
    
    # Logging and checkpoints
    if rank == 0:
        if not os.path.exists(f'checkpoint/{args.model_name}'):
            os.makedirs(f'checkpoint/{args.model_name}')
        writer = SummaryWriter(f'log/{args.model_name}')
    
    # Run loop
    for epoch in range(args.epoch_start, args.epoch_end):
        for i, ((true_pha, true_fgr), true_bgr) in enumerate(tqdm(dataloader_train)):
            step = epoch * len(dataloader_train) + i

            true_pha = true_pha.to(rank, non_blocking=True)
            true_fgr = true_fgr.to(rank, non_blocking=True)
            true_bgr = true_bgr.to(rank, non_blocking=True)
            true_pha, true_fgr, true_bgr = random_crop(true_pha, true_fgr, true_bgr)
            
            true_src = true_bgr.clone()
            
            # Augment with shadow
            aug_shadow_idx = torch.rand(len(true_src)) < 0.3
            if aug_shadow_idx.any():
                aug_shadow = true_pha[aug_shadow_idx].mul(0.3 * random.random())
                aug_shadow = T.RandomAffine(degrees=(-5, 5), translate=(0.2, 0.2), scale=(0.5, 1.5), shear=(-5, 5))(aug_shadow)
                aug_shadow = kornia.filters.box_blur(aug_shadow, (random.choice(range(20, 40)),) * 2)
                true_src[aug_shadow_idx] = true_src[aug_shadow_idx].sub_(aug_shadow).clamp_(0, 1)
                del aug_shadow
            del aug_shadow_idx
            
            # Composite foreground onto source
            true_src = true_fgr * true_pha + true_src * (1 - true_pha)

            # Augment with noise
            aug_noise_idx = torch.rand(len(true_src)) < 0.4
            if aug_noise_idx.any():
                true_src[aug_noise_idx] = true_src[aug_noise_idx].add_(torch.randn_like(true_src[aug_noise_idx]).mul_(0.03 * random.random())).clamp_(0, 1)
                true_bgr[aug_noise_idx] = true_bgr[aug_noise_idx].add_(torch.randn_like(true_bgr[aug_noise_idx]).mul_(0.03 * random.random())).clamp_(0, 1)
            del aug_noise_idx
            
            # Augment background with jitter
            aug_jitter_idx = torch.rand(len(true_src)) < 0.8
            if aug_jitter_idx.any():
                true_bgr[aug_jitter_idx] = kornia.augmentation.ColorJitter(0.18, 0.18, 0.18, 0.1)(true_bgr[aug_jitter_idx])
            del aug_jitter_idx
            
            # Augment background with affine
            aug_affine_idx = torch.rand(len(true_bgr)) < 0.3
            if aug_affine_idx.any():
                true_bgr[aug_affine_idx] = T.RandomAffine(degrees=(-1, 1), translate=(0.01, 0.01))(true_bgr[aug_affine_idx])
            del aug_affine_idx
            
            with autocast():
                pred_pha, pred_fgr, pred_pha_sm, pred_fgr_sm, pred_err_sm, _ = model_distributed(true_src, true_bgr)
                loss = compute_loss(pred_pha, pred_fgr, pred_pha_sm, pred_fgr_sm, pred_err_sm, true_pha, true_fgr)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

            if rank == 0:
                if (i + 1) % args.log_train_loss_interval == 0:
                    writer.add_scalar('loss', loss, step)

                if (i + 1) % args.log_train_images_interval == 0:
                    writer.add_image('train_pred_pha', make_grid(pred_pha, nrow=5), step)
                    writer.add_image('train_pred_fgr', make_grid(pred_fgr, nrow=5), step)
                    writer.add_image('train_pred_com', make_grid(pred_fgr * pred_pha, nrow=5), step)
                    writer.add_image('train_pred_err', make_grid(pred_err_sm, nrow=5), step)
                    writer.add_image('train_true_src', make_grid(true_src, nrow=5), step)

                del true_pha, true_fgr, true_src, true_bgr
                del pred_pha, pred_fgr, pred_pha_sm, pred_fgr_sm, pred_err_sm

                if (i + 1) % args.log_valid_interval == 0:
                    valid(model, dataloader_valid, writer, step)

                if (step + 1) % args.checkpoint_interval == 0:
                    torch.save(model.state_dict(), f'checkpoint/{args.model_name}/epoch-{epoch}-iter-{step}.pth')
                    
        if rank == 0:
            torch.save(model.state_dict(), f'checkpoint/{args.model_name}/epoch-{epoch}.pth')
            
    # Clean up
    dist.destroy_process_group()
Esempio n. 18
0
        label = (torch.ones(batch_size) * 0.1).to(device)

        output = netD(fake.detach()).reshape(-1)
        lossD_fake = criterion(output, label)

        lossD = lossD_real + lossD_fake
        lossD.backward()
        optimizerD.step()

        #         Train Generator: max_log(D(G(z)))
        netG.zero_grad()
        label = torch.ones(batch_size).to(device)
        output = netD(fake).reshape(-1)
        lossG = criterion(output, label)
        lossG.backward()
        optimizerG.step()

        if batch_idx % 100 == 0:
            print(
                f'Epoch [{epoch}/{num_epochs}] Batch {batch_idx}/{len(dataloader)} \
                  loss D: {lossD:.4f}, {lossG:.4f} D(x): {D_x:.4f}')

            with torch.no_grad():
                fake = netG(fixed_noise)
                img_grid_real = torchvision.utils.make_grid(data[:32],
                                                            normalize=True)
                img_grid_fake = torchvision.utils.make_grid(fake[:32],
                                                            normalize=True)
                writer_real.add_image('MNIST real images', img_grid_real)
                writer_real.add_image('MNIST fake images', img_grid_fake)
    # loop through all generated dataloaders with adversarial images
    results_dict = {}
    for attack_name in adv_dict:

        # measure attack success
        print("Testing performance of attack {}: ".format(attack_name))
        for epsilon_attack, epsilon in zip(adv_dict[attack_name], epsilons):
            attacked_acc, _ = validate(epsilon_attack, model, criterion, 1,
                                       args)

            # save adv images for visualization purposes
            dataiter = iter(epsilon_attack)
            images, _ = dataiter.next()
            img_grid = utils.make_grid(images)
            summary.add_image(
                "Training Images Adversarially Attacked Using {} with eps {}".
                format(attack_name, epsilon), img_grid)

            print("Generating defences for attack {} with eps {}: ".format(
                attack_name, epsilon))

            def_adv_dict = gen_defences(epsilon_attack, attack_name,
                                        defence_list)
            accuracies = {
                'initial': initial_acc.item(),
                'attacked': attacked_acc.item()
            }

            if 'adv_retraining' in args.defences:
                # evaluate retrained model
                acc1, val_loss = validate(epsilon_attack, robust_model,
Esempio n. 20
0
class Evaluator(object):
    def __init__(self, args):

        self.args = args
        self.device = torch.device(args.device)

        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(cfg.DATASET.MEAN, cfg.DATASET.STD),
        ])
        self.lr = 2.5
        self.prefix = f"2_boxes_3_7={self.lr}"
        # self.prefix = f"overfit__count_toy_experiment_3class_7_2_1_conf_loss=total_xavier_weights_xavier_bias_lr={self.lr}"
        self.writer = SummaryWriter(log_dir=f"cce_toy_logs/{self.prefix}")
        # self.writer = SummaryWriter(log_dir= f"cce_cityscapes_logs/{self.prefix}")
        # dataset and dataloader
        val_dataset = get_segmentation_dataset(cfg.DATASET.NAME,
                                               split='val',
                                               mode='testval',
                                               transform=input_transform)
        # val_sampler = make_data_sampler(val_dataset, False, args.distributed)
        self.val_loader = data.DataLoader(dataset=val_dataset,
                                          shuffle=True,
                                          batch_size=cfg.TEST.BATCH_SIZE,
                                          drop_last=True,
                                          num_workers=cfg.DATASET.WORKERS,
                                          pin_memory=True)

        self.dataset = val_dataset
        self.classes = val_dataset.classes
        self.metric = SegmentationMetric(val_dataset.num_class,
                                         args.distributed)

        # self.model = get_segmentation_model().to(self.device)

        # if hasattr(self.model, 'encoder') and hasattr(self.model.encoder, 'named_modules') and \
        #     cfg.MODEL.BN_EPS_FOR_ENCODER:
        #     logging.info('set bn custom eps for bn in encoder: {}'.format(cfg.MODEL.BN_EPS_FOR_ENCODER))
        #     self.set_batch_norm_attr(self.model.encoder.named_modules(), 'eps', cfg.MODEL.BN_EPS_FOR_ENCODER)

        # if args.distributed:
        #     self.model = nn.parallel.DistributedDataParallel(self.model,
        #         device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)

        # self.model.to(self.device)

    def set_batch_norm_attr(self, named_modules, attr, value):
        for m in named_modules:
            if isinstance(m[1], nn.BatchNorm2d) or isinstance(
                    m[1], nn.SyncBatchNorm):
                setattr(m[1], attr, value)

    def eval(self):
        self.metric.reset()
        print(f"Length of classes: {len(self.classes)}")
        temp_weights = torch.eye(len(self.classes), device="cuda")
        torch.nn.init.xavier_uniform_(temp_weights, gain=1.0)
        print(temp_weights)
        temp_weights.requires_grad = True
        # temp_weights.requires_grad= True
        temp_bias = torch.zeros(len(self.classes), device="cuda")
        # torch.nn.init.xavier_uniform_(temp_bias, gain=1.0)
        temp_bias.requires_grad = True
        # temp_weights = torch.rand(len(self.classes), len(self.classes), device="cuda", requires_grad=True)
        # temp_bias = torch.rand(len(self.classes), device="cuda", requires_grad=True)

        logging.info(
            "Start training of temprature weights, Total sample: {:d}".format(
                len(self.val_loader)))

        cce_criterion = CCELoss(len(self.classes)).to(self.device)
        cross_criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)
        optimizer = torch.optim.SGD([temp_weights, temp_bias], lr=self.lr)
        import time
        time_start = time.time()
        num_epochs = 300
        for epoch in range(num_epochs):
            eceEvaluator_perimage = perimageCCE(n_classes=len(self.classes))
            epoch_loss_cce_total = 0
            epoch_loss_cross_entropy_total = 0
            epoch_loss_total = 0
            for i, (images, targets, filenames) in enumerate(self.val_loader):
                # import pdb; pdb.set_trace()
                optimizer.zero_grad()

                images = images.to(self.device)
                targets = targets.to(self.device)

                # print(image.shape)
                with torch.no_grad():
                    # outputs = model.evaluate(images)

                    # outputs = torch.rand(1,3,300,400)
                    outputs = torch.ones(1, 2, 300, 400) * (torch.Tensor(
                        [0.3, 0.7]).reshape(1, -1, 1, 1))
                    # outputs = torch.ones(1,4,300,400)*(torch.Tensor([0.5,0.25,0.15, 0.1]).reshape(1,-1,1,1))
                    outputs = outputs.cuda()
                    outputs[0, 0, :, :200] = 0.7
                    outputs[0, 1, :, 200:] = 0.3

                    # outputs = torch.ones(1,3,300,400)*(torch.Tensor([0.7,0.2,0.1]).reshape(1,-1,1,1))
                    # # outputs = torch.ones(1,4,300,400)*(torch.Tensor([0.5,0.25,0.15, 0.1]).reshape(1,-1,1,1))
                    # outputs = outputs.cuda()
                    # outputs[0,0,100:200, 50:150] = 0.1
                    # outputs[0,0,100:150, 250:300] = 0.2
                    # outputs[0,1,100:200, 50:150] = 0.7
                    # outputs[0,1,100:150, 250:300] = 0.1
                    # outputs[0,2,100:200, 50:150] = 0.2
                    # outputs[0,2,100:150, 250:300] = 0.7

                    # Converting back to logits
                    outputs = torch.log(outputs)

                outputs = outputs.permute(0, 2, 3, 1).contiguous()
                outputs = torch.matmul(outputs, temp_weights)
                outputs = outputs + temp_bias

                outputs = outputs.permute(0, 3, 1, 2).contiguous()

                # Add image stuff
                save_imgs = torch.softmax(outputs, dim=1).squeeze(0)
                # analyse(outputs = save_imgs.unsqueeze(0))
                # accuracy(outputs = outputs)
                for class_no, class_distri in enumerate(save_imgs):
                    plt.clf()
                    class_distri[0][0] = 0
                    class_distri[0][1] = 1

                    im = plt.imshow(class_distri.detach().cpu().numpy(),
                                    cmap="Greens")
                    plt.colorbar(im)
                    plt.savefig("temp_files/temp.jpg")
                    plt.clf()
                    import cv2
                    img_dif = cv2.imread("temp_files/temp.jpg")

                    self.writer.add_image(f"Class_{class_no}",
                                          img_dif,
                                          epoch,
                                          dataformats="HWC")

                loss_cce = cce_criterion.forward(outputs, targets)
                loss_cross_entropy = cross_criterion.forward(outputs, targets)

                alpha = 0
                total_loss = loss_cce + alpha * loss_cross_entropy

                epoch_loss_cce_total += loss_cce.item()
                epoch_loss_cross_entropy_total += loss_cross_entropy.item()
                epoch_loss_total += total_loss.item()

                total_loss.backward()
                optimizer.step()

                with torch.no_grad():
                    for output, target in zip(outputs, targets.detach()):
                        # older ece requires softmax and size output=[class,w,h] target=[w,h]
                        eceEvaluator_perimage.update(output.softmax(dim=0),
                                                     target)
                # print(outputs.shape)
                # print(eceEvaluator_perimage.get_overall_CCELoss())
                print(
                    f"batch :{i+1}/{len(self.val_loader)}" +
                    "loss cce : {:.5f} | loss cls : {:.5f} | loss tot : {:.5f}"
                    .format(loss_cce, loss_cross_entropy, total_loss))

            print(temp_weights)
            print(temp_bias)
            epoch_loss_cce_total /= len(self.val_loader)
            epoch_loss_cross_entropy_total /= len(self.val_loader)
            epoch_loss_total /= len(self.val_loader)

            count_table_image, _ = eceEvaluator_perimage.get_count_table_img(
                self.classes)
            cce_table_image, dif_map = eceEvaluator_perimage.get_perc_table_img(
                self.classes)
            self.writer.add_image("CCE_table",
                                  cce_table_image,
                                  epoch,
                                  dataformats="HWC")
            self.writer.add_image("Count table",
                                  count_table_image,
                                  epoch,
                                  dataformats="HWC")
            self.writer.add_image("DifMap", dif_map, epoch, dataformats="HWC")

            self.writer.add_scalar(f"Cross EntropyLoss_LR",
                                   epoch_loss_cross_entropy_total, epoch)
            self.writer.add_scalar(f"CCELoss_LR", epoch_loss_cce_total, epoch)
            self.writer.add_scalar(f"Total Loss_LR", epoch_loss_total, epoch)
            self.writer.add_histogram("Weights", temp_weights, epoch)
            self.writer.add_histogram("Bias", temp_bias, epoch)
            # output = output/temp_weights
            # print(output.shape)
            # print(temp_weights, temp_bias)

            if epoch > 0 and epoch % 10 == 0:
                print("saving weights.")
                np.save("weights/toy/wt_{}_{}.npy".format(epoch, self.prefix),
                        temp_weights.cpu().detach().numpy())
                np.save("weights/toy/b{}_{}.npy".format(epoch, self.prefix),
                        temp_bias.cpu().detach().numpy())

            # print("epoch {} : loss {:.5f}".format(epoch, epoch_loss))
            # import pdb; pdb.set_trace()

        self.writer.close()
Esempio n. 21
0
    # Create the dataset object for example with the "NIC_v2 - 79 benchmark"
    # and assuming the core50 location in ~/core50/128x128/
    dataset = CORE50(root='/home/akash/core50/data/core50_128x128',
                     scenario="ni",
                     task_type='segment')
    writer = SummaryWriter()

    # Get the fixed test set
    # test_x, test_y = dataset.get_test_set()

    # loop over the training incremental batches
    for i, train_batch in enumerate(dataset):
        # WARNING train_batch is NOT a mini-batch, but one incremental batch!
        # You can later train with SGD indexing train_x and train_y properly.
        train_x, train_y, t = train_batch

        print("----------- batch {0} -------------".format(i))
        print("train_x shape: {}, train_y shape: {}, task: {}".format(
            train_x.shape, train_y['mask'].shape, t))
        if (train_x.shape[0] > 0):
            print("TASK NOT EMPTY")
            img_1 = train_y['mask'][0, :, :]
            writer.add_image('task_img_' + str(t),
                             train_x[0, :, :, :],
                             dataformats='HWC')
            writer.add_image('task_seg_' + str(t), img_1, dataformats='HW')

        # use the data
        pass
Esempio n. 22
0
class TeeTrainer(Trainer):
    """for segmentation task"""
    def __init__(self, model, train_set, val_set, configs):
        super().__init__()
        print("Start trainer..")
        # load config
        self._configs = configs
        self._lr = self._configs["lr"]
        self._batch_size = self._configs["batch_size"]
        self._momentum = self._configs["momentum"]
        self._weight_decay = self._configs["weight_decay"]
        self._distributed = self._configs["distributed"]
        self._num_workers = self._configs["num_workers"]
        self._device = torch.device(self._configs["device"])
        self._max_epoch_num = self._configs["max_epoch_num"]
        self._max_plateau_count = self._configs["max_plateau_count"]

        # load dataloader and model
        self._train_set = train_set
        self._val_set = val_set
        self._model = model(
            in_channels=configs["in_channels"],
            num_classes=configs["num_classes"],
        )
        self._model.load_state_dict(
            torch.load("saved/checkpoints/mixed.test")["net"])

        print(self._configs)
        self._model = self._model.to(self._device)

        if self._distributed == 1:
            torch.distributed.init_process_group(backend="nccl")
            self._model = nn.parallel.DistributedDataParallel(
                self._model, find_unused_parameters=True)
            self._train_loader = DataLoader(
                self._train_set,
                batch_size=self._batch_size,
                num_workers=self._num_workers,
                pin_memory=True,
                shuffle=True,
                worker_init_fn=lambda x: np.random.seed(x),
            )
            self._val_loader = DataLoader(
                self._val_set,
                batch_size=self._batch_size,
                num_workers=self._num_workers,
                pin_memory=True,
                shuffle=False,
                worker_init_fn=lambda x: np.random.seed(x),
            )
        else:
            self._train_loader = DataLoader(
                self._train_set,
                batch_size=self._batch_size,
                num_workers=self._num_workers,
                pin_memory=True,
                shuffle=True,
                # worker_init_fn=lambda x: np.random.seed(x)
            )
            self._val_loader = DataLoader(
                self._val_set,
                batch_size=self._batch_size,
                num_workers=self._num_workers,
                pin_memory=True,
                shuffle=False,
                # worker_init_fn=lambda x: np.random.seed(x)
            )

        # define loss function (criterion) and optimizer
        # class_weights = torch.FloatTensor(np.array([0.3, 0.7])).to(self._device)
        self._criterion = nn.CrossEntropyLoss().to(self._device)

        self._optimizer = RAdam(
            params=self._model.parameters(),
            lr=self._lr,
            weight_decay=self._weight_decay,
        )

        self._scheduler = ReduceLROnPlateau(
            self._optimizer,
            patience=self._configs["plateau_patience"],
            verbose=True)

        # training info
        self._start_time = datetime.datetime.now()
        self._start_time = self._start_time.replace(microsecond=0)

        log_dir = os.path.join(
            self._configs["cwd"],
            self._configs["log_dir"],
            "{}_{}".format(self._configs["model_name"], str(self._start_time)),
        )

        self._writer = SummaryWriter(log_dir)
        self._train_loss = []
        self._train_acc = []
        self._val_loss = []
        self._val_acc = []
        self._best_loss = 1e9
        self._best_acc = 0
        self._plateau_count = 0
        self._current_epoch_num = 0

    def reset(self):
        """reset trainer"""
        pass

    def _train(self):
        self._model.train()
        train_loss = 0.0
        train_acc = 0.0

        for i, (images, targets) in tqdm(enumerate(self._train_loader),
                                         total=len(self._train_loader),
                                         leave=False):
            images = images.cuda(non_blocking=True)
            targets = targets.cuda(non_blocking=True)

            # compute output, measure accuracy and record loss
            outputs = self._model(images)

            loss = self._criterion(outputs, targets)
            acc = accuracy(outputs, targets)[0]
            # acc = eval_metrics(targets, outputs, 2)[0]

            train_loss += loss.item()
            train_acc += acc.item()

            # compute gradient and do SGD step
            self._optimizer.zero_grad()
            loss.backward()
            self._optimizer.step()

            # log
            if i == 0:
                grid = torchvision.utils.make_grid(images)
                self._writer.add_image("images", grid, 0)
                # self._writer.add_graph(self._model, images)
                # self._writer.close()

            if self._configs["little"] == 1:
                mask = torch.squeeze(outputs, 0)
                mask = mask.detach().cpu().numpy() * 255
                mask = np.transpose(mask, (1, 2, 0)).astype(np.uint8)
                cv2.imwrite(
                    os.path.join("debug",
                                 "e{}.png".format(self._current_epoch_num)),
                    mask[..., 1],
                )

        i += 1
        self._train_loss.append(train_loss / i)
        self._train_acc.append(train_acc / i)

    def _val(self):
        self._model.eval()
        val_loss = 0.0
        val_acc = 0.0

        os.system("rm -rf debug/*")
        for i, (images, targets) in tqdm(enumerate(self._val_loader),
                                         total=len(self._val_loader),
                                         leave=False):
            images = images.cuda(non_blocking=True)
            targets = targets.cuda(non_blocking=True)

            # compute output, measure accuracy and record loss
            outputs = self._model(images)

            loss = self._criterion(outputs, targets)
            acc = accuracy(outputs, targets)[0]
            # acc = eval_metrics(targets, outputs, 2)[0]

            val_loss += loss.item()
            val_acc += acc.item()

            # debug time
            outputs = torch.squeeze(outputs, dim=0)
            outputs = torch.argmax(outputs, dim=0)
            tmp_image = torch.squeeze(images, dim=0)
            print(tmp_image.shape)
            tmp_image = tmp_image.cpu().numpy()
            cv2.imwrite("debug/{}/{}.png".format(outputs, i), tmp_image)

        i += 1
        self._val_loss.append(val_loss / i)
        self._val_acc.append(val_acc / i)

    def train(self):
        """make a training job"""
        while not self._is_stop():
            self._train()
            self._val()

            self._update_training_state()
            self._logging()
            self._increase_epoch_num()

        self._writer.close()  # be careful with this line of code

    def _update_training_state(self):
        if self._val_acc[-1] > self._best_acc:
            self._save_weights()
            self._plateau_count = 0
            self._best_acc = self._val_acc[-1]
            self._best_loss = self._val_loss[-1]
        else:
            self._plateau_count += 1
        self._scheduler.step(self._val_loss[-1])

    def _logging(self):
        # TODO: save message to log file, tensorboard then
        consume_time = str(datetime.datetime.now() - self._start_time)

        message = "\nE{:03d}  {:.3f}/{:.3f}/{:.3f} {:.3f}/{:.3f}/{:.3f} | p{:02d}  Time {}\n".format(
            self._current_epoch_num,
            self._train_loss[-1],
            self._val_loss[-1],
            self._best_loss,
            self._train_acc[-1],
            self._val_acc[-1],
            self._best_acc,
            self._plateau_count,
            consume_time[:-7],
        )

        self._writer.add_scalar("Accuracy/train", self._train_acc[-1],
                                self._current_epoch_num)
        self._writer.add_scalar("Accuracy/val", self._val_acc[-1],
                                self._current_epoch_num)
        self._writer.add_scalar("Loss/train", self._train_loss[-1],
                                self._current_epoch_num)
        self._writer.add_scalar("Loss/val", self._val_loss[-1],
                                self._current_epoch_num)

        print(message)

    def _is_stop(self):
        """check stop condition"""
        return (self._plateau_count > self._max_plateau_count
                or self._current_epoch_num > self._max_epoch_num)

    def _increase_epoch_num(self):
        self._current_epoch_num += 1

    def _store_trainer(self):
        """store config, training info and traning result to file"""
        pass

    def _save_weights(self):
        """save checkpoint"""
        if self._distributed == 0:
            state_dict = self._model.state_dict()
        else:
            state_dict = self._model.module.state_dict()
        state = {
            **self._configs,
            "net": state_dict,
            "best_loss": self._best_loss,
            "best_acc": self._best_acc,
        }

        checkpoint_dir = os.path.join(self._configs["cwd"],
                                      "saved/checkpoints")

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir, exist_ok=True)

        torch.save(state,
                   os.path.join(checkpoint_dir, self._configs["model_name"]))
Esempio n. 23
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument(
        "--batch-size",
        type=int,
        default=64,
        metavar="N",
        help="input batch size for training (default: 64)",
    )
    parser.add_argument(
        "--test-batch-size",
        type=int,
        default=1000,
        metavar="N",
        help="input batch size for testing (default: 1000)",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=10,
        metavar="N",
        help="number of epochs to train (default: 10)",
    )
    parser.add_argument(
        "--lr",
        type=float,
        default=0.01,
        metavar="LR",
        help="learning rate (default: 0.01)",
    )
    parser.add_argument(
        "--momentum",
        type=float,
        default=0.5,
        metavar="M",
        help="SGD momentum (default: 0.5)",
    )
    parser.add_argument(
        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
    )
    parser.add_argument(
        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
    )
    parser.add_argument(
        "--log-interval",
        type=int,
        default=100,
        metavar="N",
        help="how many batches to wait before logging training status",
    )

    parser.add_argument(
        "--save-model",
        action="store_true",
        default=False,
        help="For Saving the current Model",
    )

    parser.add_argument(
        "--optimizer",
        default="sgd",
        choices=["raner", "ranerqh", "sgd"],
        help="choose optimizer from choices",
    )

    parser.add_argument(
        "--sa", action="store_true", help="use self attention module",
    )

    parser.add_argument(
        "--mish", action="store_true", help="use Mish activate function"
    )

    parser.add_argument("--smooth", default=None, help="put float to smooth or sce")

    parser.add_argument("--gp", action="store_true", help="use global pooling")

    parser.add_argument("--fpa", action="store_true", help="use fpa scheduler")

    args = parser.parse_args()

    # Tensorboard
    writer = SummaryWriter()

    torch.manual_seed(args.seed)
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            "../data",
            train=True,
            download=True,
            transform=transforms.Compose(
                [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
            ),
        ),
        batch_size=args.batch_size,
        shuffle=True,
        **kwargs
    )
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            "../data",
            train=False,
            transform=transforms.Compose(
                [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
            ),
        ),
        batch_size=args.test_batch_size,
        shuffle=True,
        **kwargs
    )

    model = Net(args.sa, args.gp, args.mish).to(device)

    # choose loss function
    if args.smooth is None:
        print("use CrossEntropy Loss")
        criterion = torch.nn.CrossEntropyLoss()
    else:
        print("use LabelSmoothing Loss")
        criterion = LabelSmoothingLoss(smoothing=float(args.smooth))

    # choose optimizer
    if args.optimizer == "sgd":
        print("use Momentum SGD optimizer")
        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
    elif args.optimizer == "ranger":
        print("use Ranger optimizer")
        optimizer = Ranger(model.parameters(), lr=args.lr)
    elif args.optimizer == "rangerqh":
        print("use RangerQH optimizer")
        optimizer = RangerQH(model.parameters(), lr=args.lr)

    # choose LR scheduler
    if args.fpa:
        print("use FlatplusAnneal scheduler")
        scheduler = FlatplusAnneal(optimizer, max_iter=args.epochs, step_size=0.7)
    else:
        print("use StepLR scheduler")
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=args.epochs // 3)

    for epoch in range(1, args.epochs + 1):
        train_loss = train(
            args, model, device, train_loader, optimizer, epoch, criterion
        )
        test_loss, test_acc = test(args, model, device, test_loader, criterion)
        scheduler.step()
        writer.add_scalar("lr", scheduler.get_lr()[0], epoch)
        writer.add_scalars("loss", {"train": train_loss, "test": test_loss}, epoch)
        writer.add_scalar("acc/test", test_acc, epoch)

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")

    # tfboard
    images, labels = next(iter(train_loader))
    grid = utils.make_grid(images)
    writer.add_image("images", grid, 0)
    writer.add_graph(model, images)
    writer.close()
Esempio n. 24
0
def train_gssoft(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = GSSOFT(args.channels, args.latent_dim, args.num_embeddings,
                   args.embedding_dim)
    model.to(device)

    model_name = "{}_C_{}_N_{}_M_{}_D_{}".format(args.model, args.channels,
                                                 args.latent_dim,
                                                 args.num_embeddings,
                                                 args.embedding_dim)

    checkpoint_dir = Path(model_name)
    checkpoint_dir.mkdir(parents=True, exist_ok=True)

    writer = SummaryWriter(log_dir=Path("runs") / model_name)

    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

    if args.resume is not None:
        print("Resume checkpoint from: {}:".format(args.resume))
        checkpoint = torch.load(args.resume,
                                map_location=lambda storage, loc: storage)
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        global_step = checkpoint["step"]
    else:
        global_step = 0

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Lambda(shift)])
    training_dataset = datasets.CIFAR10("./CIFAR10",
                                        train=True,
                                        download=True,
                                        transform=transform)

    test_dataset = datasets.CIFAR10("./CIFAR10",
                                    train=False,
                                    download=True,
                                    transform=transform)

    training_dataloader = DataLoader(training_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.num_workers,
                                     pin_memory=True)

    test_dataloader = DataLoader(test_dataset,
                                 batch_size=64,
                                 shuffle=True,
                                 drop_last=True,
                                 num_workers=args.num_workers,
                                 pin_memory=True)

    num_epochs = args.num_training_steps // len(training_dataloader) + 1
    start_epoch = global_step // len(training_dataloader) + 1

    N = 3 * 32 * 32

    for epoch in range(start_epoch, num_epochs + 1):
        model.train()
        average_logp = average_KL = average_elbo = average_bpd = average_perplexity = 0
        for i, (images, _) in enumerate(tqdm(training_dataloader), 1):
            images = images.to(device)

            dist, KL, perplexity = model(images)
            targets = (images + 0.5) * 255
            targets = targets.long()
            logp = dist.log_prob(targets).sum((1, 2, 3)).mean()
            loss = (KL - logp) / N
            elbo = (KL - logp) / N
            bpd = elbo / np.log(2)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            global_step += 1

            if global_step % 25000 == 0:
                save_checkpoint(model, optimizer, global_step, checkpoint_dir)

            average_logp += (logp.item() - average_logp) / i
            average_KL += (KL.item() - average_KL) / i
            average_elbo += (elbo.item() - average_elbo) / i
            average_bpd += (bpd.item() - average_bpd) / i
            average_perplexity += (perplexity.item() - average_perplexity) / i

        writer.add_scalar("logp/train", average_logp, epoch)
        writer.add_scalar("kl/train", average_KL, epoch)
        writer.add_scalar("elbo/train", average_elbo, epoch)
        writer.add_scalar("bpd/train", average_bpd, epoch)
        writer.add_scalar("perplexity/train", average_perplexity, epoch)

        model.eval()
        average_logp = average_KL = average_elbo = average_bpd = average_perplexity = 0
        for i, (images, _) in enumerate(test_dataloader, 1):
            images = images.to(device)

            with torch.no_grad():
                dist, KL, perplexity = model(images)

            targets = (images + 0.5) * 255
            targets = targets.long()
            logp = dist.log_prob(targets).sum((1, 2, 3)).mean()
            elbo = (KL - logp) / N
            bpd = elbo / np.log(2)

            average_logp += (logp.item() - average_logp) / i
            average_KL += (KL.item() - average_KL) / i
            average_elbo += (elbo.item() - average_elbo) / i
            average_bpd += (bpd.item() - average_bpd) / i
            average_perplexity += (perplexity.item() - average_perplexity) / i

        writer.add_scalar("logp/test", average_logp, epoch)
        writer.add_scalar("kl/test", average_KL, epoch)
        writer.add_scalar("elbo/test", average_elbo, epoch)
        writer.add_scalar("bpd/test", average_bpd, epoch)
        writer.add_scalar("perplexity/test", average_perplexity, epoch)

        samples = torch.argmax(dist.logits, dim=-1)
        grid = utils.make_grid(samples.float() / 255)
        writer.add_image("reconstructions", grid, epoch)

        print(
            "epoch:{}, logp:{:.3E}, KL:{:.3E}, elbo:{:.3f}, bpd:{:.3f}, perplexity:{:.3f}"
            .format(epoch, average_logp, average_KL, average_elbo, average_bpd,
                    average_perplexity))
            if iter_d == 0:

                writer.add_scalar('D/fake', D_fake, iteration + int(len(loader)/5)*epoch)
                writer.add_scalar('D/GP', gradient_penalty, iteration + int(len(loader)/5)*epoch)
                writer.add_scalar('D/real', D_real, iteration+int(len(loader)/5)*epoch)
                writer.add_scalar('D/cost', D_cost, iteration + int(len(loader)/5)*epoch)
                writer.add_scalar('D/wasserstein', Wasserstein_D, iteration + int(len(loader)/5)*epoch)

        # Train generator network
        for i in range(1):
            for p in netD.parameters():
                p.requires_grad = False  # to avoid computation
            netG.zero_grad()

            fake = netG.generate_images(config.batchSize, device)
            G = netD(fake)
            G = G.mean()
            G.backward(mone)
            G_cost = -G
            optimizerG.step()

            writer.add_scalar('G/cost', G_cost, iteration + int(len(loader)/5)*epoch)

            if iteration%20 == 0:

                valid_x = netG(valid_noise)
                writer.add_image('valid_image', torchvision.utils.make_grid(valid_x, nrow=3), global_step=iteration)


Esempio n. 26
0
    def train_latent(self, imgs, classes, model_dir, tensorboard_dir):
        self.latent_model = LatentModel(self.config)

        data = dict(img=torch.from_numpy(imgs).permute(0, 3, 1, 2),
                    img_id=torch.from_numpy(np.arange(imgs.shape[0])),
                    class_id=torch.from_numpy(classes.astype(np.int64)))

        dataset = NamedTensorDataset(data)
        data_loader = DataLoader(dataset,
                                 batch_size=self.config['train']['batch_size'],
                                 shuffle=True,
                                 sampler=None,
                                 batch_sampler=None,
                                 num_workers=1,
                                 pin_memory=True,
                                 drop_last=True)

        self.latent_model.init()
        self.latent_model.to(self.device)

        criterion = VGGDistance(self.config['perceptual_loss']['layers']).to(
            self.device)

        optimizer = Adam([{
            'params':
            itertools.chain(self.latent_model.modulation.parameters(),
                            self.latent_model.generator.parameters()),
            'lr':
            self.config['train']['learning_rate']['generator']
        }, {
            'params':
            itertools.chain(self.latent_model.content_embedding.parameters(),
                            self.latent_model.class_embedding.parameters()),
            'lr':
            self.config['train']['learning_rate']['latent']
        }],
                         betas=(0.5, 0.999))

        scheduler = CosineAnnealingLR(
            optimizer,
            T_max=self.config['train']['n_epochs'] * len(data_loader),
            eta_min=self.config['train']['learning_rate']['min'])

        summary = SummaryWriter(log_dir=tensorboard_dir)

        train_loss = AverageMeter()
        for epoch in range(self.config['train']['n_epochs']):
            self.latent_model.train()
            train_loss.reset()

            pbar = tqdm(iterable=data_loader)
            for batch in pbar:
                batch = {
                    name: tensor.to(self.device)
                    for name, tensor in batch.items()
                }

                optimizer.zero_grad()
                out = self.latent_model(batch['img_id'], batch['class_id'])

                content_penalty = torch.sum(out['content_code']**2,
                                            dim=1).mean()
                loss = criterion(
                    out['img'], batch['img']
                ) + self.config['content_decay'] * content_penalty

                loss.backward()
                optimizer.step()
                scheduler.step()

                train_loss.update(loss.item())
                pbar.set_description_str('epoch #{}'.format(epoch))
                pbar.set_postfix(loss=train_loss.avg)

            pbar.close()
            self.save(model_dir, latent=True, amortized=False)

            summary.add_scalar(tag='loss',
                               scalar_value=train_loss.avg,
                               global_step=epoch)

            fixed_sample_img = self.generate_samples(dataset, randomized=False)
            random_sample_img = self.generate_samples(dataset, randomized=True)

            summary.add_image(tag='sample-fixed',
                              img_tensor=fixed_sample_img,
                              global_step=epoch)
            summary.add_image(tag='sample-random',
                              img_tensor=random_sample_img,
                              global_step=epoch)

        summary.close()
Esempio n. 27
0
class NetworkTrainer(object):

    def __init__(self,                  
                 opt = 'adam', 
                 lr = 0.001,                 
                 batch_size = 4,
                 epochs = 10, 
                 dcm_loss = True, 
                 padding_center = False,                 
                 experiment = 'TEST',                 
                 gpu = '0',
                 ):
        
        ## Set the default information
        self.info = OrderedDict()
        self.set_info(opt, lr, batch_size, epochs, dcm_loss, padding_center, experiment)
        self.set_default_info()
        self.device = torch.device("cuda:%s" % gpu if torch.cuda.is_available() else "cpu")                
        
        
        ## Dataset              
        if self.info['experiment'] == 'TEST':
            self.all_dataset = self.load_dataset(list_id = range(101))
        else:
            self.all_dataset = self.load_dataset()        
        
        ## Output folder path 
        self.output_dir = os.path.join("./results", experiment)
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)
            
        self.writer = SummaryWriter(log_dir = self.output_dir)
        
        ## initialize results dictionary
        self.results = self.intialize_results_dict()
        
        # Network
        self.generator = self.get_network('ce-net')
        self.discriminator = self.get_network('disc')
        #self.optimizer = self.get_optimizer()                
        self.criteriaMSE = self.get_loss_fx('MSE')
        self.criteriaBCE = self.get_loss_fx('BCE')
        self.transform = self.get_transform()
        self.train_loader, self.val_loader = self.get_data_loader()
        
        print("Network initizliation:")
        print("Training Number: %s" % (len(self.train_loader) * self.info['batch_size']))
        print("Validation Number: %s" % (len(self.val_loader) * self.info['batch_size']))
        
    def set_info(self, opt, lr, batch_size, epochs, dcm_loss, padding_center, experiment):        
        self.info['optimizer'] = opt
        self.info['learning_rate'] = lr
        self.info['batch_size'] = batch_size
        self.info['epochs'] = epochs
        self.info['dcm_loss'] = dcm_loss
        self.info['padding_center'] = padding_center
        self.info['experiment'] = experiment
        
    def set_default_info(self):
        self.info['Generator_adv_loss'] = 0.1
        self.info['Generator_mse_loss'] = 0.9
        self.info['Discriminator_adv_loss'] = 0.9
        self.info['Discriminator_dcm_loss'] = 0.1
        self.info['Sample_interval'] = 10
        
    
    def update_info(key, value):
        self.info[key] = value    
    
    def intialize_results_dict(self):
        results = OrderedDict()
        results['G_training_loss'] = []
        results['D_training_loss'] = []
        results['validation_loss'] = []        
        
        results['validation_mse_loss'] = []                        
        results['validation_adv_loss'] = []
        results['best_loss'] = float('inf')
        results['best_MSE'] = float('inf')
        
        return results
    
        
    def load_dataset(self, list_id = None, transform = None, inpaint = False):
        return SSIDataset(list_id = list_id, transform = transform, inpaint = inpaint) 
        
        
    def data_split(self, validation_split = 0.2, random_seed = 123, shuffle_dataset = True):
        dataset_size = len(self.all_dataset)
        indices = list(range(dataset_size))
        split = int(np.floor(validation_split * dataset_size))
        if shuffle_dataset :
            np.random.seed(random_seed)
            np.random.shuffle(indices)
        train_indices, val_indices = indices[split:], indices[:split]
        return train_indices, val_indices
        
    def get_data_loader(self):
        train_indices, val_indices = self.data_split()
        train_dataset = self.load_dataset(list_id = train_indices, transform = self.transform['train'], inpaint = True)
        val_dataset = self.load_dataset(list_id = val_indices, transform = self.transform['val'], inpaint = True)
        
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.info['batch_size'], num_workers = 4)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=self.info['batch_size'], num_workers = 4)       
        return train_loader, val_loader
    
    def get_optimizer(self):
        opt, lr, reg = self.info['optimizer'], self.info['learning_rate'], self.info['regularization_weights']
        if opt == 'adam':
            return optim.Adam(self.network.parameters(), lr=lr, weight_decay = reg)
        elif opt == 'rmsprop':
            return optim.RMSprop(self.network.parameters(), lr = lr, weight_decay = reg)
        else:
            return optim.SGD(self.network.parameters(), lr = lr, weight_decay = reg)
    
    def get_network(self, net = 'ce-net'):                
        if net == 'ce-net':
            return CENet()
        elif net == 'disc':
            if self.info['dcm_loss']:
                return Discriminator(9)
            else:
                return Discriminator(1)
    
    def get_transform(self):
        return get_transformer_norm()
    
    
    def get_loss_fx(self, loss_fx):
        if loss_fx == 'MSE':
            return nn.MSELoss()
        elif loss_fx == 'BCE':
            return nn.BCELoss()        
        
        
    def _get_output_labels(self, output):        
        if self.info['dcm_loss']:
            # Discrimation labels
            pred_dlabel = output[:,0]                
            # DICOM Labels
            pred_dicom = output[:,1:]
        else:
            pred_dlabel = output
            pred_dicom = None
        
        return pred_dlabel, pred_dicom
    
    def padding_center(self, imgs, centers):
        new_img = imgs.clone()
        new_img[:,:,64:192, 100:300] = centers
        return new_img
        
    def get_disc_input(self, imgs, centers):
        if self.info['padding_center']:
            disc_input = self.padding_center(imgs, centers)
        else:
            disc_input = centers
        return disc_input

    def sample_images(self, imgs, centers, pred_centers, epoch):
        true = self.padding_center(imgs, centers)        
        true = make_grid(true, normalize= True)
        self.writer.add_image('true_images', true, epoch)
                
        pred = self.padding_center(imgs, pred_centers)
        pred = make_grid(pred, normalize= True)
        self.writer.add_image('pred_images', pred, epoch)
        
    
    def evaluate(self, dataloader, epoch, sample_p = 0.1):
        self.generator.eval()
        self.discriminator.eval()       
        
        #dlabel = torch.FloatTensor(self.info['batch_size'])
        MSE_loss = 0 
        Adv_loss = 0
        
        for i, (imgs, centers, dcm_labels) in enumerate(dataloader):               
            batch_size = dcm_labels.clone().size(0)
            
            imgs, centers, dcm_labels = imgs.to(self.device), centers.to(self.device), dcm_labels.to(self.device)
                        
            pred_centers = self.generator(imgs)
            
            if np.random.uniform(0,1) < sample_p:
                self.sample_images(imgs, centers, pred_centers, epoch)
                
            # Advasarial loss            
            # Fake Image succesfully fool the discriminator
            dlabel = torch.FloatTensor(batch_size).fill_(1).to(self.device)
            disc_input = self.get_disc_input(imgs, pred_centers)
            output = self.discriminator(disc_input)
            pred_dlabel, _ = self._get_output_labels(output)                               
                    
            lossAdv_Encoder = self.criteriaBCE(pred_dlabel, dlabel)

            # MSE Loss           
            lossMSE_Encoder = self.criteriaMSE(pred_centers, centers)                             
            
            MSE_loss += lossMSE_Encoder.item()
            Adv_loss += lossAdv_Encoder.item()
    
        MSE_loss /= i
        Adv_loss /= i
        
        return MSE_loss, Adv_loss
    
    
    def train(self):
        self.generator = self.generator.to(self.device)        
        self.discriminator = self.discriminator.to(self.device)
        
        epochs = self.info['epochs']                                            
        n_iter = 0 
        
        # Define the optimizer for the network
        optG = optim.Adam(self.generator.parameters(), lr = self.info['learning_rate'])         
        optD = optim.Adam(self.discriminator.parameters(), lr = self.info['learning_rate'])
        
        for epoch in range(epochs):
            print('Starting epoch {}/{}.'.format(epoch + 1, epochs))
            self.generator.train()
            self.discriminator.train()
            
            G_epoch_loss = 0         
            D_epoch_loss = 0
            
            dlabel = torch.FloatTensor(self.info['batch_size']).to(self.device)
            
            for i, (imgs, centers, dcm_labels) in enumerate(self.train_loader):                     
                batch_size = imgs.size(0)
                imgs, centers, dcm_labels = imgs.to(self.device), centers.to(self.device), dcm_labels.to(self.device)           
                
                # -----------------------
                # Train Generator (Encoder)
                # -----------------------
                optG.zero_grad()       
                
                pred_centers = self.generator(imgs)
                
                # Advasarial loss                
                #dlabel.data.resize_(batch_size).fill_(1)            
                dlabel = torch.FloatTensor(batch_size).fill_(1).to(self.device)
                output = self.discriminator(pred_centers)
                pred_dlabel, _ = self._get_output_labels(output)            
                
                lossAdv_Encoder = self.criteriaBCE(pred_dlabel, dlabel)

                # MSE Loss
                lossMSE_Encoder = self.criteriaMSE(pred_centers, centers)

                                        
                lossG = self.info['Generator_adv_loss'] * lossAdv_Encoder + self.info['Generator_mse_loss']* lossMSE_Encoder
                                                                
                G_epoch_loss += lossG.item()
                
                # Write the loss to summary writer
                self.writer.add_scalar('Loss/train_ADV_G', lossAdv_Encoder.item(), n_iter)
                self.writer.add_scalar('Loss/train_MSE_G', lossMSE_Encoder.item(), n_iter)
                self.writer.add_scalar('Loss/train_G', lossG.item(), n_iter)
                                
                
                lossG.backward()
                
                optG.step()
                
                
                # -----------------------
                # Train Discriminator
                # -----------------------
                
                
                # Discriminator - Train with real
                optD.zero_grad()
                #dlabel.data.resize_(batch_size).fill_(1)
                dlabel = torch.FloatTensor(batch_size).fill_(1).to(self.device)
                
                # Padding the original context as the input for discriminator
                if self.info['padding_center']:
                    disc_input = self.padding_center(imgs, centers)
                else:
                    disc_input = centers
                
                output = self.discriminator(disc_input)
                
                # Get the output labels for discriminator
                pred_dlabel, pred_dicom = self._get_output_labels(output)                            

                lossAdv_real = self.criteriaBCE(pred_dlabel, dlabel)        
                
                if self.info['dcm_loss']:
                    lossDCM = self.criteriaBCE(pred_dicom, dcm_labels.float())
                else:
                    lossDCM = torch.Tensor([0]).to(self.device)

                # Discriminator - Train with fake
                
                pred_centers = self.generator(imgs)
                #dlabel.data.resize_(batch_size).fill_(0)
                dlabel = torch.FloatTensor(batch_size).fill_(0).to(self.device)

                if self.info['padding_center']:
                    disc_input = self.padding_center(imgs, pred_centers)
                else:
                    disc_input = pred_centers
                                
                output = self.discriminator(pred_centers)
                pred_dlabel = output[:, 0]
                                
                lossAdv_fake = self.criteriaBCE(pred_dlabel, dlabel)
                
                
                lossD = self.info['Discriminator_adv_loss'] * (lossAdv_real + lossAdv_fake) + self.info['Discriminator_dcm_loss'] * lossDCM
                
                D_epoch_loss += lossD.item()
                self.writer.add_scalar('Loss/train_ADV_D', (lossAdv_real + lossAdv_fake).item(), n_iter)
                self.writer.add_scalar('Loss/train_DCM_D', lossDCM.item(), n_iter)
                self.writer.add_scalar('Loss/train_D', lossD.item(), n_iter)
                
                n_iter += 1
                
                lossD.backward()
                optD.step()

                if i % 100 == 0:
                    print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f'
                          % (epoch, epochs, i, len(self.train_loader),
                             lossD.item(), lossG.item()))    

                                
            D_epoch_loss /= i+1
            G_epoch_loss /= i+1
            self.results['G_training_loss'].append(D_epoch_loss)
            self.results['D_training_loss'].append(G_epoch_loss)
            print(f'Epoch finished ! D_Loss: {D_epoch_loss}, G_Loss: {G_epoch_loss}' )
                        
            # Validation
            with torch.set_grad_enabled(False):
                self.generator.eval()                                            
                MSE_loss, Adv_loss  = self.evaluate(self.val_loader, epoch = epoch)
                
                self.results['validation_mse_loss'].append(MSE_loss)
                self.results['validation_adv_loss'].append(Adv_loss)
                                                
                print('Validation MSE Loss: {}'.format(MSE_loss))
                print('Validation Adv Loss: {}'.format(Adv_loss))
                                
                if MSE_loss + Adv_loss < self.results['best_loss']:
                    self.results['best_loss'] = MSE_loss + Adv_loss
                    self.results['best_MSE'] = MSE_loss                                        
                    self.results['best_epoch'] = epoch + 1
                    torch.save(self.generator.state_dict(), os.path.join(self.output_dir, "epoch{}.pth".format(epoch+1)))                   
                    print("Best Validation MSE improved!")                                    
                    
                elif (epoch+1) % self.info['Sample_interval'] == 0:
                    torch.save(self.generator.state_dict(), os.path.join(self.output_dir, "epoch{}.pth".format(epoch+1)))       

        # Save the training dice score using the best weights
        #self.evaluate_train()

    def evaluate_train(self):                
        weight_path =  os.path.join(self.output_dir, "epoch{}.pth".format(self.results['best_epoch']))
        self.network.load_state_dict(torch.load(weight_path))
        
        with torch.set_grad_enabled(False):
            self.network.eval()      
            train_loss, train_acc, train_precision, train_recall  = eval_net(self.network, self.train_loader, self.criterion, self.device)   
                        
            self.results['train_accuracy'] = train_acc
            self.results['train_precision'] = train_precision
            self.results['train_recall'] = train_recall
            
            
        
    def plot_training():
        pass
    def save_results(self):
        config = configparser.ConfigParser()        
        config['INFO'] = self.info
        config['BEST RESULTS'] = {'val_mse': self.results['best_MSE'],                             
                              'best_epoch': self.results['best_epoch']}        
        
        with open(os.path.join(self.output_dir, 'exp.ini'), 'w') as configfile:
            config.write(configfile)
                        
        loss_history = pd.DataFrame({'generator_training_loss': self.results['G_training_loss'],
                                    'discriminator_loss': self.results['D_training_loss']})
        loss_history.to_csv(os.path.join(self.output_dir, 'loss_history.csv'))
        
        torch.save(self.generator.state_dict(), os.path.join(self.output_dir, "epoch_last.pth"))
        
    def load_weights(self, weight_path):
        self.network.load_state_dict(torch.load(weight_path))
Esempio n. 28
0
    def train_amortized(self, imgs, classes, model_dir, tensorboard_dir):
        self.amortized_model = AmortizedModel(self.config)
        self.amortized_model.modulation.load_state_dict(
            self.latent_model.modulation.state_dict())
        self.amortized_model.generator.load_state_dict(
            self.latent_model.generator.state_dict())

        data = dict(img=torch.from_numpy(imgs).permute(0, 3, 1, 2),
                    img_id=torch.from_numpy(np.arange(imgs.shape[0])),
                    class_id=torch.from_numpy(classes.astype(np.int64)))

        dataset = NamedTensorDataset(data)
        data_loader = DataLoader(dataset,
                                 batch_size=self.config['train']['batch_size'],
                                 shuffle=True,
                                 sampler=None,
                                 batch_sampler=None,
                                 num_workers=1,
                                 pin_memory=True,
                                 drop_last=True)

        self.latent_model.to(self.device)
        self.amortized_model.to(self.device)

        reconstruction_criterion = VGGDistance(
            self.config['perceptual_loss']['layers']).to(self.device)
        embedding_criterion = nn.MSELoss()

        optimizer = Adam(
            params=self.amortized_model.parameters(),
            lr=self.config['train_encoders']['learning_rate']['max'],
            betas=(0.5, 0.999))

        scheduler = CosineAnnealingLR(
            optimizer,
            T_max=self.config['train_encoders']['n_epochs'] * len(data_loader),
            eta_min=self.config['train_encoders']['learning_rate']['min'])

        summary = SummaryWriter(log_dir=tensorboard_dir)

        train_loss = AverageMeter()
        for epoch in range(self.config['train_encoders']['n_epochs']):
            self.latent_model.eval()
            self.amortized_model.train()

            train_loss.reset()

            pbar = tqdm(iterable=data_loader)
            for batch in pbar:
                batch = {
                    name: tensor.to(self.device)
                    for name, tensor in batch.items()
                }

                optimizer.zero_grad()

                target_content_code = self.latent_model.content_embedding(
                    batch['img_id'])
                target_class_code = self.latent_model.class_embedding(
                    batch['class_id'])

                out = self.amortized_model(batch['img'])

                loss_reconstruction = reconstruction_criterion(
                    out['img'], batch['img'])
                loss_content = embedding_criterion(out['content_code'],
                                                   target_content_code)
                loss_class = embedding_criterion(out['class_code'],
                                                 target_class_code)

                loss = loss_reconstruction + 10 * loss_content + 10 * loss_class

                loss.backward()
                optimizer.step()
                scheduler.step()

                train_loss.update(loss.item())
                pbar.set_description_str('epoch #{}'.format(epoch))
                pbar.set_postfix(loss=train_loss.avg)

            pbar.close()
            self.save(model_dir, latent=False, amortized=True)

            summary.add_scalar(tag='loss-amortized',
                               scalar_value=loss.item(),
                               global_step=epoch)
            summary.add_scalar(tag='rec-loss-amortized',
                               scalar_value=loss_reconstruction.item(),
                               global_step=epoch)
            summary.add_scalar(tag='content-loss-amortized',
                               scalar_value=loss_content.item(),
                               global_step=epoch)
            summary.add_scalar(tag='class-loss-amortized',
                               scalar_value=loss_class.item(),
                               global_step=epoch)

            fixed_sample_img = self.generate_samples_amortized(
                dataset, randomized=False)
            random_sample_img = self.generate_samples_amortized(
                dataset, randomized=True)

            summary.add_image(tag='sample-fixed-amortized',
                              img_tensor=fixed_sample_img,
                              global_step=epoch)
            summary.add_image(tag='sample-random-amortized',
                              img_tensor=random_sample_img,
                              global_step=epoch)

        summary.close()
Esempio n. 29
0
        plt.show()

        dis_optimizer.zero_grad()
        dis_output_true_v = net_discr(batch_v)
        dis_output_fake_v = net_discr(gen_output_v.detach())
        dis_loss = objective(dis_output_true_v, true_labels_v) + objective(
            dis_output_fake_v, fake_labels_v)
        dis_loss.backward()
        dis_optimizer.step()
        dis_losses.append(dis_loss.item())

        gen_optimizer.zero_grad()
        dis_output_v = net_discr(gen_output_v)
        gen_loss_v = objective(dis_output_v, true_labels_v)
        gen_loss_v.backward()
        gen_optimizer.step()
        gen_losses.append(gen_loss_v.item())

        iter_no += 1
        if iter_no % REPORT_EVERY_ITER == 0:
            log.info("Iter %d: gen_loss=%.3e, dis_loss=%.3e", iter_no,
                     np.mean(gen_losses), np.mean(dis_losses))
            writer.add_scalar("gen_loss", np.mean(gen_losses), iter_no)
            writer.add_scalar("dis_loss", np.mean(dis_losses), iter_no)
            gen_losses = []
            dis_losses = []
        if iter_no % SAVE_IMAGE_EVERY_ITER == 0:
            writer.add_image("fake", vutils.make_grid(gen_output_v.data[:64]),
                             iter_no)
            writer.add_image("real", vutils.make_grid(batch_v.data[:64]),
                             iter_no)
Esempio n. 30
0
class UNetModel:
    '''Wrapper class for different Unet models to facilitate training, validation, logging etc.
        Args:
            exp_config: Experiment configuration file as given in the experiment folder
    '''
    def __init__(self, exp_config, logger=None, tensorboard=True):

        self.net = exp_config.model(input_channels=exp_config.input_channels,
                                    num_classes=exp_config.n_classes,
                                    num_filters=exp_config.filter_channels,
                                    latent_levels=exp_config.latent_levels,
                                    no_convs_fcomb=exp_config.no_convs_fcomb,
                                    beta=exp_config.beta,
                                    image_size=exp_config.image_size,
                                    reversible=exp_config.use_reversible
                                    )
        self.exp_config = exp_config
        self.batch_size = exp_config.batch_size
        self.logger = logger

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.net.to(self.device)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=1e-3, weight_decay=1e-5)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, 'min', min_lr=1e-4, verbose=True, patience=50000)

        if exp_config.pretrained_model is not None:
            self.logger.info('Loading pretrained model {}'.format(exp_config.pretrained_model))

            model_path = os.path.join(sys_config.project_root, 'models', exp_config.pretrained_model)

            model_name = self.exp_config.experiment_name + '_' + exp_config.pretrained_model + '.pth'

            log_dir = os.path.join(sys_config.log_root, exp_config.log_dir_name, exp_config.experiment_name)
            save_model_path = os.path.join(log_dir, model_name)

            if os.path.exists(model_path):
                self.net.load_state_dict(torch.load(save_model_path))
            else:
                self.logger.info('The file {} does not exist. Starting training without pretrained net.'
                                 .format(save_model_path))

        self.mean_loss_of_epoch = 0
        self.tot_loss = 0
        self.kl_loss = 0
        self.reconstruction_loss = 0
        self.dice_mean = 0
        self.val_loss = 0
        self.foreground_dice = 0

        self.val_recon_loss = 0
        self.val_elbo = 0
        self.val_kl_loss = 0
        self.avg_dice = 0
        self.avg_ged = -1
        self.avg_ncc = -1

        self.best_dice = -1
        self.best_loss = np.inf
        self.best_ged = np.inf
        self.best_ncc = -1

        if tensorboard:
            self.training_writer = SummaryWriter()
            self.validation_writer = SummaryWriter(comment='_validation')
        self.iteration = 0

    def train(self, data):
        self.net.train()
        self.logger.info('Starting training.')
        self.logger.info('Current filters: {}'.format(self.exp_config.filter_channels))
        self.logger.info('Batch size: {}'.format(self.batch_size))

        for self.iteration in range(1, self.exp_config.iterations):
            x_b, s_b = data.train.next_batch(self.batch_size)

            patch = torch.tensor(x_b, dtype=torch.float32).to(self.device)

            mask = torch.tensor(s_b, dtype=torch.float32).to(self.device)
            mask = torch.unsqueeze(mask, 1)

            self.mask = mask
            self.patch = patch

            self.net.forward(patch, mask, training=True)
            self.loss = self.net.loss(mask)

            self.tot_loss += self.loss

            self.reconstruction_loss += self.net.reconstruction_loss
            self.kl_loss += self.net.kl_divergence_loss

            self.optimizer.zero_grad()

            self.loss.backward()
            self.optimizer.step()

            if self.iteration % self.exp_config.validation_frequency == 0:
                self.validate(data)

            if self.iteration % self.exp_config.logging_frequency == 0:
                self.logger.info('Iteration {} Loss {}'.format(self.iteration, self.loss))
                #self._create_tensorboard_summary()
                self.tot_loss = 0
                self.kl_loss = 0
                self.reconstruction_loss = 0

            self.scheduler.step(self.loss)

        self.logger.info('Finished training.')

    def validate(self, data):
        self.net.eval()
        with torch.no_grad():
            self.logger.info('Validation for step {}'.format(self.iteration))

            self.logger.info('Checkpointing model.')
            self.save_model('validation_ckpt')
            if self.device == torch.device('cuda'):
                allocated_memory = torch.cuda.max_memory_allocated(self.device)

                self.logger.info('Memory allocated in current iteration: {}{}'.format(allocated_memory, self.iteration))

            ged_list = []
            dice_list = []
            ncc_list = []
            elbo_list = []
            kl_list = []
            recon_list = []

            time_ = time.time()

            validation_set_size = data.validation.images.shape[0]\
                if self.exp_config.num_validation_images == 'all' else self.exp_config.num_validation_images

            for ii in range(validation_set_size):

                s_gt_arr = data.validation.labels[ii, ...]

                # from HW to NCHW
                x_b = data.validation.images[ii, ...]
                patch = torch.tensor(x_b, dtype=torch.float32).to(self.device)
                val_patch = patch.unsqueeze(dim=0).unsqueeze(dim=1)

                s_b = s_gt_arr[:, :, np.random.choice(self.exp_config.annotator_range)]
                mask = torch.tensor(s_b, dtype=torch.float32).to(self.device)
                val_mask = mask.unsqueeze(dim=0).unsqueeze(dim=1)
                val_masks = torch.tensor(s_gt_arr, dtype=torch.float32).to(self.device)  # HWC
                val_masks = val_masks.transpose(0, 2).transpose(1, 2)  # CHW

                patch_arrangement = val_patch.repeat((self.exp_config.validation_samples, 1, 1, 1))

                mask_arrangement = val_mask.repeat((self.exp_config.validation_samples, 1, 1, 1))

                self.mask = mask_arrangement
                self.patch = patch_arrangement

                # training=True for constructing posterior as well
                s_out_eval_list = self.net.forward(patch_arrangement, mask_arrangement, training=False)
                s_prediction_softmax_arrangement = self.net.accumulate_output(s_out_eval_list, use_softmax=True)

                # sample N times
                self.val_loss = self.net.loss(mask_arrangement)
                elbo = self.val_loss
                kl = self.net.kl_divergence_loss
                recon = self.net.reconstruction_loss

                s_prediction_softmax_mean = torch.mean(s_prediction_softmax_arrangement, axis=0)
                s_prediction_arrangement = torch.argmax(s_prediction_softmax_arrangement, dim=1)

                ground_truth_arrangement = val_masks  # nlabels, H, W
                ged = utils.generalised_energy_distance(s_prediction_arrangement, ground_truth_arrangement,
                                                        nlabels=self.exp_config.n_classes - 1,
                                                        label_range=range(1, self.exp_config.n_classes))

                # num_gts, nlabels, H, W
                s_gt_arr_r = val_masks.unsqueeze(dim=1)
                ground_truth_arrangement_one_hot = utils.convert_batch_to_onehot(s_gt_arr_r, nlabels=self.exp_config.n_classes)
                ncc = utils.variance_ncc_dist(s_prediction_softmax_arrangement, ground_truth_arrangement_one_hot)

                s_ = torch.argmax(s_prediction_softmax_mean, dim=0) # HW
                s = val_mask.view(val_mask.shape[-2], val_mask.shape[-1]) #HW

                # Write losses to list
                per_lbl_dice = []
                for lbl in range(self.exp_config.n_classes):
                    binary_pred = (s_ == lbl) * 1
                    binary_gt = (s == lbl) * 1

                    if torch.sum(binary_gt) == 0 and torch.sum(binary_pred) == 0:
                        per_lbl_dice.append(1.0)
                    elif torch.sum(binary_pred) > 0 and torch.sum(binary_gt) == 0 or torch.sum(binary_pred) == 0 and torch.sum(
                            binary_gt) > 0:
                        per_lbl_dice.append(0.0)
                    else:
                        per_lbl_dice.append(dc(binary_pred.detach().cpu().numpy(), binary_gt.detach().cpu().numpy()))

                dice_list.append(per_lbl_dice)
                elbo_list.append(elbo)
                kl_list.append(kl)
                recon_list.append(recon)

                ged_list.append(ged)
                ncc_list.append(ncc)

            dice_tensor = torch.tensor(dice_list)
            per_structure_dice = dice_tensor.mean(dim=0)

            elbo_tensor = torch.tensor(elbo_list)
            kl_tensor = torch.tensor(kl_list)
            recon_tensor = torch.tensor(recon_list)

            ged_tensor = torch.tensor(ged_list)
            ncc_tensor = torch.tensor(ncc_list)

            self.avg_dice = torch.mean(dice_tensor)
            self.foreground_dice = torch.mean(dice_tensor, dim=0)[1]
            self.val_elbo = torch.mean(elbo_tensor)
            self.val_recon_loss = torch.mean(recon_tensor)
            self.val_kl_loss = torch.mean(kl_tensor)

            self.avg_ged = torch.mean(ged_tensor)
            self.avg_ncc = torch.mean(ncc_tensor)

            self.logger.info(' - Foreground dice: %.4f' % torch.mean(self.foreground_dice))
            self.logger.info(' - Mean (neg.) ELBO: %.4f' % self.val_elbo)
            self.logger.info(' - Mean GED: %.4f' % self.avg_ged)
            self.logger.info(' - Mean NCC: %.4f' % self.avg_ncc)

            if torch.mean(per_structure_dice) >= self.best_dice:
                self.best_dice = torch.mean(per_structure_dice)
                self.logger.info('New best validation Dice! (%.3f)' % self.best_dice)
                self.save_model(savename='best_dice')
            if self.val_elbo <= self.best_loss:
                self.best_loss = self.val_elbo
                self.logger.info('New best validation loss! (%.3f)' % self.best_loss)
                self.save_model(savename='best_loss')
            if self.avg_ged <= self.best_ged:
                self.best_ged = self.avg_ged
                self.logger.info('New best GED score! (%.3f)' % self.best_ged)
                self.save_model(savename='best_ged')
            if self.avg_ncc >= self.best_ncc:
                self.best_ncc = self.avg_ncc
                self.logger.info('New best NCC score! (%.3f)' % self.best_ncc)
                self.save_model(savename='best_ncc')

            self.logger.info('Validation took {} seconds'.format(time.time()-time_))

        self.net.train()

    def train_brats(self, trainDataLoader):
        epoch = 1
        while epoch < 100:

            # set net up training
            self.net.train()

            for i, data in enumerate(trainDataLoader):

                # load data
                inputs, pid, labels = data
                inputs, labels = inputs.to(self.device), labels.to(self.device)

                # forward and backward pass
                outputs = self.net.forward(inputs, labels)
                loss = self.loss(outputs, labels)
                print('Current loss at iteration {} : {}'.format(i, loss))
                del inputs, outputs, labels
                loss.backward()

            epoch = epoch + 1

    def _create_tensorboard_summary(self, end_of_epoch=False):
        self.net.eval()
        with torch.no_grad():
            # calculate the means since the last validation
            self.training_writer.add_scalar('Mean_loss', self.tot_loss/self.exp_config.validation_frequency, global_step=self.iteration)
            self.training_writer.add_scalar('KL_Divergence_loss', self.kl_loss/self.exp_config.validation_frequency, global_step=self.iteration)
            self.training_writer.add_scalar('Reconstruction_loss', self.reconstruction_loss/self.exp_config.validation_frequency, global_step=self.iteration)

            self.validation_writer.add_scalar('Dice_score_of_last_validation', self.foreground_dice, global_step=self.iteration)
            self.validation_writer.add_scalar('GED_score_of_last_validation', self.avg_ged, global_step=self.iteration)
            self.validation_writer.add_scalar('NCC_score_of_last_validation', self.avg_ncc, global_step=self.iteration)

            self.validation_writer.add_scalar('Mean_loss', self.val_elbo, global_step=self.iteration)
            self.validation_writer.add_scalar('KL_Divergence_loss', self.val_kl_loss, global_step=self.iteration)
            self.validation_writer.add_scalar('Reconstruction_loss', self.val_recon_loss, global_step=self.iteration)

            # plot images of current patch for summary
            sample = torch.softmax(self.net.sample(), dim=1)
            sample1 = torch.chunk(sample, 2, dim=1)[self.exp_config.n_classes-1]

            self.training_writer.add_image('Patch/GT/Sample',
                                          torch.cat([self.patch,
                                                     self.mask.view(-1, 1, self.exp_config.image_size[1],
                                                                    self.exp_config.image_size[2]), sample1],
                                                    dim=2), global_step=self.iteration, dataformats='NCHW')

            if self.device == torch.device('cuda'):
                allocated_memory = torch.cuda.max_memory_allocated(self.device)

                self.logger.info('Memory allocated in current iteration: {}{}'.format(allocated_memory, self.iteration))
                self.training_writer.add_scalar('Max_memory_allocated', allocated_memory, self.iteration)

        self.net.train()

    def test(self, data, sys_config):
        self.net.eval()
        with torch.no_grad():

            model_selection = self.exp_config.experiment_name + '_best_loss.pth'
            self.logger.info('Testing {}'.format(model_selection))

            self.logger.info('Loading pretrained model {}'.format(model_selection))

            model_path = os.path.join(
                sys_config.log_root,
                self.exp_config.log_dir_name,
                self.exp_config.experiment_name,
                model_selection)

            if os.path.exists(model_path):
                self.net.load_state_dict(torch.load(model_path))
            else:
                self.logger.info('The file {} does not exist. Aborting test function.'.format(model_path))
                return

            ged_list = []
            dice_list = []
            ncc_list = []

            time_ = time.time()

            end_dice = 0.0
            end_ged = 0.0
            end_ncc = 0.0

            for i in range(10):
                self.logger.info('Doing iteration {}'.format(i))
                n_samples = 10

                for ii in range(data.test.images.shape[0]):

                    s_gt_arr = data.test.labels[ii, ...]

                    # from HW to NCHW
                    x_b = data.test.images[ii, ...]
                    patch = torch.tensor(x_b, dtype=torch.float32).to(self.device)
                    val_patch = patch.unsqueeze(dim=0).unsqueeze(dim=1)

                    s_b = s_gt_arr[:, :, np.random.choice(self.exp_config.annotator_range)]
                    mask = torch.tensor(s_b, dtype=torch.float32).to(self.device)
                    val_mask = mask.unsqueeze(dim=0).unsqueeze(dim=1)
                    val_masks = torch.tensor(s_gt_arr, dtype=torch.float32).to(self.device)  # HWC
                    val_masks = val_masks.transpose(0, 2).transpose(1, 2)  # CHW

                    patch_arrangement = val_patch.repeat((n_samples, 1, 1, 1))

                    mask_arrangement = val_mask.repeat((n_samples, 1, 1, 1))

                    self.mask = mask_arrangement
                    self.patch = patch_arrangement

                    # training=True for constructing posterior as well
                    s_out_eval_list = self.net.forward(patch_arrangement, mask_arrangement, training=False)
                    s_prediction_softmax_arrangement = self.net.accumulate_output(s_out_eval_list, use_softmax=True)

                    s_prediction_softmax_mean = torch.mean(s_prediction_softmax_arrangement, axis=0)
                    s_prediction_arrangement = torch.argmax(s_prediction_softmax_arrangement, dim=1)

                    ground_truth_arrangement = val_masks  # nlabels, H, W
                    ged = utils.generalised_energy_distance(s_prediction_arrangement, ground_truth_arrangement,
                                                            nlabels=self.exp_config.n_classes - 1,
                                                            label_range=range(1, self.exp_config.n_classes))

                    # num_gts, nlabels, H, W
                    s_gt_arr_r = val_masks.unsqueeze(dim=1)
                    ground_truth_arrangement_one_hot = utils.convert_batch_to_onehot(s_gt_arr_r,
                                                                                     nlabels=self.exp_config.n_classes)
                    ncc = utils.variance_ncc_dist(s_prediction_softmax_arrangement, ground_truth_arrangement_one_hot)

                    s_ = torch.argmax(s_prediction_softmax_mean, dim=0)  # HW
                    s = val_mask.view(val_mask.shape[-2], val_mask.shape[-1])  # HW

                    # Write losses to list
                    per_lbl_dice = []
                    for lbl in range(self.exp_config.n_classes):
                        binary_pred = (s_ == lbl) * 1
                        binary_gt = (s == lbl) * 1

                        if torch.sum(binary_gt) == 0 and torch.sum(binary_pred) == 0:
                            per_lbl_dice.append(1.0)
                        elif torch.sum(binary_pred) > 0 and torch.sum(binary_gt) == 0 or torch.sum(
                                binary_pred) == 0 and torch.sum(
                                binary_gt) > 0:
                            per_lbl_dice.append(0.0)
                        else:
                            per_lbl_dice.append(dc(binary_pred.detach().cpu().numpy(), binary_gt.detach().cpu().numpy()))
                    dice_list.append(per_lbl_dice)

                    ged_list.append(ged)
                    ncc_list.append(ncc)

                    if ii % 100 == 0:
                        self.logger.info(' - Mean GED: %.4f' % torch.mean(torch.tensor(ged_list)))
                        self.logger.info(' - Mean NCC: %.4f' % torch.mean(torch.tensor(ncc_list)))


                dice_tensor = torch.tensor(dice_list)
                per_structure_dice = dice_tensor.mean(dim=0)

                ged_tensor = torch.tensor(ged_list)
                ncc_tensor = torch.tensor(ncc_list)

                model_path = os.path.join(
                    sys_config.log_root,
                    self.exp_config.log_dir_name,
                    self.exp_config.experiment_name)

                np.savez(os.path.join(model_path, 'ged%s_%s_2.npz' % (str(n_samples), model_selection)), ged_tensor.numpy())
                np.savez(os.path.join(model_path, 'ncc%s_%s_2.npz' % (str(n_samples), model_selection)), ncc_tensor.numpy())

                self.avg_dice = torch.mean(dice_tensor)
                self.foreground_dice = torch.mean(dice_tensor, dim=0)[1]

                self.avg_ged = torch.mean(ged_tensor)
                self.avg_ncc = torch.mean(ncc_tensor)

                logging.info('-- GED: --')
                logging.info(torch.mean(ged_tensor))
                logging.info(torch.std(ged_tensor))

                logging.info('-- NCC: --')
                logging.info(torch.mean(ncc_tensor))
                logging.info(torch.std(ncc_tensor))

                self.logger.info(' - Foreground dice: %.4f' % torch.mean(self.foreground_dice))
                self.logger.info(' - Mean (neg.) ELBO: %.4f' % self.val_elbo)
                self.logger.info(' - Mean GED: %.4f' % self.avg_ged)
                self.logger.info(' - Mean NCC: %.4f' % self.avg_ncc)

                self.logger.info('Testing took {} seconds'.format(time.time() - time_))

                end_dice += self.avg_dice
                end_ged += self.avg_ged
                end_ncc += self.avg_ncc
            self.logger.info('Mean dice: {}'.format(end_dice/10))
            self.logger.info('Mean ged: {}'.format(end_ged / 10))
            self.logger.info('Mean ncc: {}'.format(end_ncc / 10))

    def generate_images(self, data, sys_config):
        self.net.eval()
        with torch.no_grad():

            model_selection = self.exp_config.experiment_name + '_best_dice.pth'
            self.logger.info('Generating samples {}'.format(model_selection))

            self.logger.info('Loading pretrained model {}'.format(model_selection))

            model_path = os.path.join(
                sys_config.log_root,
                self.exp_config.log_dir_name,
                self.exp_config.experiment_name,
                model_selection)

            image_path = os.path.join(
                sys_config.log_root,
                self.exp_config.log_dir_name,
                self.exp_config.experiment_name,
            )

            # if os.path.exists(model_path):
            #     self.net.load_state_dict(torch.load(model_path))
            # else:
            #     self.logger.info('The file {} does not exist. Aborting test function.'.format(model_path))
            #     return

            n_samples = 10

            for ii in range(31,100):

                s_gt_arr = data.test.labels[ii, ...]

                # from HW to NCHW
                x_b = data.test.images[ii, ...]
                patch = torch.tensor(x_b, dtype=torch.float32).to(self.device)
                val_patch = patch.unsqueeze(dim=0).unsqueeze(dim=1)

                s_b = s_gt_arr[:, :, np.random.choice(self.exp_config.annotator_range)]
                mask = torch.tensor(s_b, dtype=torch.float32).to(self.device)
                val_mask = mask.unsqueeze(dim=0).unsqueeze(dim=1)
                val_masks = torch.tensor(s_gt_arr, dtype=torch.float32).to(self.device)  # HWC
                val_masks = val_masks.transpose(0, 2).transpose(1, 2)  # CHW

                patch_arrangement = val_patch.repeat((n_samples, 1, 1, 1))

                mask_arrangement = val_mask.repeat((n_samples, 1, 1, 1))

                self.mask = mask_arrangement
                self.patch = patch_arrangement

                # training=True for constructing posterior as well
                s_out_eval_list = self.net.forward(patch_arrangement, mask_arrangement, training=False)
                s_prediction_softmax_arrangement = self.net.accumulate_output(s_out_eval_list, use_softmax=True)
                s_ = torch.argmax(s_prediction_softmax_arrangement, dim=1)
                self.logger.info('s_.shape{}'.format(s_.shape))
                self.logger.info('s_'.format(s_))

                self.save_images(image_path, patch, val_masks, s_, ii)

    def save_images(self, save_location, image, ground_truth_labels, sample,
                    iteration):
        from torchvision.utils import save_image

        save_image(image, os.path.join(save_location, '{}image.png'.format(iteration)), pad_value=1, scale_each=True,
                   normalize=True)

        for i in range(self.exp_config.num_labels_per_subject):
            save_image(ground_truth_labels[i].float(),
                       os.path.join(save_location, '{}mask{}.png'.format(iteration, i)),
                       pad_value=1,
                       scale_each=True,
                       normalize=True)
        for i in range(10):
            save_image(sample[i].float(),
                       os.path.join(save_location, '{}sample{}.png'.format(iteration, i)),
                       pad_value=1,
                       scale_each=True,
                       normalize=True)


    def save_model(self, savename):
        model_name = self.exp_config.experiment_name + '_' + savename + '.pth'

        log_dir = os.path.join(sys_config.log_root, exp_config.log_dir_name, exp_config.experiment_name)
        save_model_path = os.path.join(log_dir, model_name)
        torch.save(self.net.state_dict(), save_model_path)
        self.logger.info('saved model to .pth file in {}'.format(save_model_path))