Exemple #1
0
    def train(x, y):
        # train one iteration
        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        batch_size = config['batch_size']
        counter = 0
        EPSILON = config['magic_epsilon']

        # If accumulating gradients, loop multiple times before an optimizer step
        optim = D.module.optim if isinstance(D, nn.DataParallel) else D.optim
        optim.zero_grad()

        tP_mean = 0.
        tP_bar_list = []
        for accumulation_index in range(config['num_D_accumulations']):
            y_bar = y[counter][torch.randperm(batch_size), ...]
            out, out_mi, out_c, tP, tP_bar, tQ, tQ_bar = D(x[counter],
                                                           y[counter],
                                                           y_bar,
                                                           add_bias=True)
            tP_mean += torch.mean(tP) / float(config['num_D_accumulations'])
            tP_bar_list.append(tP_bar)
            counter += 1
        tP_bar = torch.cat(tP_bar_list)
        tP_bar_max = tP_bar.max().detach()
        log_mean_etP_bar = tP_bar_max + torch.log(
            torch.mean(torch.exp(tP_bar - tP_bar_max)))
        MI_P = tP_mean - log_mean_etP_bar
        (-MI_P).backward()

        # Optionally apply ortho reg in D
        if config['D_ortho'] > 0.0:
            # Debug print to indicate we're using ortho reg in D.
            print('using modified ortho reg in D')
            utils.ortho(D, config['D_ortho'])

        optim.step()

        out = {'MI': utils.get_tensor_item(MI_P)}
        return out
Exemple #2
0
    def OnPaintGL( self ):
        glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT )

        width, height = self.GetGLExtents()
        
        if RENDER_BACKGROUND:
            MVP = ortho( 0,width, 0, height, -1, 1 )
            
            shaders.glUseProgram( self.bgshader )
            
            glUniformMatrix4fv( glGetUniformLocation(self.bgshader, 'MVP'), 1, True, MVP )
            
            self.bgvbo.bind()
            glEnableClientState( GL_VERTEX_ARRAY );
            glVertexPointerf( self.bgvbo )
            
            glDrawArrays( GL_TRIANGLE_STRIP, 0, len( self.bgvbo ) )
            
            self.bgvbo.unbind()
            glDisableClientState( GL_VERTEX_ARRAY );
        
        if RENDER_FOREGROUND:
            shaders.glUseProgram( self.fgshader )
            
            self.fgvbo.bind()
            glEnableVertexAttribArray( 0 )
            glEnableVertexAttribArray( 1 )
            glVertexAttribPointer( 0, 3, GL_FLOAT, GL_FALSE, 24, self.fgvbo )
            glVertexAttribPointer( 1, 3, GL_FLOAT, GL_FALSE, 24, self.fgvbo+12 )
            
            for uname, ucount, ufuncs in self.graph.uniforms.values():
                UNIFORM_FUNCTION[ucount]( glGetUniformLocation(self.fgshader, uname), *ufuncs() )
            
            for name, value in custom_vs_nodes.items():
                value[4]( glGetUniformLocation(self.fgshader, name), *eval(value[3]) )
                
            for name, value in custom_fs_nodes.items():
                value[4]( glGetUniformLocation(self.fgshader, name), *eval(value[3]) )
                
            glDrawArrays( GL_TRIANGLES, 0, len( self.fgvbo ) )
            
            self.fgvbo.unbind()
            glDisableVertexAttribArray( 0 )
            glDisableVertexAttribArray( 1 )
        
        shaders.glUseProgram( 0 )
        
        self.SwapBuffers()
Exemple #3
0
    def train(x, y):
        G.optim.zero_grad()
        D.optim.zero_grad()
        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            # The fake class label
            lossy = torch.LongTensor(config['batch_size'])
            lossy = lossy.cuda()
            lossy.data.fill_(
                config['n_classes'])  # index for fake just for loss
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                y_.sample_()

                D_fake, D_real = GD(z_[:config['batch_size']],
                                    y_[:config['batch_size']],
                                    x[counter],
                                    y[counter],
                                    train_G=False,
                                    split_D=config['split_D'])

                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                if config['mh_csc_loss'] or config['mh_loss']:
                    D_loss_real = losses.crammer_singer_criterion(
                        D_real, y[counter])
                    D_loss_fake = losses.crammer_singer_criterion(
                        D_fake, lossy[:config['batch_size']])
                else:
                    D_loss_real, D_loss_fake = losses.discriminator_loss(
                        D_fake, D_real)
                D_loss = (D_loss_real + D_loss_fake) / float(
                    config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        # If accumulating gradients, loop multiple times
        for accumulation_index in range(config['num_G_accumulations']):
            # reusing the same noise for CIFAR ...
            if config['resampling'] or (accumulation_index > 0):
                z_.sample_()
                y_.sample_()

            if config['fm_loss']:
                D_feat_fake, D_feat_real = GD(z_,
                                              y_,
                                              x[-1],
                                              None,
                                              train_G=True,
                                              split_D=config['split_D'],
                                              feat=True)
                fm_loss = torch.mean(
                    torch.abs(
                        torch.mean(D_feat_fake, 0) -
                        torch.mean(D_feat_real, 0)))
                G_loss = fm_loss
            else:
                D_fake = GD(z_, y_, train_G=True, split_D=config['split_D'])
                if config['mh_csc_loss']:
                    G_loss = losses.crammer_singer_complement_criterion(
                        D_fake, lossy[:config['batch_size']]) / float(
                            config['num_G_accumulations'])
                elif config['mh_loss']:
                    D_feat_fake, D_feat_real = GD(z_,
                                                  y_,
                                                  x[-1],
                                                  None,
                                                  train_G=True,
                                                  split_D=config['split_D'],
                                                  feat=True)
                    fm_loss = torch.mean(
                        torch.abs(
                            torch.mean(D_feat_fake, 0) -
                            torch.mean(D_feat_real, 0)))
                    oth_loss = losses.mh_loss(D_fake,
                                              y_[:config['batch_size']])
                    G_loss = (config['mh_fmloss_weight'] * fm_loss +
                              config['mh_loss_weight'] * oth_loss) / float(
                                  config['num_G_accumulations'])
                else:
                    G_loss = losses.generator_loss(D_fake) / float(
                        config['num_G_accumulations'])
            G_loss.backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            print('using modified ortho reg in G'
                  )  # Debug print to indicate we're using ortho reg in G
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G,
                        config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        G.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        out = {
            'G_loss': float(G_loss.item()),
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item())
        }
        # Return G's loss and the components of D's loss.
        return out
Exemple #4
0
 def train(x, y):
     G.optim.zero_grad()
     D.optim.zero_grad()
     x = torch.split(x, config['batch_size'])
     y = torch.split(y, config['batch_size'])
     counter, counter2 = 0, 0
     if config['toggle_grads']:
         utils.toggle_grad(D, True)
         utils.toggle_grad(G, False)
     for step_index in range(config['num_D_steps']):
         D.optim.zero_grad()
         for accumulation_index in range(config['num_D_accumulations']):
             z_.sample_()
             if not config['conditional']:
                 y_.zero_()
                 y_counter = torch.zeros_like(y[counter]).to(
                     y_.device).long()
             else:
                 y_.sample_()
                 y_counter = y[counter]
             real_samples = x[counter]
             D_fake, D_real = GD(z_[:config['batch_size']],
                                 y_[:config['batch_size']],
                                 real_samples,
                                 y_counter,
                                 train_G=False,
                                 split_D=config['split_D'])
             _, firstGgg1g1G1g1G1 = GD3(z_[:config['batch_size']],
                                        y_[:config['batch_size']],
                                        train_G=False,
                                        return_G_z=True,
                                        split_D=config['split_D'])
             D_loss = discriminator_loss(D_fake, D_real, firstGgg1g1G1g1G1,
                                         firstGgg1g1G1g1G1)
             D_loss.backward()
             counter += 1
         if config['D_ortho'] > 0.0:
             print('using modified ortho reg in D')
             utils.ortho(D, config['D_ortho'])
         D.optim.step()
     if config['toggle_grads']:
         utils.toggle_grad(D, False)
         utils.toggle_grad(G, True)
     G.optim.zero_grad()
     for accumulation_index in range(config['num_G_accumulations']):
         z_.sample_()
         y_.sample_()
         if not config['conditional']:
             y_.zero_()
         _, fiFirstgvhgzagaGenerator = GD3(z_[:config['batch_size']],
                                           y_[:config['batch_size']],
                                           train_G=False,
                                           return_G_z=True,
                                           split_D=config['split_D'])
         seSecgvhgzagaGenerator = x[counter2]
         G_loss = generator_loss(D_fake, fiFirstgvhgzagaGenerator,
                                 seSecgvhgzagaGenerator,
                                 fiFirstgvhgzagaGenerator) / float(
                                     config['num_G_accumulations'])
         counter2 += 1
     if config['G_ortho'] > 0.0:
         print('using modified ortho reg in G')
         utils.ortho(G,
                     config['G_ortho'],
                     blacklist=[param for param in G.shared.parameters()])
     G.optim.step()
     if config['ema']:
         ema.update(state_dict['itr'])
     out = {'G_loss': float(G_loss.item()), 'D_loss': float(D_loss.item())}
     return out
Exemple #5
0
    def train(x, y, ratio):
        G.optim.zero_grad()
        D.optim.zero_grad()
        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        ratio = torch.split(ratio, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                # only feed in 0's for y if "unconditional"
                if not config['conditional']:
                    y_.zero_()
                    y_counter = torch.zeros_like(y[counter]).to(
                        y_.device).long()
                else:
                    y_.sample_()
                    y_counter = y[counter]
                D_fake, D_real = GD(z_[:config['batch_size']],
                                    y_[:config['batch_size']],
                                    x[counter],
                                    y_counter,
                                    train_G=False,
                                    split_D=config['split_D'])
                # reweight discriminator loss
                # modified discriminator loss to reflect flattening coefficient
                D_loss_real, D_loss_fake = discriminator_loss(
                    D_fake, D_real, ratio[counter], alpha=config['alpha'])
                D_loss = (D_loss_real + D_loss_fake) / \
                    float(config['num_D_accumulations'])

                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        # If accumulating gradients, loop multiple times
        for accumulation_index in range(config['num_G_accumulations']):
            z_.sample_()
            y_.sample_()
            # NOTE: setting all labels to 0 to train as unconditional model
            if not config['conditional']:
                y_.zero_()
            D_fake = GD(z_, y_, train_G=True, split_D=config['split_D'])
            # we don't need to do anything for the generator loss
            G_loss = generator_loss(D_fake) / float(
                config['num_G_accumulations'])
            G_loss.backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            # Debug print to indicate we're using ortho reg in G
            print('using modified ortho reg in G')
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G,
                        config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        G.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        out = {
            'G_loss': float(G_loss.item()),
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item())
        }
        # Return G's loss and the components of D's loss.
        return out
Exemple #6
0
    def train(x, y):
        G_batch_size = max(config['G_batch_size'], config['batch_size'])
        G.optim.zero_grad()
        D.optim.zero_grad()

        #Use latent optimization
        z_prime = lat_opt_ngd(G, D, z_, G_batch_size, y_)

        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            toggle_grad(D, True)
            toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                D_fake, D_real = GD(z_prime[:config['batch_size']],
                                    y_[:config['batch_size']],
                                    x[counter],
                                    y[counter],
                                    train_G=False,
                                    split_D=config['split_D'])

                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = loss_hinge_dis(D_fake, D_real)
                D_loss = (D_loss_real + D_loss_fake) / float(
                    config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                ortho(D, config['D_ortho'])

            D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            toggle_grad(D, False)
            toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        # If accumulating gradients, loop multiple times
        for accumulation_index in range(config['num_G_accumulations']):
            D_fake = GD(z_prime, y_, train_G=True, split_D=config['split_D'])
            G_loss = loss_hinge_gen(D_fake) / float(
                config['num_G_accumulations'])
            G_loss.backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            ortho(G,
                  config['G_ortho'],
                  blacklist=[param for param in G.shared.parameters()])
        G.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        out = {
            'G_loss': float(G_loss.item()),
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item())
        }
        # Return G's loss and the components of D's loss.
        return out
Exemple #7
0
    def train(x, y):
        # train one iteration
        G.optim.zero_grad()
        D.optim.zero_grad()
        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        half_size = config['batch_size']
        counter = 0
        MINE_weight = config['MINE_weight'] if config[
            'weighted_MINE_loss'] else 1.0
        EPSILON = config['magic_epsilon']

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                y_.sample_()
                gy_bar = y_[torch.randperm(half_size),
                            ...] if D.TQ or D.TP else None
                dy_bar = y[counter][torch.randperm(half_size),
                                    ...] if D.TP or D.TQ else None
                D_fake, D_real, mi, c_cls, tP, tP_bar, tQ, tQ_bar = GD(
                    z_[:config['batch_size']],
                    y_[:config['batch_size']],
                    x[counter],
                    y[counter],
                    gy_bar,
                    dy_bar,
                    train_G=False,
                    split_D=config['split_D'],
                    add_bias=True)
                # Compute components of D's loss, average them, and divide by the number of gradient accumulations
                D_loss_real, D_loss_fake = discriminator_loss(D_fake, D_real)
                C_loss = 0.
                MI_P = 0.
                MI_Q = 0.
                if config['loss_type'] == 'fCGAN':
                    # MINE-P on real
                    etP_bar = torch.mean(torch.exp(tP_bar[half_size:]))
                    if D.ma_etP_bar is None:
                        D.ma_etP_bar = etP_bar.detach().item()
                    D.ma_etP_bar += config['ma_rate'] * (
                        etP_bar.detach().item() - D.ma_etP_bar)
                    MI_P = torch.mean(tP[half_size:]) - torch.log(
                        etP_bar + EPSILON) * etP_bar.detach() / D.ma_etP_bar
                    # MINE-Q on fake
                    etQ_bar = torch.mean(torch.exp(tQ_bar[:half_size]))
                    if D.ma_etQ_bar is None:
                        D.ma_etQ_bar = etQ_bar.detach().item()
                    D.ma_etQ_bar += config['ma_rate'] * (
                        etQ_bar.detach().item() - D.ma_etQ_bar)
                    MI_Q = torch.mean(tQ[:half_size]) - torch.log(
                        etQ_bar + EPSILON) * etQ_bar.detach() / D.ma_etQ_bar
                if config['loss_type'] == 'MINE':
                    # AC
                    C_loss += F.cross_entropy(c_cls[half_size:], y[counter])
                    if config['train_AC_on_fake']:
                        C_loss += F.cross_entropy(c_cls[:half_size], y_)
                    # MINE-Q on fake
                    etQ_bar = torch.mean(torch.exp(tQ_bar[:half_size]))
                    if D.ma_etQ_bar is None:
                        D.ma_etQ_bar = etQ_bar.detach().item()
                    D.ma_etQ_bar += config['ma_rate'] * (
                        etQ_bar.detach().item() - D.ma_etQ_bar)
                    MI_Q = torch.mean(tQ[:half_size]) - torch.log(
                        etQ_bar + EPSILON) * etQ_bar.detach() / D.ma_etQ_bar
                if config['loss_type'] == 'Twin_AC':
                    C_loss += F.cross_entropy(c_cls[half_size:],
                                              y[counter]) + F.cross_entropy(
                                                  mi[:half_size], y_)
                    if config['train_AC_on_fake']:
                        C_loss += F.cross_entropy(c_cls[:half_size], y_)
                if config['loss_type'] == 'AC':
                    C_loss += F.cross_entropy(
                        c_cls[half_size:],
                        y[counter])  # AC should be trained on fake also
                    if config['train_AC_on_fake']:
                        C_loss += F.cross_entropy(c_cls[:half_size], y_)
                D_loss = (D_loss_real + D_loss_fake +
                          C_loss * config['AC_weight'] -
                          (MI_P + MI_Q) * MINE_weight) / float(
                              config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()
        for accumulation_index in range(config['num_G_accumulations']):
            z_.sample_()
            y_.sample_()
            gy_bar = y_[torch.randperm(half_size), ...] if D.TQ else None
            D_fake, mi, c_cls, tP, tP_bar, tQ, tQ_bar = GD(
                z_,
                y_,
                gy_bar=gy_bar,
                train_G=True,
                split_D=config['split_D'],
                return_G_z=False,
                add_bias=config['loss_type'] != 'fCGAN')
            C_loss = 0.
            MI_loss = 0.
            MI_Q_loss = 0.
            f_div = 0.
            if config['loss_type'] == 'fCGAN':
                # f-div
                f_div += (tQ - tP).mean()  # rev-kl
            if config['loss_type'] == 'MINE':
                # AC
                C_loss += F.cross_entropy(c_cls, y_)
                # MINE-Q
                MI_Q_loss = torch.mean(tQ) - torch.log(
                    torch.mean(torch.exp(tQ_bar)) + EPSILON)
            if config['loss_type'] == 'AC' or config['loss_type'] == 'Twin_AC':
                C_loss += F.cross_entropy(c_cls, y_)
                if config['loss_type'] == 'Twin_AC':
                    MI_loss = F.cross_entropy(mi, y_)

            G_loss = generator_loss(D_fake) / float(
                config['num_G_accumulations'])
            C_loss = C_loss / float(config['num_G_accumulations'])
            MI_loss = MI_loss / float(config['num_G_accumulations'])
            MI_Q_loss = MI_Q_loss / float(config['num_G_accumulations'])
            f_div = f_div / float(config['num_G_accumulations'])
            (G_loss + (C_loss - MI_loss) * config['AC_weight'] +
             MI_Q_loss * config['MINE_weight'] +
             f_div * config['fCGAN_weight']).backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            print('using modified ortho reg in G'
                  )  # Debug print to indicate we're using ortho reg in G
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G,
                        config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        G.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        out = {
            'G_loss': float(G_loss.item()),
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item()),
            'C_loss': utils.get_tensor_item(C_loss),
            'MI_loss': utils.get_tensor_item(MI_loss),
            'f_div': utils.get_tensor_item(f_div),
            'MI_P': utils.get_tensor_item(MI_P),
            'MI_Q': utils.get_tensor_item(MI_Q)
        }
        # Return G's loss and the components of D's loss.
        return out
Exemple #8
0
    def train(x, y):
        G.optim.zero_grad()
        D.optim.zero_grad()
        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0
        lambda_D = config['lambda_D']
        lambda_G = config['lambda_G']

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                y_.sample_()
                D_scores, D_scores_rotate90, D_scores_rotate180, D_scores_rotate270, \
                D_scores_croptl, D_scores_croptr, D_scores_cropbl, D_scores_cropbr, \
                D_scores_translation, D_scores_cutout = GD(z_[:config['batch_size']], y_[:config['batch_size']],
                              x[counter], y[counter], train_G=False, policy=config['DiffAugment'],
                              CR=config['CR'] > 0, CR_augment=config['CR_augment'])

                D_loss_CR = 0
                if config['CR'] > 0:
                    D_fake, D_real, D_real_aug = D_scores
                    D_loss_CR = torch.mean(
                        (D_real_aug - D_real)**2) * config['CR']
                else:
                    D_fake, D_real = D_scores
                    # rotation
                    D_fake_rotate90, D_real_rotate90 = D_scores_rotate90
                    D_fake_rotate180, D_real_rotate180 = D_scores_rotate180
                    D_fake_rotate270, D_real_rotate270 = D_scores_rotate270
                    # cropping
                    D_fake_croptl, D_real_croptl = D_scores_croptl
                    D_fake_croptr, D_real_croptr = D_scores_croptr
                    D_fake_cropbl, D_real_cropbl = D_scores_cropbl
                    D_fake_cropbr, D_real_cropbr = D_scores_cropbr
                    # translation & cutout
                    D_fake_translation, D_real_translation = D_scores_translation
                    D_fake_cutout, D_real_cutout = D_scores_cutout

                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = losses.discriminator_loss(
                    D_fake, D_real)
                # rotation
                D_loss_real_rotate90, D_loss_fake_rotate90 = losses.discriminator_loss(
                    D_fake_rotate90, D_real_rotate90)
                D_loss_real_rotate180, D_loss_fake_rotate180 = losses.discriminator_loss(
                    D_fake_rotate180, D_real_rotate180)
                D_loss_real_rotate270, D_loss_fake_rotate270 = losses.discriminator_loss(
                    D_fake_rotate270, D_real_rotate270)
                # croping
                D_loss_real_croptl, D_loss_fake_croptl = losses.discriminator_loss(
                    D_fake_croptl, D_real_croptl)
                D_loss_real_croptr, D_loss_fake_croptr = losses.discriminator_loss(
                    D_fake_croptr, D_real_croptr)
                D_loss_real_cropbl, D_loss_fake_cropbl = losses.discriminator_loss(
                    D_fake_cropbl, D_real_cropbl)
                D_loss_real_cropbr, D_loss_fake_cropbr = losses.discriminator_loss(
                    D_fake_cropbr, D_real_cropbr)
                # translation and cutout
                D_loss_real_translation, D_loss_fake_translation = losses.discriminator_loss(
                    D_fake_translation, D_real_translation)
                D_loss_real_cutout, D_loss_fake_cutout = losses.discriminator_loss(
                    D_fake_cutout, D_real_cutout)

                D_loss = D_loss_real + D_loss_fake + D_loss_CR
                # rotation
                D_loss_rotate90 = D_loss_real_rotate90 + D_loss_fake_rotate90
                D_loss_rotate180 = D_loss_real_rotate180 + D_loss_fake_rotate180
                D_loss_rotate270 = D_loss_real_rotate270 + D_loss_fake_rotate270
                # cropping
                D_loss_croptl = D_loss_real_croptl + D_loss_fake_croptl
                D_loss_croptr = D_loss_real_croptr + D_loss_fake_croptr
                D_loss_cropbl = D_loss_real_cropbl + D_loss_fake_cropbl
                D_loss_cropbr = D_loss_real_cropbr + D_loss_fake_cropbr
                # translation and cutout
                D_loss_translation = D_loss_real_translation + D_loss_fake_translation
                D_loss_cutout = D_loss_real_cutout + D_loss_fake_cutout

                D_loss = D_loss + lambda_D/4*(D_loss + D_loss_rotate90 + D_loss_rotate180 + D_loss_rotate270) \
                                + lambda_D/5*(D_loss + D_loss_croptl + D_loss_croptr + D_loss_cropbl + D_loss_cropbr) \
                                + lambda_D/2*(D_loss + D_loss_translation) \
                                + lambda_D/2*(D_loss + D_loss_cutout)

                D_loss = D_loss / float(config['num_D_accumulations'])
                D_loss.backward(retain_graph=True)
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        if not config['fix_G']:
            # If accumulating gradients, loop multiple times
            for accumulation_index in range(config['num_G_accumulations']):
                z_.sample_()
                y_.sample_()
                D_fake, D_fake_rotate90, D_fake_rotate180, D_fake_rotate270, \
                        D_fake_croptl, D_fake_croptr, D_fake_cropbl, D_fake_cropbr, D_fake_translation, D_fake_cutout = GD(z_, y_, train_G=True, policy=config['DiffAugment'])

                G_loss_rotate0 = losses.generator_loss(D_fake) / float(
                    config['num_G_accumulations'])
                # rotation
                G_loss_rotate90 = losses.generator_loss(
                    D_fake_rotate90) / float(config['num_G_accumulations'])
                G_loss_rotate180 = losses.generator_loss(
                    D_fake_rotate180) / float(config['num_G_accumulations'])
                G_loss_rotate270 = losses.generator_loss(
                    D_fake_rotate270) / float(config['num_G_accumulations'])
                # cropping
                G_loss_croptl = losses.generator_loss(D_fake_croptl) / float(
                    config['num_G_accumulations'])
                G_loss_croptr = losses.generator_loss(D_fake_croptr) / float(
                    config['num_G_accumulations'])
                G_loss_cropbl = losses.generator_loss(D_fake_cropbl) / float(
                    config['num_G_accumulations'])
                G_loss_cropbr = losses.generator_loss(D_fake_cropbr) / float(
                    config['num_G_accumulations'])
                # translation and cutout
                G_loss_translation = losses.generator_loss(
                    D_fake_translation) / float(config['num_G_accumulations'])
                G_loss_cutout = losses.generator_loss(D_fake_cutout) / float(
                    config['num_G_accumulations'])

                G_loss = G_loss_rotate0 + lambda_G/4.*(G_loss_rotate0 + G_loss_rotate90 + G_loss_rotate180 + G_loss_rotate270) \
                                        + lambda_G/5.*(G_loss_rotate0 + G_loss_croptl + G_loss_croptr + G_loss_cropbl + G_loss_cropbr) \
                                        + lambda_G/2.*(G_loss_rotate0 + G_loss_translation) \
                                        + lambda_G/2.*(G_loss_rotate0 + G_loss_cutout)

                G_loss.backward()

            # Optionally apply modified ortho reg in G
            if config['G_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in G
                print('using modified ortho reg in G')
                # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
                utils.ortho(
                    G,
                    config['G_ortho'],
                    blacklist=[param for param in G.shared.parameters()])
            G.optim.step()

            # If we have an ema, update it, regardless of if we test with it or not
            if config['ema']:
                ema.update(state_dict['itr'])

        out = {
            'G_loss': float(G_loss.item()) if not config['fix_G'] else 0,
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item()),
        }
        if config['CR'] > 0:
            out['D_loss_CR'] = float(D_loss_CR.item())
        # Return G's loss and the components of D's loss.
        return out
    def train_mode_seeing(x, y):
        G.optim.zero_grad()
        D.optim.zero_grad()
        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                y_.sample_()
                D_fake, D_fake_features, D_real, D_real_features = GD(
                    z_[:config['batch_size']],
                    y_[:config['batch_size']],
                    x[counter],
                    y[counter],
                    train_G=False,
                    split_D=config['split_D'])
                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = discriminator_loss(D_fake, D_real)
                D_loss = (D_loss_real + D_loss_fake) / float(
                    config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                # print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            if config['clip_norm'] is not None:
                torch.nn.utils.clip_grad_norm_(D.parameters(),
                                               config['clip_norm'])
            D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        # If accumulating gradients, loop multiple times
        for accumulation_index in range(config['num_G_accumulations']):
            z_.sample_()
            y_.sample_()
            z1 = z_.data.clone().detach()
            D_fake1, _, fake_image1 = GD(z1,
                                         y_,
                                         train_G=True,
                                         split_D=config['split_D'],
                                         return_G_z=True)
            G_loss1 = generator_loss(D_fake1, D_real.detach()) / float(
                config['num_G_accumulations'])

            z_.sample_()
            z2 = z_.data.clone().detach()
            D_fake2, _, fake_image2 = GD(z2,
                                         y_,
                                         train_G=True,
                                         split_D=config['split_D'],
                                         return_G_z=True)
            G_loss2 = generator_loss(D_fake2, D_real.detach()) / float(
                config['num_G_accumulations'])

            G_loss_gan = G_loss1 + G_loss2

            # mode seeking loss
            lz = torch.mean(torch.abs(fake_image2 - fake_image1)) / torch.mean(
                torch.abs(z2 - z1))
            eps = 1 * 1e-5
            loss_lz = 1 / (lz + eps)
            G_loss = G_loss_gan + loss_lz
            G_loss.backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            # print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G,
                        config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])

        if config['clip_norm'] is not None:
            torch.nn.utils.clip_grad_norm_(G.parameters(), config['clip_norm'])
        G.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        out = {
            'G_loss': float(G_loss.item()),
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item())
        }
        # Return G's loss and the components of D's loss.
        return out
    def train(x, y):
        train_fns_c = getattr(config, 'train_fns_c')
        summary = {}
        summary_D = {}
        G.optim.zero_grad()
        D.optim.zero_grad()
        # How many chunks to split x and y into?
        x.requires_grad_()
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                y_.sample_()
                D_fake, D_real, G_z = GD(z_[:config['batch_size']],
                                         y_[:config['batch_size']],
                                         x[counter],
                                         y[counter],
                                         train_G=False,
                                         split_D=config['split_D'],
                                         return_G_z=True)

                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                r_logit_mean, f_logit_mean, wd, _ = \
                  losses.wgan_discriminator_loss(r_logit=D_real, f_logit=D_fake)
                # gpreal
                img_gp, gp = gan_losses.compute_grad2(
                    d_out=D_real,
                    x_in=x[counter],
                    backward=True,
                    gp_lambda=10. / config['num_D_accumulations'],
                    return_grad=True)
                # losses.wgan_gpreal_gradient_penalty(x=x[counter], dy=y[counter],
                #                                     f=GD)

                if train_fns_c.adv_train:
                    r_logit_mean_adv = losses.adv_loss(netD=GD,
                                                       img=x[counter],
                                                       y=y[counter],
                                                       gp_img=img_gp,
                                                       adv_lr=0.01,
                                                       retain_graph=True)
                    summary_D['r_logit_mean_adv'] = r_logit_mean_adv

                if train_fns_c.use_bound:
                    D_loss = (-wd + torch.relu(wd - float(config.bound))) / \
                             float(config['num_D_accumulations'])
                    summary['bound'] = config.bound
                else:
                    D_loss = (-wd) / float(config['num_D_accumulations'])

                D_loss.backward(retain_graph=True)

                counter += 1
            summary_D['r_logit_mean'] = r_logit_mean.item()
            summary_D['f_logit_mean'] = f_logit_mean.item()
            summary['wd'] = wd.item()
            summary['gp'] = gp.mean()

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        # If accumulating gradients, loop multiple times
        for accumulation_index in range(config['num_G_accumulations']):
            z_.sample_()
            y_.sample_()
            D_fake = GD(z_, y_, train_G=True, split_D=config['split_D'])
            G_f_logit_mean, G_loss = losses.wgan_generator_loss(f_logit=D_fake)
            G_loss = G_loss / float(config['num_G_accumulations'])
            G_loss.backward()
            summary_D['G_f_logit_mean'] = G_f_logit_mean.item()
            summary['G_loss'] = G_loss.item()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            print('using modified ortho reg in G'
                  )  # Debug print to indicate we're using ortho reg in G
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G,
                        config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        G.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        myargs.textlogger.log(state_dict['itr'], **summary_D)
        # Return G's loss and the components of D's loss.
        return summary
    def train(x, y):
        G.optim.zero_grad()
        D.optim.zero_grad()
        E.optim.zero_grad()

        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        # print("inside fns", x)
        print("split - x {}".format(len(x)))
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)
            utils.toggle_grad(E, False)
        # print("inside train fns: config['num_D_steps']", config['num_D_steps'])
        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            # print("---------------------- counter {} ---------------".format(counter))
            # print("x[counter] {}; y[counter] {}".format(x[counter].shape, y[counter].shape))
            for accumulation_index in range(config['num_D_accumulations']):
                # Cornner case for the last batch
                if counter >= len(x):
                    break
                D_fake, D_real = GDE(x[counter], y[counter], config, state_dict['itr'], img_pool, train_G=False,
                                    split_D=config['split_D'])
                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = losses.discriminator_loss( \
                    D_fake, D_real, config['clip'])
                D_loss = (D_loss_real + D_loss_fake) / \
                    float(config['num_D_accumulations'])
                print("D_loss: {}; D_fake {}, D_real {}".format(D_loss.item(), D_loss_fake.item(), D_loss_real.item()))
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])
            
            # stop gradient for testing purpose
            if config['stop_gradient']:
                print("!!! D is not optimized since you turn on `stop_gradient`!!!!!!")
            else:
                D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)
            utils.toggle_grad(E, True)

        # Zero G/E's gradients by default before training G, for safety
        G.optim.zero_grad()
        E.optim.zero_grad()
        # If accumulating gradients, loop multiple times
        counter = 0 # reset counter for data split
        for accumulation_index in range(config['num_G_accumulations']):
            if counter >= len(x):
                    break
            # print("---------------------- counter {} ---------------".format(counter))
            output = GDE(x[counter], y[counter], config, state_dict['itr'], img_pool, train_G=True, split_D=config['split_D'], return_G_z=True)
            D_fake = output[0]
            G_z = output[2]
            mu, log_var = output[3], output[4]
            if len(output) == 6:
                G_additional = output[5]
            # print("checkpoint==========================")
            G_loss = losses.generator_loss(
                D_fake) / float(config['num_G_accumulations'])
            VAE_recon_loss = losses.vae_recon_loss(G_z, x[counter])
            VAE_kld_loss = losses.vae_kld_loss(mu, log_var, config['clip'])
            GE_loss = G_loss + VAE_recon_loss * config['lambda_vae_recon'] + VAE_kld_loss * config['lambda_vae_kld']
                            # weights_TTs.mean() * config['lambda_spatial_transform_weights']
                            
            # log_loss_str = f"GE_loss {GE_loss.item()}; VAE_recon_loss {VAE_recon_loss.item()}; VAE_kld_loss {VAE_kld_loss.item()}; weights_TTs {weights_TTs.mean().item()}; "
            log_loss_str = f"GE_loss {GE_loss.item()}; VAE_recon_loss {VAE_recon_loss.item()}; VAE_kld_loss {VAE_kld_loss.item()} "

            # add G_additional loss
            if len(output) == 6:
                G_additional_loss = config['lambda_g_additional'] * G_additional.sum()
                GE_loss += G_additional_loss
                log_loss_str += f"G_additional {G_additional_loss.item()}"
            
            # print out loss
            print(log_loss_str)
            
            # optimization
            GE_loss.backward()
            counter += 1


        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            # Debug print to indicate we're using ortho reg in G
            print('using modified ortho reg in G')
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G, config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        
        # stop gradient for testing purpose
        if config['stop_gradient']:
            print("!!! G and E is not optimized since you turn on `stop_gradient`!!!!!!")
        else:
            G.optim.step()
            E.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        out = {'G_loss': float(G_loss.item()),
               'D_loss_real': float(D_loss_real.item()),
               'D_loss_fake': float(D_loss_fake.item()),
               'VAE_recon_loss': float(VAE_recon_loss.item()),
               'VAE_KLD_loss': float(VAE_recon_loss.item())}
        # Return G's loss and the components of D's loss.
        return out
Exemple #12
0
 def train(x, y):
     G.optim.zero_grad()
     D.optim.zero_grad()
     x = torch.split(x, config['batch_size'])
     y = torch.split(y, config['batch_size'])
     counter = 0
     if config['toggle_grads']:
         utils.toggle_grad(D, True)
         utils.toggle_grad(G, False)
     for step_index in range(config['num_D_steps']):
         D.optim.zero_grad()
         for accumulation_index in range(config['num_D_accumulations']):
             z_.sample_()
             if not config['conditional']:
                 y_.zero_()
                 if counter < len(y):
                     y_counter = torch.zeros_like(y[counter]).to(
                         y_.device).long()
             else:
                 y_.sample_()
                 y_counter = y[counter]
             if counter < len(y):
                 real_samples = x[counter]
             D_fake, D_real = GD(z_[:config['batch_size']],
                                 y_[:config['batch_size']],
                                 real_samples,
                                 y_counter,
                                 train_G=False,
                                 split_D=config['split_D'])
             D_loss_real, D_loss_fake = discriminator_loss(D_fake, D_real)
             D_loss = D_loss_real + D_loss_fake
             D_loss.backward()
             counter += 1
         if config['D_ortho'] > 0.0:
             print('using modified ortho reg in D')
             utils.ortho(D, config['D_ortho'])
         D.optim.step()
     if config['toggle_grads']:
         utils.toggle_grad(D, False)
         utils.toggle_grad(G, True)
     G.optim.zero_grad()
     for accumulation_index in range(config['num_G_accumulations']):
         z_.sample_()
         y_.sample_()
         if not config['conditional']:
             y_.zero_()
         real_samples2 = x[0]
         D_fake = GD(z_, y_, train_G=True, split_D=config['split_D'])
         G_loss = generator_loss(D_fake, real_samples2, z_, G.forward(
             z_, y_)) / float(config['num_G_accumulations'])
         G_loss.backward()
     if config['G_ortho'] > 0.0:
         print('using modified ortho reg in G')
         utils.ortho(G,
                     config['G_ortho'],
                     blacklist=[param for param in G.shared.parameters()])
     G.optim.step()
     if config['ema']:
         ema.update(state_dict['itr'])
     out = {
         'G_loss': float(G_loss.item()),
         'D_loss_real': float(D_loss_real.item()),
         'D_loss_fake': float(D_loss_fake.item())
     }
     return out
Exemple #13
0
  def train(x, y):
    G.optim.zero_grad()
    D.optim.zero_grad()
    # How many chunks to split x and y into?
    x = torch.split(x, config['batch_size'])
    y = torch.split(y, config['batch_size'])
    counter = 0
    
    # Optionally toggle D and G's "require_grad"

    utils.toggle_grad(D, True)
    utils.toggle_grad(G, False)
      
    for step_index in range(config['num_D_steps']):
      # If accumulating gradients, loop multiple times before an optimizer step
      for accumulation_index in range(config['num_D_accumulations']):
        z_.sample_()
        y_.sample_()
        D_fake, D_real, mi, c_cls = GD(z_[:config['batch_size']], y_[:config['batch_size']],
                            x[counter], y[counter], train_G=False, 
                            split_D=config['split_D'])
         
        # Compute components of D's loss, average them, and divide by 
        # the number of gradient accumulations
        D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real)
        C_loss = 0
        if config['loss_type'] == 'Twin_AC':
            C_loss += F.cross_entropy(c_cls[D_fake.shape[0]:] ,y[counter]) + F.cross_entropy(mi[:D_fake.shape[0]] ,y_)
        if config['loss_type'] == 'Twin_AC_M':
            C_loss += hinge_multi(c_cls[D_fake.shape[0]:], y[counter]) + hinge_multi(mi[:D_fake.shape[0]], y_)
        if config['loss_type'] == 'AC':
            C_loss += F.cross_entropy(c_cls[D_fake.shape[0]:] ,y[counter])
        D_loss = (D_loss_real + D_loss_fake + C_loss*config['AC_weight']) / float(config['num_D_accumulations'])
        D_loss.backward()
        counter += 1
        
      # Optionally apply ortho reg in D
      if config['D_ortho'] > 0.0:
        # Debug print to indicate we're using ortho reg in D.
        print('using modified ortho reg in D')
        utils.ortho(D, config['D_ortho'])
      
      D.optim.step()
    
    # Optionally toggle "requires_grad"
    utils.toggle_grad(D, False)
    utils.toggle_grad(G, True)
      
    # Zero G's gradients by default before training G, for safety
    G.optim.zero_grad()
    for step_index in range(config['num_G_steps']):
        for accumulation_index in range(config['num_G_accumulations']):
            z_.sample_()
            y_.sample_()
            D_fake, G_z, mi, c_cls = GD(z_, y_, train_G=True, split_D=config['split_D'], return_G_z=True)
            C_loss = 0
            MI_loss = 0
            if config['loss_type'] == 'AC' or config['loss_type'] == 'Twin_AC':
                C_loss = F.cross_entropy(c_cls, y_)
                if config['loss_type'] == 'Twin_AC':
                    MI_loss = F.cross_entropy(mi, y_)
            if config['loss_type'] == 'Twin_AC_M':
                C_loss = hinge_multi(c_cls, y_,hinge=False)
                MI_loss = hinge_multi(mi, y_, hinge=False)

            G_loss = losses.generator_loss(D_fake) / float(config['num_G_accumulations'])
            C_loss = C_loss / float(config['num_G_accumulations'])
            MI_loss = MI_loss / float(config['num_G_accumulations'])
            (G_loss + (C_loss - MI_loss)*config['AC_weight']).backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            print('using modified ortho reg in G')  # Debug print to indicate we're using ortho reg in G
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G, config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        G.optim.step()
    
    # If we have an ema, update it, regardless of if we test with it or not
    if config['ema']:
      ema.update(state_dict['itr'])
    
    out = {'G_loss': float(G_loss.item()), 
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item()),
            'C_loss': C_loss,
            'MI_loss': MI_loss}
    # Return G's loss and the components of D's loss.
    return out
  def train(x, y):
    G.optim.zero_grad()
    D.optim.zero_grad()
    # How many chunks to split x and y into?
    x = torch.split(x, config['batch_size'])
    y = torch.split(y, config['batch_size'])
    counter = 0
    
    # Optionally toggle D and G's "require_grad"
    if config['toggle_grads']:
      utils.toggle_grad(D, True)
      utils.toggle_grad(G, False)
      
    for step_index in range(config['num_D_steps']):
      # If accumulating gradients, loop multiple times before an optimizer step
      D.optim.zero_grad()
      for accumulation_index in range(config['num_D_accumulations']):
        z_.sample_()
        y_.sample_()
        D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']], 
                            x[counter], y[counter], train_G=False, 
                            split_D=config['split_D'])
         
        # Compute components of D's loss, average them, and divide by 
        # the number of gradient accumulations
        # D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real)

        D_real_positive = [y[counter], config['n_classes']]
        # D_real_negative = (config['n_classes'] + 1,)
        if global_cfg.omni_loss.mode == 'only_p':
          assert 0, "deprecated"
          D_loss_real = omni_loss(pred=D_real, positive=D_real_positive, default_label=-1)
        elif global_cfg.omni_loss.mode == 'p_and_n':
          D_loss_real = omni_loss(pred=D_real, positive=D_real_positive, default_label=0)
        elif global_cfg.omni_loss.mode == 'one_side':
          D_loss_real = omni_loss(pred=D_real, positive=D_real_positive, default_label=-1)
        else:
          assert 0

        D_fake_positive = (config['n_classes'] + 1,)
        # D_fake_negative = (y_[:config['batch_size']], config['n_classes'])
        if global_cfg.omni_loss.mode == 'only_p':
          D_loss_fake = omni_loss(pred=D_fake, positive=D_fake_positive, default_label=-1)
        elif global_cfg.omni_loss.mode == 'p_and_n':
          D_loss_fake = omni_loss(pred=D_fake, positive=D_fake_positive, default_label=0)
        elif global_cfg.omni_loss.mode == 'one_side':
          D_fake_negative = [y_[:config['batch_size']], config['n_classes']]
          D_loss_fake = omni_loss(pred=D_fake, positive=None, negative=D_fake_negative, default_label=-1)
        else:
          assert 0

        D_loss = (D_loss_real + D_loss_fake) / float(config['num_D_accumulations'])
        D_loss.backward()
        counter += 1
        
      # Optionally apply ortho reg in D
      if config['D_ortho'] > 0.0:
        # Debug print to indicate we're using ortho reg in D.
        print('using modified ortho reg in D')
        utils.ortho(D, config['D_ortho'])
      
      D.optim.step()

    out = {'D_real_loss': D_loss_real.item(),
           'D_fake_loss': D_loss_fake.item()}

    # Optionally toggle "requires_grad"
    if config['toggle_grads']:
      utils.toggle_grad(D, False)
      utils.toggle_grad(G, True)
      
    # Zero G's gradients by default before training G, for safety
    G.optim.zero_grad()
    
    # If accumulating gradients, loop multiple times
    for accumulation_index in range(config['num_G_accumulations']):    
      z_.sample_()
      y_.sample_()
      D_fake = GD(z_, y_, train_G=True, split_D=config['split_D'])
      # G_loss = losses.generator_loss(D_fake)
      G_fake_positive = (y_, config['n_classes'])
      # G_fake_negative = (config['n_classes'] + 1,)
      if global_cfg.omni_loss.mode == 'only_p':
        G_loss = omni_loss(pred=D_fake, positive=G_fake_positive, default_label=-1)
      elif global_cfg.omni_loss.mode == 'p_and_n':
        G_loss = omni_loss(pred=D_fake, positive=G_fake_positive, default_label=0)
      elif global_cfg.omni_loss.mode == 'one_side':
        G_loss = omni_loss(pred=D_fake, positive=G_fake_positive, default_label=-1)
      else:
        assert 0
      G_loss = G_loss / float(config['num_G_accumulations'])

      G_loss.backward()

    # Optionally apply modified ortho reg in G
    if config['G_ortho'] > 0.0:
      print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G
      # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
      utils.ortho(G, config['G_ortho'], 
                  blacklist=[param for param in G.shared.parameters()])
    G.optim.step()

    out.update({'G_loss': G_loss.item(), })
    # out['D_G_fake'] = D_fake.mean().item()
    
    # If we have an ema, update it, regardless of if we test with it or not
    if config['ema']:
      ema.update(state_dict['itr'])

    if val_loaders is not None:
      val_x, val_y = next(val_loaders)
      val_x = val_x.cuda()
      val_y = val_y.cuda()
      with torch.no_grad():
        D_val = D(val_x, val_y)

        D_val_positive = (val_y, config['n_classes'])
        # D_val_negative = (config['n_classes'] + 1,)
        if global_cfg.omni_loss.mode == 'only_p':
          D_val_loss = omni_loss(pred=D_val, positive=D_val_positive, default_label=-1)
        elif global_cfg.omni_loss.mode == 'p_and_n':
          D_val_loss = omni_loss(pred=D_val, positive=D_val_positive, default_label=0)
        elif global_cfg.omni_loss.mode == 'one_side':
          D_val_loss = omni_loss(pred=D_val, positive=D_val_positive, default_label=-1)
        else:
          assert 0
        # D_val_loss = omni_loss(pred=D_val, positive=D_val_positive, negative=D_val_negative)
        out.update({'D_val_loss': D_val_loss.item(), })

    
    default_dict.clear()
    default_dict['D_loss'].update(out)
    return default_dict
Exemple #15
0
    def train(x, y, this_iter):
        G.optim.zero_grad()
        D.optim.zero_grad()
        if E is not None:
            E.optim.zero_grad()
        if not (config['prior_type'] == 'default'):
            Prior.optim.zero_grad()

        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0

        ######### Discriminator ##############
        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            if E is not None:
                utils.toggle_grad(E, False)
            if not (config['prior_type'] == 'default'):
                utils.toggle_grad(Prior, False)
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                z_, y_ = Prior.sample_()
                D_fake, D_real = GD(z_[:config['batch_size']],
                                    y_[:config['batch_size']],
                                    x[counter],
                                    y[counter],
                                    train_G=False,
                                    split_D=config['split_D'],
                                    is_Enc=False)

                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = my_loss.discriminator_loss(
                    D_fake, D_real)
                D_loss = (D_loss_real + D_loss_fake) / float(
                    config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()

        ########## Generator ################3
        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            if not (config['prior_type'] == 'default'
                    ) and not this_iter % config['update_GMM_every_n']:
                utils.toggle_grad(Prior, True)
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)
        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()
        if not (config['prior_type'] == 'default'):
            Prior.optim.zero_grad()
        # If accumulating gradients, loop multiple times
        for accumulation_index in range(config['num_G_accumulations']):
            z_, y_ = Prior.sample_()
            D_fake = GD(z_, y_, train_G=True, split_D=config['split_D'])
            G_loss = my_loss.generator_loss(D_fake)
            (G_loss / float(config['num_G_accumulations'])).backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            print('using modified ortho reg in G'
                  )  # Debug print to indicate we're using ortho reg in G
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G,
                        config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        G.optim.step()
        if not (config['prior_type']
                == 'default') and not this_iter % config['update_GMM_every_n']:
            Prior.optim.step()

        ############# Encoder ##########
        if E is not None:
            # Optionally toggle "requires_grad"
            if config['toggle_grads']:
                utils.toggle_grad(D, False)
                utils.toggle_grad(G, False)
                utils.toggle_grad(E, True)

            counter = 0

            for step_index in range(config['num_E_steps']):
                # Zero G's gradients by default before training G, for safety
                E.optim.zero_grad()
                if not (config['prior_type'] == 'default'):
                    Prior.optim.zero_grad()

                # If accumulating gradients, loop multiple times
                for accumulation_index in range(config['num_E_accumulations']):
                    z_, y_ = Prior.sample_()
                    z_mu, z_lv = GD(z_,
                                    y_,
                                    train_G=False,
                                    split_D=config['split_D'],
                                    is_Enc=True)
                    z_p = z_ if not config['is_latent_detach'] else z_.detach()
                    E_loss = my_loss.log_likelihood(z_p, z_mu, z_lv) / float(
                        config['lambda_encoder'])
                    total_loss = E_loss
                    if not (config['prior_type']
                            == 'default') and not this_iter % config[
                                'update_GMM_every_n'] and step_index == 0:
                        log_y_pred = Prior.latent_classification(z_)
                        Prior_loss = my_loss.classification_loss(
                            log_y_pred, y_) / float(
                                config['num_E_accumulations'])
                        total_loss += Prior_loss
                        if config['is_loss3'] != 0:
                            if config['is_loss3'] == -1:
                                loss3 = torch.sum(
                                    (1 / float(config['lambda_encoder'])) *
                                    my_loss.log_gaussian(Prior.lv_c) /
                                    Prior.n_classes)
                                total_loss += loss3
                            else:
                                loss3 = torch.sum(
                                    config['is_loss3'] *
                                    my_loss.log_gaussian(Prior.lv_c) /
                                    (Prior.n_classes * Prior.dim_z))
                                total_loss += loss3
                    MSE_loss = torch.mean(torch.sum((z_ - z_mu).pow(2), dim=1))
                    (total_loss /
                     float(config['num_E_accumulations'])).backward()
                    counter += 1

                # Optionally apply modified ortho reg in G
                if config['E_ortho'] > 0.0:
                    print(
                        'using modified ortho reg in E'
                    )  # Debug print to indicate we're using ortho reg in G
                    # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
                    utils.ortho(E, config['E_ortho'])
                E.optim.step()
                if not (config['prior_type']
                        == 'default') and not this_iter % config[
                            'update_GMM_every_n'] and step_index == 0:
                    acc_samples = torch.mean(
                        (y_ == log_y_pred.argmax(1)).float())
                    Prior.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        out = {
            'G_loss': float(G_loss.item()),
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item())
        }
        # Return G's loss and the components of D's loss.
        if not (config['prior_type']
                == 'default') and not this_iter % config['update_GMM_every_n']:
            out['P_acc_samples'] = float(acc_samples.item())
        if E is not None:
            out['E_log_likelihood'] = float(E_loss.item())
            out['E_MSE_loss'] = float(MSE_loss.item())
        return out
Exemple #16
0
    def train(x, y):
        G.optim.zero_grad()
        D.optim.zero_grad()
        inner_iter_count = 0
        partial_test_input = 0
        # How many chunks to split x and y into?
        #x = torch.split(x, config['batch_size'])
        #y = torch.split(y, config['batch_size'])
        #print('x len{}'.format(len(x)))
        #print('y len{}'.format(len(y)))
        #assert len(x) == config['num_D_accumulations'] == len(y)
        #D_fake, D_real, G_fake, gy = None, None, None, None
        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()

            d_reals = None#[None for _ in x]
            g_fakes = None#[None for _ in x]
            #gys = [None for _ in x]
            #zs = [None for _ in x]
            #zs_.sample_()
            #ys_.sample_()
            #gy = ys_[:config['batch_size']]
            #z = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :5]
            if state_dict['epoch'] < 0:
                #for accumulation_index in range(config['num_D_accumulations']):  # doesn't mean anything right now
                # for fb_iter in range(config['num_feedback_iter']):
                # if fb_iter == 0:
                # z_ = zs_[:config['batch_size']]
                # gy = ys_[:config['batch_size']]
                # print('z_ shape {}'.format(z_.shape))
                # z_ = z_.view(zs_.size(0), 9, 8, 8)[:, :5]
                zs_.sample_()
                z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 8, 8)[:,20]  # [:, :5]
                #z_ = z_.view(z_.size(0), -1)

                # zs[accumulation_index] = z
                # z_ = torch.cat([z, torch.zeros(zs_.size(0), 4, 8, 8).cuda()], 1)

                ys_.sample_()
                gy = ys_[:config['batch_size']]
                # gys[accumulation_index] = gy.detach()
                # else:
                # D_real = D_real#.repeat(1,3,1,1)# * g_fakes[accumulation_index]
                # print('zs_ shape 0 {}'.format(zs_.shape))
                # print('\n\n\n\n')
                # print('r shape {}'.format(r.shape))
                # print('g fake shape {}'.format(g_fakes[accumulation_index].shape))
                # print('\n\n\n\n')
                # z_ = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :8]
                # G_fake = nn.AvgPool2d(4)(g_fakes[accumulation_index])
                # print('z shape 5 {}'.format(z_.shape))
                # z_=z_[:,:3]
                # print('z shape 10 {}'.format(z_.shape))

                # z_ = torch.cat([d_reals[accumulation_index], G_fake, zs[accumulation_index]], 1)
                # print('z shape 15 {}'.format(z_.shape))
                # gy = gys[accumulation_index]
                D_fake, D_real, G_fake = GD(z_,
                                            gy,
                                            x=x,#[accumulation_index],
                                            dy=y,#[accumulation_index],
                                            train_G=False,
                                            split_D=config['split_D'])
                #print('D shape {}'.format(D_fake.shape))
                #print('G fake shape {}'.format(nn.AvgPool2d(4)(G_fake).shape))
                #print('D real shape {}'.format(D_real.shape))
                #print('z shape {}'.format(z_.shape))

                if state_dict['itr'] % 1000 == 0: ##and accumulation_index == 6:
                    print('saving img')
                    torchvision.utils.save_image(x.float().cpu(),#[accumulation_index].float().cpu(),
                                                 '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_xreal.jpg'.format(
                                                     time, state_dict['itr']),
                                                 nrow=int(D_fake.shape[0] ** 0.5), normalize=True)
                    torchvision.utils.save_image(D_fake.float().cpu(),
                                                 '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_dfake.jpg'.format(
                                                     time, state_dict['itr']),
                                                 nrow=int(D_fake.shape[0] ** 0.5), normalize=True)
                    torchvision.utils.save_image(D_real.float().cpu(),
                                                 '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_dreal.jpg'.format(
                                                     time, state_dict['itr']),
                                                 nrow=int(D_fake.shape[0] ** 0.5), normalize=True)

                # d_reals[accumulation_index] = D_real.detach()
                # g_fakes[accumulation_index] = G_fake.detach()

                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real)
                D_loss = (D_loss_real + D_loss_fake)# / float(config['num_D_accumulations'])
                D_loss.backward()
                # counter += 1

                    # Optionally apply ortho reg in D
                if config['D_ortho'] > 0.0:
                    # Debug print to indicate we're using ortho reg in D.
                    print('using modified ortho reg in D')
                    utils.ortho(D, config['D_ortho'])

                D.optim.step()
                # D.optim.zero_grad()
                # Optionally toggle "requires_grad"
            else:
                for fb_iter in range(config['num_feedback_iter_D']):
                    #for accumulation_index in range(config['num_D_accumulations']): #doesn't mean anything right now
                    #for fb_iter in range(config['num_feedback_iter']):
                    zs_.sample_()
                    z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 32, 32)[:, :20]
                    ys_.sample_()
                    gy = ys_[:config['batch_size']]

                    if fb_iter == 0:
                        # z_ = zs_[:config['batch_size']]
                        # gy = ys_[:config['batch_size']]
                        #print('z_ shape {}'.format(z_.shape))
                        #z_ = z_.view(zs_.size(0), 9, 8, 8)[:, :5]

                        #zs_.sample_()
                        #z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 8, 8)[:, :20]
                        #zs[accumulation_index] = z_
                        #print('three channel x input train D shape before {}'.format(x[:, :3].shape))
                        #init_x = nn.AvgPool2d(4)(x[:, :3])
                        init_x = x[:, :3]

                        z_ = torch.cat([z_, init_x, torch.ones(zs_.size(0), 1, 32, 32).cuda()], 1)
                        #print('three channel x input train D shape after {}'.format(nn.AvgPool2d(4)(x[:, :3]).shape))

                        #ys_.sample_()
                        #gy = ys_[:config['batch_size']]
                        #gys[accumulation_index] = gy.detach()
                    else:
                        #D_real = D_real#.repeat(1,3,1,1)# * g_fakes[accumulation_index]
                        #print('zs_ shape 0 {}'.format(zs_.shape))
                        #print('\n\n\n\n')
                        #print('r shape {}'.format(r.shape))
                        #print('g fake shape {}'.format(g_fakes[accumulation_index].shape))
                        #print('\n\n\n\n')
                        #z_ = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :8]
                        g_fake = 0.1 * g_fake + 0.9 * init_x#[accumulation_index]
                        #print('z shape 5 {}'.format(z_.shape))
                        #z_=z_[:,:3]
                        # print('z shape 10 {}'.format(z_.shape))
                        # print('g fake shape 10 {}'.format(G_fake.shape))
                        # print('d real shape 10 {}'.format(d_reals.shape))
                        #z_ = torch.cat([zs[accumulation_index],d_reals[accumulation_index], G_fake,], 1)
                        z_ = torch.cat([z_, g_fake, nn.functional.interpolate(d_reals, 32, mode='bilinear')#[accumulation_index]
                                           ,], 1)
                    #z_ = z_.view(z_.size(0),-1)
                        #print('z shape 15 {}'.format(z_.shape))
                        #gy = gys[accumulation_index]
                    # if state_dict['itr'] % 42 == 0:
                    #     partial_test_input = partial_test_input + torch.cat([g_fakes, d_fakes])
                    D_fake, D_real, G_fake = GD(z_,
                                        gy,
                                        x=x,#[accumulation_index],
                                        dy=y,#[accumulation_index],
                                        train_G=False,

                                        split_D=config['split_D'])
                    #print('D shape {}'.format(D_fake.shape))
                    if state_dict['itr'] % 1000 == 0:# and accumulation_index == 6:
                        print('saving img')
                        torchvision.utils.save_image(x.float().cpu(),#[accumulation_index].float().cpu(),
                                                     '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_xreal.jpg'.format(
                                                         time, state_dict['itr'], fb_iter),
                                                     nrow=int(D_fake.shape[0] ** 0.5), normalize=True)
                        torchvision.utils.save_image(G_fake.float().cpu(),
                        '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_Gfake_d.jpg'.format(
                            time,state_dict['itr'],fb_iter),nrow=int(D_fake.shape[0] ** 0.5),normalize=True)
                        if fb_iter > 1:
                            torchvision.utils.save_image(g_fake.float().cpu(),
                            '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_gfake_d.jpg'.format(
                                time,state_dict['itr'],fb_iter),nrow=int(D_fake.shape[0] ** 0.5),normalize=True)


                    D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real)
                    if not fb_iter == 0:
                        # d_real_enforcement = losses.loss_enforcing(d_reals#[accumulation_index]
                        #                                            , D_real)
                        # g_fakes_enforcement = losses.loss_enforcing(g_fakes #[accumulation_index]
                        #                                             , nn.AvgPool2d(4)(G_fake))
                        D_loss = (D_loss_real + D_loss_fake)# + 0.1 * d_real_enforcement)# / float(config['num_D_accumulations'])
                    else:
                        D_loss = (D_loss_real + D_loss_fake)# / float(config['num_D_accumulations'])

                    #d_reals[accumulation_index] = D_real.detach()
                    d_reals = D_real.detach()

                    #g_fakes[accumulation_index] = nn.AvgPool2d(4)(G_fake).detach()
                    g_fake = G_fake.detach()
                    #g_fakes = G_fake.detach()

                    # Compute components of D's loss, average them, and divide by
                    # the number of gradient accumulations

                    # D_loss_real, D_loss_fake = losses.discriminator_loss(D_fake, D_real)
                    # if not fb_iter == 0:
                    #     D_loss = (D_loss_real + D_loss_fake + d_real_enforcement + g_fakes_enforcement) / float(config['num_D_accumulations'])
                    # else:
                    #     D_loss = (D_loss_real + D_loss_fake) / float(config['num_D_accumulations'])

                    D_loss.backward()

                    #counter += 1

                    # Optionally apply ortho reg in D
                    if config['D_ortho'] > 0.0:
                        # Debug print to indicate we're using ortho reg in D.
                        # print('using modified ortho reg in D')
                        utils.ortho(D, config['D_ortho'])

                    D.optim.step()
                        #D.optim.zero_grad()

            # Optionally toggle "requires_grad"

        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        #d_fakes = [None for _ in range(config['num_G_accumulations'])]
        #g_fakes = [None for _ in range(config['num_G_accumulations'])]
        #gys = [None for _ in range(config['num_G_accumulations'])]
        #for fb_iter in range(config['num_feedback_iter']):
        # If accumulating gradients, loop multiple times
        d_fakes = None#[None for _ in x]
        g_fakes = None#[None for _ in x]
        #gys = [None for _ in x]
        #zs = [None for _ in x]
        if state_dict['epoch'] < 0:
            #for accumulation_index in range(config['num_G_accumulations']):  # doesn't mean anything right now
            zs_.sample_()
            z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 32, 32)[:, :20]
            #zs[accumulation_index] = z_[:, :5]
            # z_ = torch.cat([z, torch.zeros(zs_.size(0), 4, 8, 8).cuda()],1)
            ys_.sample_()
            gy = ys_
            #gys[accumulation_index] = gy.detach()

            # D_fake = D_fake.repeat(1,3,1,1)
            # z_ = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :5]
            #G_fake = nn.AvgPool2d(4)(g_fakes[accumulation_index])
            #z_ = torch.cat([d_fakes[accumulation_index], G_fake, zs[accumulation_index]], 1)
             #   gy = gys[accumulation_index]
            z_ = z_.view(z_.size(0), -1)
            D_fake, G_z = GD(z=z_, gy=gy, train_G=True, split_D=config['split_D'], return_G_z=True)
            G_loss = losses.generator_loss(D_fake)# / float(config['num_G_accumulations'])
            G_loss.backward()

            if state_dict['itr'] % 1000 == 0:# and accumulation_index == 6:
                print('saving img')
                torchvision.utils.save_image(D_fake.float().cpu(),
                                             '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_dfake.jpg'.format(
                                                 time,
                                                 state_dict['itr'],),
                                             nrow=int(D_fake.shape[0] ** 0.5),
                                             normalize=True)
                torchvision.utils.save_image(G_z.float().cpu(),
                                             '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_pre_G_z.jpg'.format(
                                                 time,
                                                 state_dict['itr'],),
                                             nrow=int(D_fake.shape[0] ** 0.5),
                                             normalize=True)

            #g_fakes[accumulation_index] = G_z.detach()
            #d_fakes[accumulation_index] = D_fake.detach()
            # Optionally apply modified ortho reg in G
            if config['G_ortho'] > 0.0:
                print('using modified ortho reg in G')  # Debug print to indicate we're using ortho reg in G
                # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
                utils.ortho(G, config['G_ortho'],
                            blacklist=[param for param in G.shared.parameters()])
            G.optim.step()
            # G.optim.zero_grad()
        else:
            for fb_iter in range(config['num_feedback_iter']):
                #for accumulation_index in range(config['num_G_accumulations']): #doesn't mean anything right now
                zs_.sample_()
                z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 32, 32)[:, :20]
                ys_.sample_()
                gy = ys_

                if fb_iter <= 1:
                    #zs_.sample_()
                    #z_ = zs_[:config['batch_size']].view(zs_.size(0), 24, 8, 8)[:, :20]

                    #zs[accumulation_index] = z_
                    #print('three channel x input train G shape before {}'.format(x.shape))
                    #init_x = nn.AvgPool2d(4)(x[:, :3])
                    init_x = x[:, :3]
                    z_ = torch.cat([z_, init_x, torch.ones(zs_.size(0), 1, 32, 32).cuda()], 1)
                    #print('three channel x input train G shape after {}'.format(nn.AvgPool2d(4)(x[:, :3]).shape))
                    #ys_.sample_()
                    #gy = ys_
                    #gys[accumulation_index] = gy.detach()
                else:
                    #D_fake = D_fake.repeat(1,3,1,1)
                    #z_ = zs_[:config['batch_size']].view(zs_.size(0), 9, 8, 8)[:, :5]
                    #G_fake = g_fakes#[accumulation_index]
                    g_fake = 0.05 * g_fakes + 0.95 * init_x  # [accumulation_index]
                    d_fakes = nn.functional.interpolate(d_fakes, 32, mode='bilinear')#[accumulation_index]
                    #z_ = torch.cat([zs[accumulation_index], d_fakes[accumulation_index], G_fake, ], 1)
                    z_ = torch.cat([z_, g_fake, d_fakes #[accumulation_index]
                                       ,], 1)
                    if ((not (state_dict['itr'] % config['save_every'])) or (not (state_dict['itr'] % config['test_every']))):
                        partial_test_input = partial_test_input + torch.cat([g_fake, d_fakes], 1)
                        inner_iter_count = inner_iter_count + 1
                    #gy = gys[accumulation_index]
                #z_ = z_.view(z_.size(0), -1)
                D_fake, G_z = GD(z=z_, gy=gy, train_G=True, split_D=config['split_D'], return_G_z=True)

                if not fb_iter == 0:
                    #g_fakes_enforcement = losses.loss_enforcing(g_fakes#[accumulation_index]
                                                                #, G_z)
                    # d_fakes_enforcement = losses.loss_enforcing(d_fakes#[accumulation_index]
                    #                                             , D_fake)
                    G_loss = (losses.generator_loss(D_fake))# + 0.1 * g_fakes_enforcement) #/ float(config['num_G_accumulations'])
                else:
                    G_loss = (losses.generator_loss(D_fake))# / float(config['num_G_accumulations'])

                G_loss.backward()

                if state_dict['itr'] % 1000 == 0:# and accumulation_index == 6:
                    print('saving img')
                    # torchvision.utils.save_image(D_fake.float().cpu(),
                    #                            '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_dfake.jpg'.format(time,
                    #                                state_dict['itr'], fb_iter),
                    #                            nrow=int(D_fake.shape[0] ** 0.5),
                    #                            normalize=True)
                    torchvision.utils.save_image(G_z.float().cpu(),
                                               '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_G_z.jpg'.format(time,
                                                   state_dict['itr'], fb_iter),
                                               nrow=int(D_fake.shape[0] ** 0.5),
                                               normalize=True)
                    if fb_iter > 1:
                        torchvision.utils.save_image(g_fake.float().cpu(),
                                               '/ubc/cs/research/shield/projects/cshen001/BigGAN-original/BigGAN-PyTorch/samples_new/{}_it{}_fb{}_G_z_input.jpg'.format(time,
                                                   state_dict['itr'], fb_iter),
                                               nrow=int(D_fake.shape[0] ** 0.5),
                                               normalize=True)

                #g_fakes[accumulation_index] = nn.AvgPool2d(4)(G_z).detach()
                g_fakes = G_z.detach()
                #g_fakes = G_z.detach()
                #d_fakes[accumulation_index] = D_fake.detach()

                d_fakes = D_fake.detach()

                # Optionally apply modified ortho reg in G
                if config['G_ortho'] > 0.0:
                    print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G
                    # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
                    utils.ortho(G, config['G_ortho'],
                                      blacklist=[param for param in G.shared.parameters()])
                G.optim.step()
                    #G.optim.zero_grad()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
          ema.update(state_dict['itr'])

        out = {'G_loss': float(G_loss.item()),
                'D_loss_real': float(D_loss_real.item()),
                'D_loss_fake': float(D_loss_fake.item())}
        # Return G's loss and the components of D's loss.

        partial_test_input = partial_test_input / (inner_iter_count + 1e-9)
        return out, partial_test_input
Exemple #17
0
    def train(x, y, epoch, batch_size, target_map = None, r_mixup = 0.0):
        G.optim.zero_grad()
        D.optim.zero_grad()

        if config["unet_mixup"]:
            real_target = torch.tensor([1.0]).cuda()
            fake_target = torch.tensor([0.0]).cuda()

        if config["unet_mixup"] and not config["full_batch_mixup"]:
            use_mixup_in_this_round = True
        elif config["unet_mixup"] and config["full_batch_mixup"]:
            use_mixup_in_this_round = torch.rand(1).detach().item()<r_mixup
        else:
            use_mixup_in_this_round = False

        out = {}

        skip_normal_real_fake_loss = (use_mixup_in_this_round and config["full_batch_mixup"] )

        n_d_accu = config['num_D_accumulations']

        split_size = int(x.size(0)/n_d_accu)

        x = torch.split(x, split_size)
        y = torch.split(y, split_size)

        d_real_target = torch.tensor([1.0]).cuda()
        d_fake_target = torch.tensor([0.0]).cuda()

        discriminator_loss = functools.partial(BCEloss, d_real_target=d_real_target, d_fake_target=d_fake_target)
        mix_fake_target = torch.tensor([1.0]).cuda()
        fake_loss = functools.partial(BCEfakeloss, target = mix_fake_target)

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            counter = 0
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()

            for accumulation_index in range(n_d_accu):

                z_.sample_()
                y_.sample_()

                if use_mixup_in_this_round:

                    if (not config["full_batch_mixup"]) or (config["full_batch_mixup"] and (config["consistency_loss_and_augmentation"] or config["consistency_loss"]) ):

                        D_fake, D_real , D_mixed, G_z, mixed,  D_middle_fake, D_middle_real, D_middle_mixed, target_map   = GD(z_[:batch_size], y_[:batch_size],
                                                            x[counter], y[counter], train_G=False,
                                                            split_D=config['split_D'], mixup = True, target_map = target_map) # mixup can be true because weight is set to 0 when no mixup is used
                    else:
                        D_mixed, G_z, mixed, D_middle_mixed, target_map   = GD(z_[:batch_size], y_[:batch_size],
                                                            x[counter], y[counter], train_G=False, return_G_z = True,
                                                            split_D=config['split_D'], mixup = True, mixup_only = True, target_map = target_map)

                    if config["slow_mixup"] and not config["full_batch_mixup"]:
                        mixup_coeff = min(1.0, epoch/config["warmup_epochs"] )#use without full batch mixup
                    else:
                        mixup_coeff = 1.0

                    if config["display_mixed_batch"]:
                        # This can help for debugging
                        plt.figure()
                        m = torchvision.utils.make_grid(mixed,nrow=5,padding=2,normalize = True)
                        m = m.permute(1,2,0)
                        m = m.cpu().numpy()
                        plt.imshow(m)
                        plt.figure()
                        plt.figure()
                        m = torchvision.utils.make_grid(G_z,nrow=5,padding=2,normalize = True)
                        m = m.permute(1,2,0)
                        m = m.cpu().numpy()
                        plt.imshow(m)
                        plt.figure()
                        plt.figure()
                        m = torchvision.utils.make_grid(x[counter],nrow=5,padding=2,normalize = True)
                        m = m.permute(1,2,0)
                        m = m.cpu().numpy()
                        plt.imshow(m)
                        plt.figure()
                        m = torchvision.utils.make_grid(target_map,nrow=5,padding=2)
                        m = m.permute(1,2,0)
                        m = m.cpu().numpy()
                        plt.imshow(m)
                        plt.title("mix")
                        plt.show()
                        plt.figure()

                else:
                    D_fake, D_real , G_z, D_middle_fake, D_middle_real   = GD(z_[:batch_size], y_[:batch_size],
                                                        x[counter], y[counter], train_G=False,
                                                        split_D=config['split_D'])



                if not skip_normal_real_fake_loss:
                    D_loss_real_2d, D_loss_fake_2d = discriminator_loss(D_fake.view(-1), D_real.view(-1))
                    D_loss_real_2d_item = D_loss_real_2d.detach().item()
                    D_loss_fake_2d_item = D_loss_fake_2d.detach().item()

                if use_mixup_in_this_round  and (config["consistency_loss"] or config["consistency_loss_and_augmentation"]):
                    mix =  D_real*target_map + D_fake*(1-target_map)

                if use_mixup_in_this_round:

                    D_mixed_flattened = D_mixed.view(-1)
                    target_map_flattend = target_map.view(-1)

                    mix_list = []
                    for i in range(D_mixed.size(0)):
                        # MIXUP LOSS 2D
                        mix2d_i= F.binary_cross_entropy_with_logits(D_mixed[i].view(-1),target_map[i].view(-1) )
                        mix_list.append(mix2d_i)

                    D_loss_mixed_2d = torch.stack(mix_list)
                    #-> D_loss_mixed_2d.mean() is taken later

                    D_loss_mixed_2d_item = D_loss_mixed_2d.mean().detach().item()
                    #D_loss_mixed_2d = D_loss_mixed_2d.view(D_mixed.size()).mean([2,3])

                if not skip_normal_real_fake_loss:
                    D_loss_real_middle, D_loss_fake_middle = discriminator_loss(D_middle_fake, D_middle_real)

                    D_loss_real_middle_item = D_loss_real_middle.detach().item()
                    D_loss_fake_middle_item = D_loss_fake_middle.detach().item()

                if use_mixup_in_this_round and not config["consistency_loss"]:
                    # consistency loss is only concerned with segmenter output

                    #target for mixed encoder loss is fake
                    mix_bce = F.binary_cross_entropy_with_logits(D_middle_mixed, fake_target.expand_as(D_middle_mixed), reduction="none")

                    mixed_middle_loss = mixup_coeff*mix_bce
                    mixed_middle_loss_item = mixed_middle_loss.mean().detach().item()

                if skip_normal_real_fake_loss:
                    D_loss_real = torch.tensor([0.0]).cuda()
                    D_loss_fake = torch.tensor([0.0]).cuda()
                else:
                    D_loss_real = D_loss_real_2d + D_loss_real_middle
                    D_loss_fake = D_loss_fake_2d + D_loss_fake_middle

                D_loss_real_item = D_loss_real.detach().item()
                D_loss_fake_item = D_loss_fake.detach().item()

                D_loss = 0.5*D_loss_real + 0.5*D_loss_fake

                if use_mixup_in_this_round:
                    if config["consistency_loss"] or config["consistency_loss_and_augmentation"]:
                        consistency_loss = mixup_coeff*1.0*F.mse_loss(D_mixed, mix )
                        consistency_loss_item = consistency_loss.float().detach().item()

                    if not config["consistency_loss"]:
                        # GAN loss from cutmix augmentation (=/= consitency loss)
                        mix_loss = D_loss_mixed_2d + mixed_middle_loss
                        mix_loss = mix_loss.mean()
                    else:
                        mix_loss = 0.0

                    if config["consistency_loss"]:
                        mix_loss = consistency_loss
                    elif config["consistency_loss_and_augmentation"]:
                        mix_loss = mix_loss + consistency_loss

                    D_loss = D_loss + mix_loss

                D_loss = D_loss / float(config['num_D_accumulations'])

                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()
            del D_loss

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        ######################################
        # G-step
        ######################################
        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()
        counter = 0

        z_.sample_()
        y_.sample_()

        z__ = torch.split(z_, split_size) #batch_size)
        y__ = torch.split(y_, split_size) #batch_size)

        # If accumulating gradients, loop multiple times
        for accumulation_index in range(config['num_G_accumulations']):

            G_fake, G_fake_middle = GD(z__[counter], y__[counter], train_G=True, split_D=config['split_D'], reference_x = x[counter] )

            G_loss_fake_2d = fake_loss(G_fake)
            G_loss_fake_middle = fake_loss(G_fake_middle)
            G_loss = 0.5*G_loss_fake_middle + 0.5*G_loss_fake_2d
            G_loss = G_loss / float(config['num_G_accumulations'])

            G_loss_fake_middle_item = G_loss_fake_middle.detach().item()
            G_loss_fake_2d_item = G_loss_fake_2d.detach().item()
            G_loss_item = G_loss.detach().item()

            G_loss.backward()
            counter += 1

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            print('using modified ortho reg in G') # Debug print to indicate we're using ortho reg in G
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G, config['G_ortho'],
                                    blacklist=[param for param in G.shared.parameters()])


        G.optim.step()
        del G_loss

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])


        # save intermediate losses
        if use_mixup_in_this_round and (config["consistency_loss"] or config["consistency_loss_and_augmentation"]) and config["num_D_steps"]>0:
            out["consistency"] = float(consistency_loss_item)

        out['G_loss'] = float(G_loss_item)
        if  not (config["full_batch_mixup"] and use_mixup_in_this_round) and config["num_D_steps"]>0:
            out['D_loss_real'] = float(D_loss_real_item)
            out['D_loss_fake'] = float(D_loss_fake_item)

        if use_mixup_in_this_round and not config["consistency_loss"] and config["num_D_steps"]>0:
            out["mixed_middle_loss"] = float(mixed_middle_loss_item)
            out["D_loss_mixed_2d"] = float(D_loss_mixed_2d_item)

        if  not (config["full_batch_mixup"] and use_mixup_in_this_round):
            if config["num_D_steps"]>0:
                out["D_loss_real_middle"] = float(D_loss_real_middle_item)
                out["D_loss_fake_middle"] = float(D_loss_fake_middle_item)
                out["D_loss_real_2d"] = float(D_loss_real_2d_item)
                out["D_loss_fake_2d"] = float(D_loss_fake_2d_item)
            out["G_loss_fake_middle"] = float(G_loss_fake_middle_item)
            out["G_loss_fake_2d"] = float(G_loss_fake_2d_item)

        return out
Exemple #18
0
    def train(x_s, y, yd):
        G.optim.zero_grad()
        D.optim.zero_grad()
        # How many chunks to split x and y into?
        y = y.long()
        yd = yd.long()
        x_s = torch.split(x_s, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        yd = torch.split(yd, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"

        utils.toggle_grad(D, True)
        utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                y_.sample_()
                yd_.sample_()

                D_fake, D_real, mi, c_cls, mid, c_clsd, G_z = GD(
                    z_,
                    y_,
                    yd_,
                    x_s[counter],
                    y[counter],
                    yd[counter],
                    train_G=False,
                    split_D=config['split_D'],
                    return_G_z=True)

                D_loss_real, D_loss_fake = losses.discriminator_loss(
                    D_fake, D_real)

                C_loss = 0

                if config['AC']:
                    fake_mi = mi[:D_fake.shape[0]]
                    fake_cls = c_cls[:D_fake.shape[0]]
                    c_cls_rs = c_cls[D_fake.shape[0]:]

                    fake_mid = mid[:D_fake.shape[0]]
                    c_clsd = c_clsd[D_fake.shape[0]:]
                    # print(yd)
                    # print(yd_)

                    if config['loss_type'] == 'Twin_AC':
                        C_loss += F.cross_entropy(c_clsd, yd[counter]) + F.cross_entropy(fake_mid, yd_) + \
                                  0.5*F.cross_entropy(c_cls_rs[yd[counter]!=0], y[counter][yd[counter]!=0]) + 0.5*F.cross_entropy(fake_cls, y_) + 1.0*F.cross_entropy(fake_mi, y_)
                        # if state_dict['itr'] > 0000:
                        #     C_loss += 0.2*F.cross_entropy(c_cls_ft, y_[yd_!=0]) + 0.2*F.cross_entropy(fake_mi_t[yd_!=0], y_[yd_!=0])#F.cross_entropy(fake_mi[yd_ == 0], y_[yd_ == 0])

                    if config['loss_type'] == 'AC':
                        C_loss += F.cross_entropy(
                            c_cls_fs, y_f_s) + F.cross_entropy(c_clsd, yd)

                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations

                if config['Pac']:
                    x_pack = torch.cat([x_s[counter], x_t[counter]], dim=0)
                    T_img = x_pack.view(-1, 4 * x_pack.size()[1],
                                        x_pack.size()[2],
                                        x_pack.size()[3])
                    F_img = G_z.view(-1, 4 * G_z.size()[1],
                                     G_z.size()[2],
                                     G_z.size()[3])
                    pack_img = torch.cat([T_img, F_img], dim=0)
                    pack_out, _, _ = D(pack_img, pack=True)
                    D_real_pac = pack_out[:T_img.size()[0]]
                    D_fake_pac = pack_out[T_img.size()[0]:]
                    D_loss_real_pac, D_loss_fake_pac = losses.discriminator_loss(
                        D_fake_pac, D_real_pac)
                    D_loss_real += D_loss_real_pac
                    D_loss_fake += D_loss_fake_pac

                D_loss = (D_loss_real + D_loss_fake +
                          C_loss * config['AC_weight']) / float(
                              config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()

        # Optionally toggle "requires_grad"
        utils.toggle_grad(D, False)
        utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()
        for step_index in range(config['num_G_steps']):
            for accumulation_index in range(config['num_G_accumulations']):
                z_.sample_()
                y_.sample_()
                yd_.sample_()
                D_fake, mi, cls, mid, clsd, G_z = GD(z_,
                                                     y_,
                                                     yd_,
                                                     train_G=True,
                                                     split_D=config['split_D'],
                                                     return_G_z=True)

                C_loss = 0
                MI_loss = 0
                CD_loss = 0
                MID_loss = 0
                G_loss = losses.generator_loss(D_fake)
                if config['loss_type'] == 'AC' or config[
                        'loss_type'] == 'Twin_AC':
                    C_loss = 1.0 * F.cross_entropy(
                        cls,
                        y_)  #+ 0.5*F.cross_entropy(cls[yd_!=0], y_[yd_!=0])
                    CD_loss = F.cross_entropy(clsd, yd_)
                    if config['loss_type'] == 'Twin_AC':
                        MI_loss = 1.0 * F.cross_entropy(mi, y_)
                        # if state_dict['itr'] > 0000:
                        #     MI_loss += 0.5*F.cross_entropy(mi_t[yd_!=0], y_[yd_!=0])
                        MID_loss = F.cross_entropy(mid, yd_)

                if config['Pac']:
                    F_img = G_z.view(-1, 4 * G_z.size()[1],
                                     G_z.size()[2],
                                     G_z.size()[3])
                    D_fake_pac, _, _ = D(F_img, pack=True)
                    G_loss_pac = losses.generator_loss(D_fake_pac)
                    G_loss += G_loss_pac

                G_loss = G_loss / float(config['num_G_accumulations'])
                C_loss = C_loss / float(config['num_G_accumulations'])
                MI_loss = MI_loss / float(config['num_G_accumulations'])
                CD_loss = CD_loss / float(config['num_G_accumulations'])
                MID_loss = MID_loss / float(config['num_G_accumulations'])
                (G_loss + (C_loss - MI_loss + CD_loss - MID_loss) *
                 config['AC_weight']).backward()

            # Optionally apply modified ortho reg in G
            if config['G_ortho'] > 0.0:
                print('using modified ortho reg in G'
                      )  # Debug print to indicate we're using ortho reg in G
                # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
                utils.ortho(
                    G,
                    config['G_ortho'],
                    blacklist=[param for param in G.shared.parameters()])
            G.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        out = {
            'G_loss': float(G_loss.item()),
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item()),
            'C_loss': C_loss,
            'MI_loss': MI_loss,
            'CD_loss': CD_loss,
            'MID_loss': MID_loss
        }
        # Return G's loss and the components of D's loss.
        return out
    def train(x, y):
        G.optim.zero_grad()
        D.optim.zero_grad()
        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                y_.sample_()
                D_scores = GD(z_[:config['batch_size']],
                              y_[:config['batch_size']],
                              x[counter],
                              y[counter],
                              train_G=False,
                              policy=config['DiffAugment'],
                              CR=config['CR'] > 0,
                              CR_augment=config['CR_augment'])

                D_loss_CR = 0
                if config['CR'] > 0:

                    # to do
                    continue

                else:
                    D_fake, D_real = D_scores

                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = losses.discriminator_loss(
                    D_fake, D_real)
                D_loss = D_loss_real + D_loss_fake + D_loss_CR
                D_loss = D_loss / float(config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        if not config['fix_G']:
            # If accumulating gradients, loop multiple times
            for accumulation_index in range(config['num_G_accumulations']):
                z_.sample_()
                y_.sample_()
                D_fake = GD(z_, y_, train_G=True, policy=config['DiffAugment'])
                G_loss = losses.generator_loss(D_fake) / float(
                    config['num_G_accumulations'])
                G_loss.backward()

            # Optionally apply modified ortho reg in G
            if config['G_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in G
                print('using modified ortho reg in G')
                # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
                utils.ortho(
                    G,
                    config['G_ortho'],
                    blacklist=[param for param in G.shared.parameters()])
            G.optim.step()

            # If we have an ema, update it, regardless of if we test with it or not
            if config['ema']:
                ema.update(state_dict['itr'])

        out = {
            'G_loss': float(G_loss.item()) if not config['fix_G'] else 0,
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item()),
        }
        if config['CR'] > 0:
            out['D_loss_CR'] = float(D_loss_CR.item())
        # Return G's loss and the components of D's loss.
        return out
Exemple #20
0
    def train(x, y, stage):
        G.optim.zero_grad()
        D.optim.zero_grad()
        M.optim.zero_grad()  # yaxing # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:  # yaxing: hert it is True
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)
            utils.toggle_grad(M, False)  # yaxing

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                y_.sample_()
                # yaxing: set gy and dy is equal 0, since we donot know label
                D_fake, D_real = GD(z_[:config['batch_size']],
                                    y_[:config['batch_size']],
                                    x[counter],
                                    y[counter],
                                    train_G=False,
                                    split_D=config['split_D'])

                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = losses.discriminator_loss(
                    D_fake, D_real)
                D_loss = (D_loss_real + D_loss_fake) / float(
                    config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:  # yaxing: hert it is 0.0
                # Debug print to indicate we're using ortho reg in D.
                print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            D.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            if stage == 1:
                utils.toggle_grad(G, False)  # yaxing
            else:
                utils.toggle_grad(G, True)  # yaxing
            utils.toggle_grad(M, True)  # yaxing

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()
        M.optim.zero_grad()  # yaxing

        # If accumulating gradients, loop multiple times
        for accumulation_index in range(
                config['num_G_accumulations']):  # yaxing: hert it is 1
            z_.sample_()
            y_.sample_()
            #D_fake = GD(z_, y_, train_G=True, split_D=config['split_D'])
            # yaxing: set gy and dy is equal 0, since we donot know label
            D_fake, M_regu = GD(z_,
                                y_,
                                train_G=True,
                                split_D=config['split_D'],
                                train_M=True,
                                M_regu=True)
            #G_loss = losses.generator_loss(D_fake) / float(config['num_G_accumulations'])
            M_loss = losses.generator_loss(D_fake, M_regu) / float(
                config['num_G_accumulations'])
            #pdb.set_trace()
            #G_loss.backward()
            M_loss.backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:  # yaxing: hert it is 0.0
            print('using modified ortho reg in G'
                  )  # Debug print to indicate we're using ortho reg in G
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G,
                        config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        if stage == 2:
            G.optim.step()
        M.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        #out = {'G_loss': float(G_loss.item()),
        out = {
            'G_loss': float(M_loss.item()),
            'D_loss_real': float(D_loss_real.item()),
            'D_loss_fake': float(D_loss_fake.item())
        }
        # Return G's loss and the components of D's loss.
        return out
Exemple #21
0
    def train(x, y):
        G.optim.zero_grad()
        D.optim.zero_grad()
        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an
            # optimizer step
            D.optim.zero_grad()

            for accumulation_index in range(config['num_D_accumulations']):
                z_, y_ = sample()
                D_fake, D_real = GD(z_[:config['batch_size']], y_[:config['batch_size']],
                                    x[counter], y[counter], train_G=False,
                                    split_D=config['split_D'])
                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = losses.discriminator_loss(
                    D_fake, D_real)
                D_loss = (D_loss_real + D_loss_fake) / \
                    float(config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                xm.master_print('using modified ortho reg in D')
                utils.ortho(D, config['D_ortho'])

            xm.optimizer_step(D.optim)

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        # If accumulating gradients, loop multiple times
        for accumulation_index in range(config['num_G_accumulations']):

            z_, y_ = sample()
            D_fake = GD(z_, y_, train_G=True, split_D=config['split_D'])
            G_loss = losses.generator_loss(
                D_fake) / float(config['num_G_accumulations'])
            G_loss.backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            # Debug print to indicate we're using ortho reg in G
            print('using modified ortho reg in G')
            # Don't ortho reg shared, it makes no sense. Really we should
            # blacklist any embeddings for this
  
            utils.ortho(G, config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        xm.optimizer_step(G.optim)

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])

        out = {'G_loss': G_loss,
               'D_loss_real': D_loss_real,
               'D_loss_fake': D_loss_fake}
        # Return G's loss and the components of D's loss.
        return out
Exemple #22
0
    def train(x, y, tensor_writer=None, iteration=None):
        print('Summation will be taken', config['D_hinge_loss_sum'],
              'D hinge loss')
        G.optim.zero_grad()
        D.optim.zero_grad()
        if config['no_Dv'] == False:
            Dv.optim.zero_grad()

        if tensor_writer != None and iteration % config[
                'log_results_every'] == 0:
            tensor_writer.add_video('Loaded Data', (x + 1) / 2, iteration)
            mean_pixel_val = torch.mean((x + 1) / 2, dim=[0, 1, 3, 4])
            tensor_writer.add_scalar(
                'Pixel vals/Mean Red Pixel values, real data',
                float(mean_pixel_val[0].item()), iteration)
            tensor_writer.add_scalar(
                'Pixel vals/Mean Green Pixel values, real data',
                float(mean_pixel_val[1].item()), iteration)
            tensor_writer.add_scalar(
                'Pixel vals/Mean Blue Pixel values, real data',
                float(mean_pixel_val[2].item()), iteration)

            y_text = []
            for yi in y:
                y_text.append(idx_to_classes[yi.item()])
            tensor_writer.add_text('Loaded Labels', ' | '.join(y_text),
                                   iteration)
        #Added by Xiaodan: prepare for avg pixel loss
        if config['no_avg_pixel_loss'] == False:
            mean_pixel_val_real = torch.mean((x + 1) / 2)
        # print('Range of loaded data:',x.min(),'--',x.max())
        # How many chunks to split x and y into?
        x = torch.split(x, config['batch_size'])
        y = torch.split(y, config['batch_size'])
        counter = 0

        # Optionally toggle D and G's "require_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, True)
            if config['no_Dv'] == False:
                utils.toggle_grad(Dv, True)
            utils.toggle_grad(G, False)

        for step_index in range(config['num_D_steps']):
            # If accumulating gradients, loop multiple times before an optimizer step
            D.optim.zero_grad()
            if config['no_Dv'] == False:
                Dv.optim.zero_grad()
            for accumulation_index in range(config['num_D_accumulations']):
                z_.sample_()
                y_.sample_()
                # print('z_ size in GAN tranining func:',z_.shape)
                # print('y_ size in GAN tranining func:',y_.shape)
                #xiaodan: D_fake, D_real [B*8,1]
                # print('hier and G_shared:',config['hier'],config['G_shared'])
                # print('Shape of z_[:config[batch_size]]:',z_[:config['batch_size']].shape)
                # print('config[batch_size]',config['batch_size'])
                if config['no_Dv'] == False:
                    D_fake, D_real, Dv_fake, Dv_real, G_z = GD(
                        z_[:config['batch_size']],
                        y_[:config['batch_size']],
                        x[counter],
                        y[counter],
                        train_G=False,
                        split_D=config['split_D'],
                        tensor_writer=tensor_writer,
                        iteration=iteration)
                else:
                    D_fake, D_real, G_z = GD(z_[:config['batch_size']],
                                             y_[:config['batch_size']],
                                             x[counter],
                                             y[counter],
                                             train_G=False,
                                             split_D=config['split_D'],
                                             tensor_writer=tensor_writer,
                                             iteration=iteration)
                # print('GD.k in train_fns line 49',GD.module.k) #GD.module because GD is now dataparallel class
                # D_fake & D_real shapes: [Bk,1], [Bk,1]
                # xiaodan: Make scores back to [B,k,1] for easier summation in discriminator_loss
                D_fake = D_fake.contiguous().view(-1, GD.module.k,
                                                  *D_fake.shape[1:])  #[B,k,1]
                D_real = D_real.contiguous().view(-1, GD.module.k,
                                                  *D_real.shape[1:])  #[B,k,1]
                if config['D_hinge_loss_sum'] == 'before':
                    D_fake = torch.sum(
                        D_fake, 1
                    )  #xiaodan: add k scores before doing hinge loss, according to the paper
                    D_real = torch.sum(D_real, 1)  #[B,1]
                # Compute components of D's loss, average them, and divide by
                # the number of gradient accumulations
                D_loss_real, D_loss_fake = losses.discriminator_loss(
                    D_fake, D_real, config['D_hinge_loss_sum'])

                # Dv_fake & Dv_real shapes: [BT*,1], [BT*,1] if T_into_B; [B,1], [B,1] if False
                if config['no_Dv'] == False:
                    # print('Dv_fake shape',Dv_fake.shape)
                    if config['T_into_B'] == True:
                        Dv_fake = Dv_fake.contiguous().view(
                            D_fake.shape[0], -1, *Dv_fake.shape[1:])  #[B,T*,1]
                        Dv_real = Dv_real.contiguous().view(
                            D_real.shape[0], -1, *Dv_real.shape[1:])  #[B,T*,1]
                        if config['Dv_hinge_loss_sum'] == 'before':
                            Dv_fake = torch.sum(
                                Dv_fake, 1
                            )  #xiaodan: add T* scores before doing hinge loss
                            Dv_real = torch.sum(Dv_real, 1)  #[B,1]
                        Dv_loss_real, Dv_loss_fake = losses.discriminator_loss(
                            Dv_fake, Dv_real, config['Dv_hinge_loss_sum'])
                    else:
                        #Xiaodan: If T_into_B is False, must use "before" for hinge loss.
                        Dv_loss_real, Dv_loss_fake = losses.discriminator_loss(
                            Dv_fake, Dv_real, 'before')
                    D_loss = (D_loss_real + D_loss_fake + Dv_loss_fake +
                              Dv_loss_real) / float(
                                  config['num_D_accumulations'])
                else:
                    D_loss = (D_loss_real + D_loss_fake) / float(
                        config['num_D_accumulations'])
                D_loss.backward()
                counter += 1

            # Optionally apply ortho reg in D
            if config['D_ortho'] > 0.0:
                # Debug print to indicate we're using ortho reg in D.
                if config['no_Dv'] == False:
                    print('using modified ortho reg in D and Dv')
                    utils.ortho(Dv, config['D_ortho'])
                else:
                    print('using modified ortho reg in D')
                    utils.ortho(D, config['D_ortho'])

            D.optim.step()
            if config['no_Dv'] == False:
                Dv.optim.step()

        # Optionally toggle "requires_grad"
        if config['toggle_grads']:
            utils.toggle_grad(D, False)
            if config['no_Dv'] == False:
                utils.toggle_grad(Dv, False)
            utils.toggle_grad(G, True)

        # Zero G's gradients by default before training G, for safety
        G.optim.zero_grad()

        # If accumulating gradients, loop multiple times
        for accumulation_index in range(config['num_G_accumulations']):
            z_.sample_()
            y_.sample_()
            # print('z_,y_ shapes before pass into GD:',z_.shape,y_.shape)
            if config['no_Dv'] == False:
                D_fake, Dv_fake, G_z = GD(z_,
                                          y_,
                                          train_G=True,
                                          split_D=config['split_D'],
                                          tensor_writer=tensor_writer,
                                          iteration=iteration)
            else:
                D_fake, G_z = GD(z_,
                                 y_,
                                 train_G=True,
                                 split_D=config['split_D'],
                                 tensor_writer=tensor_writer,
                                 iteration=iteration)

            D_fake = D_fake.contiguous().view(-1, GD.module.k,
                                              *D_fake.shape[1:])  #[B, k, 1]
            D_fake = torch.mean(
                D_fake,
                1)  # [B,1]  xiaodan: average k scores before doing hinge loss

            G_loss = config['D_loss_weight'] * losses.generator_loss(
                D_fake) / float(config['num_G_accumulations'])
            if config['no_Dv'] == False:
                if config['T_into_B'] == True:
                    Dv_fake = Dv_fake.contiguous().view(
                        D_fake.shape[0], -1, *Dv_fake.shape[1:])  #[B,T*,1]
                    Dv_fake = torch.mean(Dv_fake, 1)  # [B,1]
                G_loss += losses.generator_loss(Dv_fake) / float(
                    config['num_G_accumulations'])
            #Added by Xiaodan to take avg. pixel value into account as an additional losses
            # print(type(G_loss))
            if config['no_avg_pixel_loss'] == False:
                mean_pixel_val_fake = torch.mean((G_z + 1) / 2)
                mean_pixel_val_diff = abs(
                    float(mean_pixel_val_fake.item()) -
                    float(mean_pixel_val_real.item()))
                mean_pixel_loss = losses.avg_pixel_loss(
                    mean_pixel_val_diff,
                    config['avg_pixel_loss_weight']) / float(
                        config['num_G_accumulations'])
                if iteration >= config['pixel_loss_kicksin']:
                    G_loss += mean_pixel_loss
                else:
                    mean_pixel_loss = 0
            G_loss.backward()

        # Optionally apply modified ortho reg in G
        if config['G_ortho'] > 0.0:
            print('using modified ortho reg in G'
                  )  # Debug print to indicate we're using ortho reg in G
            # Don't ortho reg shared, it makes no sense. Really we should blacklist any embeddings for this
            utils.ortho(G,
                        config['G_ortho'],
                        blacklist=[param for param in G.shared.parameters()])
        if config['no_convgru'] == False:
            G_grad_gates = G.convgru.convgru.cell_list[
                0].conv_gates.weight.grad.abs().sum()
            G_grad_can = G.convgru.convgru.cell_list[
                0].conv_can.weight.grad.abs().sum()
            G_grad_first_layer = G.blocks[0][0].conv1.weight.grad.abs().sum()
            G_weight_gates = G.convgru.convgru.cell_list[
                0].conv_gates.weight.abs().mean()
            G_weight_can = G.convgru.convgru.cell_list[0].conv_can.weight.abs(
            ).mean()
            G_weight_first_layer = G.blocks[0][0].conv1.weight.abs().mean()
        G.optim.step()

        # If we have an ema, update it, regardless of if we test with it or not
        if config['ema']:
            ema.update(state_dict['itr'])
        if config['no_Dv'] == False:
            out = {
                'G_loss': float(G_loss.item()),
                'D_loss_real': float(D_loss_real.item()),
                'D_loss_fake': float(D_loss_fake.item()),
                'Dv_loss_real': float(Dv_loss_real.item()),
                'Dv_loss_fake': float(Dv_loss_fake.item())
            }
        else:
            out = {
                'G_loss': float(G_loss.item()),
                'D_loss_real': float(D_loss_real.item()),
                'D_loss_fake': float(D_loss_fake.item())
            }
        if tensor_writer != None and iteration % config[
                'log_results_every'] == 0:
            tensor_writer.add_video('Video Results', (G_z + 1) / 2, iteration)
            mean_pixel_val = torch.mean((G_z + 1) / 2, dim=[0, 1, 3, 4])
            tensor_writer.add_scalar(
                'Pixel vals/Mean Red Pixel values, fake data',
                float(mean_pixel_val[0].item()), iteration)
            tensor_writer.add_scalar(
                'Pixel vals/Mean Green Pixel values, fake data',
                float(mean_pixel_val[1].item()), iteration)
            tensor_writer.add_scalar(
                'Pixel vals/Mean Blue Pixel values, fake data',
                float(mean_pixel_val[2].item()), iteration)
            y_Gz_text = []
            for yi in y_:
                y_Gz_text.append(idx_to_classes[yi.item()])
            tensor_writer.add_text('Generated Labels', ' | '.join(y_Gz_text),
                                   iteration)

            # Return G's loss and the components of D's loss.
            if config['no_avg_pixel_loss'] == False:
                tensor_writer.add_scalar('Loss/avg_pixel_loss',
                                         mean_pixel_loss, iteration)
            tensor_writer.add_scalar('Loss/G_loss', out['G_loss'], iteration)
            tensor_writer.add_scalar('Loss/D_loss_real', out['D_loss_real'],
                                     iteration)
            tensor_writer.add_scalar('Loss/D_loss_fake', out['D_loss_fake'],
                                     iteration)
            if config['no_Dv'] == False:
                tensor_writer.add_scalar('Loss/Dv_loss_fake',
                                         out['Dv_loss_fake'], iteration)
                tensor_writer.add_scalar('Loss/Dv_loss_real',
                                         out['Dv_loss_real'], iteration)
            if config['no_convgru'] == False:
                tensor_writer.add_scalar('Gradient/G_grad_gates', G_grad_gates,
                                         iteration)
                tensor_writer.add_scalar('Gradient/G_grad_can', G_grad_can,
                                         iteration)
                tensor_writer.add_scalar('Gradient/G_grad_first_layer',
                                         G_grad_first_layer, iteration)

                tensor_writer.add_scalar('Weight/G_weight_gates',
                                         G_weight_gates, iteration)
                tensor_writer.add_scalar('Weight/G_weight_can', G_weight_can,
                                         iteration)
                tensor_writer.add_scalar('Weight/G_weight_first_layer',
                                         G_weight_first_layer, iteration)
        return out