Esempio n. 1
0
    def train(self):

        # initializing some loss functions that will be used
        criterion = nn.MSELoss()
        l2_loss = nn.MSELoss()
        l1_loss = nn.L1Loss()

        print('Training...')
        for epoch in range(self.num_epochs):
            for sample in self.data_loader:

                # getting each key value of the sample in question (each sample is a dictionary)
                right_images = sample['face']
                onehot = sample['onehot']
                raw_wav = sample['audio']
                wrong_images = sample['wrong_face']
                id_labels = from_onehot_to_int(
                    onehot
                )  # list with the position of the youtuber which the audio in question belongs

                # defining the inputs as Variables and allocate them into the GPU
                right_images = Variable(right_images.float()).cuda()
                raw_wav = Variable(raw_wav.float()).cuda()
                wrong_images = Variable(wrong_images.float()).cuda()
                onehot = Variable(onehot.float()).cuda()
                id_labels = Variable(id_labels).cuda()

                # tensor of 64 (num of samples per batch) ones and zeros that will be used to compute D loss.
                real_labels = torch.ones(right_images.size(0))
                fake_labels = torch.zeros(right_images.size(0))

                # ======== One sided label smoothing ==========
                # Helps preventing the discriminator from overpowering the
                # generator adding penalty when the discriminator is too confident
                # =============================================
                smoothed_real_labels = torch.FloatTensor(
                    Utils.smooth_label(
                        real_labels.numpy(),
                        -0.1))  # so smooth_real_labels will now be 0.9

                # allocating the three variables into GPU
                real_labels = Variable(real_labels).cuda()
                smoothed_real_labels = Variable(smoothed_real_labels).cuda()
                fake_labels = Variable(fake_labels).cuda()

                # ======= #
                # TRAIN D #
                # ======= #

                # setting all the gradients to 0
                self.discriminator.zero_grad()

                # feeding G only with wav file
                fake_images, z_vector, _ = self.generator(raw_wav)

                # feeding D with the generated images and z vector whose dimensions will be needed
                # for the concatenation in the last hidden layer
                outputs, _ = self.discriminator(fake_images, z_vector)

                # computing D loss when feeding fake images
                fake_score = outputs  # log file purposes
                fake_loss = criterion(outputs, fake_labels)

                # feeding D with the real images and z vector again
                outputs, activation_real = self.discriminator(
                    right_images, z_vector)

                # computing D loss when feeding real images
                real_score = outputs
                real_loss = criterion(outputs, smoothed_real_labels)

                # feeding D with real images but not corresponding to the wav under training
                outputs, _ = self.discriminator(wrong_images, z_vector)
                # computing D loss when feeding real images but not the ones corresponding to the input audios
                wrong_loss = criterion(outputs, fake_labels)
                wrong_score = outputs

                # the discriminator loss function is the sum of the three of them
                d_loss = real_loss + fake_loss + wrong_loss

                d_loss.backward()

                self.optimD.step()

                # ======= #
                # TRAIN G #
                # ======= #

                # setting all the gradients to 0
                self.generator.zero_grad()

                # feeding G only with wav file
                fake_images, z_vector, softmax_scores = self.generator(raw_wav)

                # feeding D with the generated images and z vector. Storing intermediate layer activations for loss computation purposes
                outputs, activation_fake = self.discriminator(
                    fake_images, z_vector)

                # feeding D with the real images and z vector.  Storing intermediate layer activations for loss computation purposes
                _, activation_real = self.discriminator(right_images, z_vector)

                activation_fake = torch.mean(activation_fake, 0)
                activation_real = torch.mean(activation_real, 0)

                # ======= Generator Loss function============
                # This is a customized loss function, the first term is the mean square error loss
                # The second term is feature matching loss, this measure the distance between the real and generated
                # images statistics by comparing intermediate layers activations
                # The third term is L1 distance between the generated and real images, this is helpful for the conditional case
                # because it links the embedding feature vector directly to certain pixel values.
                # ===========================================

                # computing first the part of the loss related to the softmax classifier after the embedding
                softmax_criterion = nn.CrossEntropyLoss()
                softmax_loss = softmax_criterion(softmax_scores, id_labels)


                g_loss = criterion(outputs, real_labels) \
                         + self.l2_coef * l2_loss(activation_fake, activation_real.detach()) \
                         + self.l1_coef * l1_loss(fake_images, right_images)\
                         + self.softmax_coef * softmax_loss  # we have seen softmax_loss starts around 2 and g_loss around 20... That's why we've scaled by 10

                # applying backpropagation and updating parameters.
                g_loss.backward()
                self.optimG.step()

            # store the info in the logger at each epoch
            self.logger.log_iteration_gan(epoch, d_loss, g_loss, real_score,
                                          fake_score, wrong_score)

            # storing the parameters for every 10 epochs
            if (epoch) % 10 == 0:
                Utils.save_checkpoint(self.discriminator, self.generator,
                                      self.checkpoints_path, self.save_path,
                                      epoch)
Esempio n. 2
0
    def train(self):

        criterion = nn.MSELoss()
        l2_loss = nn.MSELoss()
        l1_loss = nn.L1Loss()

        print('Training...')
        for epoch in range(self.num_epochs):
            for sample in self.data_loader:

                right_images = sample['face']
                onehot = sample['onehot']
                raw_wav = sample['audio']
                wrong_images = sample['wrong_face']
                id_labels = from_onehot_to_int(onehot)

                right_images = Variable(right_images.float()).cuda()
                raw_wav = Variable(raw_wav.float()).cuda()
                wrong_images = Variable(wrong_images.float()).cuda()
                onehot = Variable(onehot.float()).cuda()
                id_labels = Variable(id_labels).cuda()

                real_labels = torch.ones(right_images.size(0))
                fake_labels = torch.zeros(right_images.size(0))

                smoothed_real_labels = torch.FloatTensor(
                    Utils.smooth_label(
                        real_labels.numpy(),
                        -0.1))  # so smooth_real_labels will now be 0.9

                real_labels = Variable(real_labels).cuda()
                smoothed_real_labels = Variable(smoothed_real_labels).cuda()
                fake_labels = Variable(fake_labels).cuda()

                self.discriminator.zero_grad()

                fake_images, z_vector, _ = self.generator(raw_wav)

                outputs, _ = self.discriminator(fake_images, z_vector)

                fake_score = outputs
                fake_loss = criterion(outputs, fake_labels)

                outputs, activation_real = self.discriminator(
                    right_images, z_vector)

                real_score = outputs
                real_loss = criterion(outputs, smoothed_real_labels)

                outputs, _ = self.discriminator(wrong_images, z_vector)
                wrong_loss = criterion(outputs, fake_labels)
                wrong_score = outputs

                d_loss = real_loss + fake_loss + wrong_loss

                d_loss.backward()

                self.optimD.step()

                self.generator.zero_grad()

                fake_images, z_vector, softmax_scores = self.generator(raw_wav)

                outputs, activation_fake = self.discriminator(
                    fake_images, z_vector)

                _, activation_real = self.discriminator(right_images, z_vector)

                activation_fake = torch.mean(activation_fake, 0)
                activation_real = torch.mean(activation_real, 0)

                softmax_criterion = nn.CrossEntropyLoss()
                softmax_loss = softmax_criterion(softmax_scores, id_labels)


                g_loss = criterion(outputs, real_labels) \
                         + self.l2_coef * l2_loss(activation_fake, activation_real.detach()) \
                         + self.l1_coef * l1_loss(fake_images, right_images)\
                         + self.softmax_coef * softmax_loss
                g_loss.backward()
                self.optimG.step()

            self.logger.log_iteration_gan(epoch, d_loss, g_loss, real_score,
                                          fake_score, wrong_score)

            if (epoch) % 10 == 0:
                Utils.save_checkpoint(self.discriminator, self.generator,
                                      self.checkpoints_path, self.save_path,
                                      epoch)