def calc_supervised_speaker_loss(self, c, filename):
        """
        Calculates the loss for fully supervised training using the provided speaker labels.
        :param c: output of the layer to be trained
        :param filename: filenames of the current files in the batch
        :param start_idx: idx within the audio-files for the current files in the batch
        :return: loss and accuracy
        """

        cur_device = utils.get_device(self.opt, c)

        targets = torch.zeros(len(filename)).long()
        for idx, _ in enumerate(filename):
            targets[idx] = self.speaker_id_dict[filename[idx].split("-")[0]]
        targets = targets.to(cur_device).squeeze()

        # forward pass
        c = c.permute(0, 2, 1)

        pooled_c = nn.functional.adaptive_avg_pool1d(c, self.label_num)
        pooled_c = pooled_c.permute(0, 2, 1).reshape(-1, self.hidden_dim)

        speaker_out = self.linear_classifier(pooled_c)

        loss = self.speaker_loss(speaker_out, targets)

        accuracy = torch.zeros(1)
        # calculate accuracy
        if self.calc_accuracy:
            _, predicted = torch.max(speaker_out.data, 1)
            total = targets.size(0)
            correct = (predicted == targets).sum().item()
            accuracy[0] = correct / total

        return loss, accuracy
    def calc_InfoNCE_loss(self, Wc, z):
        """calculate the loss based on the model outputs Wc (the prediction)
        and z (the encoded future)

        :param Wc: output of the predictor, where W are the weights for the different timesteps and
        c the latent representation (either from the autoregressor, if use_autoregressor=True,
        or from the encoder otherwise) - dimensions: (B, L, C*self.opt.prediction_step)
        :param z: encoded future - output of the encoder - dimensions: (B, L, C)
        :return: loss - average loss per sample, timesteps and prediction steps in the batch
                    accuracies - average accuracies over all samples, timesteps and predictions steps in the batch
        """
        seq_len = z.size(1)

        cur_device = utils.get_device(self.opt, Wc)

        assert (Wc == Wc).all(), "InfoNCE: NaN in Wc"
        assert (z == z).all(), "InfoNCE: NaN in z"

        cm = np.zeros((2, 2))

        if self.opt.sampling_method == 1 or self.opt.sampling_method == 2:
            z_neg, _, _ = self.get_neg_z(z, cur_device)
        else:
            z_neg = None

        # the common loss
        loss_per_sample = torch.zeros((self.opt.batch_size,), device=cur_device)

        for k in range(1, self.opt.prediction_step + 1):
            z_k = z[:, k:, :]
            Wc_k = Wc[:, :-k, (k - 1) * self.enc_hidden : k * self.enc_hidden]

            z_k = self.broadcast_batch_length(z_k)
            Wc_k = self.broadcast_batch_length(Wc_k)

            pos_samples = self.get_pos_sample_f(Wc_k, z_k)
            neg_samples = self.get_neg_samples_f(Wc_k, z_k, cur_device, z_neg, k)

            # concatenate positive and negative samples
            results = torch.cat((pos_samples, neg_samples), 1)
            loss = self.loss(results)[:, 0]
            total_samples = (seq_len - k) * self.opt.batch_size

            # normalized, both to the loss over all timesteps and over all prediction steps K
            loss_per_sample += (
                -loss.reshape(self.opt.batch_size, -1).mean(axis=1)
                / self.opt.prediction_step
            )

            # calculate accuracy
            if self.calc_accuracy:
                predicted = torch.argmax(results, 1) == 0
                cm += confusion_matrix(
                    np.ones((total_samples)), predicted.cpu().numpy(), labels=[0, 1]
                )

        return loss_per_sample, cm
예제 #3
0
    def calc_CPC_loss(self, Wc, z, full_z=None):
        """
        calculate the loss based on the model outputs Wc (the prediction) and z (the encoded future)
        :param Wc: output of the predictor, where W are the weights for the different timesteps and
        c the latent representation (either from the autoregressor, if use_autoregressor=True,
        or from the encoder otherwise) - dimensions: (B, L, C*self.opt.prediction_step)
        :param z: encoded future - output of the encoder - dimensions: (B, L, C)
        :return: total_loss - average loss over all samples, timesteps and prediction steps in the batch
                    accuracies - average accuracies over all samples, timesteps and predictions steps in the batch
        """
        seq_len = z.size(1)

        cur_device = utils.get_device(self.opt, Wc)

        total_loss = 0

        accuracies = torch.zeros(self.opt.prediction_step, 1)
        true_labels = torch.zeros((seq_len * self.opt.batch_size, ),
                                  device=cur_device).long()

        if self.opt.sampling_method == 1 or self.opt.sampling_method == 2:
            z_neg, _, _ = self.get_neg_z(full_z, cur_device)
        else:
            z_neg = None

        for k in range(1, self.opt.prediction_step + 1):
            z_k = z[:, k:, :]
            Wc_k = Wc[:, :-k, (k - 1) * self.enc_hidden:k * self.enc_hidden]

            z_k = self.broadcast_batch_length(z_k)
            Wc_k = self.broadcast_batch_length(Wc_k)

            pos_samples = self.get_pos_sample_f(Wc_k, z_k)
            neg_samples = self.get_neg_samples_f(Wc_k, z_k, cur_device, z_neg,
                                                 k)

            # concatenate positive and negative samples
            results = torch.cat((pos_samples, neg_samples), 1)
            loss = self.loss(results)[:, 0]

            total_samples = (seq_len - k) * self.opt.batch_size
            loss = -loss.sum() / total_samples
            total_loss += loss

            # calculate accuracy
            if self.calc_accuracy:
                predicted = torch.argmax(results, 1)
                correct = ((predicted == true_labels[:(seq_len - k) *
                                                     self.opt.batch_size]
                            ).sum().item())
                accuracies[k - 1] = correct / total_samples

        total_loss /= self.opt.prediction_step
        accuracies = torch.mean(accuracies)

        return total_loss, accuracies
예제 #4
0
    def forward(self, input):

        cur_device = utils.get_device(self.opt, input)

        regress_hidden_state = torch.zeros(
            1, input.size(0), self.hidden_dim, device=cur_device
        )
        self.gru.flatten_parameters()
        output, regress_hidden_state = self.gru(input, regress_hidden_state)

        return output
예제 #5
0
    def forward(self, x, filename=None, start_idx=None, n=6):
        model_input = x

        cur_device = utils.get_device(self.opt, x)

        # first dimension is used for concatenating results from different GPUs
        loss = torch.zeros(1, len(self.fullmodel), device=cur_device)
        accuracy = torch.zeros(1, len(self.fullmodel), device=cur_device)

        if n == 6:  # train all layers at once
            for idx, layer in enumerate(self.fullmodel):
                loss[:,
                     idx], accuracy[:,
                                    idx], _, z = layer(model_input, filename,
                                                       start_idx)
                model_input = z.permute(0, 2, 1).detach()
        else:
            """
            forward to the layer that we want to train and only output that layer's loss
            (all other values stay at zero initialization)
            This does not reap the memory benefits that would be possible if we trained layers completely separately 
            (by training a layer and saving its output as the dataset to train the next layer on), but enables us 
            to test the behaviour of the model for greedy iterative training
            """
            assert (self.opt.model_splits == 5 or self.opt.model_splits
                    == 6), "Works only for GIM model training"

            for idx, layer in enumerate(self.fullmodel[:n + 1]):
                if idx == n:
                    loss[:, idx], accuracy[:, idx], _, _ = layer(
                        model_input, filename, start_idx)
                else:
                    _, z = layer.get_latents(model_input)
                    model_input = z.permute(0, 2, 1).detach()

        return loss
예제 #6
0
    def forward(self, input):

        cur_device = utils.get_device(self.opt, input)

        if self.opt.remove_BPTT:
            """
            For removing BPTT, we loop over the sequence manually and detach the hidden state 
            to restrict gradients to work only within the current time-step
            """

            input = input.permute(1, 0, 2)  # L, B, C

            regress_hidden_state = torch.zeros(input.size(1),
                                               self.hidden_dim,
                                               device=cur_device)
            output = torch.zeros(input.size(0),
                                 input.size(1),
                                 self.hidden_dim,
                                 device=cur_device)

            for i in range(len(input)):
                regress_hidden_state = self.gru(input[i],
                                                regress_hidden_state.detach())
                output[i] = regress_hidden_state

            output = output.permute(1, 0, 2)
        else:
            regress_hidden_state = torch.zeros(1,
                                               input.size(0),
                                               self.hidden_dim,
                                               device=cur_device)
            self.gru.flatten_parameters()
            output, regress_hidden_state = self.gru(input,
                                                    regress_hidden_state)

        return output