Example #1
0
 def get_example(self, i):
     rec, st, ed = self.chunk_indices[i]
     Y, T = feature.get_labeledSTFT(
         self.data,
         rec,
         st,
         ed,
         self.frame_size,
         self.frame_shift,
         self.n_speakers)
     Y = feature.transform(Y, self.input_transform)
     Y_spliced = feature.splice(Y, self.context_size)
     Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling)
     return Y_ss, T_ss
Example #2
0
    def __getitem__(self, i):
        rec, st, ed = self.chunk_indices[i]
        Y, T = feature.get_labeledSTFT(self.data, rec, st, ed, self.frame_size,
                                       self.frame_shift, self.n_speakers)
        # Y: (frame, num_ceps)
        Y = feature.transform(Y, self.input_transform)
        # Y_spliced: (frame, num_ceps * (context_size * 2 + 1))
        Y_spliced = feature.splice(Y, self.context_size)
        # Y_ss: (frame / subsampling, num_ceps * (context_size * 2 + 1))
        Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling)

        Y_ss = torch.from_numpy(Y_ss).float()
        T_ss = torch.from_numpy(T_ss).float()
        return Y_ss, T_ss
Example #3
0
    def get_example(self, i):
        rec, st, ed = self.chunk_indices[i]
        n_speakers, Y, T = feature.get_labeledSTFT(
            self.data, rec, st, ed, self.frame_size, self.frame_shift, None
        )  # modification for number of speakers else it was -> self.n_speakers)
        Y = feature.transform(Y, self.input_transform)
        Y_spliced = feature.splice(Y, self.context_size)
        Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling)

        # here let self.n_speakers represent the max number of speakers in the train set
        # Pad T_ss to concatenate successfully when training using _convert
        T_ss = np.pad(T_ss, ((0, 0), (0, self.n_speakers - T_ss.shape[1])),
                      'constant',
                      constant_values=0)

        return n_speakers, Y_ss, T_ss
    def get_example(self, i):
        rec, st, ed = self.chunk_indices[i]
        Y, T = feature.get_labeledSTFT(self.data, rec, st, ed, self.frame_size,
                                       self.frame_shift, self.n_speakers)
        Y = feature.transform(Y, self.input_transform)
        Y_spliced = feature.splice(Y, self.context_size)
        Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling)

        # If the sample contains more than "self.n_speakers" speakers,
        #  extract top-(self.n_speakers) speakers
        if self.n_speakers and T_ss.shape[1] > self.n_speakers:
            selected_speakers = np.argsort(
                T_ss.sum(axis=0))[::-1][:self.n_speakers]
            T_ss = T_ss[:, selected_speakers]

        # If self.shuffle is True, shuffle the order in time-axis
        # This operation improves the performance of EEND-EDA
        if self.shuffle:
            order = np.arange(Y_ss.shape[0])
            np.random.shuffle(order)
            Y_ss = Y_ss[order]
            T_ss = T_ss[order]

        return Y_ss, T_ss