def get_example(self, i): rec, st, ed = self.chunk_indices[i] Y, T = feature.get_labeledSTFT( self.data, rec, st, ed, self.frame_size, self.frame_shift, self.n_speakers) Y = feature.transform(Y, self.input_transform) Y_spliced = feature.splice(Y, self.context_size) Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling) return Y_ss, T_ss
def __getitem__(self, i): rec, st, ed = self.chunk_indices[i] Y, T = feature.get_labeledSTFT(self.data, rec, st, ed, self.frame_size, self.frame_shift, self.n_speakers) # Y: (frame, num_ceps) Y = feature.transform(Y, self.input_transform) # Y_spliced: (frame, num_ceps * (context_size * 2 + 1)) Y_spliced = feature.splice(Y, self.context_size) # Y_ss: (frame / subsampling, num_ceps * (context_size * 2 + 1)) Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling) Y_ss = torch.from_numpy(Y_ss).float() T_ss = torch.from_numpy(T_ss).float() return Y_ss, T_ss
def get_example(self, i): rec, st, ed = self.chunk_indices[i] n_speakers, Y, T = feature.get_labeledSTFT( self.data, rec, st, ed, self.frame_size, self.frame_shift, None ) # modification for number of speakers else it was -> self.n_speakers) Y = feature.transform(Y, self.input_transform) Y_spliced = feature.splice(Y, self.context_size) Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling) # here let self.n_speakers represent the max number of speakers in the train set # Pad T_ss to concatenate successfully when training using _convert T_ss = np.pad(T_ss, ((0, 0), (0, self.n_speakers - T_ss.shape[1])), 'constant', constant_values=0) return n_speakers, Y_ss, T_ss
def get_example(self, i): rec, st, ed = self.chunk_indices[i] Y, T = feature.get_labeledSTFT(self.data, rec, st, ed, self.frame_size, self.frame_shift, self.n_speakers) Y = feature.transform(Y, self.input_transform) Y_spliced = feature.splice(Y, self.context_size) Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling) # If the sample contains more than "self.n_speakers" speakers, # extract top-(self.n_speakers) speakers if self.n_speakers and T_ss.shape[1] > self.n_speakers: selected_speakers = np.argsort( T_ss.sum(axis=0))[::-1][:self.n_speakers] T_ss = T_ss[:, selected_speakers] # If self.shuffle is True, shuffle the order in time-axis # This operation improves the performance of EEND-EDA if self.shuffle: order = np.arange(Y_ss.shape[0]) np.random.shuffle(order) Y_ss = Y_ss[order] T_ss = T_ss[order] return Y_ss, T_ss