def infer(args): system_info.print_system_info() # Prepare model in_size = feature.get_input_dim(args.frame_size, args.context_size, args.input_transform) if args.model_type == 'Transformer': model = TransformerDiarization( in_size, n_units=args.hidden_size, n_heads=args.transformer_encoder_n_heads, n_layers=args.transformer_encoder_n_layers, dropout=0, alpha=0) else: raise ValueError('Unknown model type.') serializers.load_npz(args.model_file, model) if args.gpu >= 0: gpuid = use_single_gpu() model.to_gpu() kaldi_obj = kaldi_data.KaldiData(args.data_dir) for recid in kaldi_obj.wavs: data, rate = kaldi_obj.load_wav(recid) Y = feature.stft(data, args.frame_size, args.frame_shift) Y = feature.transform(Y, transform_type=args.input_transform) Y = feature.splice(Y, context_size=args.context_size) Y = Y[::args.subsampling] out_chunks = [] with chainer.no_backprop_mode(), chainer.using_config('train', False): hs = None for start, end in _gen_chunk_indices(len(Y), args.chunk_size): Y_chunked = Variable(Y[start:end]) if args.gpu >= 0: Y_chunked.to_gpu(gpuid) hs, ys = model.estimate_sequential(hs, [Y_chunked]) if args.gpu >= 0: ys[0].to_cpu() out_chunks.append(ys[0].data) if args.save_attention_weight == 1: att_fname = f"{recid}_{start}_{end}.att.npy" att_path = os.path.join(args.out_dir, att_fname) model.save_attention_weight(att_path) outfname = recid + '.h5' outpath = os.path.join(args.out_dir, outfname) if hasattr(model, 'label_delay'): outdata = shift(np.vstack(out_chunks), (-model.label_delay, 0)) else: outdata = np.vstack(out_chunks) with h5py.File(outpath, 'w') as wf: wf.create_dataset('T_hat', data=outdata)
def get_example(self, i): rec, st, ed = self.chunk_indices[i] Y, T = feature.get_labeledSTFT( self.data, rec, st, ed, self.frame_size, self.frame_shift, self.n_speakers) Y = feature.transform(Y, self.input_transform) Y_spliced = feature.splice(Y, self.context_size) Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling) return Y_ss, T_ss
def __getitem__(self, i): rec, st, ed = self.chunk_indices[i] Y, T = feature.get_labeledSTFT(self.data, rec, st, ed, self.frame_size, self.frame_shift, self.n_speakers) # Y: (frame, num_ceps) Y = feature.transform(Y, self.input_transform) # Y_spliced: (frame, num_ceps * (context_size * 2 + 1)) Y_spliced = feature.splice(Y, self.context_size) # Y_ss: (frame / subsampling, num_ceps * (context_size * 2 + 1)) Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling) Y_ss = torch.from_numpy(Y_ss).float() T_ss = torch.from_numpy(T_ss).float() return Y_ss, T_ss
def get_example(self, i): rec, st, ed = self.chunk_indices[i] n_speakers, Y, T = feature.get_labeledSTFT( self.data, rec, st, ed, self.frame_size, self.frame_shift, None ) # modification for number of speakers else it was -> self.n_speakers) Y = feature.transform(Y, self.input_transform) Y_spliced = feature.splice(Y, self.context_size) Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling) # here let self.n_speakers represent the max number of speakers in the train set # Pad T_ss to concatenate successfully when training using _convert T_ss = np.pad(T_ss, ((0, 0), (0, self.n_speakers - T_ss.shape[1])), 'constant', constant_values=0) return n_speakers, Y_ss, T_ss
def get_example(self, i): rec, st, ed = self.chunk_indices[i] Y, T = feature.get_labeledSTFT(self.data, rec, st, ed, self.frame_size, self.frame_shift, self.n_speakers) Y = feature.transform(Y, self.input_transform) Y_spliced = feature.splice(Y, self.context_size) Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling) # If the sample contains more than "self.n_speakers" speakers, # extract top-(self.n_speakers) speakers if self.n_speakers and T_ss.shape[1] > self.n_speakers: selected_speakers = np.argsort( T_ss.sum(axis=0))[::-1][:self.n_speakers] T_ss = T_ss[:, selected_speakers] # If self.shuffle is True, shuffle the order in time-axis # This operation improves the performance of EEND-EDA if self.shuffle: order = np.arange(Y_ss.shape[0]) np.random.shuffle(order) Y_ss = Y_ss[order] T_ss = T_ss[order] return Y_ss, T_ss