def generate_infer_batch(): num_frames = self.hparams.segment_length * 100 num_overlap_frames = num_frames * self.hparams.overlap_ratio for wav_path in tqdm(self.wav_list): #wav_id = os.path.splitext(os.path.basename(wav_path))[0] wav_id = self.get_save_path_from_filename(wav_path) audio, sample_rate = vad_ex.read_wave(wav_path) vad = webrtcvad.Vad(1) frames = vad_ex.frame_generator(30, audio, sample_rate) frames = list(frames) # following line may cause empty output, too many unvoiced in buffer segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames) total_wav = b"" for i, segment in enumerate(segments): total_wav += segment wav_arr = np.frombuffer(total_wav, dtype=np.int16) if len(wav_arr) == 0: #return [],[],False continue wav_arr = np.pad(wav_arr, (0, max(0, sample_rate*(self.hparams.segment_length+0.015)-len(wav_arr))), 'constant', constant_values=(0, 0)) logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40) total_len = logmel_feats.shape[0] num_fix_mels = int((total_len - num_overlap_frames) // (num_frames - num_overlap_frames)) fix_mels = [] for dvec_idx in range(num_fix_mels): start_idx = int((num_frames - num_overlap_frames) * dvec_idx) end_idx = int(start_idx + num_frames) fix_mels.append(logmel_feats[start_idx:end_idx, :]) fix_mels = np.asarray(fix_mels) self.keys.append(wav_id) self.fix_mel_lengths.append(len(fix_mels)) for fix_mel in fix_mels: yield fix_mel
def create_infer_batch(self): # self.hparams.in_wav1, self.hparams.in_wav2 are full paths of the wav file # for ex) /home/hdd2tb/ninas96211/dev_wav_set/id10343_pCDWKHjQjso_00002.wav wavs_list = [self.hparams.in_wav1, self.hparams.in_wav2] # file_name for ex) id10343_pCDWKHjQjso_00002 for wav_path in wavs_list: wav_id = os.path.splitext(os.path.basename(wav_path))[0] audio, sample_rate = vad_ex.read_wave(wav_path) vad = webrtcvad.Vad(1) frames = vad_ex.frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames) total_wav = b"" for i, segment in enumerate(segments): total_wav += segment print(wav_id + " : " + str(i) + "th segment appended") # Without writing, unpack total_wav into numpy [N,1] array # 16bit PCM 기준 dtype=np.int16 wav_arr = np.frombuffer(total_wav, dtype=np.int16) print("read audio data from byte string. np array of shape:" + str(wav_arr.shape)) logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40) # file_name for ex, 'id10343_pCDWKHjQjso_00002' self.save_dict[wav_id] = logmel_feats num_frames = self.hparams.segment_length * 100 num_overlap_frames = num_frames * self.hparams.overlap_ratio dvector_dict = {} match = False prev_wav_name = "" for wav_name, feats in self.save_dict.items(): if wav_name.split("_")[0] == prev_wav_name: print("spk_id" + wav_name.split("_")[0]) match = True total_len = feats.shape[0] num_dvectors = int((total_len - num_overlap_frames) // (num_frames - num_overlap_frames)) print("num dvec:" + str(num_dvectors)) dvectors = [] for dvec_idx in range(num_dvectors): start_idx = int((num_frames - num_overlap_frames) * dvec_idx) end_idx = int(start_idx + num_frames) print("wavname: " + wav_name + " start_idx: " + str(start_idx)) print("wavname: " + wav_name + " end_idx: " + str(end_idx)) dvectors.append(feats[start_idx:end_idx, :]) dvectors = np.asarray(dvectors) dvector_dict[wav_name] = dvectors prev_wav_name = wav_name.split("_")[0] wav1_data = list(dvector_dict.values())[0] wav2_data = list(dvector_dict.values())[1] print("match: " + str(match)) print("wav1_data.shape:" + str(wav1_data.shape)) print("wav2_data.shape:" + str(wav2_data.shape)) return wav1_data, wav2_data, match
def vad_process(self, path): # VAD Process if self.data_type == "vox1": audio, sample_rate = vad_ex.read_wave(path) elif self.data_type == "vox2": audio, sample_rate = vad_ex.read_m4a(path) elif "libri" in self.data_type: audio, sample_rate = vad_ex.read_libri(path) elif self.data_type == "emotional_actors": audio, sample_rate = vad_ex.read_wave(path) elif self.data_type == "darpa_timit": audio, sample_rate = vad_ex.read_wave(path) vad = webrtcvad.Vad(1) frames = vad_ex.frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames) total_wav = b"" for i, segment in enumerate(segments): total_wav += segment # Without writing, unpack total_wav into numpy [N,1] array # 16bit PCM 기준 dtype=np.int16 wav_arr = np.frombuffer(total_wav, dtype=np.int16) print("read audio data from byte string. np array of shape:" + str(wav_arr.shape)) return wav_arr, sample_rate
def vad_process(self, path): wav_id = os.path.splitext(os.path.basename(path))[0] audio, sample_rate = vad_ex.read_wave(path) vad = webrtcvad.Vad(1) frames = vad_ex.frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames) total_wav = b"" for i, segment in enumerate(segments): total_wav += segment #print(wav_id + " : " + str(i) + "th segment appended") # Without writing, unpack total_wav into numpy [N,1] array # 16bit PCM 기준 dtype=np.int16 wav_arr = np.frombuffer(total_wav, dtype=np.int16) #print("read audio data from byte string. np array of shape:"+str(wav_arr.shape)) logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40) return logmel_feats
def infer_batch_generator(self): #import pdb;pdb.set_trace() batch_size = 640 wavs_list = self.hparams.in_wav1 for wav_path in wavs_list: # Get voiced wav_arr if self.hparams.dataset == 'libri': out_path = self.libri_spkid_outpath(wav_path) else: out_path = '' audio, sample_rate = vad_ex.read_wave(wav_path) vad = webrtcvad.Vad(1) frames = vad_ex.frame_generator(30, audio, sample_rate) frames = list(frames) segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames) total_wav = b"" for i, segment in enumerate(segments): total_wav += segment wav_arr = np.frombuffer(total_wav, dtype=np.int16) if len(wav_arr) == 0: continue # Pad when less than 1.6s wav_arr = np.pad(wav_arr, (0, max(0, 25840 - len(wav_arr))), 'constant', constant_values=(0, 0)) # Get logmel logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40) # Get fixed length log mels num_frames = self.hparams.segment_length * 100 num_overlap_frames = num_frames * self.hparams.overlap_ratio total_len = logmel_feats.shape[0] num_fix_len_mels = int((total_len - num_overlap_frames) // (num_frames - num_overlap_frames)) fix_len_mels = [] for mel_idx in range(num_fix_len_mels): start_idx = int((num_frames - num_overlap_frames) * mel_idx) end_idx = int(start_idx + num_frames) fix_len_mels.append(logmel_feats[start_idx:end_idx, :]) fix_len_mels = np.asarray(fix_len_mels) # Queue and pop every 640 for fix_len_mel in fix_len_mels: last_item = (fix_len_mel, out_path) self.dq.append(last_item) self.dq_size += 1 if len(fix_len_mels) == 0: self.bad_cnt += 1 self.cnt += 1 if self.dq_size >= batch_size: res = [] for i in range(batch_size): res.append(self.dq.popleft()) self.dq_size -= 1 yield res # When remains a lot of 640 clusters while self.dq_size >= batch_size: res = [] for i in range(batch_size): res.append(self.dq.popleft()) self.dq_size -= 1 yield res # the last remaining cluster, if mod 640 ==0, append dummy 640 logmels res = [] while self.dq_size > 0: res.append(self.dq.popleft()) self.dq_size -= 1 remaining = (batch_size - len(res)) last_item = (last_item[0], '') if remaining != 0: for i in range(remaining): res.append(last_item) yield res else: yield res res = [] for i in range(batch_size): res.append(last_item) yield res