Beispiel #1
0
        def generate_infer_batch():
            num_frames = self.hparams.segment_length * 100
            num_overlap_frames = num_frames * self.hparams.overlap_ratio

            for wav_path in tqdm(self.wav_list):
                #wav_id = os.path.splitext(os.path.basename(wav_path))[0]
                wav_id = self.get_save_path_from_filename(wav_path)
                audio, sample_rate = vad_ex.read_wave(wav_path)
                vad = webrtcvad.Vad(1)
                frames = vad_ex.frame_generator(30, audio, sample_rate)
                frames = list(frames)
                # following line may cause empty output, too many unvoiced in buffer
                segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames)
                total_wav = b""
                for i, segment in enumerate(segments):
                    total_wav += segment
                wav_arr = np.frombuffer(total_wav, dtype=np.int16)
                if len(wav_arr) == 0:
                    #return [],[],False
                    continue
                wav_arr = np.pad(wav_arr, (0, max(0, sample_rate*(self.hparams.segment_length+0.015)-len(wav_arr))), 'constant', constant_values=(0, 0))
                logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40)

                total_len = logmel_feats.shape[0]
                num_fix_mels = int((total_len - num_overlap_frames) // (num_frames - num_overlap_frames))
                fix_mels = []
                for dvec_idx in range(num_fix_mels):
                    start_idx = int((num_frames - num_overlap_frames) * dvec_idx)
                    end_idx = int(start_idx + num_frames)
                    fix_mels.append(logmel_feats[start_idx:end_idx, :])
                fix_mels = np.asarray(fix_mels)
                self.keys.append(wav_id)
                self.fix_mel_lengths.append(len(fix_mels))
                for fix_mel in fix_mels:
                    yield fix_mel
Beispiel #2
0
    def create_infer_batch(self):
        # self.hparams.in_wav1, self.hparams.in_wav2 are full paths of the wav file
        # for ex) /home/hdd2tb/ninas96211/dev_wav_set/id10343_pCDWKHjQjso_00002.wav

        wavs_list = [self.hparams.in_wav1, self.hparams.in_wav2]

        # file_name for ex) id10343_pCDWKHjQjso_00002
        for wav_path in wavs_list:
            wav_id = os.path.splitext(os.path.basename(wav_path))[0]
            audio, sample_rate = vad_ex.read_wave(wav_path)
            vad = webrtcvad.Vad(1)
            frames = vad_ex.frame_generator(30, audio, sample_rate)
            frames = list(frames)
            segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames)
            total_wav = b""
            for i, segment in enumerate(segments):
                total_wav += segment
                print(wav_id + " : " + str(i) + "th segment appended")
            # Without writing, unpack total_wav into numpy [N,1] array
            # 16bit PCM 기준 dtype=np.int16
            wav_arr = np.frombuffer(total_wav, dtype=np.int16)
            print("read audio data from byte string. np array of shape:" +
                  str(wav_arr.shape))
            logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40)
            # file_name for ex, 'id10343_pCDWKHjQjso_00002'
            self.save_dict[wav_id] = logmel_feats

        num_frames = self.hparams.segment_length * 100
        num_overlap_frames = num_frames * self.hparams.overlap_ratio
        dvector_dict = {}

        match = False
        prev_wav_name = ""

        for wav_name, feats in self.save_dict.items():
            if wav_name.split("_")[0] == prev_wav_name:
                print("spk_id" + wav_name.split("_")[0])
                match = True
            total_len = feats.shape[0]
            num_dvectors = int((total_len - num_overlap_frames) //
                               (num_frames - num_overlap_frames))
            print("num dvec:" + str(num_dvectors))
            dvectors = []
            for dvec_idx in range(num_dvectors):
                start_idx = int((num_frames - num_overlap_frames) * dvec_idx)
                end_idx = int(start_idx + num_frames)
                print("wavname: " + wav_name + " start_idx: " + str(start_idx))
                print("wavname: " + wav_name + " end_idx: " + str(end_idx))
                dvectors.append(feats[start_idx:end_idx, :])
            dvectors = np.asarray(dvectors)
            dvector_dict[wav_name] = dvectors
            prev_wav_name = wav_name.split("_")[0]

        wav1_data = list(dvector_dict.values())[0]
        wav2_data = list(dvector_dict.values())[1]

        print("match: " + str(match))
        print("wav1_data.shape:" + str(wav1_data.shape))
        print("wav2_data.shape:" + str(wav2_data.shape))
        return wav1_data, wav2_data, match
Beispiel #3
0
 def vad_process(self, path):
     # VAD Process
     if self.data_type == "vox1":
         audio, sample_rate = vad_ex.read_wave(path)
     elif self.data_type == "vox2":
         audio, sample_rate = vad_ex.read_m4a(path)
     elif "libri" in self.data_type:
         audio, sample_rate = vad_ex.read_libri(path)
     elif self.data_type == "emotional_actors":
         audio, sample_rate = vad_ex.read_wave(path)
     elif self.data_type == "darpa_timit":
         audio, sample_rate = vad_ex.read_wave(path)
     vad = webrtcvad.Vad(1)
     frames = vad_ex.frame_generator(30, audio, sample_rate)
     frames = list(frames)
     segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames)
     total_wav = b""
     for i, segment in enumerate(segments):
         total_wav += segment
     # Without writing, unpack total_wav into numpy [N,1] array
     # 16bit PCM 기준 dtype=np.int16
     wav_arr = np.frombuffer(total_wav, dtype=np.int16)
     print("read audio data from byte string. np array of shape:" +
           str(wav_arr.shape))
     return wav_arr, sample_rate
Beispiel #4
0
 def vad_process(self, path):
     wav_id = os.path.splitext(os.path.basename(path))[0]
     audio, sample_rate = vad_ex.read_wave(path)
     vad = webrtcvad.Vad(1)
     frames = vad_ex.frame_generator(30, audio, sample_rate)
     frames = list(frames)
     segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames)
     total_wav = b""
     for i, segment in enumerate(segments):
         total_wav += segment
         #print(wav_id + " : " + str(i) + "th segment appended")
     # Without writing, unpack total_wav into numpy [N,1] array
     # 16bit PCM 기준 dtype=np.int16
     wav_arr = np.frombuffer(total_wav, dtype=np.int16)
     #print("read audio data from byte string. np array of shape:"+str(wav_arr.shape))
     logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40)
     return logmel_feats
Beispiel #5
0
    def infer_batch_generator(self):
        #import pdb;pdb.set_trace()
        batch_size = 640
        wavs_list = self.hparams.in_wav1
        for wav_path in wavs_list:
            # Get voiced wav_arr
            if self.hparams.dataset == 'libri':
                out_path = self.libri_spkid_outpath(wav_path)
            else:
                out_path = ''
            audio, sample_rate = vad_ex.read_wave(wav_path)
            vad = webrtcvad.Vad(1)
            frames = vad_ex.frame_generator(30, audio, sample_rate)
            frames = list(frames)
            segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames)
            total_wav = b""
            for i, segment in enumerate(segments):
                total_wav += segment
            wav_arr = np.frombuffer(total_wav, dtype=np.int16)
            if len(wav_arr) == 0:
                continue
            # Pad when less than 1.6s
            wav_arr = np.pad(wav_arr, (0, max(0, 25840 - len(wav_arr))),
                             'constant',
                             constant_values=(0, 0))
            # Get logmel
            logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40)

            # Get fixed length log mels
            num_frames = self.hparams.segment_length * 100
            num_overlap_frames = num_frames * self.hparams.overlap_ratio
            total_len = logmel_feats.shape[0]
            num_fix_len_mels = int((total_len - num_overlap_frames) //
                                   (num_frames - num_overlap_frames))
            fix_len_mels = []
            for mel_idx in range(num_fix_len_mels):
                start_idx = int((num_frames - num_overlap_frames) * mel_idx)
                end_idx = int(start_idx + num_frames)
                fix_len_mels.append(logmel_feats[start_idx:end_idx, :])
            fix_len_mels = np.asarray(fix_len_mels)

            # Queue and pop every 640
            for fix_len_mel in fix_len_mels:
                last_item = (fix_len_mel, out_path)
                self.dq.append(last_item)
                self.dq_size += 1
            if len(fix_len_mels) == 0:
                self.bad_cnt += 1
            self.cnt += 1
            if self.dq_size >= batch_size:
                res = []
                for i in range(batch_size):
                    res.append(self.dq.popleft())
                    self.dq_size -= 1
                yield res

        # When remains a lot of 640 clusters
        while self.dq_size >= batch_size:
            res = []
            for i in range(batch_size):
                res.append(self.dq.popleft())
                self.dq_size -= 1
            yield res
        # the last remaining cluster, if mod 640 ==0, append dummy 640 logmels
        res = []
        while self.dq_size > 0:
            res.append(self.dq.popleft())
            self.dq_size -= 1
        remaining = (batch_size - len(res))
        last_item = (last_item[0], '')
        if remaining != 0:
            for i in range(remaining):
                res.append(last_item)
            yield res
        else:
            yield res
            res = []
            for i in range(batch_size):
                res.append(last_item)
            yield res