def _get_wav_and_melspec(wav_file, length, is_training=True): ''' the range of values of wav is [-1, 1]. ''' wav = read_wav(wav_file, sr=hp.signal.sr) wav = trim_wav(wav) # divide wav into chunks that have the given length and one is randomly selected in training, but first chunk in generation. n_clips = math.ceil(len(wav) / length) if is_training else 1 idx = random.randrange(n_clips) start, end = length * idx, length * (idx + 1) wav = wav[start:end] assert (len(wav) <= length) wav = fix_length(wav, length) # padding in case of last chunk. melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) wav = np.expand_dims(wav, -1) return wav, melspec.astype(np.float32)
def do_inference(num_tests, concurrency=1): channel = implementations.insecure_channel(host, int(port)) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) coord = _Coordinator(num_tests, concurrency) for _ in range(num_tests): # dummy audio duration, sr, n_fft, win_length, hop_length, n_mels, max_db, min_db = 4, 16000, 512, 512, 128, 80, 35, -55 filename = librosa.util.example_audio_file() wav = read_wav(filename, sr=sr, duration=duration) mel = wav2melspec_db(wav, sr, n_fft, win_length, hop_length, n_mels) mel = normalize_db(mel, max_db=max_db, min_db=min_db) mel = mel.astype(np.float32) mel = np.expand_dims(mel, axis=0) # single batch n_timesteps = sr / hop_length * duration + 1 # build request request = predict_pb2.PredictRequest() request.model_spec.name = 'voice_vector' request.model_spec.signature_name = 'predict' request.inputs['x'].CopyFrom( tf.contrib.util.make_tensor_proto(mel, shape=[1, n_timesteps, n_mels])) coord.throttle() # send asynchronous response (recommended. use this.) result_future = stub.Predict.future(request, 10.0) # timeout result_future.add_done_callback(_create_rpc_callback(coord)) # send synchronous response (NOT recommended) # result = stub.Predict(request, 5.0) coord.wait_all_done()
def get_random_wav_and_label(self, tar_wavfiles, ntar_wavfiles): """ :return: wav: raw wave. float32. shape=(t, ), label: 1 if target, 0 otherwise. int32. melspec: mel-spectrogram. float32. shape=(t, n_mels) """ wavfiles, label = ( tar_wavfiles, self.tar_labels) if np.random.sample(1) <= self.tar_ratio else ( ntar_wavfiles, self.ntar_labels) wavfile = wavfiles[np.random.randint(0, len(wavfiles))] if type(wavfile) == bytes: wavfile = wavfile.decode() if wavfile.endswith('arr'): # pyarrow format wav = read_wav_from_arr(wavfile) else: wav = read_wav(wavfile, sr=hp.signal.sr) wav = trim_wav(wav) wav = crop_random_wav(wav, self.length) wav = augment_volume(wav) wav = fix_length(wav, self.length) # padding melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) melspec = np.float32(melspec) label = np.float32(label) return wav, melspec, label
def get_data(self): while True: speaker_id = random.choice(list(self.speaker_dict.keys())) wav = self._load_random_wav(speaker_id) mel_spec = wav2melspec_db(wav, hp.signal.sr, hp.signal.n_fft, hp.signal.win_length, hp.signal.hop_length, hp.signal.n_mels) mel_spec = normalize_db(mel_spec, max_db=hp.signal.max_db, min_db=hp.signal.min_db) yield wav, mel_spec, speaker_id
def get_random_wav(self, wavfile): """ :param: wavfile: a raw wave file. :return: wav: raw wave. float32. shape=(t, ), melspec: mel-spectrogram. float32. shape=(t, n_mels), wavfile: the raw wave file. """ wav = read_wav(wavfile, sr=hp.signal.sr) wav = trim_wav(wav) wav = fix_length(wav, self.length) # crop from the beginning. melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) melspec = np.float32(melspec) return wav, melspec, wavfile
def _get_wav_and_melspec(wav_file, length=None, is_training=True): wav = read_wav(wav_file, sr=hp.signal.sr) wav = trim_wav(wav) if length: n_clips = math.ceil(len(wav) / length) if is_training else 1 idx = random.randrange(n_clips) start, end = length * idx, length * (idx + 1) wav = wav[start:end] assert (len(wav) <= length) wav = fix_length(wav, length) # padding melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) wav = np.expand_dims(wav, -1) return wav, melspec.astype(np.float32)