def process_files(files, duration = 2): sr = 22050 n_fft = 512 win_length = 400 hop_length = 80 counter = 0 data_x = [] data_y = [] for file in files: counter += 1 if counter % 1000 == 0: print("Counter: " + str(counter) + " on folder ") try: wav_x = read_wav(file, sr, duration) spec_x, _ = wav2spec(wav_x, n_fft, win_length, hop_length, False) data_x.append(np.swapaxes(spec_x, 0, 1)) wav_y = read_wav(file.replace('wav_1', 'wav_6'), sr, duration) spec_y, _ = wav2spec(wav_y, n_fft, win_length, hop_length, False) data_y.append(np.swapaxes(spec_y, 0, 1)) except: print("error doing processing: " + file) np.save("H:/cs230/spec_{}sec/data_x".format(duration), data_x) np.save("H:/cs230/spec_{}sec/data_y".format(duration), data_y)
def get_mfccs_and_spectrogram(wav_file, trim=False, random_crop=False, isConverting=False): '''This is applied in `train2`, `test2` or `convert` phase. ''' # Load wav = read_wav(wav_file, sr=hp.default.sr) # Trim if trim: wav, _ = librosa.effects.trim(wav, frame_length=hp.default.win_length, hop_length=hp.default.hop_length) if random_crop: wav = wav_random_crop(wav, hp.default.sr, hp.default.duration) # Padding or crop if not Converting if isConverting is False: length = int(hp.default.sr * hp.default.duration) wav = librosa.util.fix_length(wav, length) return _get_mfcc_and_spec(wav, hp.default.preemphasis, hp.default.n_fft, hp.default.win_length, hp.default.hop_length)
def get_random_wav_and_label(self, tar_wavfiles, ntar_wavfiles): """ :return: wav: raw wave. float32. shape=(t, ), label: 1 if target, 0 otherwise. int32. melspec: mel-spectrogram. float32. shape=(t, n_mels) """ wavfiles, label = ( tar_wavfiles, self.tar_labels) if np.random.sample(1) <= self.tar_ratio else ( ntar_wavfiles, self.ntar_labels) wavfile = wavfiles[np.random.randint(0, len(wavfiles))] if type(wavfile) == bytes: wavfile = wavfile.decode() if wavfile.endswith('arr'): # pyarrow format wav = read_wav_from_arr(wavfile) else: wav = read_wav(wavfile, sr=hp.signal.sr) wav = trim_wav(wav) wav = crop_random_wav(wav, self.length) wav = augment_volume(wav) wav = fix_length(wav, self.length) # padding melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) melspec = np.float32(melspec) label = np.float32(label) return wav, melspec, label
def _get_wav_and_melspec(wav_file, length, is_training=True): ''' the range of values of wav is [-1, 1]. ''' wav = read_wav(wav_file, sr=hp.signal.sr) wav = trim_wav(wav) # divide wav into chunks that have the given length and one is randomly selected in training, but first chunk in generation. n_clips = math.ceil(len(wav) / length) if is_training else 1 idx = random.randrange(n_clips) start, end = length * idx, length * (idx + 1) wav = wav[start:end] assert (len(wav) <= length) wav = fix_length(wav, length) # padding in case of last chunk. melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) wav = np.expand_dims(wav, -1) return wav, melspec.astype(np.float32)
def do_inference(num_tests, concurrency=1): channel = implementations.insecure_channel(host, int(port)) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) coord = _Coordinator(num_tests, concurrency) for _ in range(num_tests): # dummy audio duration, sr, n_fft, win_length, hop_length, n_mels, max_db, min_db = 4, 16000, 512, 512, 128, 80, 35, -55 filename = librosa.util.example_audio_file() wav = read_wav(filename, sr=sr, duration=duration) mel = wav2melspec_db(wav, sr, n_fft, win_length, hop_length, n_mels) mel = normalize_db(mel, max_db=max_db, min_db=min_db) mel = mel.astype(np.float32) mel = np.expand_dims(mel, axis=0) # single batch n_timesteps = sr / hop_length * duration + 1 # build request request = predict_pb2.PredictRequest() request.model_spec.name = 'voice_vector' request.model_spec.signature_name = 'predict' request.inputs['x'].CopyFrom( tf.contrib.util.make_tensor_proto(mel, shape=[1, n_timesteps, n_mels])) coord.throttle() # send asynchronous response (recommended. use this.) result_future = stub.Predict.future(request, 10.0) # timeout result_future.add_done_callback(_create_rpc_callback(coord)) # send synchronous response (NOT recommended) # result = stub.Predict(request, 5.0) coord.wait_all_done()
def _load_random_wav(self, speaker_id): wavfile = self.audio_meta.get_random_audio(speaker_id) wav = read_wav(wavfile, hp.signal.sr) # wav = trim_wav(wav) length = int(hp.signal.duration * hp.signal.sr) wav = crop_random_wav(wav, length=length) wav = fix_length(wav, length, mode='reflect') return wav # (t, n_mel)
def make_softer(audio_file): # the following function will play the .wav file audio.play(audio_file) # storing the audio as a list of floats samples = audio.read_wav(audio_file)
def wav_to_spec_inverted(file): wav_x = read_wav(file, sr, duration) spec_x, _ = wav2spec(wav_x, n_fft, win_length, hop_length, False) spec_x_padding = np.array(spec_x[:, 0:300]) spec_x_padding /= np.max(spec_x_padding) spec_x_padding.resize((257, 300)) return np.swapaxes(spec_x_padding, 0, 1)
def get_mfccs_and_phones(wav_file, trim=False, random_crop=True): '''This is applied in `train1` or `test1` phase. ''' # Load wav = read_wav(wav_file, sr=16000) #hp.sr) mfccs, _, _ = _get_mfcc_and_spec( wav, 0.97, 512, #hp.preemphasis, hp.n_fft, 400, #hp.win_length, 80) #hp.hop_length) # timesteps num_timesteps = mfccs.shape[0] # phones (targets) phn_file = wav_file.replace("WAV", "PHN").replace("wav", "PHN") phn2idx, idx2phn = load_vocab() phns = np.zeros(shape=(num_timesteps, )) bnd_list = [] for line in open(phn_file, 'r').read().splitlines(): start_point, _, phn = line.split() bnd = int(start_point) // 80 #hp.hop_length phns[bnd:] = phn2idx[phn] bnd_list.append(bnd) # Trim if trim: start, end = bnd_list[1], bnd_list[-1] mfccs = mfccs[start:end] phns = phns[start:end] assert (len(mfccs) == len(phns)) # Random crop n_timesteps = ( 3 * 16000) // 80 + 1 # (hp.duration * hp.sr) // hp.hop_length + 1 if random_crop: start = np.random.choice( range(np.maximum(1, len(mfccs) - n_timesteps)), 1)[0] end = start + n_timesteps mfccs = mfccs[start:end] phns = phns[start:end] assert (len(mfccs) == len(phns)) # Padding or crop mfccs = librosa.util.fix_length(mfccs, n_timesteps, axis=0) phns = librosa.util.fix_length(phns, n_timesteps, axis=0) return mfccs, phns
def get_mfccs_and_phones(wav_file, trim=False, random_crop=True): '''This is applied in `train1` or `test1` phase. ''' # Load wav = read_wav(wav_file, sr=hp.default.sr) mfccs, _, _ = _get_mfcc_and_spec(wav, hp.default.preemphasis, hp.default.n_fft, hp.default.win_length, hp.default.hop_length) # timesteps num_timesteps = mfccs.shape[0] # phones (targets) # phn_file = wav_file.replace("WAV.wav", "PHN").replace("wav", "PHN") # phn2idx, idx2phn = load_vocab() phns = np.zeros(shape=(num_timesteps, )) bnd_list = [] # for line in open(phn_file, 'r').read().splitlines(): # start_point, _, phn = line.split() # bnd = int(start_point) // hp.default.hop_length # phns[bnd:] = phn2idx[phn] # bnd_list.append(bnd) # Trim # if trim: # start, end = bnd_list[1], bnd_list[-1] # mfccs = mfccs[start:end] # phns = phns[start:end] # assert (len(mfccs) == len(phns)) # Random crop # Ollin: aaah + 1 n_timesteps = (hp.default.duration * hp.default.sr) // hp.default.hop_length + 1 # if random_crop: # start = np.random.choice(range(np.maximum(1, len(mfccs) - n_timesteps)), 1)[0] # end = start + n_timesteps # mfccs = mfccs[start:end] # phns = phns[start:end] # assert (len(mfccs) == len(phns)) # Padding or crop mfccs = librosa.util.fix_length(mfccs, n_timesteps, axis=0) phns = librosa.util.fix_length(phns, n_timesteps, axis=0) return mfccs, phns
def get_random_wav(self, wavfile): """ :param: wavfile: a raw wave file. :return: wav: raw wave. float32. shape=(t, ), melspec: mel-spectrogram. float32. shape=(t, n_mels), wavfile: the raw wave file. """ wav = read_wav(wavfile, sr=hp.signal.sr) wav = trim_wav(wav) wav = fix_length(wav, self.length) # crop from the beginning. melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) melspec = np.float32(melspec) return wav, melspec, wavfile
def _get_wav_and_melspec(wav_file, length=None, is_training=True): wav = read_wav(wav_file, sr=hp.signal.sr) wav = trim_wav(wav) if length: n_clips = math.ceil(len(wav) / length) if is_training else 1 idx = random.randrange(n_clips) start, end = length * idx, length * (idx + 1) wav = wav[start:end] assert (len(wav) <= length) wav = fix_length(wav, length) # padding melspec = wav2melspec_db(wav, sr=hp.signal.sr, n_fft=hp.signal.n_fft, win_length=hp.signal.win_length, hop_length=hp.signal.hop_length, n_mels=hp.signal.n_mels, min_db=hp.signal.min_db, max_db=hp.signal.max_db) wav = np.expand_dims(wav, -1) return wav, melspec.astype(np.float32)
def get_mfccs_and_phones(wav_file, trim=False): '''This is applied in `train1` or `test1` phase. ''' # Load wav = read_wav(wav_file, sr=hp.default.sr) mfccs, _, _ = _get_mfcc_and_spec(wav, hp.default.preemphasis, hp.default.n_fft, hp.default.win_length, hp.default.hop_length) # timesteps num_timesteps = mfccs.shape[0] # phones (targets) phn_file = wav_file.replace("WAV", "PHN") phn2idx, idx2phn = load_vocab() phns = np.zeros(shape=(num_timesteps,)) bnd_list = [] for line in open(phn_file, 'r').read().splitlines(): start_point, _, phn = line.split() bnd = int(start_point) // hp.default.hop_length phns[bnd:] = phn2idx[phn] bnd_list.append(bnd) # Trim if trim: start, end = bnd_list[1], bnd_list[-1] mfccs = mfccs[start:end] phns = phns[start:end] assert (len(mfccs) == len(phns)) n_timesteps = (hp.default.duration * hp.default.sr) // hp.default.hop_length + 1 # Padding or crop mfccs = librosa.util.fix_length(mfccs, n_timesteps, axis=0) phns = librosa.util.fix_length(phns, n_timesteps, axis=0) return mfccs, phns
def get_mfccs_and_phones(wav_file, trim=False, random_shuffle=False): '''This is applied in `train1` or `test1` phase. ''' # Load wav = read_wav(wav_file, sr=hp.default.sr) mfccs, _, _ = _get_mfcc_and_spec(wav, hp.default.preemphasis, hp.default.n_fft, hp.default.win_length, hp.default.hop_length) # timesteps num_timesteps = mfccs.shape[0] # phones (targets) phn_file = wav_file.replace("WAV", "PHN") phn2idx, idx2phn = load_vocab() phns = np.zeros(shape=(num_timesteps, )) bnd_list = [] for line in open(phn_file, 'r').read().splitlines(): start_point, _, phn = line.split() bnd = int(start_point) // hp.default.hop_length phns[bnd:] = phn2idx[phn] bnd_list.append(bnd) # Trim if trim: start, end = bnd_list[1], bnd_list[-1] mfccs = mfccs[start:end] phns = phns[start:end] assert (len(mfccs) == len(phns)) if random_shuffle: for i in range(len(bnd_list) - 1): start = bnd_list[i] end = bnd_list[i + 1] - 1 np.random.shuffle(mfccs[start:end]) return mfccs, phns
import numpy as np import matplotlib.pyplot as plt from audio import spec2wav, wav2spec, read_wav, write_wav if __name__ == '__main__': sr = 22050 n_fft = 512 win_length = 400 hop_length = 80 duration = 2 # sec wav = read_wav( "H:\\cs230\\wav_x\\1_1.wav", sr, duration ) spec, _ = wav2spec(wav, n_fft, win_length, hop_length, False) converted_wav = spec2wav(spec, n_fft, win_length, hop_length, 600) write_wav(converted_wav, sr, 'a.wav') plt.pcolormesh(spec) plt.ylabel('Frequency') plt.xlabel('Time') plt.savefig("a.png")
ckpt = args.ckpt if args.ckpt else tf.train.latest_checkpoint(hp.logdir) pred_conf = PredictConfig( model=model, input_names=['x'], output_names=['embedding/embedding', 'prediction'], session_init=SaverRestore(ckpt) if ckpt else None) embedding_pred = OfflinePredictor(pred_conf) embedding, pred_speaker_id = embedding_pred(mel_spec) # get a random audio of the predicted speaker. wavfile_pred_speaker = np.array(map(lambda s: audio_meta_train.get_random_audio(s), pred_speaker_id)) length = int(hp.signal.duration * hp.signal.sr) wav_pred_speaker = np.array( map(lambda w: fix_length(read_wav(w, hp.signal.sr, duration=hp.signal.duration), length), wavfile_pred_speaker)) # write audio tf.summary.audio('wav', wav, hp.signal.sr, max_outputs=10) tf.summary.audio('wav_pred', wav_pred_speaker, hp.signal.sr, max_outputs=10) # write prediction speaker_name = [audio_meta.speaker_dict[sid] for sid in speaker_id] pred_speaker_name = [audio_meta_train.speaker_dict[sid] for sid in pred_speaker_id] meta = [tuple(audio_meta.meta_dict[sid][k] for k in audio_meta.target_meta_field()) for sid in speaker_id] if hp.embed.meta_path else None pred_meta = [tuple(audio_meta_train.meta_dict[sid][k] for k in audio_meta_train.target_meta_field()) for sid in pred_speaker_id] if hp.train.meta_path else None prediction = ['{} ({}) -> {} ({})'.format(s, s_meta, p, p_meta) for s, p, s_meta, p_meta in zip(speaker_name, pred_speaker_name, meta, pred_meta)] tf.summary.text('prediction', tf.convert_to_tensor(prediction))
output_names=['embedding/embedding', 'prediction'], session_init=SaverRestore(ckpt) if ckpt else None, ) embedding_pred = OfflinePredictor(pred_conf) embedding, pred_speaker_id = embedding_pred(mel_spec) # get a random audio of the predicted speaker. wavfile_pred_speaker = np.array( map(lambda s: audio_meta_train.get_random_audio(s), pred_speaker_id)) length = int(hp.signal.duration * hp.signal.sr) wav_pred_speaker = np.array( map( lambda w: fix_length( read_wav(w, hp.signal.sr, duration=hp.signal.duration), length ), wavfile_pred_speaker)) # write audio tf.summary.audio('wav', wav, hp.signal.sr, max_outputs=10) tf.summary.audio('wav_pred', wav_pred_speaker, hp.signal.sr, max_outputs=10) # write prediction speaker_name = [audio_meta.speaker_dict[sid] for sid in speaker_id] pred_speaker_name = [ audio_meta_train.speaker_dict[sid] for sid in pred_speaker_id ]