def _process_utterance(out_dir, index, wav_path, text, phone): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.vocoder=="world": spectrogram = audio.spectrogram(wav).astype(np.float32) f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate) ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate) sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim) world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded]) n_frames = world_spec.shape[0] spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-world-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False) else: # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, encoded_filename, n_frames, text, phone)
def prepare_data(audio_path): rates = [0.25, 0.5, 1, 1.5, 2, 2.5] enc_path = './data/emb/' # Create directory for encoding if os.path.exists(enc_path) is False: os.makedirs(enc_path) pattern = audio_path + '*' + '.npz' file_list = glob.glob(pattern) for item in file_list: print(os.path.splitext(os.path.basename(item))[0]) item_ndname = enc_path + os.path.splitext( os.path.basename(item))[0][:4] item = np.load(item) spec, piece = item['spec'], item['piece'] # get original piece encoding #spec = torch.FloatTensor(torch.from_numpy(spec)) enc = encode(spec.T) for rate in rates: s = librosa.effects.time_stretch(piece, rate) spec_s = audio.spectrogram(s).astype(np.float32) enc_s = encode(spec_s.T) enc_o = timestretch(enc.T, (1 / rate)) #enc_s = torch.FloatTensor(enc_s) #enc = torch.FloatTensor(enc.T) #print('enc.T = ' , enc_o.T.shape , 'enc_s = ', enc_s.shape ) new_item = item_ndname + '_' + str(rate) np.savez(new_item, input=enc_o.T, target=enc_s)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'vctk-spec-%05d.npy' % index mel_filename = 'vctk-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return spectrogram_filename, mel_filename, n_frames, text, speaker_id
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'nikl-single-spec-%05d.npy' % index mel_filename = 'nikl-single-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def gen_samples(out_dir, wav_path, n_samples): wav = audio.load_wav(wav_path) hop_size = hparams.hop_length seg_len = hparams.seg_len spec_len = hparams.spec_len # not sure why we have to minus 1 here ? wav_len = wav.shape[0] // hop_size * hop_size - 1 wav = wav[:wav_len] spec = audio.spectrogram(wav) mel = audio.melspectrogram(wav) max_val = spec.shape[1] - 1 - spec_len if max_val < 0: return [] idx = np.random.randint(0, max_val, size=(n_samples)) d = [] i = 0 for offset in idx: i += 1 w = wav[offset * hop_size:offset * hop_size + seg_len] s = spec[:, offset:offset + spec_len] m = mel[:, offset:offset + spec_len] wav_name = wav_path.split('/')[-1].split('.')[0] file_path = "{0}/{1}_{2:03d}.npz".format(out_dir, wav_name, i) np.savez(file_path, wav=w, spec=s, mel=m) d.append(file_path) return d
def _process_utterance(out_dir, in_dir, source_wav_name, target_wav_name, emotion_id): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: source_wav = audio.load_wav(os.path.join(in_dir, source_wav_name)) target_wav = audio.load_wav(os.path.join(in_dir, target_wav_name)) if hparams.rescaling: source_wav = source_wav / np.abs( source_wav).max() * hparams.rescaling_max target_wav = target_wav / np.abs( target_wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: #s_spectrogram = audio.spectrogram(source_wav).astype(np.float32) t_spectrogram = audio.spectrogram(target_wav).astype(np.float32) # Compute a mel-scale spectrogram from the wav: smel_spectrogram = audio.melspectrogram(source_wav).astype(np.float32) tmel_spectrogram = audio.melspectrogram(target_wav).astype(np.float32) s_n_frames = smel_spectrogram.shape[1] t_n_frames = tmel_spectrogram.shape[1] # Write the spectrograms to disk: #s_spectrogram_filename = 'source-spec-{}.npy'.format(source_wav_name) t_spectrogram_filename = 'target-spec-{}.npy'.format( target_wav_name.replace('.wav', '')) smel_filename = 'source-mel-{}.npy'.format( source_wav_name.replace('.wav', '')) tmel_filename = 'target-mel-{}.npy'.format( target_wav_name.replace('.wav', '')) #np.save(os.path.join(out_dir, s_spectrogram_filename), s_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, t_spectrogram_filename), t_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, smel_filename), smel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, tmel_filename), tmel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (emotion_id, t_spectrogram_filename, smel_filename, tmel_filename, s_n_frames, t_n_frames)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) #world parameters f0, sp, ap = audio.world(wav, hparams.sample_rate) f0 = (f0 / hparams.f0_norm).astype(np.float32) #normalize sp = audio._normalize(sp).astype(np.float32) ap = ap.astype(np.float32) #apは0~1の範囲しか値を取らないので正規化不要 world_frames = f0.shape[0] # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index f0_filename = 'ljspeech-f0-%05d.npy' % index sp_filename = 'ljspeech-sp-%05d.npy' % index ap_filename = 'ljspeech-ap-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, f0_filename), f0, allow_pickle=False) np.save(os.path.join(out_dir, sp_filename), sp, allow_pickle=False) np.save(os.path.join(out_dir, ap_filename), ap, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, f0_filename, sp_filename, ap_filename, world_frames, text) '''
def _process_utterance(out_dir, out_path, wav_path, text, stft): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav = wav / np.abs(wav).max() * 0.999 #stft = audio.taco_stft() # delete the silence in back of the audio file. wav = librosa.effects.trim(wav, top_db=23, frame_length=1024, hop_length=256)[0] # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav, stft).numpy().astype(np.float32) # Write the spectrograms to disk: # spectrogram_filename = 'ljspeech-spec-%05d.npy' % index parts = out_path.strip().split('/') mel_filename = parts[4] + parts[5] + parts[6] o_path = os.path.join(parts[0], parts[1], parts[4]) # print(o_path) # mel_filename = 'nam_speech-mel-%05d.npy' % index # print(out_path) if (not os.path.exists(o_path)): os.mkdir(o_path) o_path = os.path.join(o_path, parts[5]) if (not os.path.exists(o_path)): os.mkdir(o_path) o_path = os.path.join(o_path, parts[6]) np.save(o_path, mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: # return (spectrogram_filename, mel_filename, n_frames, text) return (mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max try: # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) except Exception as e: print("Problem with :", wav_path) print(e) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # mdda added START : wav_filename = mel_filename.replace('-mel-', '-audio-') #wav_samples = hparams.fft_size + (n_frames-1)*hparams.hop_size # No : 3 extra frames added : Don't bother chomping np.save(os.path.join(out_dir, wav_filename), wav.astype(np.float32), allow_pickle=False) spectrogramraw_filename = 'ljspeech-specraw-%05d.npy' % index np.save(os.path.join(out_dir, spectrogramraw_filename), spectrogram_raw(wav).T, allow_pickle=False) # mdda added END # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams): # modified version of LJSpeech _process_utterance audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) sr = hparams.sample_rate # Added from the multispeaker version lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0] + '.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) wav = clean_by_phoneme(labels, wav, sr) wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) # End added from the multispeaker version if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.max_audio_length != 0 and librosa.core.get_duration( y=wav, sr=sr) > hparams.max_audio_length: return None if hparams.min_audio_length != 0 and librosa.core.get_duration( y=wav, sr=sr) < hparams.min_audio_length: return None # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] spectrogram_filename = 'spec-{}.npy'.format(wav_name) mel_filename = 'mel-{}.npy'.format(wav_name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, text, wav_path, speaker_id=None): # check whether singlespeaker_mode if speaker_id is None: return _process_utterance_single(out_dir, text, wav_path) # modified version of VCTK _process_utterance sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0] + '.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] # case if wave files across different speakers have the same naming format. # e.g. Recording0.wav spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name) mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def compute_spectrograms(batches, sample_rate, frame_len, fps, bins=None): """ Computes spectrograms from the signals in `batches` at a given sample rate (in Hz), frame length (in samples) and frame rate (in Hz). """ plans = audio.spectrogram_plans(frame_len, dtype=np.float32) for wavs, labels in batches: spects = [audio.spectrogram(np.asanyarray(wav).ravel(), sample_rate, frame_len, fps, dtype=np.float32, bins=bins, plans=plans) for wav in wavs] yield spects, labels
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate filename = os.path.basename(wav_path).replace('.wav', '') # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) # Librosa trim seems to cut off the ending part of speech else: wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Save trimmed wav save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path) dir = os.path.dirname(save_wav_path) if not os.path.exists(dir): os.system('mkdir {} -p'.format(dir)) audio.save_wav(wav, save_wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = '{}-spec.npy'.format(filename) mel_filename = '{}-mel.npy'.format(filename) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def copy_synthesis(wav_file, out_path): """Perform copy synthesis on the wav file and write the synthesized wav to disk at out_path """ filename = os.path.splitext(os.path.basename(wav_file))[0] y = audio.load_wav(wav_file) if cfg.rescaling: y = y / np.abs(y).max() * cfg.rescaling_max mag = audio.spectrogram(y) y_hat = audio.inv_spectrogram(mag) out_path = os.path.join(out_path, filename + "_synthesized.wav") print(f"Writing {out_path} to disk") audio.save_wav(y_hat, out_path)
def parse_line(line): # Parse line from csv filename, sentence, duration = line.decode('ascii').split('\t') # Audio file wav_path = os.path.join(hyperparams.dataset_path, filename + '.wav') wave = audio.read_audio(wav_path, hyperparams.sample_rate) audio_length = wave.shape[0] / hyperparams.sample_rate # Calculate spectrum mel, linear = audio.spectrogram(hyperparams, wave) # Encode sentence tokens = text.encode(sentence) return mel.T, linear.T, tokens, np.int32( tokens.size), np.float32(audio_length)
def _process_utterance(out_dir, index, speaker_id, wav_path, text, hparams=hparams): sr = hparams.sample_rate audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) else: wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'vctk-spec-%05d.npy' % index mel_filename = 'vctk-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def _process_utterance(out_dir, index, wav_path, pinyin): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # rescale wav for unified measure for all clips wav = wav / np.abs(wav).max() * 0.999 # trim silence wav = audio.trim_silence(wav) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] if n_frames > hp.max_frame_num: return None # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'biaobei-spec-%05d.npy' % index mel_filename = 'biaobei-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, pinyin)
def generate(model_path,model_name, generate_path, generate_name, piece): """Synthesize audio from an array of embeddings. Args: encodings: Numpy array with shape [batch_size, time, dim]. save_paths: Iterable of output file names. checkpoint_path: Location of the pretrained model. [model.ckpt-200000] samples_per_save: Save files after every amount of generated samples. """ # Create directory for encoding if os.path.exists(generate_path) is False: os.makedirs(generate_path) net = AutoEncoder() net = load_model(net,model_path,model_name) cuda_available = torch.cuda.is_available() if cuda_available is True: net = net.cuda() net.eval() # Load audio for encoding piece = audio.load_wav(piece) spec = audio.spectrogram(piece).astype(np.float32) spec = torch.from_numpy(spec.T) spec = torch.FloatTensor(spec) spec = torch.unsqueeze(spec, 0) spec = Variable(spec, volatile=True).contiguous() if cuda_available is True: spec = spec.cuda() generated_spec = net(spec) generated_spec = generated_spec.data.cpu().numpy() generated_spec = np.squeeze(generated_spec) waveform = audio.inv_spectrogram(generated_spec.T) wav_name = generate_path + generate_name + '.wav' audio.save_wav(waveform , wav_name)
def _process_utterance(wav_path): sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) # n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Return a tuple describing this training example: return spectrogram.T, mel_spectrogram.T
def _process_utterance(audio_path, data_dir, tokens, loss_coeff): audio_name = os.path.basename(audio_path) filename = audio_name.rsplit('.', 1)[0] + ".npz" numpy_path = os.path.join(data_dir, filename) if not os.path.exists(numpy_path): wav = load_audio(audio_path) try: linear_spectrogram = spectrogram(wav).astype(np.float32) mel_spectrogram = melspectrogram(wav).astype(np.float32) except: return 0 data = { "linear": linear_spectrogram.T, "mel": mel_spectrogram.T, "tokens": tokens, "loss_coeff": loss_coeff, } n_frame = linear_spectrogram.shape[1] if hparams.skip_inadequate: min_n_frame = hparams.reduction_factor * hparams.min_iters max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor if min_n_frame <= n_frame <= max_n_frame and len(tokens) >= hparams.min_tokens: return None np.savez(numpy_path, **data, allow_pickle=False) else: try: data = np.load(numpy_path) n_frame = data["linear"].shape[0] except: remove_file(numpy_path) return _process_utterance(audio_path, data_dir, tokens, loss_coeff) return n_frame
def _process_utterance(out_dir, in_dir, wav_name): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(os.path.join(in_dir, wav_name)) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max spectrogram = audio.spectrogram(wav).astype(np.float32) # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) n_frames = mel_spectrogram.shape[1] # Write the spectrograms to disk: spectrogram_filename = 'spec-{}.npy'.format(wav_name.replace('.wav', '')) mel_filename = 'mel-{}.npy'.format(wav_name.replace('.wav', '')) dur_filename = 'dur-{}.npy'.format(wav_name.replace('.wav', '')) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, dur_filename, n_frames)
def get_mel(self, filename): if not self.load_mel_from_disk: wav, _ = librosa.load(filename, self.sampling_rate) wav = torch.from_numpy(wav).float().unsqueeze(0) #audio_norm = wav / self.max_wav_value #audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(wav, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) wav, sr = librosa.load(filename, sr=self.hparms.se_sample_rate) wav, _ = librosa.effects.trim(wav, top_db=20) audios = split_audio( wav, sr=self.hparms.se_sample_rate, ) mels = get_split_mels( audios, # sr=self.hparms.se_sample_rate, # n_fft=self.hparms.se_n_fft, # win_length=self.hparms.se_window, # hop_length=self.hparms.se_hop, mel=self.hparms.num_mel) if len(mels) == 0: print(filename) mels = np.stack(mels) mels = torch.from_numpy(mels).float() mels = mels.permute(0, 2, 1) x, _ = self.speaker_encoder(mels, return_sim=False) speaker_encoder = x.mean(0) # final speaker encode from an audio # reference from gst spectrogram = audio.spectrogram(wav).astype(np.float32) spectrogram = spectrogram.transpose(1, 0) else: melspec = torch.from_numpy(np.load(filename)) assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return speaker_encoder, spectrogram, melspec
def _process_utterance(mag_dir, mel_dir, wav_path, text): """Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: mag_dir: The directory to write the log magnitude spectrograms into mel_dir: The directory to write the mel spectrograms into wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (filename, text, num_frames) tuple to write to train.txt """ filename = os.path.splitext(os.path.basename(wav_path))[0] # Load the audio to a numpy array wav = audio.load_wav(wav_path) if cfg.rescaling: wav = wav / np.abs(wav).max() * cfg.rescaling_max # Compute the linear-scale spectrogram from the wav spectrogram = audio.spectrogram(wav).astype(np.float32) num_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk np.save(os.path.join(mag_dir, filename + ".npy"), spectrogram.T, allow_pickle=False) np.save(os.path.join(mel_dir, filename + ".npy"), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (filename, text, num_frames)
def _process_utterance(out_dir, index, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) assert labels[0][-1] == "silB" assert labels[-1][-1] == "silE" b = int(labels[0][1] * 1e-7 * sr) e = int(labels[-1][0] * 1e-7 * sr) wav = wav[b:e] else: wav, _ = librosa.effects.trim(wav, top_db=30) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'jsut-spec-%05d.npy' % index mel_filename = 'jsut-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, labels_path, text): # Load the wav file and trim silence from the ends: wav = audio.load_wav(wav_path) start_offset, end_offset = _parse_labels(labels_path) start = int(start_offset * hp.sr) end = int(end_offset * hp.sr) if end_offset is not None else -1 wav = wav[start:end] max_samples = _max_out_length * hp.frame_shift * hp.sr if len(wav) > max_samples: # print(wav_path + ": wav too long") return None spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'blizzard-spec-%05d.npy' % index mel_filename = 'blizzard-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text)
def test_audio_conv(audio_path): # Audio file wav_path = os.path.join(hyperparams.dataset_path, filename + '.wav') wave = audio.read_audio(wav_path, hyperparams.sample_rate) audio_length = wave.shape[0] / hyperparams.sample_rate # Calculate spectrum mel, linear = audio.spectrogram(hyperparams, wave) #plt.imshow(mel) from_mel = audio.mel_to_linear(mel, (hyperparams.num_freq - 1) * 2, hyperparams.sample_rate, hyperparams.num_mels) plt.imshow(from_mel) plt.show() plt.imshow(linear) plt.show() signal = audio.reconstruct(hyperparams, linear) audio.write_audio('test.wav', signal, hyperparams.sample_rate) signal = audio.reconstruct(hyperparams, mel, from_mel=True) audio.write_audio('test_mel.wav', signal, hyperparams.sample_rate)
def encode(model_name, piece, encoding_name): model_path = './restore/' encoding_path = './encoding/' # Create directory for encoding if os.path.exists(encoding_path) is False: os.makedirs(encoding_path) net = AutoEncoder() net = load_model(net,model_path,model_name) cuda_available = torch.cuda.is_available() if cuda_available is True: net = net.cuda() net.eval() # Load audio for encoding piece = audio.load_wav(piece) spec = audio.spectrogram(piece).astype(np.float32) spec = torch.from_numpy(spec.T) spec = torch.FloatTensor(spec) spec = torch.unsqueeze(spec, 0) spec = Variable(spec, volatile=True).contiguous() if cuda_available is True: spec = spec.cuda() # Pass input audio to net forward pass encoding = net.encoder(spec) encoding = encoding.data.cpu().numpy() #encoding = np.squeeze(encoding) encoding_ndarray = encoding_path + encoding_name+ '.npy' np.save(encoding_ndarray, encoding)
def preprocess(audio_dir, ndarray_dir , window_length): pattern = audio_dir + '*' + '.wav' file_list = glob.glob(pattern) for item in file_list: item_ndname = ndarray_dir + os.path.splitext(os.path.basename(item))[0] item = audio.load_wav(item) item_iter = 0 while(len(item)) > window_length: piece = item[: (window_length - 1)] spec = audio.spectrogram(piece).astype(np.float32) #spec = audio.melspectrogram(piece).astype(np.float32) #spec = torch.FloatTensor(torch.from_numpy(spec.T)) #piece = torch.FloatTensor(torch.from_numpy(item)) item = item[window_length:] new_item_ndname = item_ndname + str(item_iter) + '.npy' np.savez(new_item_ndname , piece = piece, spec = spec) item_iter += 1
def wav2linear_for_ppg_cbhg(wav_arr): return spectrogram(wav_arr)['magnitude']
#**************************** wav_path = './p225/1.wav' wav = audio.load_wav(wav_path) melspectrogram = audio.melspectrogram(wav).astype(np.float32) # # (80, 448) n_frames = melspectrogram.shape[1] print("melspectrogram.shape = ", melspectrogram.shape) print("n_frames = ", n_frames) mag = audio._mel_to_linear(melspectrogram) print("mag.shape = ", mag.shape) # mag.shape = (1025, 448) orisp = audio.spectrogram(wav) print("orisp.shape = ", orisp.shape) # orisp.shape = (1025, 448) # wav = audio.griffin_lim(orisp) wav = audio._griffin_lim(orisp) audio.save_wav(wav, './ori-sp-to-wav.wav') # # wav = melspectrogram2wav(melspectrogram) # # audio.save_wav(wav, './hello-taco.wav') ''' melspectrogram.shape = (80, 448) n_frames = 448 mag.shape = (1025, 448) orisp.shape = (1025, 448)