def write_lmdb(out_file_name, data_list): lmdb_output = lmdb.open(out_file_name, map_size=get_map_size(data_list)) with lmdb_output.begin(write=True) as txn: # txn is a Transaction object for audio_indx, audio_path in enumerate(tqdm(data_list)): if not ('mixture' in audio_path): continue # looping over mixture and getting vocals from it mixed_data = load_wav(audio_path).astype('float32') vocals_data = load_wav(audio_path.replace( 'mixture', 'vocals')).astype('float32') ''' # to remove zeros from mixed and vocals based on vocals silent_set = get_silent_set(vocals_data) mixed_data = remove_silence(mixed_data, silent_set) vocals_data = remove_silence(vocals_data, silent_set) ''' vocals_indices = get_sequence_with_singing_indices( vocals_data, 800) datum = datanum_pb2.DataNum() datum.mixture = mixed_data.tobytes() datum.vocals = vocals_data.tobytes() datum.vocals_indices = vocals_indices.tobytes( ) # used to store the indices having voice str_id = '{:08}'.format(audio_indx) txn.put(str_id.encode('ascii'), datum.SerializeToString())
def __getitem__(self, index): filename = self.files[index] filepath = os.path.join(config.TEST_DIR_PATH, filename) wave = load_wav(filepath) if self.transform: wave = self.transform(wave) return wave, filepath
def synthesize_one(text, speaker='Aiyue', model_path='', with_alignment=False): if _mellotron is None: load_model_mellotron(model_path) text_encoded = torch.LongTensor(transform_text( text, text_cleaners='hanzi'))[None, :].to(_device) speaker_id = torch.LongTensor(transform_speaker( '', speaker_ids={})).to(_device) style_input = 0 # pitch_contour = torch.ones(1, _hparams.prenet_f0_dim, text_encoded.shape[1] * 5, dtype=torch.float) * np.random.random() # pitch_contour = None wav = load_wav(str(speaker), sr=_hparams.sampling_rate) embed = transform_embed(wav, _encoder_model_fpath) embed = embed[::embed.shape[0] // _hparams.prenet_f0_dim] embed = embed if embed.shape[ 0] == _hparams.prenet_f0_dim else embed[:_hparams.prenet_f0_dim] f0 = np.tile(embed, (text_encoded.shape[1] * 5, 1)).T pitch_contour = torch.from_numpy(f0[None]) with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, alignments = _mellotron.inference( (text_encoded, style_input, speaker_id, pitch_contour)) out_mel = mel_outputs_postnet.data.cpu().numpy()[0] if with_alignment: return out_mel, alignments[0] else: return out_mel
def preprocess_from_path(dataset_dir, metadata_filename, output_dir, num_workers=1, tqdm=lambda x: x): """ Preprocessing wav step by step Load -> Remove silences -> Divide to chunks -> Extract features Return: list of metadata samples """ print("Start preprocess_from_path...") executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] with open(os.path.join(dataset_dir, metadata_filename), encoding="utf-8") as f: for line in f: parts = line.strip().split('\t') wav_name = parts[0] target_class = get_label_number(parts[-1]) wav_path = os.path.join(dataset_dir, 'audio', '%s' % wav_name) # test_audio if os.path.isfile(wav_path): wav = utils.remove_all_silence(utils.load_wav(wav_path)) index = 0 for (start, end) in utils.windows(wav, hparams.window_size): chunk = wav[start:end] if (len(chunk) != hparams.window_size): chunk = utils.pad_chunk(chunk, wav) futures.append( executor.submit( partial(_process_utterance, output_dir, chunk, wav_name, target_class, index))) index += 1 results = [future.result() for future in tqdm(futures)] return [r for r in results if r is not None]
def process_wav(wav_path, audio_path, mel_path, params): wav = load_wav(wav_path, sample_rate=params["preprocessing"]["sample_rate"]) wav /= np.abs(wav).max() * 0.999 mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"], num_mels=params["preprocessing"]["num_mels"], num_fft=params["preprocessing"]["num_fft"], preemph=params["preprocessing"]["preemph"], min_level_db=params["preprocessing"]["min_level_db"], hop_length=params["preprocessing"]["hop_length"], win_length=params["preprocessing"]["win_length"], fmin=params["preprocessing"]["fmin"]) length_diff = len(mel) * params["preprocessing"]["hop_length"] - len(wav) wav = np.pad(wav, (0, length_diff), "constant") pad = (params["vocoder"]["sample_frames"] - params["vocoder"]["audio_slice_frames"]) // 2 mel = np.pad(mel, ((pad, ), (0, )), "constant") wav = np.pad(wav, (pad * params["preprocessing"]["hop_length"], ), "constant") wav = mulaw_encode(wav, mu=2**params["preprocessing"]["bits"]) speaker = os.path.splitext(os.path.split(wav_path)[-1])[0].split("_")[0] np.save(audio_path, wav) np.save(mel_path, mel) return speaker, audio_path, mel_path, len(mel)
def process_wav(dataset, wav_path, audio_path, mel_path, params): """Convert wav_path into speaker_id and internally save processed data in arg's pathes. """ # auto resample based on params (internally, librosa) wav = load_wav(wav_path, sample_rate=params["preprocessing"]["sample_rate"]) wav /= np.abs(wav).max() * 0.999 mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"], preemph=params["preprocessing"]["preemph"], num_mels=params["preprocessing"]["num_mels"], num_fft=params["preprocessing"]["num_fft"], min_level_db=params["preprocessing"]["min_level_db"], hop_length=params["preprocessing"]["hop_length"], win_length=params["preprocessing"]["win_length"], fmin=params["preprocessing"]["fmin"]) length_diff = len(mel) * params["preprocessing"]["hop_length"] - len(wav) wav = np.pad(wav, (0, length_diff), "constant") pad = (params["vocoder"]["sample_frames"] - params["vocoder"]["audio_slice_frames"]) // 2 mel = np.pad(mel, ((pad,), (0,)), "constant") wav = np.pad(wav, (pad * params["preprocessing"]["hop_length"],), "constant") wav = mulaw_encode(wav, mu=2 ** params["preprocessing"]["bits"]) # speakerID acuisition speaker = get_speakerid(wav_path, dataset) # save processed data np.save(audio_path, wav) np.save(mel_path, mel) return speaker, audio_path, mel_path, len(mel)
def evaluate(args): x = tf.placeholder("float", [None, hparams.n_steps, hparams.n_input], name="x") bias = tf.Variable(tf.random_normal([hparams.n_classes]), name="bias") weight = tf.Variable(tf.truncated_normal( [hparams.n_hidden, hparams.n_classes], stddev=0.1), name="weights") prediction = RNN(x, weight, bias) sess = tf.Session() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, args.path_to_model) list_of_files = get_files(args.test_data_dir) for file in list_of_files: wav = utils.remove_all_silence( utils.load_wav(os.path.join(args.test_data_dir, '%s' % file))) features = preprocess_one(wav) pred = sess.run(prediction, feed_dict={x: features}) # Prediction for one example join_pred = np.around(np.sum(pred, axis=0) / pred.shape[0], decimals=3) # Write to file with open("result.txt", "a") as f: f.write(file + '\t' + '{0:.3f}'.format(np.max(join_pred)) + '\t' + get_text_label(np.argmax(join_pred)) + '\n')
def transcribe(m, input, output, threshold): import magenta.music as mm import data wav = utils.load_wav(input, cfg.SAMPLE_RATE) frames, _ = data.audio2frame(wav, cfg.FRAME_SIZE, cfg.SPECTROGRAM) onset, _ = m.predict(frames, threshold) sequence = data.matrix2sequence(onset[0], onset=onset[0]) mm.sequence_proto_to_midi_file(sequence, output)
def __getitem__(self, index): ''' returns the file in wav format ''' wavfile = self.wavfiles[index] mixed, s1, s2 = load_wav(wavfile, sr=self.sr) return mixed, s1, s2, len(mixed)
def load_noise_waves(): noise_waves = [] noise_directory = os.path.join(config.TRAIN_DIR_PATH, '_background_noise_') for filepath in sorted(os.listdir(noise_directory)): if not filepath.endswith('.wav'): continue wave = load_wav(os.path.join(noise_directory, filepath)) noise_waves.append(wave) return noise_waves
def __getitem__(self, index): i = self.dataset_index[index] if i == 'silence': label = label_to_idx['silence'] wave = self.silence_wave else: filepath, label, user_id = self.data[i] wave = load_wav(filepath) if self.transform: wave = self.transform(wave) return wave, label
def load_samples(self): """ Loads samples from file ./drum_samples/ File should include files bass_drum.wav, snare_drum.wav and hi_hat.wav """ path = 'drum_samples/' fs_bass, bass_drum = load_wav(path + 'bass_drum.wav') self._bass_drum = stereo_to_mono(bass_drum) fs_snare, snare_drum = load_wav(path + 'snare_drum.wav') self._snare_drum = stereo_to_mono(snare_drum) fs_hi_hat, hi_hat = load_wav(path + 'hi_hat.wav') self._hi_hat = stereo_to_mono(hi_hat) # Checking if loaded samples have matching sampling frequency with the set fs error_msg = ' sample does not have matching sample frequency' assert fs_bass == self._fs, 'Bass drum' + error_msg assert fs_snare == self._fs, 'Snare drum' + error_msg assert fs_hi_hat == self._fs, 'hi hat' + error_msg
def augmantation_from_path(dataset_dir, metadata_filename, output_dir, current_class, augmantation_amount, num_workers=1, tqdm=lambda x: x): """ Preprocessing wav step by step Load -> Remove silences -> Random start -> Divide to chunks -> Extract features Return: list of metadata samples """ print("Start augmantation_from_path...") executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] aug_index = 0 while_loop = True while while_loop: with open(os.path.join(dataset_dir, metadata_filename), encoding="utf-8") as f: for line in f: parts = line.strip().split('\t') wav_name = parts[0] target_class = get_label_number(parts[-1]) if target_class == current_class: wav_path = os.path.join(dataset_dir, 'audio', '%s' % wav_name) # audio if os.path.isfile(wav_path): wav = utils.remove_all_silence( utils.load_wav(wav_path)) wav = wav[np.random.randint(1, 22050):] index = 0 for (start, end) in utils.windows(wav, hparams.window_size): if aug_index >= augmantation_amount: while_loop = False break chunk = wav[start:end] if (len(chunk) != hparams.window_size): chunk = utils.pad_chunk(chunk, wav) futures.append( executor.submit( partial( _process_utterance, output_dir, chunk, "aug-%s-%s" % (aug_index, wav_name), target_class, index))) index += 1 aug_index += 1 results = [future.result() for future in tqdm(futures)] return [r for r in results if r is not None]
def gen_from_wav(model, wav, output): wav = load_wav(wav, params["preprocessing"]["sample_rate"], trim=False) utterance_id = os.path.basename(args.input).split(".")[0] wav = wav / np.abs(wav).max() * 0.999 mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"], preemph=params["preprocessing"]["preemph"], num_mels=params["preprocessing"]["num_mels"], num_fft=params["preprocessing"]["num_fft"], min_level_db=params["preprocessing"]["min_level_db"], ref_level_db=params["preprocessing"]["ref_level_db"], hop_length=params["preprocessing"]["hop_length"], fmin=params["preprocessing"]["fmin"], fmax=params["preprocessing"]["fmax"]) gen_from_mel(model, mel, output)
def pad2drums(read_from_fname, save_to_fname): """ Reads .wav-file in folder "raw_audio" from a drum pad (with mic about 10 cm away) and converts it to an .wav-file with drum sounds in place of the pad sounds. Created file is placed in folder "results". """ load_path = 'raw_audio/' fs, raw_audio = load_wav(load_path + read_from_fname) # Detecting the pad hits from the raw_audio hit_indices, hit_strengths = detect_sound(raw_audio, stereo=True) dg = DrumGenerator(fs=fs) drum_audio = dg.generate_drum_audio(hit_indices, hit_strengths, raw_audio.size) # Save drum_audio to file name for save_to_file added by user save_path = 'results/' + save_to_fname save_wav(save_path, drum_audio, fs)
def delete_error_audio(path): sounds_path = [] for root, dirs, files in os.walk(path): for file in files: sound_path = os.path.join(root, file) if sound_path[-4:] == '.wav' or sound_path[-4:] == '.m4a': sounds_path.append(sound_path) for audio_path in tqdm(sounds_path): try: wav = utils.load_wav(audio_path, sr=16000, mode='train') linear_spect = utils.lin_spectogram_from_wav(wav, 160, 400, 512) mag, _ = librosa.magphase(linear_spect) # magnitude mag_T = mag.T freq, time = mag_T.shape if time <= 250: # os.remove(audio_path) print('音频过短,删除:%s' % audio_path) except: # os.remove(audio_path) print('音频错误,删除:%s' % audio_path)
def transform(ENV, args): train_wav_files, train_phn_files = load_wavPhn(ENV.train_data) test_wav_files = load_wav(ENV.test_data) train_output_path = os.path.join(ENV.output, 'train') test_output_path = os.path.join(ENV.output, 'test') if not os.path.exists(train_output_path): os.makedirs(train_output_path) if not os.path.exists(test_output_path): os.makedirs(test_output_path) for i in tqdm(range(len(train_wav_files))): transform_wav(train_wav_files[i], train_output_path) phn_file = os.path.join(os.path.dirname(train_wav_files[i]), os.path.basename(train_wav_files[i]).split('.')[0] + '.phn') copy_phn(phn_file, train_output_path) for i in tqdm(range(len(test_wav_files))): transform_wav(test_wav_files[i], test_output_path) phn_file = os.path.join(os.path.dirname(test_wav_files[i]), os.path.basename(test_wav_files[i]).split('.')[0] + '.phn') copy_phn(phn_file, test_output_path)
def load_from_browser(self, fpath=None): if fpath is None: fpath = Path(self.datasets_root, self.ui.current_dataset_name, self.ui.current_src_spk, self.ui.current_utterance_name) name = str(fpath.relative_to(self.datasets_root)) speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_src_spk # Select the next utterance if self.ui.auto_next_checkbox.isChecked(): self.ui.browser_select_next() elif fpath == "": return else: name = fpath.name speaker_name = fpath.parent.name # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for # playback, so as to have a fair comparison with the generated audio wav = utils.load_wav(str(fpath)) self.ui.log("Loaded %s" % name) self.add_real_utterance(wav, cfg.data.sample_rate, name, speaker_name)
all_wav_path = glob.glob(os.path.join(data_root, models[0], '*.wav')) logf0_dict = { 'm2m': [], 'm2f': [], 'f2m': [], 'f2f': [] } print(" [*] {} start!".format('GT')) for wav_path in tqdm.tqdm(all_wav_path): wav_name = os.path.basename(wav_path) pattern = r"p[0-9]+_[0-9]+" src, trg = re.findall(pattern, wav_name) wav_path_gt = os.path.join(data_root, 'GT', trg + '.wav') wav_gt = load_wav(wav_path_gt, 22050) logf0_gt = get_logf0(wav_gt, 22050, frame_period=(256 / (0.001 * 22050))) logf0_gt = speaker_norm(logf0_gt) src_spk = src.split('_')[0] trg_spk = trg.split('_')[0] each_dict = {'GT': logf0_gt} for m in models: temp = os.path.join(data_root, m, wav_name) wav = load_wav(temp, 22050) logf0 = get_logf0(wav, 22050, frame_period=(256 / (0.001 * 22050))) logf0 = speaker_norm(logf0) each_dict[m] = logf0
def main(): parser = argparse.ArgumentParser('PreprocessingParser') parser.add_argument('--data_dir', type=str, help='data root directory') parser.add_argument('--save_dir', type=str, help='extracted feature save directory') parser.add_argument('--dev_rate', type=float, help='dev set rate', default=0.05) parser.add_argument('--test_rate', type=float, help='test set rate', default=0.05) args = parser.parse_args() # args validation if args.dev_rate < 0 or args.dev_rate >= 1: raise ValueError('dev rate should be in [0, 1)') if args.test_rate < 0 or args.test_rate >= 1: raise ValueError('dev rate should be in [0, 1)') if args.test_rate + args.dev_rate >= 1: raise ValueError('dev rate + test rate should not be >= 1.') if not os.path.isdir(args.data_dir): raise FileNotFoundError('Directory {} not found!'.format( args.data_dir)) if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) mel_dir = os.path.join(args.save_dir, 'mels') os.makedirs(mel_dir, exist_ok=True) linear_dir = os.path.join(args.save_dir, 'linears') os.makedirs(linear_dir, exist_ok=True) f0_dir = os.path.join(args.save_dir, 'f0s') os.makedirs(f0_dir, exist_ok=True) ppg_dir = os.path.join(args.save_dir, 'ppgs') os.makedirs(ppg_dir, exist_ok=True) for mode in ['train', 'dev', 'test']: if os.path.isfile( os.path.join(args.save_dir, "{}_meta.csv".format(mode))): os.remove(os.path.join(args.save_dir, "{}_meta.csv".format(mode))) wav_files = [] for rootdir, subdir, files in os.walk(args.data_dir): for f in files: if f.endswith('.wav'): wav_files.append(os.path.join(rootdir, f)) random.shuffle(wav_files) print('Set up PPGs extraction network') # Set up network ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier mfcc_pl = tf.placeholder(dtype=tf.float32, shape=[None, None, 3 * hps.Audio.n_mfcc], name='mfcc_pl') ppg_extractor = CNNBLSTMClassifier( out_dims=hps.Audio.ppg_dim, n_cnn=ppg_extractor_hps.n_cnn, cnn_hidden=ppg_extractor_hps.cnn_hidden, cnn_kernel=ppg_extractor_hps.cnn_kernel, n_blstm=ppg_extractor_hps.n_blstm, lstm_hidden=ppg_extractor_hps.lstm_hidden) predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits'] # set up a session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # load saved model saver = tf.train.Saver() print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt)) saver.restore(sess, ppg_extractor_hps.ckpt) print('Extracting mel-spectrograms, spectrograms and log-f0s...') train_set = [] dev_set = [] test_set = [] dev_start_idx = int(len(wav_files) * (1 - args.dev_rate - args.test_rate)) test_stat_idx = int(len(wav_files) * (1 - args.test_rate)) for i, wav_f in tqdm(enumerate(wav_files)): try: wav_arr = load_wav(wav_f) except: continue pre_emphasized_wav = _preemphasize(wav_arr) fid = '{}_{}'.format( wav_f.split('/')[-3].split('_')[2], wav_f.split('/')[-1].split('.')[0].split('_')[1]) # extract mel-spectrograms mel_fn = os.path.join(mel_dir, '{}.npy'.format(fid)) try: mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T except: continue # extract spectrograms linear_fn = os.path.join(linear_dir, '{}.npy'.format(fid)) try: linear_spec = spectrogram(pre_emphasized_wav).astype(np.float32).T except: continue # extract log-f0s f0_fn = os.path.join(f0_dir, '{}.npy'.format(fid)) log_f0 = logf0(wav_f) try: log_f0 = lf0_normailze(log_f0) except: continue # extract ppgs mfcc_feats = wav2unnormalized_mfcc(wav_arr) ppg = sess.run(predicted_ppgs, feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)}) ppg = softmax(np.squeeze(ppg, axis=0)) ppg_fn = os.path.join(ppg_dir, '{}.npy'.format(fid)) # save features to respective directory mel_spec, linear_spec, log_f0, ppg = length_validate( (mel_spec, linear_spec, log_f0, ppg)) np.save(mel_fn, mel_spec) np.save(linear_fn, linear_spec) np.save(f0_fn, log_f0) np.save(ppg_fn, ppg) # write to csv if i < dev_start_idx: train_set.append(fid) with open(os.path.join(args.save_dir, 'train_meta.csv'), 'a', encoding='utf-8') as train_f: train_f.write( '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'. format(fid, fid, fid, fid, fid)) elif i < test_stat_idx: dev_set.append(fid) with open(os.path.join(args.save_dir, 'dev_meta.csv'), 'a', encoding='utf-8') as dev_f: dev_f.write( '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'. format(fid, fid, fid, fid, fid)) else: test_set.append(fid) with open(os.path.join(args.save_dir, 'test_meta.csv'), 'a', encoding='utf-8') as test_f: test_f.write( '{}|ppgs/{}.npy|mels/{}.npy|linears/{}.npy|f0s/{}.npy\n'. format(fid, fid, fid, fid, fid)) print('Done extracting features!') return
def load_utterance(self, spk_name, path): wav = utils.load_wav(path) return Utterance(wav, cfg.data.sample_rate, path=path, spk_name=spk_name)
def get_map_size(files): return load_wav(files[0]).nbytes * 10 * (len(files) + 2)
embedding_dim=params["vocoder"]["embedding_dim"], rnn_channels=params["vocoder"]["rnn_channels"], fc_channels=params["vocoder"]["fc_channels"], bits=params["preprocessing"]["bits"], hop_length=params["preprocessing"]["hop_length"], nc=args.nc, device=device) model.to(device) print("Load checkpoint from: {}:".format(args.checkpoint)) checkpoint = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["model"]) model_step = checkpoint["step"] wav = load_wav(args.wav_path, params["preprocessing"]["sample_rate"]) utterance_id = os.path.basename(args.wav_path).split(".")[0] wav = wav / np.abs(wav).max() * 0.999 mel = melspectrogram(wav, sample_rate=params["preprocessing"]["sample_rate"], preemph=params["preprocessing"]["preemph"], num_mels=params["preprocessing"]["num_mels"], num_fft=params["preprocessing"]["num_fft"], min_level_db=params["preprocessing"]["min_level_db"], hop_length=params["preprocessing"]["hop_length"], win_length=params["preprocessing"]["win_length"], fmin=params["preprocessing"]["fmin"]) mel = torch.FloatTensor(mel).unsqueeze(0).to(device) output = model.generate(mel) path = os.path.join( args.gen_dir,
m2f_gt = [] f2m_gt = [] f2f_gt = [] print(" [*] {} start!".format('GT')) for wav_path in tqdm.tqdm(all_wav_path): wav_name = os.path.basename(wav_path) pattern = r"p[0-9]+_[0-9]+" src, trg = re.findall(pattern, wav_name) wav_path_gt = os.path.join(data_root, 'GT', trg + '.wav') src_spk = src.split('_')[0] trg_spk = trg.split('_')[0] wav = load_wav(wav_path_gt, 22050) logf0 = get_logf0(wav, 22050, frame_period=(256 / (0.001 * 22050))) if src_spk in M and trg_spk in M: m2m_gt.append(logf0[logf0 > 0]) elif src_spk in M and trg_spk in F: m2f_gt.append(logf0[logf0 > 0]) elif src_spk in F and trg_spk in M: f2m_gt.append(logf0[logf0 > 0]) elif src_spk in F and trg_spk in F: f2f_gt.append(logf0[logf0 > 0]) logf0_dict['m2m'] = np.concatenate(m2m_gt) logf0_dict['m2f'] = np.concatenate(m2f_gt) logf0_dict['f2m'] = np.concatenate(f2m_gt) logf0_dict['f2f'] = np.concatenate(f2f_gt)
os.chdir(proj_dir + '/data/train/wav') wavfiles = glob.glob('*.wav') trainfiles = [] os.chdir(proj_dir + '/data/train/mid') for wname in wavfiles: base_fname = wname.split('_')[0] trainfiles += [('data/train/wav/' + wname, 'data/train/mid/' + base_fname + '.mid')] # preprocess test xs, ys = [], [] os.chdir(proj_dir) for wav, mid in testfiles: # do constant-q transform on the wav file wavdata = utils.load_wav(wav) # cqt_windows = utils.cqt_windows(wavdata, 7, hop_length=hop_len) cqt_windows = utils.cqt(wavdata) savefile = 'data/test/preprocessed/' + wav.split('/')[-1] np.save(savefile, cqt_windows) xs.append(cqt_windows) print('wrote {}.npy\n dimensions: {}'.format(savefile, cqt_windows.shape), file=sys.stderr) pm = pmidi.PrettyMIDI(mid) t = librosa.frames_to_time(np.arange(cqt_windows.shape[1]), sr=sample_rate, hop_length=hop_len) piano_roll = pm.get_piano_roll(fs=sample_rate, times=t) savefile = 'data/test/preprocessed/' + mid.split('/')[-1] np.save(savefile, piano_roll)
def example_wav(): wav = load_wav( os.path.join(os.path.dirname(__file__), "../datasets/test/example/example.wav")) assert len(wav.shape) == 1 return wav
def main(): hps = Hparams parser = argparse.ArgumentParser('VC inference') parser.add_argument('--src_wav', type=str, help='source wav file path') parser.add_argument('--ckpt', type=str, help='model ckpt path') parser.add_argument('--save_dir', type=str, help='synthesized wav save directory') args = parser.parse_args() # 0. src_wav_arr = load_wav(args.src_wav) pre_emphasized_wav = _preemphasize(src_wav_arr) # 1. extract ppgs ppg_extractor_hps = hps.PPGExtractor.CNNBLSTMClassifier mfcc_pl = tf.placeholder(dtype=tf.float32, shape=[None, None, 3 * hps.Audio.n_mfcc], name='mfcc_pl') ppg_extractor = CNNBLSTMClassifier(out_dims=hps.Audio.ppg_dim, n_cnn=ppg_extractor_hps.n_cnn, cnn_hidden=ppg_extractor_hps.cnn_hidden, cnn_kernel=ppg_extractor_hps.cnn_kernel, n_blstm=ppg_extractor_hps.n_blstm, lstm_hidden=ppg_extractor_hps.lstm_hidden) predicted_ppgs = ppg_extractor(inputs=mfcc_pl)['logits'] # set up a session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # load saved model saver = tf.train.Saver() print('Restoring ppgs extractor from {}'.format(ppg_extractor_hps.ckpt)) saver.restore(sess, ppg_extractor_hps.ckpt) mfcc_feats = wav2unnormalized_mfcc(src_wav_arr) ppg = sess.run(predicted_ppgs, feed_dict={mfcc_pl: np.expand_dims(mfcc_feats, axis=0)}) sess.close() ppg = softmax(np.squeeze(ppg, axis=0)) # 2. extract lf0, mel-spectrogram log_f0 = logf0(args.src_wav) log_f0 = lf0_normailze(log_f0) # mel-spectrogram is extracted for comparison mel_spec = melspectrogram(pre_emphasized_wav).astype(np.float32).T # 3. prepare inputs min_len = min(log_f0.shape[0], ppg.shape[0]) vc_inputs = np.concatenate([ppg[:min_len, :], log_f0[:min_len, :]], axis=1) vc_inputs = np.expand_dims(vc_inputs, axis=1) # [time, batch, dim] # 4. setup vc model and do the inference model = BLSTMConversionModel(in_channels=hps.Audio.ppg_dim + 2, out_channels=hps.Audio.num_mels, lstm_hidden=hps.BLSTMConversionModel.lstm_hidden) device = torch.device('cpu') model.load_state_dict(torch.load(args.ckpt, map_location=device)) model.eval() predicted_mels = model(torch.tensor(vc_inputs)) predicted_mels = np.squeeze(predicted_mels.detach().numpy(), axis=1) # 5. synthesize wav synthesized_wav = inv_preemphasize(inv_mel_spectrogram(predicted_mels.T)) resynthesized_wav = inv_preemphasize(inv_mel_spectrogram(mel_spec.T)) ckpt_name = args.ckpt.split('/')[-1].split('.')[0] wav_name = args.src_wav.split('/')[-1].split('.')[0] save_wav(synthesized_wav, os.path.join(args.save_dir, '{}-{}-converted.wav'.format(wav_name, ckpt_name))) save_wav(resynthesized_wav, os.path.join(args.save_dir, '{}-{}-src-resyn.wav'.format(wav_name, ckpt_name))) return
parser.add_argument( 'weight_path', help="Path of checkpoint (ex:./result/weights/wavenet_0800)") args = parser.parse_args() def synthesize(mel_sp, save_path, weight_path): wavenet = WaveNet(hparams.num_mels, hparams.upsample_scales) wavenet.load_weights(weight_path) mel_sp = tf.expand_dims(mel_sp, axis=0) outputs = wavenet.synthesis(mel_sp) outputs = np.squeeze(outputs) outputs = inv_mulaw_quantize(outputs) save_wav(outputs, save_path, hparams.sampling_rate) if __name__ == '__main__': wav = load_wav(args.input_path, hparams.sampling_rate) wav = normalize(wav) * 0.95 mel_sp = melspectrogram(wav, hparams.sampling_rate, hparams.num_mels, n_fft=hparams.n_fft, hop_size=hparams.hop_size, win_size=hparams.win_size) synthesize(mel_sp, args.output_path, args.weight_path)
def convert(src_wav_dir, trg_wav_file): all_src_wav_files = glob.glob(f'{src_wav_dir}/*.wav') # This regex for src_wav_files creates about 20 output files to get a good sample without taking too # much time or memory. It can be altered (including setting to a single file or all_src_wav_files) # to create fewer/more output files. src_wav_files = glob.glob(f'{src_wav_dir}/p???_0[01][0-9].wav') src_wavs = [ utils.load_wav(src_wav_file, utils.SAMPLING_RATE) for src_wav_file in src_wav_files ] trg_wav = utils.load_wav(trg_wav_file, utils.SAMPLING_RATE) trg_wav_name = splitext(basename(trg_wav_file))[0] converted_dir = VCTK_PATH.joinpath('converted_audio', 'trg_' + trg_wav_name) os.makedirs(converted_dir, exist_ok=True) src_stats = get_stats(all_src_wav_files) trg_stats = get_stats([trg_wav_file]) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') G = get_model(device) _, _, trg_sp, _ = utils.world_decompose(wav=trg_wav, fs=utils.SAMPLING_RATE, frame_period=utils.FRAME_PERIOD) trg_coded_sp = utils.world_encode_spectral_envelop(sp=trg_sp, fs=utils.SAMPLING_RATE, dim=utils.NUM_MCEP) trg_coded_sp_norm = (trg_coded_sp - trg_stats['coded_sps_mean'] ) / trg_stats['coded_sps_std'] assert trg_coded_sp_norm.shape[0] >= 8192 trg_coded_sp_norm = trg_coded_sp_norm[:8192, :] trg_coded_sp_norm_tensor = torch.FloatTensor( trg_coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device) trg_embed = G.trg_downsample(trg_coded_sp_norm_tensor) with torch.no_grad(): for i, src_wav in enumerate(tqdm(src_wavs)): f0, _, sp, ap = utils.world_decompose( wav=src_wav, fs=utils.SAMPLING_RATE, frame_period=utils.FRAME_PERIOD) coded_sp = utils.world_encode_spectral_envelop( sp=sp, fs=utils.SAMPLING_RATE, dim=utils.NUM_MCEP) f0_converted = utils.pitch_conversion( f0=f0, mean_log_src=src_stats['log_f0s_mean'], std_log_src=src_stats['log_f0s_std'], mean_log_target=trg_stats['log_f0s_mean'], std_log_target=trg_stats['log_f0s_std']) coded_sp_norm = (coded_sp - src_stats['coded_sps_mean'] ) / src_stats['coded_sps_std'] coded_sp_norm_tensor = torch.FloatTensor( coded_sp_norm.T).unsqueeze_(0).unsqueeze_(1).to(device) # coded_sp_converted_norm = G(coded_sp_norm_tensor, trg_embed).data.cpu().numpy() coded_sp_converted_norm = G.forward_with_trg_embed( coded_sp_norm_tensor, trg_embed) coded_sp_converted_norm = coded_sp_converted_norm.data.cpu().numpy( ) coded_sp_converted = np.squeeze(coded_sp_converted_norm).T coded_sp_converted = coded_sp_converted * trg_stats[ 'coded_sps_std'] + trg_stats['coded_sps_mean'] coded_sp_converted = np.ascontiguousarray(coded_sp_converted) coded_sp_converted = coded_sp_converted.astype('double') wav_transformed = utils.world_speech_synthesis( f0=f0_converted, coded_sp=coded_sp_converted, ap=ap, fs=utils.SAMPLING_RATE, frame_period=utils.FRAME_PERIOD) output_path = converted_dir.joinpath( 'src_' + os.path.basename(src_wav_files[i])) print(f'Saving to {output_path}') librosa.output.write_wav(output_path, wav_transformed, utils.SAMPLING_RATE)
input_root = args.input_root output_root = args.output_root os.makedirs(output_root, exist_ok=True) input_dir_names = sorted(os.listdir(input_root)) gpu = args.gpu # ================ separation ================ for idx, f in enumerate(input_dir_names): input_dir = os.path.join(input_root, f) print("Processing {}...".format(input_dir), end="") save_dir = os.path.join(output_root, f) os.makedirs(save_dir, exist_ok=True) # Input data and resample mix = utils.load_wav(input_dir, STFTPara['fs']) ns = mix.shape[1] # STFT frames_ = np.floor((mix.shape[0] + 2 * STFTPara['window_shift']) / STFTPara['window_shift']) # to meet NOLA frames = int(np.ceil(frames_ / 8) * 8) X = np.zeros( (int(STFTPara['window_size'] / 2 + 1), int(frames), mix.shape[1]), dtype=np.complex) for n in range(mix.shape[1]): f, t, X[:, :int(frames_), n] = signal.stft( mix[:, n], nperseg=STFTPara['window_size'], window=STFTPara['type'],