def process_sounds(self): ''' processes dowloaded files below self.root after running download_files(). DEPRECATED Don't use this for the pretrained VGGish! TODO: this should go to preprocessing if kept at all ''' self.info_df = self.df[['gen', 'id']].copy() for path, dirs, files in os.walk(self.root): for file in files: if file.endswith('.mp3'): y, sr = load(os.path.join(path, file)) if self.convert_to_wav: write_wav( os.path.join(path, file.replace('.mp3', '.wav')), y, self.input_sr) if self.make_mel_spec: S = librosa.feature.melspectrogram( y, sr=self.sr, n_mels=self.n_mels, hop_length=self.hop_length) log_S = librosa.amplitude_to_db(S, ref=np.max) np.save(os.path.join(path, 'mel_spec.npy'), log_S) if self.save_img: scipy.misc.imsave( os.path.join(path, 'mel_spec.jpg'), log_S) if self.extract_chunks: if log_S.shape[1] < self.len_chunks: print( 'recording {} has length {} which is shorter \ than required chunk length.') continue self.spec_chunks(log_S, path=path) self.info_df.to_csv(os.path.join(self.root, 'info.csv'), sep='\t')
def reconstruct(spectrogram): # remove the padding from the speech spectrogram = spectrogram[:feature_size, :feature_size].transpose() # including the real and imaginary components spectrogram = spectrogram[:257, :] + 1j * spectrogram[257:, :] # re-construct audio from spectrogram wav = istft(spectrogram) write_wav("target.wav", wav, sr=44100)
def __call__(self, n_samples, sample_length, cond, speaker): print('Generate', n_samples, 'of length', sample_length) samples = self.generate(n_samples, sample_length, cond, speaker).cpu().numpy() for i in range(n_samples): print(self.filename) write_wav(self.filename, samples[i, :], sr=self.sample_rate)
def test(direction=direction, began = began ,model_dir=model_dir, test_dir=test_dir, sr=sr, n_features=n_features, frame_period=frame_period) : outputs_dir = "./sample" if began == True : model = CycleBeGAN(num_features=n_features,mode="test") model.load(os.path.join(model_dir, "Cycle_BeGan") else : model = CycleGAN(num_features=n_features,mode="test") model.load(os.path.join(model_dir, "CycleGan")) mcep = np.load(os.path.join("./", 'mcep.npz')) mcep_mean_A = mcep['A_mean'] mcep_std_A = mcep['A_std'] mcep_mean_B = mcep['B_mean'] mcep_std_B = mcep['B_std'] logf0s = np.load(os.path.join("./", 'logf0s.npz')) logf0s_mean_A = logf0s['A_mean'] logf0s_std_A = logf0s['A_std'] logf0s_mean_B = logf0s['B_mean'] logf0s_std_B = logf0s['B_std'] if not os.path.exists(outputs_dir) : os.mkdir(outputs_dir) file_list = librosa.util.find_files(test_dir,ext="wav") for file in file_list : wav,_ = load(file, sr=sr) wav = wav_padding(wav = wav, sr = sr, frame_period = frame_period, multiple = 4) f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sr, frame_period = frame_period) coded_sp = world_encode_spectral_envelop(sp = sp, fs = sr, dim = n_features) coded_sp_transposed = coded_sp.T if direction == "A2B" : f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A, mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B) #f0_converted = f0 coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = direction)[0] coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B else : # B2A f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_B, std_log_src = logf0s_std_B, mean_log_target = logf0s_mean_A, std_log_target = logf0s_std_A) #f0_converted = f0 coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = direction)[0] coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sr) wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sr, frame_period = frame_period) write_wav(os.path.join(outputs_dir, os.path.basename(file)), wav_transformed, sr) if __name__ == "__main__" : test(direction = direction) print("Done!")
def separate(PATH_INPUT, PATH_OUTPUT, MODEL, SR=16000, FFT_SIZE = 1024, H = 512): if os.path.isdir( PATH_INPUT): # 入力がディレクトリーの場合、ファイルリストをつくる filelist_mixdown = find_files(PATH_INPUT, ext="wav", case_sensitive=True) else: # 入力が単一ファイルの場合 filelist_mixdown=[PATH_INPUT] print ('number of mixdown file', len(filelist_mixdown)) # 出力用のディレクトリーがない場合は 作成する。 _, path_output_ext = os.path.splitext(PATH_OUTPUT) print ('path_output_ext',path_output_ext) if len(path_output_ext)==0 and not os.path.exists(PATH_OUTPUT): os.mkdir(PATH_OUTPUT) # モデルの読み込み unet = train.UNet() chainer.serializers.load_npz( MODEL,unet) config.train = False config.enable_backprop = False # ミックスされたものを読み込み、vocal(speech)の分離を試みる for fmixdown in filelist_mixdown: # audioread でエラーが発生した場合は、scipyを使う。 try: y_mixdown, _ = load(fmixdown, sr=SR, mono=True) except: sr_mixdown, y_mixdown = read(fmixdown) if not sr_mixdown == SR: y_mixdown = resample(y_mixdown, sr_mixdown, SR) # 入力の短時間スペクトラムを計算して、正規化する。 spec = stft(y_mixdown, n_fft=FFT_SIZE, hop_length=H, win_length=FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j*np.angle(spec)) print ('mag.shape', mag.shape) start = 0 end = 128 * (mag.shape[1] // 128) # 入力のフレーム数以下で、networkの定義に依存して 適切な値を選ぶこと。 # speech(vocal)を分離するためのマスクを求める mask = unet(mag[:, start:end][np.newaxis, np.newaxis, 1:, :]).data[0, 0, :, :] mask = np.vstack((np.zeros(mask.shape[1], dtype="float32"), mask)) # 入力の短時間スペクトラムにマスクを掛けて、逆FFTで波形を合成する。 mag2=mag[:, start:end]*mask phase2=phase[:, start:end] y = istft(mag2*phase2, hop_length=H, win_length=FFT_SIZE) # 分離した speech(vocal)を出力ファイルとして保存する。 if len(path_output_ext)==0: # ディレクトリーへ出力 foutname, _ = os.path.splitext( os.path.basename(fmixdown) ) fname= os.path.join(PATH_OUTPUT, (foutname + '.wav')) else: # 指定されたファイルへ出力 fname= PATH_OUTPUT print ('saving... ', fname) write_wav(fname, y, SR, norm=True)
def epoch(self, epoch_index): samples = self.generate(self.n_samples, self.sample_length) \ .cpu().float().numpy() for i in range(self.n_samples): write_wav(os.path.join(self.samples_path, self.pattern.format(epoch_index, i + 1)), samples[i, :], sr=self.sample_rate, norm=True)
def process_single_file(self, file_name): mixture, _ = load(os.path.join(self.input_dir, file_name + '.wav'), sr=self.samplerate_hz) speaker_signals = self.separate_single_mixture(mixture) write_wav(os.path.join(self.output_dir, 's1', file_name + '.wav'), \ speaker_signals[0, :], self.samplerate_hz, norm=True) write_wav(os.path.join(self.output_dir, 's2', file_name + '.wav'), \ speaker_signals[1, :], self.samplerate_hz, norm=True)
def generate_autodrive(vae, dataset, n_files=1, out=None, preprocessing=None, transformOptions=None, start="random", n_start=10, n_loops=10, projections=None): for j in range(n_files): check_dir('%s/autodrive' % out) #sequence_length = loaded_data['script_args'].sequence # get starting point device = next(vae.parameters()).device if start == "file": input_file = random.randrange(len(dataset)) data_in, _ = dataset[input_file] data_in = vae.format_input_data(preprocessing( data_in[:n_start])).unsqueeze(0) z_in = vae.encode(data_in)[-1]['out_params'].mean elif start == "random": latent_in = vae.platent[-1]['dim'] # draw random point z0 = torch.distributions.Normal(torch.zeros(1, latent_in), torch.ones(1, latent_in)).sample() # draw direction u = torch.distributions.Normal(torch.zeros(1, latent_in), torch.ones(1, latent_in)).sample() increments = torch.linspace(0, 1e-1, n_start).unsqueeze(0) z_in = (z0 + increments.t() @ u).unsqueeze(0).to(device=device) with torch.no_grad(): for n in range(n_loops): prediction_out = vae.prediction_module({'z_enc': [z_in]}) z_in = torch.cat([z_in, prediction_out['out']], 1) data_out = vae.decode(z_in)[0]["out_params"].mean data_out = data_out.squeeze().cpu() # plot things fig = plt.figure() if len(data_out.shape) == 1: plt.plot(data_out) else: plt.imshow(data_out, aspect="auto") fig.savefig('%s/autodrive/drive_%d.pdf' % (out, j), format="pdf") plt.close('all') signal_out = inverseTransform( preprocessing.invert(data_out.squeeze().cpu().detach().numpy()), 'stft', {'transformParameters': transformOptions}, iterations=10, method='griffin-lim') write_wav('%s/autodrive/drive_%d.wav' % (out, j), signal_out, transformOptions.get('resampleTo', 22050), norm=True)
def main(infile, outfile): fftsize = 1024 hopsize = 256 data, sr = load(infile, sr=None) spec = stft(data, fftsize, hopsize, 'hanning') output = istft(spec, hopsize, 'hanning') write_wav(outfile, output, sr)
def main(infile, outfile, dur): fftsize = 1024 hopsize = 256 data, sr = load(infile, sr=None, duration=dur) pv = pva(data, fftsize, hopsize, sr, 'hanning') y = pvs(pv, hopsize, sr, 'hanning') write_wav(outfile, y, sr)
def output_wav(self, folder, filename): ''' Small function to output a Load or Slice to a wav file :param folder: (string) | the folder to output to :param filename: (string) | filename to output as :return: ''' audio = os.path.join(folder, filename) output.write_wav(audio, self.y, self.sr)
def callOnFile(wav, commands, wavPath, melPath, scKwargs={}): '''call <commands> with <wav> write to <wavPath>, then collect results from <melPath>''' if not VERBOSE_OUT: scKwargs['stdout'] = subprocess.DEVNULL if not VERBOSE_ERR: scKwargs['stderr'] = subprocess.DEVNULL write_wav(wavPath, wav[0], wav[1]) ret = subprocess.call(commands, **scKwargs) assert ret == 0, f'return value: {ret} != 0' times, pitches = load_time_series(melPath, delimiter=r'\s+|,') return times, pitches
def wav_writer(samples, sample_rate, suffix, orig, newdir=None, subdir=None, verbose=True): ''' Saves a wav in same place as original .wav file Inputs: samples: new samples to save orig: original filename of .wav file suffix: suffix for the new filename newdir: a new directory to use instead of the original wav file's path subdir: name of a subdirectory to make in the original or new directory verbose: whether or not to print filename Returns: the new filename ''' filesplit = os.path.split(orig) # Get the path in which to save the wav if newdir: base_path = newdir else: base_path = filesplit[0] #Same path as the original file if subdir: base_path = os.path.join(base_path, subdir) # Make path if necessary try: os.mkdir(base_path) except FileExistsError: pass # Get the name by which to save the wav file_name = filesplit[1] base_name = f'{os.path.splitext(file_name)[0]}_{suffix}.wav' # Full path & filename by which wav should be saved file_path = os.path.join(base_path, base_name) try: write_wav(file_path, np.array(samples), sample_rate) except ParameterError: # librosa.util.exceptions.ParameterError print(f'Skipping {file_path} due to ParameterError') if verbose: print(f'Saved files to {file_path}') return file_path
def _get_batches_of_transformed_samples(self, index_array): print("Batch index:", self.batch_index) index_array.sort() # find max size in batch filtered_df = self.dataframe_data.loc[self.dataframe_data.index.isin( index_array)] bigfile_in_batch = filtered_df.loc[filtered_df[2].idxmax()] max_audiosize_in_batch = int(bigfile_in_batch[2]) # when stretching slow down the audio we change max_audiosize_in_batch by stretch rate if self.audio_data_generator.stretch and ( self.audio_data_generator.stretch < 1): max_audiosize_in_batch = int( max_audiosize_in_batch * (1 + self.audio_data_generator.stretch)) # when shift is happens we change max_audiosize_in_batch accordingly if self.audio_data_generator.shift: _, max_sr = load_audio(bigfile_in_batch[1]) max_audiosize_in_batch = int(max_audiosize_in_batch + (self.audio_data_generator.shift * max_sr)) batch_x = np.zeros((len(index_array), ) + (max_audiosize_in_batch, ), dtype=backend.floatx()) batch_y = [0] * len(index_array) for i, j in enumerate(index_array): current_audiofile = self.dataframe_data.iloc[j] y = current_audiofile[0] x, sr = load_audio(current_audiofile[1]) if len(x) < max_audiosize_in_batch: x = np.pad(x, (0, max(0, int(max_audiosize_in_batch - len(x)))), "constant", constant_values=(self.stuffing)) x, sr = self.audio_data_generator.transform(x, sr) # optionally save augmented audio to disk for debugging purposes if self.save_to_dir: fname = '{prefix}_{index}_{hash}.wav'.format( prefix=self.save_prefix, index=j, hash=np.random.randint(1e7)) write_wav(os.path.join(self.save_to_dir, fname), x, sr) batch_x[i] = x batch_y[i] = y return batch_x, batch_y
def main(infile, outfile, pitch, scale, transpose, dur_ratio): fftsize = 1024 hopsize = 256 data, sr = load(infile, sr=None) pv = ifd(data, fftsize, hopsize, sr, 'hanning') if transpose: pv = np.flip(pv, 1) output = addsyn(pv, 0, pitch, scale, int(hopsize * dur_ratio), sr) write_wav(outfile, output, sr)
def epoch(self, epoch_index): samples = self.generate(self.n_samples, self.sample_length) \ .cpu().float().numpy() for i in range(self.n_samples): if self.save_raw: samples.tofile('debug_seq_{}.csv'.format(epoch_index), sep=',', format='%10.5f') write_wav(os.path.join(self.samples_path, self.pattern.format(epoch_index, i + 1)), samples[i, :], sr=self.sample_rate, norm=True)
def guess(self): wav, _ = load(self.WAVE_OUTPUT_FILENAME, sr=self.sr) wav, _ = trim(wav, top_db=self.top_db) write_wav(self.WAVE_OUTPUT_FILENAME, wav, self.sr) print(">> save as", self.WAVE_OUTPUT_FILENAME) #dtw recognition x = self.getMfcc(wav, self.sr) res = self.recognition(x) print(res) self.audio_num = self.audio_num + 1 self.WAVE_OUTPUT_FILENAME = "./saved/" + str(self.audio_num) + ".wav"
def separate_whole_audio_data(): unet = analyze.UNet() mag, phase, length = analyze.load_audio('audio/wav/fhana.wav') data = [[], []] start = time() for i in range(0, mag.shape[1], 1024): mask = analyze.compute_mask(unet, mag[:, i:i+1024]) data[0].extend(analyze.save_audio(mag[:, i:i+1024]*mask, phase[:, i:i+1024])) data[1].extend(analyze.save_audio(mag[:, i:i+1024]*(1-mask), phase[:, i:i+1024])) from librosa.output import write_wav for i in range(2): write_wav('data{0}.wav'.format(i), np.array(data[i][:length]), 16000, norm=True) print(time() - start)
def create_sin_wave_data(data_dir, num_files, seq_len, sample_rate=16000, save_raw=False): os.makedirs(data_dir, exist_ok=True) dataset = get_batch(num_files, seq_len) for i in range(num_files): if save_raw: dataset[i].tofile('sin_{}.csv'.format(i), sep=',', format='%7.5f') write_wav( os.path.join( data_dir, 'sin_{}.wav'.format(i) ), dataset[i, :], sr=sample_rate, norm=False )
def save_sound(output_path, waveform, sample_rate, normalize=True): # save waveform to .wav sound file # example: # save_sound('../output',output.wav,waveform,sample_rate) # ensure that output_dir exists Path(output_path).parent.mkdir(parents=True, exist_ok=True) from librosa.output import write_wav write_wav(output_path, waveform, sr=sample_rate, norm=normalize) print(output_path, 'saved') return
def epoch(self, epoch_index): samples = self.generate(self.n_samples, self.sample_length) \ .cpu().float().numpy() print("__epoch__") print(self.trainer.stats) for i in range(self.n_samples): file_path = os.path.join( self.samples_path, sample_file_path( epoch_index, self.trainer.iterations, self.trainer.stats["training_loss"]["last"].tolist(), i)) write_wav(file_path, samples[i, :], sr=self.sample_rate, norm=True) if self._upload is not None: self._upload(file_path)
def SaveStereoAudio(fname, mag, phase, norm=True, save_path=None): y_l = istft(mag[0] * phase[0], hop_length=C.H, win_length=C.FFT_SIZE, window=C.WINDOW) y_r = istft(mag[1] * phase[1], hop_length=C.H, win_length=C.FFT_SIZE, window=C.WINDOW) stereo = np.array((y_l, y_r)) if save_path is None: write_wav(C.PATH_MUSIC / fname, stereo, C.SR, norm=norm) else: write_wav(save_path / fname, stereo, C.SR, norm=norm)
def epoch(self, epoch_index): samples = self.generate(self.n_samples, self.sample_length) \ .cpu().float().numpy() print("__epoch__") print(self.trainer.stats) for i in range(self.n_samples): write_wav(os.path.join( self.samples_path, self.pattern.format( epoch_index, self.trainer.iterations, self.trainer.stats["training_loss"]["last"].tolist(), i + 1)), samples[i, :], sr=self.sample_rate, norm=True)
def test(): vis = Visualizer(env='svs') model = getattr(models, 'Unet')().eval() # model.cuda() model.load_state_dict( t.load('G:/Unet_svs/check/epoch_219__0724_16_57_35.pth')) mix_wav, _ = load("C:/Users/lenovo/Music/c.mp3", sr=8192) mix_wav_mag, mix_wav_phase = magphase( stft(mix_wav, n_fft=1024, hop_length=768)) START = 700 END = START + 128 mix_wav_mag = mix_wav_mag[:, START:END] mix_wav_phase = mix_wav_phase[:, START:END] print(mix_wav_mag.shape) gg = mix_wav_mag[1:] gg = t.from_numpy(gg) gg.unsqueeze_(0) gg.unsqueeze_(0) vis.img('a', gg) print(gg.shape) with t.no_grad(): gg = Variable(gg) score = model(gg) predict = gg.data * score.data print(predict.shape) target_pred_mag = predict.view(512, 128).cpu().numpy() target_pred_mag = np.vstack((np.zeros((128)), target_pred_mag)) vis.img('b', t.from_numpy(target_pred_mag)) print(target_pred_mag.shape) write_wav( f'C:/Users/lenovo/Music/pred_vocal.wav', istft( target_pred_mag * mix_wav_phase # (mix_wav_mag * target_pred_mag) * mix_wav_phase , win_length=1024, hop_length=768), 8192, norm=True) write_wav(f'C:/Users/lenovo/Music/pred_mix.wav', istft(mix_wav_mag * mix_wav_phase, win_length=1024, hop_length=768), 8192, norm=True)
def speedyspeech_tts(text_str, device_str): print('Loading model checkpoints') m = SpeedySpeech(device=device_str).load('models/speedyspeech.pth', device_str) m.eval() checkpoint = torch.load('models/melgan.pth', device_str) hp = HParam("mikuai/speedyspeech/melgan/config/default.yaml") melgan = Generator(hp.audio.n_mel_channels).to(device_str) melgan.load_state_dict(checkpoint["model_g"]) melgan.eval(inference=False) print('Processing text') txt_processor = TextProcessor(HPText.graphemes, phonemize=HPText.use_phonemes) text = [text_str] phonemes, plen = txt_processor(text) # append more zeros - avoid cutoff at the end of the largest sequence phonemes = torch.cat((phonemes, torch.zeros(len(phonemes), 5).long()), dim=-1) phonemes = phonemes.to('cpu') print('Synthesizing') # generate spectrograms with torch.no_grad(): spec, durations = m((phonemes, plen)) # invert to log(mel-spectrogram) spec = m.collate.norm.inverse(spec) # mask with pad value expected by MelGan msk = mask(spec.shape, durations.sum(dim=-1).long(), dim=1).to('cpu') spec = spec.masked_fill(~msk, -11.5129) # Append more pad frames to improve end of the longest sequence spec = torch.cat((spec.transpose( 2, 1), -11.5129 * torch.ones(len(spec), HPStft.n_mel, 5).to('cpu')), dim=-1) # generate audio with torch.no_grad(): audio = melgan(spec).squeeze(1) print('Saving audio') # TODO: cut audios to proper length for i, a in enumerate(audio.detach().cpu().numpy()): write_wav(('output.wav'), a, HPStft.sample_rate, norm=False)
def main(infile, outfile): fftsize = 1024 hopsize = 256 data, sr = load(infile, sr=None) spec = stft(data, fftsize, hopsize, 'hanning') fr = sr / hopsize twopi = np.pi * 2 delay = 0.0055 + 5e-3 * np.cos(np.arange(spec.shape[1]) * twopi * 0.1 / fr) for i in range(spec.shape[1]): spec[:, i] = specomb(spec[:, i], 0.6, delay[i], 0.9, sr) output = istft(spec, hopsize, 'hanning') write_wav(outfile, output, sr)
def prepare_wav(track_list, sub_dirname, args): dirpath = os.path.join(args.dst_dir, sub_dirname) if not os.path.exists(dirpath): os.makedirs(dirpath) for track in track_list: name = track.name rate = track.rate vocal = monauralize(track.sources['vocals'].audio) mix = monauralize(track.audio) path = os.path.join(dirpath, name) print(path) write_wav(path + '.wav', np.stack((vocal, mix)), rate)
def make_tone(x, f1, f2): y = gen_tone(x, f1, f2) tmpfn = "tmp.wav" tone_fn = path.join('wavs', 'tone%i.wav' % i) write_wav(tone_fn, y, SR, norm=True) reverbed_fn = path.join('wavs', 'reverbed_tone%i.wav' % i) cmd = "sox {} {} gain -3 reverb".format(tone_fn, reverbed_fn) check_call(cmd, shell=True) combined_fn = path.join('wavs', 'combined_tone%i.wav' % i) cmd = "sox {} {} pad 1 0".format(tone_fn, tmpfn) check_call(cmd, shell=True) cmd = "sox -m {} {} {}".format(BASE_FN, tmpfn, combined_fn) check_call(cmd, shell=True)
def generate(output_path=OUTPUT_PATH, summary_path=SUMMARY_PATH, hparams=HPARAMS): tf.logging.set_verbosity(tf.logging.INFO) assert os.path.exists(summary_path), 'Summary directory does not exist...' if not os.path.exists(output_path): os.mkdir(output_path) tf.logging.log(tf.logging.INFO, 'Build model...') model = wave_net.WaveNet(hparams) inputs = tf.placeholder(dtype=tf.int32, shape=[1, model.receptive_field]) model.build(inputs) saver = tf.train.Saver( var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) tf.logging.log(tf.logging.INFO, 'Start session...') with tf.Session() as sess: ckpt = tf.train.latest_checkpoint(summary_path) saver.restore(sess, ckpt) num_samples = hparams.seconds_to_generate * 16000 initial_input = np.random.randint(low=0, high=hparams.bin_size, size=[1, model.receptive_field]) samples = np.zeros([1, num_samples]) samples[0, 0:model.receptive_field] = initial_input start = time.time() for i in range(model.receptive_field, num_samples): tf.logging.log_every_n(tf.logging.INFO, 'Generated sample %d/%d' % (i, num_samples), n=100) generated = sess.run( model.generated, feed_dict={inputs: samples[:, (i - model.receptive_field):i]}) samples[0, i] = generated[0, -1].argmax(axis=-1) end = time.time() print('Generated %d in %.1f seconds...' % (hparams.seconds_to_generate, end - start)) audio = dequantize(samples, hparams.bin_size).squeeze() print('Write file...') write_wav(os.path.join(output_path, 'audio.wav'), audio, 16000)
def saveAudioBatch(data, path, basename, sr=16000, latents=None, overwrite=False): from librosa.util.utils import ParameterError # outdata = resizeAudioTensor(data, orig_sr, target_sr) # taudio.save(path, outdata, sample_rate=target_sr) data = list(data) #LW it was a map # if no (or wrong) latents, we'll use zerors to zip tp looping over enumeration still works if latents != None and len(latents) == len(data): zdata = zip(data, latents) print( "saveAudioBatch: zipping audio with latents (and will write param files)" ) else: zdata = zip(data, [0] * len(data)) print( "saveAudioBatch: zipping audio with naughts for latents (and will not write param files)" ) try: for i, (audio, params) in enumerate(zdata): #for i, audio in enumerate(data): #LW if type(audio) != np.ndarray: audio = np.array(audio, float) out_path = os.path.join(path, f'{basename}_{i}.wav') # also get path/file names for parameter pytorch and text files param_out_path = os.path.join(path, f'{basename}_{i}.pt') txt_param_out_path = os.path.join(path, f'{basename}_{i}.txt') if not os.path.exists(out_path) or overwrite: write_wav(out_path, audio.astype(float), sr) if latents != None: torch.save(params, param_out_path) np.savetxt(txt_param_out_path, params.cpu().numpy()) else: print(f"saveAudioBatch: File {out_path} exists. Skipping...") continue except ParameterError as pe: print(pe)
# -*- coding: utf-8 -*- from machineLearningHelper import getLearningArrays from librosa.output import write_wav import numpy as np samplerate = 44100 instances, classifications = getLearningArrays(useToySounds=True) screams = [] notScreams = [] numInstances = len(instances) for i in xrange(numInstances): if classifications[i]: screams.extend(instances[i]) else: notScreams.extend(instances[i]) screams = np.array(screams) notScreams = np.array(notScreams) write_wav('./test_sounds/concatenated_screams.wav', screams, 44100, norm=False) write_wav('./test_sounds/concatenated_not_screams.wav', notScreams, 44100, norm=False)
def SaveAudio(fname, mag, phase): y = istft(mag*phase, hop_length=C.H, win_length=C.FFT_SIZE) write_wav(fname, y, C.SR, norm=True)