def _maybe_convert_wav(data_dir, original_data, converted_data): source_dir = os.path.join(data_dir, original_data) target_dir = os.path.join(data_dir, converted_data) # Conditionally convert sph files to wav files if os.path.exists(target_dir): print("skipping maybe_convert_wav") return # Create target_dir os.makedirs(target_dir) # Loop over sph files in source_dir and convert each to 16-bit PCM wav for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, "*.sph"): for channel in ['1', '2']: sph_file = os.path.join(root, filename) wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + "-" + channel + ".wav" wav_file = os.path.join(target_dir, wav_filename) temp_wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + "-" + channel + "-temp.wav" temp_wav_file = os.path.join(target_dir, temp_wav_filename) print("converting {} to {}".format(sph_file, temp_wav_file)) subprocess.check_call(["sph2pipe", "-c", channel, "-p", "-f", "rif", sph_file, temp_wav_file]) print("upsampling {} to {}".format(temp_wav_file, wav_file)) audioData, frameRate = librosa.load(temp_wav_file, sr=16000, mono=True) soundfile.write(wav_file, audioData, frameRate, "PCM_16") os.remove(temp_wav_file)
def save(filename_audio, filename_jam, jam, strict=True, **kwargs): '''Save a muda jam to disk Parameters ---------- filename_audio: str The path to store the audio file filename_jam: str The path to store the jams object strict: bool Strict safety checking for jams output kwargs Additional parameters to `soundfile.write` ''' y = jam.sandbox.muda._audio['y'] sr = jam.sandbox.muda._audio['sr'] # First, dump the audio file psf.write(filename_audio, y, sr, **kwargs) # Then dump the jam jam.save(filename_jam, strict=strict)
def saveTo(self, file): with ZipFile(file, 'w') as zip: song_file = configparser.ConfigParser() song_file['DEFAULT'] = {'volume': self.volume, 'bpm': self.bpm, 'beat_per_bar': self.beat_per_bar, 'width': self.width, 'height': self.height} for clip in self.clips: clip_file = {'name': clip.name, 'volume': str(clip.volume), 'frame_offset': str(clip.frame_offset), 'beat_offset': str(clip.beat_offset), 'beat_diviser': str(clip.beat_diviser), 'audio_file': basename( clip.audio_file)} if clip_file['audio_file'] is None: clip_file['audio_file'] = 'no-sound' song_file["%s/%s" % (clip.x, clip.y)] = clip_file buffer = StringIO() song_file.write(buffer) zip.writestr('metadata.ini', buffer.getvalue()) for member in self.data: buffer = BytesIO() sf.write(self.data[member], buffer, self.samplerate[member], subtype=sf.default_subtype('WAV'), format='WAV') zip.writestr(member, buffer.getvalue()) self.file_name = file
def Save(cls, filename, data): d = numpy.transpose(data.GetChannels()) soundfile.write(data=d, file="%s.%s" % (filename, cls.ending), samplerate=int(round(data.GetSamplingRate())), subtype=cls.encoding, format=cls.format)
def test_write_int_data_to_float_file(file_inmemory): """This is a very uncommon use case.""" sf.write(file_inmemory, data_mono, 44100, format='WAV', subtype='FLOAT') file_inmemory.seek(0) read, fs = sf.read(file_inmemory, always_2d=False, dtype='float32') assert np.all(read == data_mono) assert fs == 44100
def play_message(in_msg_fn): """ This method opens a decrypted in_msg and converts the data to an audio stream. Then, it simply reads in the frames of the audio file and writes the data to an output stream. In other words, it plays the message for you. """ try: in_msg = open(in_msg_fn, 'rb') data = pickle.load(in_msg) in_msg.close() print('Data pickled') except IOError: print("ERROR: Failed to open message file.") return sf.write(DECR_OUTPUT_FILENAME, data, samplerate=RATE) ########################################################################## # For now, I just want to make sure the WAV file is written successfully.# # Until then, this playback stuff will be on the backlog.################# ########################################################################## # wf = wave.open(DECR_OUTPUT_FILENAME, 'rb') # p = pyaudio.PyAudio() # stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), # channels=wf.getnchannels(), # rate=wf.getframerate(), # output=True) # data = wf.readframes(CHUNK) # while data != '': # stream.write(data) # data = wf.readframes(CHUNK) # stream.stop_stream() # stream.close() # p.terminate() return DECR_OUTPUT_FILENAME
def __rubberband(y, sr, **kwargs): '''Execute rubberband Parameters ---------- y : np.ndarray [shape=(n,) or (n, c)] Audio time series, either single or multichannel sr : int > 0 sampling rate of y **kwargs keyword arguments to rubberband Returns ------- y_mod : np.ndarray [shape=(n,) or (n, c)] `y` after rubberband transformation ''' assert sr > 0 # Get the input and output tempfile fd, infile = tempfile.mkstemp(suffix='.wav') os.close(fd) fd, outfile = tempfile.mkstemp(suffix='.wav') os.close(fd) # dump the audio sf.write(infile, y, sr) try: # Execute rubberband arguments = ['rubberband', '-q'] for key, value in six.iteritems(kwargs): arguments.append(str(key)) arguments.append(str(value)) arguments.extend([infile, outfile]) subprocess.check_call(arguments) # Load the processed audio. y_out, _ = sf.read(outfile, always_2d=True) # make sure that output dimensions matches input if y.ndim == 1: y_out = np.squeeze(y_out) finally: # Remove temp files os.unlink(infile) os.unlink(outfile) pass return y_out
def play(self): log.debug('Play %r', self) # FIXME change adhoc play to universal fragment_filename = '/tmp/fragment.wav' sub.check_call(['rm', '-rf', fragment_filename]) sf.write(self.samples, fragment_filename, self.samplerate) sub.check_call(['play', fragment_filename])
def compute_combination(args): snr, signal, noise, target_rate, new_name, storage_name = args noisy_signal = signal*snrdb2ratio(snr, signal, noise)+noise noisy_signal = noisy_signal/peak(noisy_signal) soundfile.write(storage_name, noisy_signal, target_rate) shutil.copyfile(storage_name, new_name) #soundfile = al.Sndfile(new_name, 'w', al.Format('flac'), 1, target_rate) #soundfile.write_frames(noisy_signal) #soundfile.sync() print("Wrote", new_name)
def _save_estimates(self, user_estimates, track, estimates_dir): track_estimate_dir = op.join( estimates_dir, track.subset, track.filename ) if not os.path.exists(track_estimate_dir): os.makedirs(track_estimate_dir) # write out tracks to disk for target, estimate in list(user_estimates.items()): target_path = op.join(track_estimate_dir, target + '.wav') sf.write(target_path, estimate, track.rate) pass
def onExportClip(self): if self.last_clip and self.last_clip.audio_file: audio_file = self.last_clip.audio_file file_name, a = self.getSaveFileName( 'Export Clip : %s' % self.last_clip.name, 'WAVE (*.wav)') if file_name: file_name = verify_ext(file_name, 'wav') sf.write(self.song.data[audio_file], file_name, self.song.samplerate[audio_file], subtype=sf.default_subtype('WAV'), format='WAV')
def main(): logdir, ckpt = os.path.split(args.checkpoint) arch = tf.gfile.Glob(os.path.join(logdir, 'architecture*.json'))[0] # should only be 1 file with open(arch) as fp: arch = json.load(fp) normalizer = Tanhize( xmax=np.fromfile('./etc/xmax.npf'), xmin=np.fromfile('./etc/xmin.npf'), ) features = read_whole_features(args.file_pattern.format(args.src)) x = normalizer.forward_process(features['sp']) x = nh_to_nchw(x) y_s = features['speaker'] y_t_id = tf.placeholder(dtype=tf.int64, shape=[1,]) y_t = y_t_id * tf.ones(shape=[tf.shape(x)[0],], dtype=tf.int64) machine = MODEL(arch) z = machine.encode(x) x_t = machine.decode(z, y_t) # NOTE: the API yields NHWC format x_t = tf.squeeze(x_t) x_t = normalizer.backward_process(x_t) # For sanity check (validation) x_s = machine.decode(z, y_s) x_s = tf.squeeze(x_s) x_s = normalizer.backward_process(x_s) f0_s = features['f0'] f0_t = convert_f0(f0_s, args.src, args.trg) output_dir = get_default_output(args.output_dir) saver = tf.train.Saver() sv = tf.train.Supervisor(logdir=output_dir) with sv.managed_session() as sess: load(saver, sess, logdir, ckpt=ckpt) while True: try: feat, f0, sp = sess.run( [features, f0_t, x_t], feed_dict={y_t_id: np.asarray([SPEAKERS.index(args.trg)])} ) feat.update({'sp': sp, 'f0': f0}) y = pw2wav(feat) oFilename = make_output_wav_name(output_dir, feat['filename']) sf.write(oFilename, y, FS) except: break
def write_audio_file(filepath, v_signal, fs, norm=0.98): ''' norm: If None, no normalisation is applied. If it is a float number, it is the target value (absolute) for the normalisation. ''' # Normalisation: if norm is not None: v_signal = norm * v_signal / np.max(np.abs(v_signal)) # default # Write: sf.write(filepath, v_signal, fs) return
def test_write_float_data_to_pcm_file(file_inmemory): float_to_clipped_int16 = [ (-1.0 - 2**-15, -2**15 ), (-1.0 , -2**15 ), (-1.0 + 2**-15, -2**15 + 1), ( 0.0 , 0 ), ( 1.0 - 2**-14, 2**15 - 2), ( 1.0 - 2**-15, 2**15 - 1), ( 1.0 , 2**15 - 1), ] written, expected = zip(*float_to_clipped_int16) sf.write(file_inmemory, written, 44100, format='WAV', subtype='PCM_16') file_inmemory.seek(0) read, fs = sf.read(file_inmemory, dtype='int16') assert np.all(read == expected) assert fs == 44100
def main(): args = get_args() if args.lin: cFreq = makeLinearCFs(args.band, args.space, args.low, args.high) else: cFreq = makeErbCFs(args.band, args.space, args.low, args.high) compTone = genComplex(cFreq, args.rate, args.time) ampTone = ampModulate(compTone, args.mod, args.rate) # -1 : balance to not go above '1'. # > 0 : balance to the specified value. if args.rms <= 0.0: ampTone *= ( 1 / np.max( np.abs(ampTone) ) ) else: ampTone *= (args.rms / rms(ampTone)) sf.write(args.save, ampTone, args.rate)
def sliceAudio(iFilename, names, times, verbose_en): #open aduio data, fs = sf.read(iFilename) times.append(len(data)*fs) # calculate time laps for i in range(len(times)-1): startPoint = times[i]*fs endPoint = times[i+1]*fs # write slice audio file sf.write(names[i]+'.wav', data[startPoint:endPoint], fs) if verbose_en == True: print names[i]+'.wav'
def output(self, filename, format=None): """ Write the samples out to the given filename. Parameters ---------- filename : str The path to write the audio on disk. This can be any format supported by `pysoundfile`, including `WAV`, `FLAC`, or `OGG` (but not `mp3`). format : str If provided, explicitly set the output encoding format. See `soundfile.available_formats`. """ sf.write(filename, self.raw_samples.T, int(self.sample_rate), format=format)
def save(f, s, fs, subtype=None): ''' Return ------ waveform (ndarray), sample rate (int) ''' from soundfile import write return write(f, s, fs, subtype=subtype)
def export(input, input_file, output_path, samplerate): if not os.path.exists(output_path): os.makedirs(output_path) basepath = os.path.join( output_path, os.path.splitext(os.path.basename(input_file))[0] ) # Write out all components for i in range(input.shape[0]): sf.write( basepath + "_cpnt-" + str(i) + ".wav", input[i], samplerate ) out_sum = np.sum(input, axis=0) sf.write(basepath + '_reconstruction.wav', out_sum, samplerate)
def stereo_to_mono_and_extreme_silence_cropping(source, target, subtype=None, print_progress=False): if os.path.isdir(source) and os.path.isdir(target): from glob import iglob if source[-1] != '/': source += '/' for i, filepath in enumerate(iglob(source + '*.wav')): filename = os.path.basename(filepath) if print_progress: printProgress("{}: {}".format(i, filename)) stereo_to_mono_and_extreme_silence_cropping( filepath, os.path.join(target, filename) ) else: wf, sr = wf_and_sr(source) wf = ensure_mono(wf) wf = crop_head_and_tail_silence(wf) sf.write(data=wf, file=target, samplerate=sr, subtype=subtype)
def unify_signals(self, obj): for signal_name in self.signal_names: if signal_name in obj._save_names: continue signal = getattr(obj, signal_name) if signal != []: name = hashlib.md5(signal).hexdigest() if name in self.signals: setattr(obj, signal_name, self.signals[name]) # signal = self.signals[name] else: self.signals.update({name:signal}) #signal = self.signals[name] setattr(obj, signal_name, self.signals[name]) sf.write(os.path.join(self.temp_path, name+'.wav'), self.signals[name], samplerate = obj.sample_rate) obj._save_names[signal_name] = name
def swingify(file_path, outfile, factor, sr=None, format=None): y, sr = librosa.load(file_path, mono=False, sr=sr) print(y.shape) anal_samples = librosa.to_mono(y) raw_samples = np.atleast_2d(y) # force stereo if raw_samples.shape[0] < 2: print('doubling mono signal to be stereo') raw_samples = np.vstack([raw_samples, raw_samples]) beats = get_beats(anal_samples, sr, 512) output = synthesize(raw_samples, beats, factor) output = output * 0.7 print(sr) sf.write(outfile, output.T, int(sr), format=format) # librosa.output.write_wav(outfile, output, sr, norm=True) return beats
def insertIntoDb(file, identifier, info): onlineDbId = info['source'].lower() id = '{}_{}'.format(onlineDbId, identifier) if id in rirDb: return False info['id'] = id info['filename'] = os.path.join(ImportDir, id + '.wav') rirDb[id] = info # copy file (or write as wav file) if isinstance(file, str): shutil.copyfile(file, info['filename']) else: assert len(file) == 2 x, fs = file sf.write(info['filename'], x, fs) return True
def main(input_directory, data_directory, group, speaker, chapter): wav_file_count = 0 save_path = os.path.join(data_directory, group, speaker, chapter) idtag = speaker+'-'+chapter transcript_filename = idtag + '.trans.txt' os.makedirs(save_path, exist_ok=True) outfile = open(os.path.join(save_path, transcript_filename), 'w') save_path = os.path.join(data_directory, group, speaker, chapter) for file in os.listdir(input_directory): if file.endswith(".wav"): data, samplerate = sf.read(os.path.join(input_directory,file)) sf.write('testwavout.wav',data,samplerate) # save the file to its new place ident = idtag + '-' + '{:04d}'.format(wav_file_count) new_filename = ident+'.wav' print(ident) os.replace('testwavout.wav',os.path.join(save_path,new_filename)) wav_file_count += 1 outfile.write(ident+' \n') outfile.close()
def synthesize_direct(self, tract_params, glottis_params, duration_s, framerate_hz, wavfile=None): extra_frames = 1000 n_frames = int(duration_s * framerate_hz) assert len(tract_params) / self.n_vocaltract_params == n_frames assert len(glottis_params) / self.n_glottis_params == n_frames # Prep output c_int_ptr = ctypes.c_int * 1 # int* c_audio_ptr = ctypes.c_double * int( duration_s * self.audio_samplerate + extra_frames) audio = c_audio_ptr(0) n_audio_samples = c_int_ptr(0) c_tubeareas_ptr = ctypes.c_double * int( n_frames * self.n_tube_sections) tubeareas = c_tubeareas_ptr(0) c_tractsequence_ptr = ctypes.c_double * int( n_frames * self.n_vocaltract_params) tract_params_ptr = c_tractsequence_ptr(*tract_params) c_glottissequence_ptr = ctypes.c_double * int( n_frames * self.n_glottis_params) glottis_params_ptr = c_tractsequence_ptr(*glottis_params) # Call VTL self.lib.vtlInitialize(self.speaker) self.lib.vtlSynthBlock(tract_params_ptr, glottis_params_ptr, tubeareas, ctypes.c_int(n_frames), ctypes.c_double(framerate_hz), audio, n_audio_samples) # Process output out_audio = np.asarray(audio, dtype=np.float64) out_audio = np.int16(out_audio / np.max(np.abs(out_audio)) * 32767) self.lib.vtlClose() if wavfile is None: return out_audio, self.audio_samplerate sf.write(wavfile, out_audio, self.audio_samplerate)
def saveTo(self, file): with ZipFile(file, 'w') as zip: song_file = configparser.ConfigParser() port_list = list(self.outputsPorts) song_file['DEFAULT'] = {'volume': self.volume, 'bpm': self.bpm, 'beat_per_bar': self.beat_per_bar, 'width': self.width, 'height': self.height, 'outputs': json.dumps(port_list), 'scenes': json.dumps(self.scenes)} if self.initial_scene is not None: song_file['DEFAULT']['initial_scene'] = self.initial_scene for clip in self.clips: clip_file = {'name': clip.name, 'volume': str(clip.volume), 'frame_offset': str(clip.frame_offset), 'beat_offset': str(clip.beat_offset), 'beat_diviser': str(clip.beat_diviser), 'output': clip.output, 'mute_group': str(clip.mute_group), 'audio_file': basename( clip.audio_file)} if clip_file['audio_file'] is None: clip_file['audio_file'] = 'no-sound' song_file["%s/%s" % (clip.x, clip.y)] = clip_file buffer = StringIO() song_file.write(buffer) zip.writestr('metadata.ini', buffer.getvalue()) for member in self.data: buffer = BytesIO() sf.write(buffer, self.data[member], self.samplerate[member], subtype=sf.default_subtype('WAV'), format='WAV') zip.writestr(member, buffer.getvalue()) self.file_name = file
def main(dbFilename, targetFs, force=False): util.createDirectory(NormalizeDir) rirDb = json.load(open(dbFilename)) bar = util.ConsoleProgressBar() bar.start('Normalize RIRs') i = 0 for rirId, rir in rirDb.items(): targetFilename = join(NormalizeDir, rir['id'] + '.wav') if not force: if rir['filename'] == targetFilename and \ rir['fs'] == targetFs and \ targetFilename: continue x, fs_x = sf.read(join(ImportDir, rir['id'] + '.wav'), dtype='float32') y, fs_y = x, fs_x if fs_y != targetFs: y = resample(y, targetFs / fs_y, 'sinc_best') fs_y = targetFs rir['length_org'] = len(y) / fs_y y = util.trimSilence(y, 0.001, trimRight=False) y = util.normalizeAmplitude(y) sf.write(targetFilename, y, fs_y) rir['filename'] = targetFilename rir['fs'] = fs_y rir['length'] = len(y) / fs_y i += 1 bar.progress(i / len(rirDb)) bar.end() with open(dbFilename, 'w') as dbFile: json.dump(rirDb, dbFile, sort_keys=True, indent=4)
def test_read_int_data_from_float_file(file_inmemory): """This is a very uncommon use case.""" unnormalized_float_to_clipped_int16 = [ (-2.0**15 - 1 , -2**15), (-2.0**15 , -2**15), (-2.0**15 + 1 , -2**15 + 1), (-1.0 , -1), (-0.51 , -1), (-0.5 , 0), ( 0.0 , 0), ( 0.5 , 0), ( 0.51 , 1), ( 1.0 , 1), ( 2.0**15 - 2 , 2**15 - 2), ( 2.0**15 - 1 , 2**15 - 1), ( 2.0**15 , 2**15 - 1), ] file_data, expected = zip(*unnormalized_float_to_clipped_int16) sf.write(file_inmemory, file_data, 44100, format='WAV', subtype='FLOAT') file_inmemory.seek(0) read, fs = sf.read(file_inmemory, always_2d=False, dtype='int16') assert np.all(read == expected) assert fs == 44100
def get_pitch_marks(v_sig, fs): temp_wav = lu.ins_pid('temp.wav') temp_pm = lu.ins_pid('temp.pm') sf.write(temp_wav, v_sig, fs) reaper(temp_wav, temp_pm) v_pm = np.loadtxt(temp_pm, skiprows=7) v_pm = v_pm[:,0] # Protection against REAPER bugs 1: vb_correct = np.hstack(( True, np.diff(v_pm) > 0)) v_pm = v_pm[vb_correct] # Protection against REAPER bugs 2 (maybe I need a better protection): if (v_pm[-1] * fs) >= (np.size(v_sig)-1): v_pm = v_pm[:-1] # Removing temp files: os.remove(temp_wav) os.remove(temp_pm) return v_pm
def resample(sample_rate=None, dir=None, csv_path=None): clips = [] start_time = time.time() # List all clips that appear on the csv (train, eval or test) if csv_path != 'test': with open(csv_path, 'r') as csvFile: reader = csv.reader(csvFile) for row in reader: clips.append(row[0]) csvFile.close() clips.remove('fname') else: clips = os.listdir(dir) if os.path.exists(dir+'/resampled/'): shutil.rmtree(dir+'/resampled', ignore_errors=True) # ignore errors whit read only files os.mkdir(dir+'/resampled') for clip in clips: # Audio clip is read data, sr = sf.read(dir+'/'+clip) data = data.T # Audio data is resampled to desired sample_rate if sr != sample_rate: data_resampled = librosa.resample(data, sr, sample_rate) # Processed data is saved into a directory under train_clip_dir sf.write(dir+'/resampled/'+clip, data_resampled, sample_rate, subtype='PCM_16') print('Audio data has been resampled successfully') elapsed_time = time.time() - start_time print('Elapsed time ' + str(elapsed_time) + ' seconds')
def save_sound(dst, sound): """Save a sound to a file.""" # Save without resampling sf.write(dst, sound[0], sound[1]) return None
def save_wav(path, wav, sr): import soundfile as sf sf.write(path, wav, sr) pass
parser.add_argument('--batch-size', type=int, default=16) args, _ = parser.parse_known_args() train_dataset, valid_dataset, args = load_datasets(parser, args) # Iterate over training dataset total_training_duration = 0 for k in tqdm.tqdm(range(len(train_dataset))): x, y = train_dataset[k] total_training_duration += x.shape[1] / train_dataset.sample_rate if args.save: import soundfile as sf sf.write( "test/" + str(k) + 'x.wav', x.detach().numpy().T, 44100, ) sf.write( "test/" + str(k) + 'y.wav', y.detach().numpy().T, 44100, ) print("Total training duration (h): ", total_training_duration / 3600) print("Number of train samples: ", len(train_dataset)) print("Number of validation samples: ", len(valid_dataset)) # iterate over dataloader train_dataset.seq_duration = args.seq_dur train_dataset.random_chunks = True
# -*- coding: utf-8 -*- """ Created on Fri Mar 19 08:49:04 2021 @author: CS """ # 改变采样率,统一为44.1kHz import librosa import numpy as np import soundfile as sf import os file_path = 'D:/Project/DCASE_test/Data/Data_ShipsEar/' sr_output = 44100 out_path = 'D:/Project/DCASE_test/Data/test/' file_list = os.listdir(file_path) for file in file_list: wav_path = file_path + file data, sr = librosa.load(wav_path, None) data_output = librosa.resample(data.astype(np.float32), sr, sr_output) out_name = out_path + file sf.write(out_name, data_output, sr_output)
To save a NumPy array as a WAV file, you can use wavio.write(): import wavio wavio.write("myfile.wav", my_np_array, fs, sampwidth=2) In this example, my_np_array is a NumPy array containing audio, fs is the sample rate of the recording (usually 44100 or 44800 Hz), and sampwidth is the sampling width of the audio (the number of bytes per sample, typically 1 or 2 bytes). soundfile The soundfile library can read and write all file formats supported by libsndfile. Although it can’t play back audio, it allows you to convert audio from and to FLAC, AIFF, and a few audio formats that are less common . To convert a WAV file to FLAC, you can use the following code: import soundfile as sf # Extract audio data and sampling rate from file data, fs = sf.read('myfile.wav') # Save as FLAC file at correct sampling rate sf.write('myfile.flac', data, fs) Similar code will work for converting between other file formats supported by libsndfile. pydub pydub lets you save audio in any format that ffmpeg supports, which includes nearly all audio types you might encounter in your daily life. For example, you can convert your WAV file to MP3 with the following code: from pydub import AudioSegment sound = AudioSegment.from_wav('myfile.wav') sound.export('myfile.mp3', format='mp3') Using AudioSegment.from_file() is a more general way of loading audio files. For example, if you want to convert your file back from MP3 to WAV, you can do the following: from pydub import AudioSegment sound = AudioSegment.from_file('myfile.mp3', format='mp3') sound.export('myfile.wav', format='wav')
sys.path.append(WAVERNN_FOLDER) from gen_wavernn import generate from utils import hparams as hp from models.fatchord_version import WaveRNN hp.configure(WAVERNN_FOLDER+'/hparams.py') model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to('cpu') model.load(CHECKPOINTS_FOLDER + "/" + wavernn_chpt) y = [] ix=1 while os.path.exists(CHR_FOLDER+"/"+str(ix)+".npy"): y.append(np.load(CHR_FOLDER+"/"+str(ix)+".npy")) ix+=1 idx=1 for s in y: waveform = generate(model, s, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap) sf.write("wg-"+str(idx)+".wav", waveform, hp.sample_rate) idx+=1
def write_file(file_path, data, samplerate=16000): """ write_file('test.wav', full_data, samplerate) """ sf.write(file_path, data, samplerate)
from __future__ import division import soundfile as sf from scipy import signal import numpy as np x,fs1=sf.read('Sound_Noise.wav') h,fs2=sf.read('transfer.wav') # def dft(x): # N=len(x) # X=np.zeros((N,),dtype=complex) # for k in range(0,N): # for n in range(0,N): # X[k]=X[k]+x[n]*np.exp(-np.pi*2j*k*n/N) # return X x1=np.fft.fft(x) h1=np.fft.fft(h) y=x1*h1 y1=np.fft.ifft(y) y1=y1.real sf.write('fft.wav',y1,fs1)
def main(): args = parse_args() data_dir = None if args.vocals: data_dir = vocal_dir else: data_dir = novocal_dir if not os.path.isdir(data_dir): os.mkdir(data_dir) seq = 0 for sd in args.stem_dirs: for song in os.scandir(sd): for dir_name, _, file_list in os.walk(song): instruments = [ os.path.join(dir_name, f) for f in file_list if f.endswith(".wav") ] if instruments: print("Found directory containing wav files: %d" % seq) print(os.path.basename(dir_name).replace(" ", "_")) loaded_wavs = [None] * len(instruments) drum_track_index = -1 vocal_track_index = -1 mix_track_index = -1 for i, instrument in enumerate(instruments): if "drum" in instrument.lower(): drum_track_index = i elif "vocal" in instrument.lower(): vocal_track_index = i elif "mix" in instrument.lower(): mix_track_index = i # automatically resamples for us loaded_wavs[i] = MonoLoader( filename=instrument, sampleRate=args.sample_rate)() track_len = len(loaded_wavs[0]) # ensure all stems have the same length assert (len(loaded_wavs[i]) == track_len for i in range(1, len(loaded_wavs))) # first create the full mix harmonic_mix = sum([ l for i, l in enumerate(loaded_wavs) if i not in [ drum_track_index, vocal_track_index, mix_track_index, ] ]) full_mix = None if args.vocals: full_mix = (harmonic_mix + loaded_wavs[drum_track_index] + loaded_wavs[vocal_track_index]) else: full_mix = harmonic_mix + loaded_wavs[drum_track_index] seg_samples = int( numpy.floor(args.segment_size * args.sample_rate)) total_segs = int(numpy.floor(track_len / seg_samples)) seg_limit = min(total_segs - 1, args.segment_limit) for seg in range(seg_limit): if seg < args.segment_offset: continue seqstr = "%03d%04d" % (seq, seg) left = seg * seg_samples right = (seg + 1) * seg_samples harm_path = os.path.join( data_dir, "{0}_harmonic.wav".format(seqstr)) mix_path = os.path.join(data_dir, "{0}_mix.wav".format(seqstr)) perc_path = os.path.join( data_dir, "{0}_percussive.wav".format(seqstr)) vocal_path = os.path.join( data_dir, "{0}_vocal.wav".format(seqstr)) soundfile.write(harm_path, harmonic_mix[left:right], args.sample_rate) soundfile.write(mix_path, full_mix[left:right], args.sample_rate) # write the drum track soundfile.write( perc_path, loaded_wavs[drum_track_index][left:right], args.sample_rate, ) if args.vocals: # write the vocal track soundfile.write( vocal_path, loaded_wavs[vocal_track_index][left:right], args.sample_rate, ) seq += 1 if args.track_limit > -1: if seq == args.track_limit: return 0 return 0
import soundfile filename = 'test.mp3' print('loading {}...'.format(filename)) start = time.time() y, sr = librosa.load(filename, mono=True) print('elapsed {} sec'.format(time.time() - start)) print('y.shape: ' + str(y.shape)) print('sr: ' + str(sr)) duration = librosa.get_duration(y, sr) print('duration: ' + str(duration)) print('{:.4f} hours'.format(duration / 3600)) print('splitting...') start = time.time() intervals = librosa.effects.split(y, top_db=60) print('elapsed {} sec'.format(time.time() - start)) print('intervals.shape: ' + str(intervals.shape)) # print(intervals) print('saving...') start = time.time() for i, inter in enumerate(intervals): yt = y[inter[0]:inter[1]] filename = os.path.join('audios', 'clip_{}.wav'.format(i)) soundfile.write(filename, yt, samplerate=sr) print('elapsed {} sec'.format(time.time() - start)) print('done')
def write_wav(self, data, fs, filename, path='wav_out'): if not os.path.exists(path): os.makedirs(path) filepath = os.path.join(path, filename) print('Write to file: %s.' % filepath) sf.write(filepath, data.T, fs, subtype='PCM_16')
args.add_argument("--src", help="Source database file.") args.add_argument("--dst", help="New audio that maps to the source file.") args.add_argument("--out", default="output.wav", help="Name of output file.") return args.parse_args() if __name__ == "__main__": args = setup_args() a_audio = args.src b_audio = args.dst _, sr = sf.read(a_audio) a_segments = build_segment_db(a_audio) b_segments = build_segment_db(b_audio) new_segments = [] for s in b_segments: match = nearest(s.mfcc, a_segments) try: new_segments.extend(match.quarter) except AttributeError: pass sf.write(args.out, new_segments, sr)
import soundfile as sf import os for file in os.listdir('./'): if file.startswith('LA'): for wav in os.listdir(file): data, fs = sf.read(file + os.sep + wav) thre = max(abs(data)) ndata = data / thre * 0.8 sf.write(file + os.sep + wav, ndata, fs)
def write_wav_skip_existing(path, y, sr, norm=False): if not os.path.exists(path): soundfile.write(path, y, sr, "PCM_16") else: print("WARNING: Tried writing audio to " + path + ", but audio file exists already. Skipping file!")
def gpu_decode(feat_list, gpu): with torch.cuda.device(gpu): with torch.no_grad(): model_waveform = GRU_WAVE_DECODER_DUALGRU_COMPACT_MBAND( feat_dim=config.mcep_dim + config.excit_dim, upsampling_factor=config.upsampling_factor, hidden_units=config.hidden_units_wave, hidden_units_2=config.hidden_units_wave_2, kernel_size=config.kernel_size_wave, dilation_size=config.dilation_size_wave, n_quantize=config.n_quantize, causal_conv=config.causal_conv_wave, right_size=config.right_size, n_bands=config.n_bands, pad_first=True, lpc=config.lpc) logging.info(model_waveform) model_waveform.cuda() model_waveform.load_state_dict( torch.load(args.checkpoint)["model_waveform"]) model_waveform.remove_weight_norm() model_waveform.eval() for param in model_waveform.parameters(): param.requires_grad = False torch.backends.cudnn.benchmark = True # define generator if args.string_path is None: string_path = config.string_path else: string_path = args.string_path logging.info(string_path) generator = decode_generator( feat_list, batch_size=args.batch_size, upsampling_factor=config.upsampling_factor, excit_dim=config.excit_dim, string_path=string_path) # decode time_sample = [] n_samples = [] n_samples_t = [] count = 0 pqmf = PQMF(config.n_bands).cuda() print( f'{pqmf.subbands} {pqmf.A} {pqmf.taps} {pqmf.cutoff_ratio} {pqmf.beta}' ) for feat_ids, (batch_feat, n_samples_list) in generator: logging.info("decoding start") start = time.time() logging.info(batch_feat.shape) #batch_feat = F.pad(batch_feat.transpose(1,2), (model_waveform.pad_left,model_waveform.pad_right), "replicate").transpose(1,2) samples = model_waveform.generate(batch_feat) logging.info(samples.shape) # B x n_bands x T//n_bands samples = pqmf.synthesis( samples)[:, 0].cpu().data.numpy() # B x 1 x T --> B x T logging.info(samples.shape) samples_list = samples time_sample.append(time.time() - start) n_samples.append(max(n_samples_list)) n_samples_t.append( max(n_samples_list) * len(n_samples_list)) for feat_id, samples, samples_len in zip( feat_ids, samples_list, n_samples_list): #wav = np.clip(samples[:samples_len], -1, 1) wav = np.clip(samples[:samples_len], -1, 0.999969482421875) outpath = os.path.join(args.outdir, feat_id + ".wav") sf.write(outpath, wav, args.fs, "PCM_16") logging.info("wrote %s." % (outpath)) #break #figname = os.path.join(args.outdir, feat_id+"_wav.png") #plt.subplot(2, 1, 1) #plt.plot(wav_src) #plt.title("source wave") #plt.subplot(2, 1, 2) #plt.plot(wav) #plt.title("generated wave") #plt.tight_layout() #plt.savefig(figname) #plt.close() count += 1 #if count >= 3: #if count >= 6: #if count >= 1: # break logging.info("average time / sample = %.6f sec (%ld samples) [%.3f kHz/s]" % (\ sum(time_sample)/sum(n_samples), sum(n_samples), sum(n_samples)/(1000*sum(time_sample)))) logging.info("average throughput / sample = %.6f sec (%ld samples) [%.3f kHz/s]" % (\ sum(time_sample)/sum(n_samples_t), sum(n_samples_t), sum(n_samples_t)/(1000*sum(time_sample))))
total_length = onsets[-1] + samples[ (len(onsets) % round_robin - 1 + round_robin) % round_robin].shape[0] f = np.zeros([total_length, channels]) index = 0 for ost in onsets: f[ost:ost + samples[index].shape[0], :] += samples[index] index = (index + 1) % round_robin print('output file %s have %d channels' % (args.outfile, channels)) norm_factor = np.max(f) if norm_factor > 1: f /= norm_factor sf.write(args.outfile, np.squeeze(f), samplerate) else: ##output midi file = pretty_midi.PrettyMIDI(resolution=960, initial_tempo=bpm) drum_prog = pretty_midi.instrument_name_to_program('Steel Drums') trig_drum = pretty_midi.Instrument(program=drum_prog) for ost in onsets: time = float(ost) / samplerate note = pretty_midi.Note(velocity=127, pitch=36, start=time, end=time + 0.001) trig_drum.notes.append(note) file.instruments.append(trig_drum) file.write(args.outfile)
p.terminate() print('Finished recording') # Save the recorded data as a WAV file wf = wave.open(filename, 'wb') wf.setnchannels(channels) wf.setsampwidth(p.get_sample_size(sample_format)) wf.setframerate(fs) wf.writeframes(b''.join(frames)) wf.close() playsound(filename) # Read the audio data fs, data = wavfile.read(filename) data = data[:, 0] # Normalize the data data = normalize(data) # Preprocess the raw data # Filter requirements. order = 10 cutoff = 4000 # desired cutoff frequency of the filter, Hz y = butter_lowpass_filter(data, cutoff, fs, order) noise = y[0:20000] y_reduced_noise = nr.reduce_noise(audio_clip=y, noise_clip=noise, verbose=False) y_reduced_noise = normalize(y_reduced_noise) # Write to file sf.write(filename, y_reduced_noise, fs) playsound(filename)
waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) text = "相对论直接和间接的催生了量子力学的诞生 也为研究微观世界的高速运动确立了全新的数学模型" text = pinyin.get(text, format="numerical", delimiter=" ") print(text) sequence = np.array(text_to_sequence(text))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) plot_data((mel_outputs.float().data.cpu().numpy()[0], mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T)) ensure_folder('images') plt.savefig('images/mel_spec.jpg') mel_outputs_postnet = mel_outputs_postnet.type(torch.float16) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) audio = audio[0].data.cpu().numpy() audio = audio.astype(np.float32) print('audio.shape: ' + str(audio.shape)) print(audio) sf.write('output.wav', audio, sampling_rate, 'PCM_24')
import sounddevice as sd import soundfile as sf samplerate = 44100 # Hertz duration = 30 # seconds filename = 'output.wav' mydata = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=2, blocking=True) sf.write(filename, mydata, samplerate)
table_size = 1024 lfo_freq = 2 mod_amount = 70 lfo = generate_wave(samplerate, duration, lfo_freq) # phase = dry_phase(samplerate, duration, base_freq) phase = fm_phase(samplerate, base_freq, mod_amount, lfo) # phase = pm_phase(samplerate, base_freq, mod_amount, lfo) additive = additive_saw(samplerate, base_freq, phase) naive = naive_saw(samplerate, phase, table_size) linterp = yoshimi_saw_linterp(samplerate, base_freq, phase, table_size) cubic = yoshimi_saw_cubic(samplerate, base_freq, phase, table_size) sinc = sinc_saw(samplerate, base_freq, phase, table_size) # errors = calc_error(additive, naive, linterp, cubic, sinc) # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(errors) # compare(additive, naive) # compare(additive, linterp) # compare(additive, cubic) # compare(additive, sinc) soundfile.write("snd/modAdditive.wav", normalize(additive), samplerate) soundfile.write("snd/modNaive.wav", normalize(naive), samplerate) soundfile.write("snd/modLinterp.wav", normalize(linterp), samplerate) soundfile.write("snd/modCubic.wav", normalize(cubic), samplerate) soundfile.write("snd/modSinc.wav", normalize(sinc), samplerate)
def write_audio(path, audio, sample_rate): soundfile.write(file=path, data=audio, samplerate=sample_rate)
torch.nn.Conv1d(W, W, 3, 1, 1, 1, bias=False), AdaptiveBatchNorm1d(W), torch.nn.LeakyReLU(0.2), torch.nn.Conv1d(W, 1, 1, 1, 0, bias=True), ) def forward(self, input): return self.conv(input) with torch.no_grad(): model = DenoiseNetwork() speech_data = torch.load("SPEECH.pt") noise_data = torch.load("NOISE.pt") speech_sample = speech_data[0:220500].view(1, 1, -1) noise_sample = noise_data[0:220500].view(1, 1, -1) w = torch.rand(1) noisy_sample = (w * speech_sample) + ((1 - w) * noise_sample) output = model(noisy_sample) print(output) print(torch.nn.functional.mse_loss(output, speech_sample)) soundfile.write("speech_sample.wav", speech_sample.view(-1).numpy(), 22050) soundfile.write("noise_sample.wav", noise_sample.view(-1).numpy(), 22050) soundfile.write("noisy_sample.wav", noisy_sample.view(-1).numpy(), 22050) soundfile.write("output_sample.wav", output.view(-1).numpy(), 22050)
def py_gen_decay_indices(limit, n): # generates indices for STM input that mimic a decay-ing behaiviour of memory result = [1] if n > 1: ratio = (float(limit) / result[-1])**(1.0 / (n - len(result))) while len(result) < n: next_value = result[-1] * ratio if next_value - result[-1] >= 1: result.append(next_value) else: result.append(result[-1] + 1) ratio = (float(limit) / result[-1])**(1.0 / (n - len(result))) indices = list(map(lambda x: -round(x) + limit, result[::-1])) return indices def tf_gen_decay_indices(limit, n): return tf.py_function(gen_decay_indices, [limit, n], tf.int32) if __name__ == "__main__": # Check preprocessing x = load_audio(HP.train_path, HP.sr) x = encode_mulaw(x, HP.bits) x = decode_mulaw(x, HP.bits) x = encode_16bit(x) sf.write("outputs/final/preprocess_check.wav", x, HP.sr)
os.makedirs("sounds") for octave in range(1, 4): freqs.extend((freqs_base * octave).tolist()) env = create_env_cos() t = np.arange(0, env.size) / sr #plt.plot(t, env) for f, freq in enumerate(freqs): fn_out = "sounds/%.2fHz.wav" % (freq) if Osc == "square": T = int(sr / freq) square = np.zeros((T, )) - 1 square[int(T / 4):int(-T / 4)] = 1 sample = np.tile(square, (int(np.ceil(sr * DUR / T)), )) sample = sample[:int(sr * DUR)] elif Osc == "saw": T = int(sr / freq) saw = np.zeros((T, )) saw[:int(T / 2)] = np.linspace(-1, 1, int(T / 2)) saw[int(T / 2):] = np.linspace(1, -1, T - int(T / 2)) sample = np.tile(saw, (int(np.ceil(sr * DUR / T)), )) sample = sample[:int(sr * DUR)] sample *= env sf.write(fn_out, 0.707 * sample / np.max(np.abs(sample)), sr) # %%
def main(): print "Generation started at:", datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M') args = get_args() SEQ_LEN = args.seq_len CON_DIM = args.con_dim CON_FRAME_SIZE = args.con_frame_size BIG_FRAME_SIZE = args.big_frame_size FRAME_SIZE = args.frame_size WEIGHT_NORM = args.weight_norm EMB_SIZE = args.emb_size RNN_HIDDEN_DIM = args.rnn_hidden_dim RNN_TYPE = args.rnn_type DNN_HIDDEN_DIM = args.dnn_hidden_dim LEARN_H0 = args.learn_h0 Q_LEVELS = args.q_levels Q_TYPE = args.q_type BATCH_SIZE = args.batch_size WAV_OUT_PATH = args.wav_out_path RESTORE_FROM = args.restore_from H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1 assert SEQ_LEN % CON_FRAME_SIZE == 0,\ 'seq_len should be divisible by con_frame_size' assert CON_FRAME_SIZE % BIG_FRAME_SIZE == 0,\ 'con_frame_size should be divisible by big_frame_size' assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\ 'big_frame_size should be divisible by frame_size' if os.path.exists(TEST_WAV_SORTED_LIST) and os.path.exists( TEST_CON_SORTED_LIST): print 'Test sorted list already exists!' else: if not os.path.exists(SORTED_LIST_PATH): os.makedirs(SORTED_LIST_PATH) print 'generating test sorted list...' generate_sorted_list(TEST_WAV_PATH, TEST_CON_PATH, SAMPLE_RATE, TEST_WAV_SORTED_LIST, TEST_CON_SORTED_LIST) print 'Done.' if not os.path.exists(WAV_OUT_PATH): os.makedirs(WAV_OUT_PATH) data_feeder = load_data(TEST_WAV_SORTED_LIST, TEST_CON_SORTED_LIST, BATCH_SIZE, SEQ_LEN, CON_FRAME_SIZE, CON_DIM, Q_LEVELS, Q_TYPE, SAMPLE_RATE) con_ph = tf.placeholder(dtype=tf.float32, shape=[None, CON_DIM]) con_mask_ph = tf.placeholder(dtype=tf.int32, shape=[None, CON_FRAME_SIZE]) big_wav_ph = tf.placeholder(dtype=tf.int32, shape=[None, BIG_FRAME_SIZE]) big_mask_ph = tf.placeholder(dtype=tf.int32, shape=[None, BIG_FRAME_SIZE]) wav_ph = tf.placeholder(dtype=tf.int32, shape=[None, FRAME_SIZE]) mask_ph = tf.placeholder(dtype=tf.int32, shape=[None, FRAME_SIZE]) wav_t_ph = tf.placeholder(dtype=tf.int32, shape=[ None, ]) mask_t_ph = tf.placeholder(dtype=tf.int32, shape=[ None, ]) con_h0_ph = tf.placeholder(dtype=tf.float32, shape=[None, H0_MULT * sum(RNN_HIDDEN_DIM[0])]) big_h0_ph = tf.placeholder(dtype=tf.float32, shape=[None, H0_MULT * sum(RNN_HIDDEN_DIM[1])]) h0_ph = tf.placeholder(dtype=tf.float32, shape=[None, H0_MULT * sum(RNN_HIDDEN_DIM[2])]) con_frame_level_output_ph = tf.placeholder( dtype=tf.float32, shape=[None, 1, RNN_HIDDEN_DIM[0][-1]]) big_frame_level_output_ph = tf.placeholder( dtype=tf.float32, shape=[None, 1, RNN_HIDDEN_DIM[1][-1]]) frame_level_output_ph = tf.placeholder(dtype=tf.float32, shape=[None, RNN_HIDDEN_DIM[2][-1]]) prev_samples_ph = tf.placeholder(dtype=tf.int32, shape=[None, FRAME_SIZE]) sample_level_output_ph = tf.placeholder(dtype=tf.float32, shape=[None, Q_LEVELS]) reset_ph = tf.placeholder(dtype=tf.int32, shape=[]) argmax_ph = tf.placeholder(dtype=tf.int32, shape=[]) with tf.variable_scope('SampleRNNModel', reuse=None): net = SampleRNNModel(seq_len=SEQ_LEN, con_dim=CON_DIM, con_frame_size=CON_FRAME_SIZE, big_frame_size=BIG_FRAME_SIZE, frame_size=FRAME_SIZE, weight_norm=WEIGHT_NORM, emb_size=EMB_SIZE, rnn_hidden_dim=RNN_HIDDEN_DIM, dnn_hidden_dim=DNN_HIDDEN_DIM, rnn_type=RNN_TYPE, learn_h0=LEARN_H0, q_levels=Q_LEVELS) con_frame_level_output, con_h0 = net.con_frame_level_rnn( con_ph, con_mask_ph, con_h0_ph, reset_ph) big_frame_level_output, big_h0 = net.big_frame_level_rnn( big_wav_ph, big_mask_ph, con_frame_level_output_ph, big_h0_ph, reset_ph) frame_level_output, h0 = net.frame_level_rnn( wav_ph, mask_ph, big_frame_level_output_ph, h0_ph, reset_ph) sample_level_output = net.sample_level_predictor( frame_level_output_ph, prev_samples_ph) new_sample, ce_loss_t, accuracy_t = net.create_generator( sample_level_output_ph, wav_t_ph, mask_t_ph, argmax_ph) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=400) load_model(saver, sess, RESTORE_FROM) try: ce_loss_list = [] accuracy_list = [] wav_mat_list = [] mask_mat_list = [] samples_number = 0 count = 0 print 'Generation!' start_time = time() for _wav_batch, _mask_batch, _con_batch, _reset_batch, _end_batch, _end_epoch in data_feeder: if _reset_batch == 1: CON_H0 = np.zeros( (BATCH_SIZE, H0_MULT * sum(RNN_HIDDEN_DIM[0]))) BIG_H0 = np.zeros( (BATCH_SIZE, H0_MULT * sum(RNN_HIDDEN_DIM[1]))) H0 = np.zeros((BATCH_SIZE, H0_MULT * sum(RNN_HIDDEN_DIM[2]))) samples = np.full((_wav_batch.shape[0], CON_FRAME_SIZE), np.int32((Q_LEVELS - 1) // 2), dtype='int32') cumulative_ce_loss = np.zeros((_wav_batch.shape[0], ), dtype='float32') cumulative_accuracy = np.zeros((_wav_batch.shape[0], ), dtype='float32') cumulative_mask = np.zeros((_wav_batch.shape[0], ), dtype='int32') mask_gen = _mask_batch[:, CON_FRAME_SIZE:] index = CON_FRAME_SIZE else: mask_gen = np.concatenate( [mask_gen, _mask_batch[:, CON_FRAME_SIZE:]], axis=1) for t in xrange(CON_FRAME_SIZE, SEQ_LEN + CON_FRAME_SIZE): if t % CON_FRAME_SIZE == 0: CON_FRAME_OUTPUT, CON_H0 = sess.run( [con_frame_level_output, con_h0], feed_dict={ con_ph: _con_batch[:, (t // CON_FRAME_SIZE) * CON_DIM:(t // CON_FRAME_SIZE + 1) * CON_DIM], con_mask_ph: _mask_batch[:, t:t + CON_FRAME_SIZE], con_h0_ph: CON_H0, reset_ph: _reset_batch }) if t % BIG_FRAME_SIZE == 0: BIG_FRAME_OUTPUT, BIG_H0 = sess.run( [big_frame_level_output, big_h0], feed_dict={ big_wav_ph: samples[:, index - BIG_FRAME_SIZE:index], big_mask_ph: _mask_batch[:, t:t + BIG_FRAME_SIZE], con_frame_level_output_ph: CON_FRAME_OUTPUT[:, (t / BIG_FRAME_SIZE) % (CON_FRAME_SIZE / BIG_FRAME_SIZE)].reshape( _wav_batch.shape[0], 1, -1), big_h0_ph: BIG_H0, reset_ph: _reset_batch }) if t % FRAME_SIZE == 0: FRAME_OUTPUT, H0 = sess.run( [frame_level_output, h0], feed_dict={ wav_ph: samples[:, index - FRAME_SIZE:index], mask_ph: _mask_batch[:, t:t + FRAME_SIZE], big_frame_level_output_ph: BIG_FRAME_OUTPUT[:, (t / FRAME_SIZE) % (BIG_FRAME_SIZE / FRAME_SIZE)].reshape( _wav_batch.shape[0], 1, -1), h0_ph: H0, reset_ph: _reset_batch }) SAMPLE_OUTPUT = sess.run(sample_level_output, feed_dict={ frame_level_output_ph: FRAME_OUTPUT[:, t % FRAME_SIZE], prev_samples_ph: samples[:, index - FRAME_SIZE:index] }) if index < ARGMAX_SAMPLES + CON_FRAME_SIZE: NEW_SAMPLE, CE_LOSS, ACCURACY = sess.run( [new_sample, ce_loss_t, accuracy_t], feed_dict={ sample_level_output_ph: SAMPLE_OUTPUT, wav_t_ph: _wav_batch[:, t], mask_t_ph: _mask_batch[:, t], argmax_ph: 1 }) else: NEW_SAMPLE, CE_LOSS, ACCURACY = sess.run( [new_sample, ce_loss_t, accuracy_t], feed_dict={ sample_level_output_ph: SAMPLE_OUTPUT, wav_t_ph: _wav_batch[:, t], mask_t_ph: _mask_batch[:, t], argmax_ph: 0 }) cumulative_ce_loss += CE_LOSS cumulative_accuracy += ACCURACY cumulative_mask += _mask_batch[:, t] samples = np.concatenate([samples, NEW_SAMPLE], axis=1) index += 1 if _end_batch == 1: ce_loss_list.extend(list(cumulative_ce_loss / cumulative_mask)) accuracy_list.extend( list(cumulative_accuracy / cumulative_mask)) wav_mat_list.append(samples[:, CON_FRAME_SIZE:]) mask_mat_list.append(mask_gen) ce_loss = np.mean(ce_loss_list) accuracy = np.mean(accuracy_list) fid = open(TEST_WAV_SORTED_LIST, 'r') gen_id_list = fid.readlines() fid.close() for i in xrange(len(wav_mat_list)): samples_number += wav_mat_list[i].shape[0] * wav_mat_list[i].shape[ 1] for j in xrange(wav_mat_list[i].shape[0]): samplei = wav_mat_list[i][j] maski = mask_mat_list[i][j] samplei = samplei[0:len(np.where(maski == 1)[0])] if Q_TYPE == 'mu-law': from datasets.audio_reader import mu2linear samplei = mu2linear(samplei, Q_LEVELS) sf.write( WAV_OUT_PATH + os.sep + gen_id_list[count].split()[0].split('/')[-1], samplei, SAMPLE_RATE, 'PCM_16') count += 1 generation_time = time() - start_time log = "{} samples generated in {} hours.\nThe time of generating 1 second speech is {} seconds.\n" log += "Performance:\n\tloss:{:.4f}\taccuracy:{:.4f}%\n" log = log.format(len(gen_id_list), generation_time / 3600, generation_time / samples_number * SAMPLE_RATE, ce_loss, accuracy * 100) print log fid_log = open(LOG_FILE, 'a') fid_log.write(log) fid_log.close() print "Generation ended at:", datetime.strftime( datetime.now(), '%Y-%m-%d %H:%M') except KeyboardInterrupt: # Introduce a line break after ^C is displayed so save message # is on its own line. print()
def record(length=1, reclength=1, filename=None, thres=0): """ Merekam suara secara stream dan metode callback """ global cumulated_status, end_count, start_count, recording, magnitudo, audiodata, predicting, i_quit, listening predicting = False listening = True end_count = False start_count = 0 recording = False magnitudo = [] audiodata = [] try: import sounddevice as sd #samplerate = sd.query_devices(args.device, 'input')['default_samplerate'] samplerate = 16000.0 delta_f = (high - low) / screenwidth fftsize = np.ceil(samplerate / delta_f).astype(int) low_bin = int(np.floor(low / delta_f)) cumulated_status = sd.CallbackFlags() def callback(indata, frames, time, status): global cumulated_status, audiodata, magnitudo, end_count, start_count, recording, smodel, predicting, i_quit cumulated_status |= status if any(indata): magnitude = np.abs(np.fft.rfft(indata[:, 0], n=fftsize)) magnitude *= gain / fftsize rms = librosa.feature.rmse(S=indata) rms = int(rms * 32768) start_count += 1 if rms >= thres: if not recording: #and not end_count #print("Start record") recording = True start_count = 0 if recording: audiodata.extend(itertools.chain(indata.tolist())) magnitudo.append(magnitude) if start_count == int(samplerate / (samplerate * DURATION / 1000)): #print("End record") start_count = 0 end_count = True recording = False try: if not predicting: print("Predict") soundfile.write("temp.wav", audiodata, 16000) predict("temp.wav", model=smodel) predicting = False pass except: pass audiodata = [] with sd.InputStream(device=None, channels=1, callback=callback, blocksize=int(samplerate * DURATION / 1000), samplerate=samplerate): while True: #response = input() #if response in ('', 'q', 'Q'): if listening == False: #time.sleep(length) break if filename != None: soundfile.write(filename, audiodata, 16000) if cumulated_status: logging.warning(str(cumulated_status)) except Exception as e: print(e)
def main(fft_window_size, fft_window_step): """Generates the audio and PESQ vs SNR graphs for a given STFT setup. Saves the graphs and generated audio files to disk. Args: fft_window_size: The FFT window size. fft_window_step: The FFT window step. """ os.environ['CUDA_VISIBLE_DEVICES'] = '1' print(fft_window_size, ' ', fft_window_step, ' ', (fft_window_size // 2)) origonal_audio, _ = sf.read(WAVEFORM_PATH) origonal_audio = origonal_audio.astype(np.float32) for representation in REPRESENTATIONS: REPRESENTATIONS[representation]['perceptual_errors'] = [] REPRESENTATIONS[representation]['waveforms'] = [] for snr in SNRS: pb_i = utils.Progbar(len(REPRESENTATIONS) * N_REPEATS) print('SNR: ', snr) for representation in REPRESENTATIONS: all_perceptual_errors = [] for _ in range(N_REPEATS): perceptual_errors, audio_hats = process_representation_at_snr( representation, origonal_audio, snr, fft_window_size, fft_window_step) all_perceptual_errors.append(perceptual_errors) pb_i.add(1) print(' ', representation, ' -> ', np.mean(all_perceptual_errors, 0)) REPRESENTATIONS[representation]['perceptual_errors'].append( np.mean(all_perceptual_errors, 0)) REPRESENTATIONS[representation]['waveforms'].append(audio_hats) # Plot the graph for representation in REPRESENTATIONS: perceptual_errors = REPRESENTATIONS[representation][ 'perceptual_errors'] perceptual_errors = np.array(perceptual_errors) plot = plt.plot(SNRS, perceptual_errors[:, 0], label=representation) for i in range(perceptual_errors.shape[-1] - 1): plt.plot(SNRS, perceptual_errors[:, i + 1], color=plot[0].get_color(), linestyle=LINE_STYLES[i]) plt.xlabel('SNR') plt.ylabel('PESQ') plt.legend() file_name = 'pesq_vs_snr__{}ws_{}s'.format(fft_window_size, fft_window_step) plt.savefig(os.path.join(RESULTS_PATH, file_name), bbox_inches='tight', dpi=920) plt.clf() # Save the audio files setup = 'audio_{}ws_{}s'.format(fft_window_size, fft_window_step) base_audio_dir = os.path.join(RESULTS_PATH, setup) for representation in REPRESENTATIONS: audio_dir = os.path.join(base_audio_dir, representation) os.makedirs(audio_dir, exist_ok=True) for i, audio in enumerate( REPRESENTATIONS[representation]['waveforms']): for j, wav in enumerate(audio): file_path = os.path.join( audio_dir, '{}_{}db_{}.wav'.format(representation, SNRS[i], j)) sf.write(file_path, wav, SAMPLE_RATE)
def clean_getfirst3secs(audiofile): data, samplerate = sf.read(audiofile) os.remove(audiofile) data2 = data[0:samplerate * 3] sf.write(audiofile, data2, samplerate) return [audiofile]
sec = int(time.time()) # current Unix timestamp ofname = 'R_' + str(sec) + '.wav' try: data, fs = sf.read(args.filename, dtype='float32') #print("Play fs: %d\n",fs) # sd.play(data, fs, device=args.device) # play 'data' array at fs sample rate recdata = sd.playrec( data, fs, channels=recChan, device=args.device) # play data array AND record recChan channels # recdata is a [n][recChan] array of float32 type (?) status = sd.wait() # wait here until playback is done #pk1 = np.amax(recdata[:,0]) # peak recorded value (positive) #pk2 = np.amax(recdata[:,1]) opath = outdir + "/" + ofname sf.write(opath, recdata, fs) # save recorded data to file tstamp = ofname[2:-4] # print("%s %d %4.2f %4.2f " % (tstamp,int(fs/1000),pk1,pk2),end="") print("%s" % opath) except KeyboardInterrupt: parser.exit('\nInterrupted by user') except Exception as e: parser.exit(type(e).__name__ + ': ' + str(e)) if status: parser.exit('Error during playback: ' + str(status))
def test(self, args): with open(args.tt_list, 'r') as f: self.tt_list = [line.strip() for line in f.readlines()] self.model_file = args.model_file self.ckpt_dir = args.ckpt_dir self.est_path = args.est_path self.write_ideal = args.write_ideal self.gpu_ids = tuple(map(int, args.gpu_ids.split(','))) if len(self.gpu_ids) == 1 and self.gpu_ids[0] == -1: # cpu only self.device = torch.device('cpu') else: # gpu self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if not os.path.isdir(self.ckpt_dir): os.makedirs(self.ckpt_dir) logger = getLogger(os.path.join(self.ckpt_dir, 'test.log'), log_file=True) # create a network net = Net() logger.info('Model summary:\n{}'.format(net)) net = net.to(self.device) # calculate model size param_count = numParams(net) logger.info('Trainable parameter count: {:,d} -> {:.2f} MB\n'.format(param_count, param_count*32/8/(2**20))) # training criterion and optimizer criterion = LossFunction() # net feeder feeder = NetFeeder(self.device, self.win_size, self.hop_size) # resynthesizer resynthesizer = Resynthesizer(self.device, self.win_size, self.hop_size) # load model logger.info('Loading model from {}'.format(self.model_file)) ckpt = CheckPoint() ckpt.load(self.model_file, self.device) net.load_state_dict(ckpt.net_state_dict) logger.info('model info: epoch {}, iter {}, cv_loss - {:.4f}\n'.format(ckpt.ckpt_info['cur_epoch']+1, ckpt.ckpt_info['cur_iter']+1, ckpt.ckpt_info['cv_loss'])) net.eval() for i in range(len(self.tt_list)): # create a data loader for testing tt_loader = AudioLoader(self.tt_list[i], self.sample_rate, unit='utt', segment_size=None, segment_shift=None, batch_size=1, buffer_size=10, in_norm=self.in_norm, mode='eval') logger.info('[{}/{}] Estimating on {}'.format(i+1, len(self.tt_list), self.tt_list[i])) est_subdir = os.path.join(self.est_path, self.tt_list[i].split('/')[-1].replace('.ex', '')) if not os.path.isdir(est_subdir): os.makedirs(est_subdir) accu_tt_loss = 0. accu_n_frames = 0 for k, egs in enumerate(tt_loader): mix = egs['mix'] sph = egs['sph'] n_samples = egs['n_samples'] n_frames = countFrames(n_samples, self.win_size, self.hop_size) mix = mix.to(self.device) sph = sph.to(self.device) feat, lbl = feeder(mix, sph) with torch.no_grad(): loss_mask = lossMask(shape=lbl.shape, n_frames=n_frames, device=self.device) est = net(feat) loss = criterion(est, lbl, loss_mask, n_frames) accu_tt_loss += loss.data.item() * sum(n_frames) accu_n_frames += sum(n_frames) sph_idl = resynthesizer(lbl, mix) sph_est = resynthesizer(est, mix) # save estimates mix = mix[0].cpu().numpy() sph = sph[0].cpu().numpy() sph_est = sph_est[0].cpu().numpy() sph_idl = sph_idl[0].cpu().numpy() mix, sph, sph_est, sph_idl = wavNormalize(mix, sph, sph_est, sph_idl) sf.write(os.path.join(est_subdir, '{}_mix.wav'.format(k)), mix, self.sample_rate) sf.write(os.path.join(est_subdir, '{}_sph.wav'.format(k)), sph, self.sample_rate) sf.write(os.path.join(est_subdir, '{}_sph_est.wav'.format(k)), sph_est, self.sample_rate) if self.write_ideal: sf.write(os.path.join(est_subdir, '{}_sph_idl.wav'.format(k)), sph_idl, self.sample_rate) avg_tt_loss = accu_tt_loss / accu_n_frames logger.info('loss: {:.4f}\n'.format(avg_tt_loss)) return