def recognise(self, mode='sphinx', marg=0.2): # use the audio file as the audio source r = srec.Recognizer() if mode=='sphinx': recogniser = r.recognize_sphinx if mode=='google': recogniser = r.recognise_google for ii, (ts,te,lab) in enumerate(zip(self.tst,self.tend,self.label)): tstart = st-marg tend = end+marg wo=w[int(tstart*fs):int(tend*fs)] wavwrite('speech_sample.wav',fs,wo) with srec.AudioFile('speech_sample.wav') as source: audio = r.record(source) # read the entire audio file try: # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` # instead of `r.recognize_google(audio)` utt = recogniser(audio) #utt = r.recognize_sphinx(audio) self.label[ii] = utt except srec.UnknownValueError: print("Speech Recognition could not understand audio") except srec.RequestError as e: print("Could not request results {}".format(e))
def recognise(self, mode='sphinx', marg=0.2): import speech_recognition as srec # use the audio file as the audio source r = srec.Recognizer() if mode=='sphinx': recogniser = r.recognize_sphinx sys.stderr.write('Doing speech recognition with sphinx\n') if mode=='google': sys.stderr.write('Doing speech recognition with google\n') recogniser = r.recognise_google for ii, (ts,te,lab) in enumerate(zip(self.tst,self.tend,self.label)): tstart = ts-marg tend = te+marg wo=self.x[int(tstart*self.sr):int(tend*self.sr)] wavwrite('speech_sample.wav',self.sr,wo.astype('int16')) with srec.AudioFile('speech_sample.wav') as source: audio = r.record(source) # read the entire audio file try: # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` # instead of `r.recognize_google(audio)` utt = recogniser(audio) #utt = r.recognize_sphinx(audio) self.label[ii] = utt sys.stderr.write('{}\n'.format(utt)) except srec.UnknownValueError: sys.stderr.write("Speech Recognition could not understand audio\n") except srec.RequestError as e: sys.stderr.write("Could not request results {}\n".format(e))
def main(): try: out_dir, in_dir = sys.argv[1], sys.argv[2] except: in_dir = '../../data/sc09' out_dir = '../../data/sc09_wav' if not os.path.isdir(out_dir): os.makedirs(out_dir) tfrecord_fps = glob.glob(os.path.join(in_dir, '*.tfrecord')) dataset = tf.data.TFRecordDataset(tfrecord_fps) dataset = dataset.map(_mapper) dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(1)) x, y = dataset.make_one_shot_iterator().get_next() x, y = x[0], y[0] with tf.Session() as sess: i = 0 while True: try: _x, _y = sess.run([x, y]) except: break _x *= 32767. _x = np.clip(_x, -32767., 32767.) _x = _x.astype(np.int16) wavwrite(os.path.join(out_dir, '{}_{}.wav'.format(_y, str(i).zfill(5))), 16000, _x) i += 1
def main(argv): if str(argv[0]) == "help": print("python genwave.py FREQ AMP DURATION FS NUMCH BITS f/i OUTPATH") return freq = float(argv[0]) if len(argv) > 0 else 440 amp = float(argv[1]) if len(argv) > 1 else 0.7 duration = float(argv[2]) if len(argv) > 2 else 30 fs = int(argv[3]) if len(argv) > 3 else 44100 nch = int(argv[4]) if len(argv) > 4 else 2 bits = int(argv[5]) if len(argv) > 5 else 16 f_or_i = argv[6] if len(argv) > 6 else "i" outpath = str(argv[7]) if len(argv) > 7 else "out.wav" if bits == 16: sig = np.zeros([int(fs * duration), nch], dtype=np.int16) elif bits == 32: if f_or_i == "i": sig = np.zeros([int(fs * duration), nch], dtype=np.int32) else: sig = np.zeros([int(fs * duration), nch], dtype=np.float32) else: print("invalid bit-width: 16/32 required") return sinewave = amp * np.sin( np.arange(sig.shape[0]) * freq * 1.0 / fs * 2 * np.pi) for ich in range(sig.shape[1]): if f_or_i == "i": sig[:, ich] = np.round(sinewave * (2**(bits - 1) - 1)) else: sig[:, ich] = sinewave[:] wavwrite(outpath, fs, sig)
def recover_samples_from_spectrum(logspectrum_stft, spectrum_phase, save_to): abs_spectrum = np.exp(logspectrum_stft) spectrum = abs_spectrum * (np.exp(1j * spectrum_phase)) istft_graph = tf.Graph() with istft_graph.as_default(): num_fea = int(FLAGS.Fs * 0.025) / 2 + 1 frame_length = int(FLAGS.Fs * 0.025) frame_step = int(FLAGS.Fs * 0.010) stft_ph = tf.placeholder(tf.complex64, shape=(None, num_fea)) samples = tf.signal.inverse_stft( stft_ph, frame_length, frame_step, frame_length, window_fn=tf.signal.inverse_stft_window_fn( frame_step, forward_window_fn=functools.partial(tf.signal.hann_window, periodic=True))) istft_sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) samples_ = istft_sess.run(samples, feed_dict={stft_ph: spectrum}) wavwrite(save_to, FLAGS.Fs, samples_) return samples_
def getFilteredDataList(inputFileName, load = True, save = True): frequencyFilterWidthMap = getNs() result = [] [originalSampleRate, original] = wavread(inputFileName) for cutOffFrequency in frequencyFilterWidthMap: outputFileName = inputFileName + '_noisedfiltered_' + str(cutOffFrequency) + '.wav' if os.path.isfile(outputFileName) and load: print("Loading file ", outputFileName, " from disk." ) [sampleRate, x] = wavread(outputFileName) if sampleRate != originalSampleRate: raise ValueError("Sample rate of file ", outputFileName, " does not eaqual the sample rate of", " the original file " , inputFileName) else: windowSize = frequencyFilterWidthMap[cutOffFrequency] print('Generating noisedfiltered ', cutOffFrequency, ' data, with windowSize ', windowSize) x = running_mean(original, windowSize).astype('int16') x = x + np.random.normal(0, 10, len(x)).astype('int16') # to add noise. if save: wavwrite(outputFileName, originalSampleRate, x) print("saved: ", outputFileName) if len(x) != len(original): raise ValueError("Filtering the wav file failed. Original input is ", len(original), " long", "but the filtered data is " , len(x) , " long.") result.append(x) return (result ,originalSampleRate)
def test_signal_normalize(tmp_path): sig = np.sin(2*np.pi*np.linspace(0,1,50)) sig1 = sig * .8 sig2 = sig * .4 rms_out1 = 0.700 # Should be very close to full amplitude rms_out2 = 0.350 # Should be very close to half amplitude with tempfile.TemporaryDirectory() as tmpdirin: infile1 = os.path.join(tmpdirin, 'file1.wav') infile2 = os.path.join(tmpdirin, 'file2.wav') wavwrite(infile1, 50, sig1) wavwrite(infile2, 50, sig2) with tempfile.TemporaryDirectory() as tmpdirout: psylab.signal.normalize(tmpdirin, ext='wav', outdir=tmpdirout, relative=False) fs,out1 = wavread(os.path.join(tmpdirout, 'file1.wav')) fs,out2 = wavread(os.path.join(tmpdirout, 'file2.wav')) #print("sig: {:}".format(np.sqrt(np.mean(np.square(sig))))) #print("sig1: {:}".format(np.sqrt(np.mean(np.square(sig1))))) #print("sig2: {:}".format(np.sqrt(np.mean(np.square(sig2))))) #print("out1: {:}".format(np.sqrt(np.mean(np.square(out1))))) #print("out2: {:}".format(np.sqrt(np.mean(np.square(out2))))) np.testing.assert_allclose(rms_out1, np.sqrt(np.mean(np.square(out1))), rtol=1e-3) np.testing.assert_allclose(rms_out2, np.sqrt(np.mean(np.square(out2))), rtol=1e-3)
def gen(path, endian="little"): endian_tag = "<" if endian == "little" else ">" print("parse the file \"{}\"".format(path)) tags = path.split("_") tags = map(lambda s: s.replace(".pcm", ""), tags) fs_tag = filter(lambda s: s.startswith("sr"), tags) if len(fs_tag) > 0: fs = int(fs_tag[0][2:]) else: print("filename \"{}\" does not match the format.".format(path)) return ch_tag = filter(lambda s: s.startswith("ch"), tags) if len(ch_tag) > 0: num_ch = int(ch_tag[0][2:]) else: print("filename \"{}\" does not match the format.".format(path)) return format_tag = filter(lambda s: s.startswith("format"), tags) if len(format_tag) > 0: proc_format = int(format_tag[0][6:]) else: print("filename \"{}\" does not match the format.".format(path)) return if not proc_format in [1, 5, 6]: print("the format must be pcm16/pcm24/pcmfloat.") return with open(path, "rb") as f: raw_data = f.read() if proc_format == 1: signal = np.array(struct.unpack( "{}{}h".format(endian_tag, len(raw_data) / 2), raw_data), dtype=np.int16) elif proc_format == 5: signal = np.array(struct.unpack( "{}{}f".format(endian_tag, len(raw_data) / 4), raw_data), dtype=np.float32) elif proc_format == 6: data_len = len(raw_data) / 3 data = "" for x in range(data_len): data += raw_data[x * 3 + 1:x * 3 + 3] signal = np.array(struct.unpack( "{}{}h".format(endian_tag, len(data) / 2), data), dtype=np.int16) if num_ch > 1: signal = np.reshape(signal, (len(signal) / num_ch, num_ch)) output_path = "{}.wav".format(path) wavwrite(filename=output_path, rate=fs, data=signal) print("generate the file \"{}\"".format(output_path))
def main(): sound_filename = sys.argv[1] frequency = float(sys.argv[2]) duration = float(sys.argv[3]) amplitude = float(sys.argv[4]) sampling_rate = int(sys.argv[5]) data = sine(frequency, duration, amplitude, sampling_rate) wavwrite(sound_filename, sampling_rate, data)
def _wav_write(wav_fp, fs, wav_f, normalize=False): if normalize: wav_f_max = wav_f.max() if wav_f_max != 0.0: wav_f /= wav_f.max() wav_f = np.clip(wav_f, -1.0, 1.0) wav = (wav_f * 32767.0).astype(np.int16) wavwrite(wav_fp, fs, wav)
def transcribe(self, input_audio, model_path=None, output="./"): """Transcribe frame-level fundamental frequency of vocal from the given audio. Parameters ---------- input_audio: Path Path to the wav audio file. model_path: Path Path to the trained model or the transcription mode. If given a path, should be the folder that contains `arch.yaml`, `weights.h5`, and `configuration.yaml`. output: Path (optional) Path for writing out the extracted vocal f0. Default to current path. Returns ------- f0: txt The transcribed f0 of the vocal contour in Hz. See Also -------- omnizart.cli.vocal_contour.transcribe: The coressponding command line entry. """ if not os.path.isfile(input_audio): raise FileNotFoundError( f"The given audio path does not exist. Path: {input_audio}") logger.info("Loading model...") model, model_settings = self._load_model(model_path) logger.info("Extracting feature...") feature = extract_cfp_feature( input_audio, hop=model_settings.feature.hop_size, win_size=model_settings.feature.window_size, down_fs=model_settings.feature.sampling_rate) logger.info("Predicting...") f0 = inference(feature[:, :, 0], model, timestep=model_settings.training.timesteps) agg_f0 = aggregate_f0_info(f0, t_unit=model_settings.feature.hop_size) timestamp = np.arange(len(f0)) * model_settings.feature.hop_size wav = sonify.pitch_contour(timestamp, f0, model_settings.feature.sampling_rate, amplitudes=0.5 * np.ones(len(f0))) output = self._output_midi(output, input_audio, verbose=False) if output is not None: write_agg_f0_results(agg_f0, f"{output}_f0.csv") wavwrite(f"{output}_trans.wav", model_settings.feature.sampling_rate, wav) logger.info("Text and Wav files have been written to %s", os.path.abspath(os.path.dirname(output))) logger.info("Transcription finished") return agg_f0
def main(**kwargs): outfile = kwargs['outfile'][0] infile = kwargs['infile'] # print "Filtering %s to %s" % (infile, outfile) data, rate = ffmpeg_load_audio(infile, 44100, True, dtype=np.float32) wavwrite('test.wav', 44100, data) filtered_data = butter_bandpass_filter(data, 100.0, 3000.0, 44100) wavwrite(outfile, 44100, filtered_data)
def process_filtering(self, sig_float_filtered, write=False, output_file_name=None): self.filtered = True self.sig_int = float2pcm(sig_float_filtered) self.sig_float = sig_float_filtered if write: wavwrite(output_file_name, file.sr, file.sig_int)
def generate_audio(self, fake_r, fake_i, order): audio_dir = './audio/' + str(order) + '/' if not os.path.isdir(audio_dir): os.makedirs(audio_dir) for i in range(self.batch_size): audio_fp = os.path.join(audio_dir, '{}.wav'.format(str(i))) fake = fake_r[i, 0] + 1j * fake_i[i, 0] audio = spec_to_wav(fake) wavwrite(audio_fp, self.fs, audio) print("Done generating audio :)")
def resample(path_in, path_out=None, sampling_rate=16000): # load and resampling y, sr = librosa.load(path_in, sr=sampling_rate) # save add '-16k' as suffix if path_out is None if path_out is None: root, ext = os.path.splitext(path_in) path_out= root + '-16k' + ext wavwrite( path_out, sr , ( y * 2 ** 15).astype(np.int16)) print ('save wav file', path_out)
def get_wav(self, *, resample=True): '''Retrieves io.BytesIO() packed with `.wav` contents''' result = self.resample_fs(self.BULLSHITWAVNUMBER) if resample \ else self.copy() data = result.normdata(dtype=np.int16) bytes_io = io.BytesIO() wavwrite(bytes_io, result._fs, data) return bytes_io
def tx1_to_wav(tx1_fp, out_fp, midi_downsample_rate=None): if midi_downsample_rate == 0: midi_downsample_rate = None print('(Rate {}) {}->{}'.format(midi_downsample_rate, tx1_fp, out_fp)) with open(tx1_fp, 'r') as f: tx1 = f.read() midi = tx1_to_midi(tx1) wav = nesmdb.convert.midi_to_wav(midi, midi_downsample_rate) wavwrite(out_fp, 44100, wav) print('Done: {}'.format(wav.shape)) return True
def main(argv): del argv tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=FLAGS.num_shards, iterations_per_loop=FLAGS.iterations_per_loop)) # Set module-level global variable so that model_fn and input_fn can be # identical for each different kind of dataset and model global dataset, model dataset = bias_input model = bias_model # TPU-based estimator used for TRAIN and EVAL est = tf.contrib.tpu.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=config, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size) # CPU-based estimator used for PREDICT (generating images) cpu_est = tf.contrib.tpu.TPUEstimator(model_fn=model_fn, use_tpu=False, config=config, predict_batch_size=_NUM_VIZ_AUDIO) current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info('Starting training for %d steps, current step: %d' % (FLAGS.train_steps, current_step)) # Render some generated images G_z = cpu_est.predict(input_fn=noise_input_fn) G_z = [p['generated_audio'][:, :] for p in G_z] G_z = np.array(G_z) preview_dir = './preview' if not os.path.isdir(preview_dir): os.makedirs(preview_dir) for i in range(len(G_z)): audio = np.int16(G_z[i] / np.max(np.abs(G_z[i])) * 32767) preview_fp = os.path.join( preview_dir, '{}_{}_{}.wav'.format(str(i % 10), str(current_step), str(i))) wavwrite(preview_fp, _FS, audio) tf.logging.info('Finished generating images')
def recover_samples_from_spectrum(logspectrum_stft, spectrum_phase, save_to=None): abs_spectrum = np.exp(logspectrum_stft) spectrum = abs_spectrum * (np.exp(1j * spectrum_phase)) istft_graph = tf.Graph() with istft_graph.as_default(): stft_ph = tf.placeholder(tf.complex64, shape=(None, 201)) samples = tf.contrib.signal.inverse_stft(stft_ph, 400, 160, 400) istft_sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) samples_ = istft_sess.run(samples, feed_dict={stft_ph: spectrum}) if save_to: wavwrite(save_to, 16000, samples_) return samples_
def save_wav(path0, data, sr=44100): # print('file ', path0) amplitude = np.iinfo(np.int16).max max_data = np.amax(np.abs(data)) # normalize, max level is 16bit full bit if max_data < (1.0 / amplitude): max_data = 1.0 try: wavwrite(path0, sr, np.array((amplitude / max_data) * data, dtype=np.int16)) except: print('error: wavwrite ', path0) sys.exit()
def record_audio(self, out_path, time_allowance): self.ids.record_button.disabled = True self.ids.reading_speed_slider.disabled = True self.ids.goto_text_input.disabled = True recording = self.mic.record(samplerate=self.fs, numframes=int(time_allowance * self.fs), channels=1) recording = recording / numpy.max(numpy.abs(recording)) wavwrite(out_path, self.fs, recording) # Save as WAV file self.recording_indicator = "" self.ids.record_button.disabled = False self.ids.reading_speed_slider.disabled = False self.ids.goto_text_input.disabled = False self.load_next_sentence()
def test_signal_equate(tmp_path): sig1 = np.sin(2*np.pi*np.linspace(0,1,50)) sig2 = sig1 * .8 with tempfile.TemporaryDirectory() as tmpdirin: infile1 = os.path.join(tmpdirin, 'file1.wav') infile2 = os.path.join(tmpdirin, 'file2.wav') wavwrite(infile1, 50, sig1) wavwrite(infile2, 50, sig2) with tempfile.TemporaryDirectory() as tmpdirout: psylab.signal.equate(tmpdirin, ext='wav', outdir=tmpdirout, relative=False) fs,out1 = wavread(os.path.join(tmpdirout, 'file1.wav')) fs,out2 = wavread(os.path.join(tmpdirout, 'file2.wav')) np.testing.assert_allclose(sig2, out1) np.testing.assert_allclose(sig2, out2) np.testing.assert_allclose(np.sqrt(np.mean(np.square(sig1)))*.8, np.sqrt(np.mean(np.square(out2))))
def filter(input_path, output_path, lowcut=100.0, highcut=3000.0, rate=44100): """ Filters a wav file to get rid of frequencies not present in human speech. Band pass filter. :param input_path: the input wav path :param output_path: the output wav path :param lowcut: the lowest frequency to accept :param highcut: the highest frequency to accept :param rate: the sampling frequency :type rate: Number :param force: call recursively if the input_path and match_path lengths vary greatly (not in betwee 0.5 and 2.0) :returns: -1 if a file does not exist """ if check_file_paths([input_path]) == -1: return -1 data, rate = ffmpeg_load_audio(input_path, 44100, True, dtype=np.float32) filtered_data = butter_bandpass_filter(data, lowcut, highcut, rate) wavwrite(output_path, 44100, filtered_data)
def epoch_save(self, samples, dir_name, epoch): """ Makes a mag/phase figure using _get_image and plotimage from samples and saves it as a .png file. Samples are also transformed to audio signals and saved as a .wav file. All files are saved into a subdirectory with the epoch number as its name. Parameters ---------- samples : ndarray Array of mag/phase tensors you want to save. Needs to hold at least 64 tensors. dir_name : string Name of the target directory. epoch : int Specifies the name of directory containing the saved figure and .wav ffiles . It is advisable that it is max 3 digits long, since the name format is "%00%d". """ # Create new epoch directory ep_dir_name = dir_name + '/%s' % str(epoch).zfill(3) + '/' if not os.path.exists(ep_dir_name): os.makedirs(ep_dir_name) # Create a 8x8 grid of mag/phase images batch_size, _, width, _ = samples.shape samples_magphase = np.zeros((batch_size, width, width)) # samples_magphase[:, :width // 2, :] = np.squeeze(samples[:, :, :, 0]) samples_magphase[:, :width // 2, :] = samples[:, :, :, 0] # Spectrogram samples_magphase[:, width // 2:, :] = samples[:, :, :, 1] # Phase # Save the image fig = self.plotimage(samples_magphase) plt.savefig(ep_dir_name + 'mag_phase.png', bbox_inches='tight') plt.close(fig) # Save sound samples for i, magphase in enumerate(samples): audio = self._image_to_audio(magphase) wavwrite(ep_dir_name + "%s.wav" % str(i).zfill(3), int(self.rate), audio / self.rate)
def handle_signals(mixedpath, noisepospath, noisenegpath): try: # Read Wavs mixedsamples = read_wav(mixedpath) noisepossamples = read_wav(noisepospath) noisenegsamples = read_wav(noisenegpath) # Normalize max_scale = max(abs(mixedsamples)+0.000001) mixedsamples = mixedsamples / max_scale wavwrite('/home/user/Desktop/N_HANS_Github/N_HANS___Selective_Noise/audio_examples/mixed_normalised', 16000, mixedsamples) noisepossamples = noisepossamples / max_scale noisenegsamples = noisenegsamples / max_scale mixedsamples = mixedsamples.astype(np.float32) noisepossamples = noisepossamples.astype(np.float32) noisenegsamples = noisenegsamples.astype(np.float32) # Cut the end to have an exact number of frames if (len(mixedsamples) - 400) % 160 != 0: mixedsamples = mixedsamples[:-((len(mixedsamples) - 400) % 160)] nsepos = noisepossamples nseneg = noisenegsamples while len(mixedsamples) - len(nsepos) > 0: # Make noise longer diff = len(mixedsamples) - len(nsepos) nsepos = np.concatenate([nsepos, noisepossamples[:diff]], axis=0) while len(mixedsamples) - len(nseneg) > 0: # Make noise longer diff = len(mixedsamples) - len(nseneg) nseneg = np.concatenate([nseneg, noisenegsamples[:diff]], axis=0) if len(mixedsamples) - len(noisepossamples) < 0: # Make noise shorter nsepos = noisepossamples[:len(mixedsamples)] if len(mixedsamples) - len(noisenegsamples) < 0: # Make noise shorter nseneg = noisenegsamples[:len(mixedsamples)] noisepossamples = nsepos noisenegsamples = nseneg return noisepossamples, noisenegsamples, mixedsamples except: print('error in threads') print(mixedpath, noisepospath, noisenegpath)
def filter(input_path, output_path, lowcut=100.0, highcut=3000.0, rate=44100): """ Filters a wav file to get rid of frequencies not present in human speech. Band pass filter. :param input_path: the input wav path :param output_path: the output wav path :param lowcut: the lowest frequency to accept :param highcut: the highest frequency to accept :param rate: the sampling frequency :type rate: Number :param force: call recursively if the input_path and match_path lengths vary greatly (not in betwee 0.5 and 2.0) :returns: -1 if a file does not exist """ if check_file_paths([input_path]) == -1: return -1 data, rate = ffmpeg_load_audio(input_path, 44100, True, dtype=np.float32) filtered_data = butter_bandpass_filter(data, lowcut, highcut, rate) # return filtered_data wavwrite(output_path, 44100, filtered_data)
def runTest(): dir = 'ressources/' files = [ dir + 'test1.mpg', dir + 'test2.mpg', dir + 'test3.mpg', dir + 'test4.mpg', dir + 'test5.mpg' ] sg = SoundGenerator(sound) j = 0 fileList = [] for i in files: objs = k.kanadeHarris(i, sound) data, fs = sg.soundGenerationForVideoPurpose(objs) wavwrite("soundTest" + str(j) + ".wav", fs, data) genNewVideo('ressources/test' + str(j + 1) + '.mpg_out.avi', "soundTest" + str(j) + ".wav", j) fileList.append('res/output' + str(j) + '.avi') j += 1 concatRes(fileList)
def reconstructFromSTFT(spectrum_phase, logspectrum_stft, save_to): abs_spectrum = np.exp(logspectrum_stft) spectrum_phase = np.array(spectrum_phase) spectrum = abs_spectrum * (np.exp(1j * spectrum_phase)) istft_graph = tf.Graph() with istft_graph.as_default(): num_fea = FFT_LENGTH // 2 + 1 frame_length = WIN_SAMPLES frame_step = HOP_SAMPLES stft_ph = tf.placeholder(tf.complex64, shape=(None, num_fea)) samples = tf.signal.inverse_stft(stft_ph, frame_length, frame_step, FFT_LENGTH) istft_sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) samples_ = istft_sess.run(samples, feed_dict={stft_ph: spectrum}) wavwrite(save_to, SAMPLING_RATE, samples_) return samples_
def average(*args): """ Averages multiple wav files together. Accomplishes this by performing fast fourier transforms on the data, averaging those arrays, and then performing an inverse fast fourier transform. :param args: array of wav paths with the output being the first path and the rest being inputs :returns: -1 if it fails or if it cannot find the paths :Example: >>> import speechprocessing >>> speechprocessing.average('output.wav', 'input_one.wav', 'input_two.wav') """ if len(args) < 2: print 'Invalid number of arguments' return -1 output_path = args[0] input_paths = args[1:] processed_wav_data = [] if check_file_paths(input_paths) == -1: return -1 for path in input_paths: data, rate = ffmpeg_load_audio(path, 44100, True, dtype=np.float32) filtered_data = butter_bandpass_filter(data, 100.0, 3000.0, 44100) processed_wav_data.append(filtered_data) fft_data = [] for data in processed_wav_data: fft_data.append(np.fft.rfft(data)) # Adding a * before an array of arrays makes it zip array-wise # .. or something. Nobody really knows how or why this works zipped_data = zip(*fft_data) mean_data = map(np.mean, zipped_data) # Reverse real fft averaged = np.fft.irfft(mean_data) wavwrite(output_path, 44100.0, averaged)
def log_batch(self, pr_batch, t_batch, mix_batch, mixture_rec=None): """! :param pr_batch: Reconstructed wavs: Torch Tensor of size: batch_size x num_sources x length_of_wavs :param t_batch: Target wavs: Torch Tensor of size: batch_size x num_sources x length_of_wavs :param mix_batch: Batch of the mixtures: Torch Tensor of size: batch_size x 1 x length_of_wavs :param mixture_rec: Batch of the reconstructed mixtures: Torch Tensor of size: batch_size x 1 x length_of_wavs """ mixture = mix_batch.detach().cpu().numpy() true_sources = t_batch.detach().cpu().numpy() pred_sources = pr_batch.detach().cpu().numpy() for b_ind in range(self.bs): self.log_example(pred_sources[b_ind], true_sources[b_ind], mixture[b_ind], title='bind_{}'.format(b_ind)) if mixture_rec is not None: mixture_rec_np = mixture_rec.detach().cpu().numpy() for b_ind in range(self.bs): rec_mix_name = "bind_{}_rec_mix.wav".format(b_ind) rec_mix_wav = (mixture_rec_np[b_ind][0] / (np.max(np.abs(mixture_rec_np[b_ind][0])) + 10e-8)) wavwrite(os.path.join(self.dirpath, rec_mix_name), self.fs, rec_mix_wav)
def main(**kwargs): outfile = kwargs['outfile'][0] infile = kwargs['infile'] print "Filtering %s to %s" % (infile, outfile) rate, sound_samples = wavread(infile) mono = True if 'ndarray' in str(type(sound_samples[0])): mono = False # data,r = ffmpeg_load_audio('32but.wav', 44100, True, dtype=np.float32) rate, sound_samples = ffmpeg_load_audio(infile, rate, mono, dtype=np.float32) fs = 44100.0 lowcut = 100.0 highcut = 3000.0 # b,a = butter_bandpass(lowcut, highcut, fs, 5) # filtered = lfilter(b, a, sound_samples) # filtered = butter_bandpass_filter(sound_samples, lowcut, highcut, fs, 5) # filtered = butter_bandpass_filter_two(sound_samples, lowcut, highcut, fs, 5) wavwrite(outfile, rate, sound_samples)
def main(): sig = 3 offset = 0 rate, data, nrate, ndata = get_all_data() timelength = data.shape[0] / rate times = np.linspace(0., timelength, data.shape[0]) kernel = signal.gaussian(data.size, sig) f, pxx, nf, npxx, convolved, nconvolved = get_power_spectra( kernel, sig, PLOT_FLAG) # Test smoothing gaussian_smoothing_test(PLOT_FLAG) # Apply custom low-pass filter filtered = low_pass(convolved, nconvolved, f, times, PLOT_FLAG) # Recover the final signal recovered_signal = recover(filtered, data, times, offset, PLOT_FLAG) # Save this signal into a wav file recovered_signal.real.astype(np.int16) wavwrite("reconstructed.wav", rate, recovered_signal)
def log_example(self, pred_sources, true_sources, mixture, title=''): mix_name = "{}_mix.wav".format(title) mix_wav = (mixture[0] / (np.max(np.abs(mixture[0])) + 10e-8)) wavwrite(os.path.join(self.dirpath, mix_name), self.fs, mix_wav) for s_ind in range(self.n_sources): true_s_name = "{}_true_s{}.wav".format(title, s_ind) rec_s_name = "{}_rec_s{}.wav".format(title, s_ind) rec_wav = (pred_sources[s_ind] / (np.max(np.abs(pred_sources[s_ind])) + 10e-8)) true_wav = (true_sources[s_ind] / (np.max(np.abs(true_sources[s_ind])) + 10e-8)) wavwrite(os.path.join(self.dirpath, true_s_name), self.fs, true_wav) wavwrite(os.path.join(self.dirpath, rec_s_name), self.fs, rec_wav)
recons_vt_psd = lpc.lpc2psd(lpc_coef, g, fft_size) recons_psd *= recons_vt_psd recons_psds.append(recons_psd) recons_vt_psds.append(recons_vt_psd) recons_psds = np.array(recons_psds) recons_vt_psds = np.array(recons_vt_psds) df = pysptk.synthesis.AllZeroDF(ORDER) synthesizer = pysptk.synthesis.Synthesizer(df, int(fs * 0.005)) x_glottal_res_zerodf = synthesizer.synthesis(x, lpcs / gains) df = pysptk.synthesis.AllZeroDF(3) synthesizer = pysptk.synthesis.Synthesizer(df, int(fs * 0.005)) x_res_zerodf = synthesizer.synthesis(x_glottal_res_zerodf, glottal_lpcs / glottal_gains) wavwrite('x_glottal_res_zerodf.wav', fs, (x_glottal_res_zerodf * 2**15).astype(np.int16)) wavwrite('x_res_zerodf.wav', fs, (x_res_zerodf * 2**15).astype(np.int16)) y = synthesisRequiem.get_waveform(x_res_zerodf, np.transpose(recons_psds, [1, 0]), dat['temporal_positions'], dat['f0'], dat['fs']) y_from_glottal = synthesisRequiem.get_waveform( x_glottal_res_zerodf, np.transpose(recons_vt_psds, [1, 0]), dat['temporal_positions'], dat['f0'], dat['fs']) wavwrite('x_recons_zerodf.wav', fs, (y * 2**15).astype(np.int16)) wavwrite('x_recons_glottal_zerodf.wav', fs, (y_from_glottal * 2**15).astype(np.int16))
output_filepath = sys.argv[1] input_filepaths = sys.argv[2:] fs = 44100.0 lowcut = 100.0 # Low pass cutoff highcut = 3000.0 # High pass cutoff processed_wav_data = [] for path in input_filepaths: data, rate = ffmpeg_load_audio(path, 44100, True, dtype=np.float32) filtered_data = butter_bandpass_filter(data, 100.0, 3000.0, 44100) processed_wav_data.append(filtered_data) fft_data = [] for data in processed_wav_data: fft_data.append(np.fft.rfft(data)) # Adding a * before an array of arrays makes it zip array-wise # .. or something. Nobody really knows how or why this works zipped_data = zip(*fft_data) mean_data = map(np.mean, zipped_data) # Reverse real fft f = np.fft.irfft(mean_data) wavwrite(output_filepath, fs, f)
def save_vgmwav(wav_fp, wav): wav *= 32767. wav = np.clip(wav, -32768., 32767.) wav = wav.astype(np.int16) wavwrite(wav_fp, 44100, wav)
isIce=allsicRS>thresh isIce[nanMask]=np.nan probIce=np.nanmean(isIce, axis=-1) p25=[] p75=[] for yit in range(len(boxPos)): p25.append(np.percentile(nSIF[yit,:], 25)) p75.append(np.percentile(nSIF[yit,:], 75)) p25_1850= np.percentile(nSIF1850, 25) p75_1850= np.percentile(nSIF1850, 75) data1=(sic[:,0]>15)*1. data2=(sic1850>15)*1. wavwrite('/Volumes/Pitcairn/seaicePPF/northernHemisphere/figures/OWnoise1850-2100.'+key+'.wav', 365*8, data1) wavwrite('/Volumes/Pitcairn/seaicePPF/northernHemisphere/figures/OWnoise1850BG'+key+'.wav', 365*8, data2) fig8, ax8 = plt.subplots(1, 2, sharey=True, sharex=False, num=None, figsize=(7,3.5), dpi=300, facecolor='w', edgecolor='w') fig8.sca(ax8[0]) plt.tick_params(axis='both', which='major', labelsize=6) plt.tick_params(axis='both', which='minor', labelsize=6) fig8.sca(ax8[1]) plt.tick_params(axis='both', which='major', labelsize=6) plt.tick_params(axis='both', which='minor', labelsize=6) fig8.patch.set_alpha(0.0) #ax5.set_aspect('equal', 'datalim') fig8.sca(ax8[1]) plt.imshow(meanAllsic.T, vmin=0, vmax=100, cmap=cmap) #plt.colorbar()
# sound_time = f.nframes*1.0/f.samplerate # sound_data = f.read_frames(f.nframes) # samples_to_take = int(math.floor(sound_time * display_sample_rate)) # time_step_for_samples = f.samplerate*1.0/display_sample_rate # wave = [] # for i in xrange(samples_to_take): # frame_offset = i * time_step_for_samples # if num_channels == 1: # wave.append(sound_data[frame_offset]) # else: # wave.append(sound_data[frame_offset][0]) rate, wave = wavread(infile) wavwrite('test.wav', rate, wave) (freq, amp) = get_component_frequencies(wave) # print type(s) # with open('data.txt', 'a') as textOutputFile: # for line in amp: # textOutputFile.write(str(line)) # textOutputFile.write(',') # Only plot first 4000 Hz hz=4000 freq = freq[0:hz] amp = amp[0:hz] fig = pylab.figure()
def sound(var): from scipy.io.wavfile import write as wavwrite scaled = np.int16(var/np.max(np.abs(var)) * 32767) stmp=asarray(scaled,dtype=np.int16) wavwrite('stmp.wav',8820,stmp) sysfileopen("stmp.wav")
import numpy as np import wave from scipy.io.wavfile import write as wavwrite import struct LENWAV = 20000 # Must be <= 40000 / SAMPLEFACTOR for one-second wav files SAMPLEFACTOR = 4 img = np.zeros((1, 1, LENWAV, 1), dtype=int) waveFile = wave.open("../data/pitbull.wav", "rb") for i in range(0, LENWAV * SAMPLEFACTOR): waveData = waveFile.readframes(1) if i % SAMPLEFACTOR == 0: sound = struct.unpack("<h", waveData) img[0, 0, i / SAMPLEFACTOR, 0] = sound[0] print "NEW SAMPLE RATE", waveFile.getframerate() / SAMPLEFACTOR wavwrite( "../output/test.wav", waveFile.getframerate() / SAMPLEFACTOR, img.flatten() / float(np.max(np.abs(img.flatten()), axis=0)), )
def deemphasis(signal): return lfilter([1, 0.70], 1, signal) def rms(signal): return sqrt(mean(power(signal, 2))) if __name__ == "__main__": fs, data = wavread('Mann.wav') data = array(data, dtype=double) data /= amax(absolute(data)) data = decimate(data, 4) fs = round(fs/4) block_len = 0.032 overlap = 0.5 order = 16 out = vocode(data, fs, block_len, overlap, order) wavwrite('vocoded.wav', fs, array(out/amax(absolute(out)) * (2**15-1), dtype=int16)) figure() plot(data) figure() plot(out) show() # ideas: # use reduce(ola, map(process, array)) # http://stackoverflow.com/questions/6657820/python-convert-an-iterable-to-a-stream
def process_filtering(self, sig_float_filtered, write=False, output_file_name = None): self.filtered = True self.sig_int = float2pcm(sig_float_filtered) self.sig_float = sig_float_filtered if write: wavwrite(output_file_name, file.sr, file.sig_int)
def main(): import argparse import os import sys import traceback from tqdm import tqdm parser = argparse.ArgumentParser() conversion_to_types = { # VGM simplifiers 'vgm_simplify': ('.vgm', '.simp.vgm'), 'vgm_shorten': ('.vgm', '.short.vgm'), # NES disassembly raw 'vgm_to_ndr': ('.vgm', '.ndr.pkl'), 'ndr_to_txt': ('.ndr.pkl', '.ndr.txt'), 'txt_to_ndr': ('.ndr.txt', '.ndr.pkl'), 'ndr_to_vgm': ('.ndr.pkl', '.ndr.vgm'), # NES disassembly functional 'vgm_to_ndf': ('.vgm', '.ndf.pkl'), 'ndf_to_txt': ('.ndf.pkl', '.ndf.txt'), 'txt_to_ndf': ('.ndf.txt', '.ndf.pkl'), 'ndf_to_vgm': ('.ndf.pkl', '.ndf.vgm'), # NES language modeling format 'vgm_to_nlm': ('.vgm', '.nlm.pkl'), 'nlm_to_vgm': ('.nlm.pkl', '.nlm.vgm'), # NES-MDB score formats 'ndf_to_exprsco': ('.ndf.pkl', '.exprsco.pkl'), 'ndf_to_midi': ('.ndf.pkl', '.mid'), 'exprsco_to_seprsco': ('.exprsco.pkl', '.seprsco.pkl'), 'exprsco_to_blndsco': ('.exprsco.pkl', '.blndsco.pkl'), # WAV converters 'vgm_to_wav': ('.vgm', '.wav'), 'ndr_to_wav': ('.ndr.pkl', '.wav'), 'ndf_to_wav': ('.ndf.pkl', '.wav'), 'nlm_to_wav': ('.nlm.pkl', '.wav'), 'midi_to_wav': ('.mid', '.mid.wav'), 'exprsco_to_wav': ('.exprsco.pkl', '.exprsco.wav'), 'seprsco_to_wav': ('.seprsco.pkl', '.seprsco.wav'), 'blndsco_to_wav': ('.blndsco.pkl', '.blndsco.wav'), } conversion_to_kwargs = { 'vgm_simplify': ['vgm_simplify_nop1', 'vgm_simplify_nop2', 'vgm_simplify_notr', 'vgm_simplify_nono'], 'vgm_shorten': ['vgm_shorten_start', 'vgm_shorten_nmax'], 'ndf_to_exprsco': ['ndf_to_exprsco_rate'], 'midi_to_wav': ['midi_to_wav_rate'], } parser.add_argument('conversion', type=str, choices=conversion_to_types.keys()) parser.add_argument('fps', type=str, nargs='+') parser.add_argument('--out_dir', type=str) parser.add_argument('--skip_verify', action='store_true', dest='skip_verify') parser.add_argument('--vgm_shorten_start', type=int) parser.add_argument('--vgm_shorten_nmax', type=int) parser.add_argument('--vgm_simplify_nop1', action='store_true', dest='vgm_simplify_nop1') parser.add_argument('--vgm_simplify_nop2', action='store_true', dest='vgm_simplify_nop2') parser.add_argument('--vgm_simplify_notr', action='store_true', dest='vgm_simplify_notr') parser.add_argument('--vgm_simplify_nono', action='store_true', dest='vgm_simplify_nono') parser.add_argument('--ndf_to_exprsco_rate', type=float) parser.add_argument('--midi_to_wav_rate', type=float) parser.set_defaults( conversion=None, fps=None, out_dir=None, skip_verify=False, vgm_shorten_start=None, vgm_shorten_nmax=1024, vgm_simplify_nop1=False, vgm_simplify_nop2=False, vgm_simplify_notr=False, vgm_simplify_nono=False, ndf_to_exprsco_rate=None, midi_to_wav_rate=None) args = parser.parse_args() in_type, out_type = conversion_to_types[args.conversion] fps = args.fps if len(fps) > 1 and args.out_dir is None: raise Exception('Must specify output directory for batch mode') if len(fps) == 1 and args.out_dir is None: out_fps = [fps[0].replace(in_type, out_type)] else: out_fns = [os.path.basename(fp).replace(in_type, out_type) for fp in fps] out_fps = [os.path.join(args.out_dir, fn) for fn in out_fns] if os.path.exists(args.out_dir): print 'WARNING: Output directory {} already exists'.format(args.out_dir) else: os.makedirs(args.out_dir) for in_fp, out_fp in tqdm(zip(fps, out_fps)): if not args.skip_verify: _verify_type(in_fp, in_type) _verify_type(out_fp, out_type) # Load input file in_ext = in_type.split('.')[-1] if in_ext == 'pkl': with open(in_fp, 'rb') as f: in_file = pickle.load(f) elif in_ext in ['mid', 'vgm']: with open(in_fp, 'rb') as f: in_file = f.read() elif in_ext == 'txt': with open(in_fp, 'r') as f: in_file = f.read() else: raise NotImplementedError('Input extension .{} not recognized'.format(in_ext)) kwargs = {} if args.conversion in conversion_to_kwargs: kwargs = {kw:getattr(args, kw) for kw in conversion_to_kwargs[args.conversion]} try: out_file = globals()[args.conversion](in_file, **kwargs) except: print '-' * 80 print in_fp traceback.print_exc() continue # Save output file out_ext = out_type.split('.')[-1] if out_ext == 'pkl': with open(out_fp, 'wb') as f: pickle.dump(out_file, f) elif out_ext in ['mid', 'vgm']: with open(out_fp, 'wb') as f: f.write(out_file) elif out_ext == 'txt': with open(out_fp, 'w') as f: f.write(out_file) elif out_ext == 'wav': wav = out_file wav *= 32767. wav = np.clip(wav, -32767., 32767.) wav = wav.astype(np.int16) wavwrite(out_fp, 44100, wav) else: raise NotImplementedError('Output extension .{} not recognized'.format(out_ext))
sigs = mixsounds_ica(g,dg) sigs = np.array(sigs,dtype = 'uint8') plot(sigs[0],len(sigs[0])) plot(sigs[1],len(sigs[1])) plot(sigs[2],len(sigs[2])) plot(sigs[3],len(sigs[3])) plot(sigs[4],len(sigs[4])) plot(sigs[5],len(sigs[5])) plot(sigs[6],len(sigs[6])) plot(sigs[7],len(sigs[7])) plot(sigs[8],len(sigs[8])) # write in a file if 'output' not in os.listdir('.'): os.mkdir('output') for i in range(sigs.shape[0]): wavwrite('output/unmixedsound'+`i`+'.wav',8000,sigs[i]) # used to test the quality of the functions (part 1) elif sys.argv[1] == 'convergence': mean = 0 N = 100 N_n = N for i in range(N): s1,s2,n = square_cos_ica(g,dg) if n >= 0: mean += n #if algo diverged then don't count it else: N_n = N_n - 1 print(mean) print("The mean of the number of steps untill convergence is " +`mean/N_n`)