def _load_sounds(self): self._bar_audio_data, self._bar_audio_fs = soundfile.read(self._bar_file) self._beat_audio_data, self._beat_audio_fs = soundfile.read(self._beat_file) if self._division_file: self._division_audio_data, self._division_audio_fs = soundfile.read(self._division_file) else: self._division_audio_data, self._division_audio_fs = None, None
def main(): #fs, bg_signal = wavfile.read(sys.argv[1]) if argv[1] == 'batch': files = [] for f in os.listdir(argv[2]): if os.path.splitext(f)[1] == ".flac": files.append(f) args = [(f, argv[2], argv[3]) for f in files] pool = multiprocessing.Pool(12) r = pool.map_async(compute_vad, args) r.wait() pool.close() pool.join() #for a in args: # compute_vad(a) else: bg_signal, fs = soundfile.read(argv[1]) ltsd = LTSD_VAD() bg_signal=bg_signal[:2000] print(bg_signal) ltsd.init_params_by_noise(fs, bg_signal) signal, fs = soundfile.read(argv[1]) #vaded_signal = ltsd.filter(signal) segments, sig_len = ltsd.segments(signal) print(ltsd.segments(signal)[0])
def test_write_non_seekable_file(file_w): with sf.SoundFile(file_w, 'w', 44100, 1, format='XI') as f: assert not f.seekable() assert f.frames == 0 f.write(data_mono) assert f.frames == len(data_mono) with pytest.raises(RuntimeError) as excinfo: f.seek(2) assert "unseekable" in str(excinfo.value) with sf.SoundFile(filename_new) as f: assert not f.seekable() assert f.frames == len(data_mono) data = f.read(3, dtype='int16') assert np.all(data == data_mono[:3]) data = f.read(666, dtype='int16') assert np.all(data == data_mono[3:]) with pytest.raises(RuntimeError) as excinfo: f.seek(2) assert "unseekable" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: f.read() assert "frames" in str(excinfo.value) data, fs = sf.read(filename_new, dtype='int16') assert np.all(data == data_mono) assert fs == 44100 with pytest.raises(ValueError) as excinfo: sf.read(filename_new, start=3) assert "start is only allowed for seekable files" in str(excinfo.value)
def test_read_into_non_contiguous_out(file_stereo_r): out = np.empty(data_stereo.shape[::-1], dtype='float64') if getattr(sys, 'pypy_version_info', (999,)) < (2, 6): # The test for C-contiguous doesn't work with PyPy 2.5.0 sf.read(file_stereo_r, out=out.T) else: with pytest.raises(ValueError) as excinfo: sf.read(file_stereo_r, out=out.T) assert "C-contiguous" in str(excinfo.value)
def get_traindata(gesfile, audio_f, dt, audio_fargs=None, wavfile=None, ignore_f0=True): """Get input, output pairs for supervised learning training or testing. Parameters ---------- dt : float Sampling step size for the gesture and gesfile : str Path to a .ges gesture file (XML format). audio_f : function A function that will be applied to the audio stream audio_fargs : dict, optional Keyword arguments that will be provided to ``audio_f``. By default, audio, sampling rate, and dt will be provided. wavfile : str, optional A .wav file that corresponds to the ``gesfile``. If specified but the file does not exist, it will be generated. If not specified, audio will be synthesized but not saved. """ gs = parse_ges(gesfile, ignore_f0=ignore_f0) y = gs.trajectory(dt=dt) if wavfile is None: audio, fs = synthesize(gesfile) elif not os.path.exists(wavfile): synthesize(gesfile, wavfile) audio, fs = sf.read(wavfile) else: audio, fs = sf.read(wavfile) audio_fargs = {} if audio_fargs is None else audio_fargs.copy() audio_fargs.update({'audio': audio, 'fs': fs, 'dt': dt}) x = audio_f(**audio_fargs) # For some reason, the wav file size and the gesture trajectory size # are often off by one or two. Here, we lengthen or shorten ``y``, # assuming that VTL is doing it correctly. # Not sure if that assumption is correct. if x.shape[0] > y.shape[0]: # Extend y by n timesteps toadd = np.tile(y[np.newaxis, -1], (x.shape[0] - y.shape[0], 1)) y = np.concatenate((y, toadd)) if x.shape[0] < y.shape[0]: # Shorten y by n timesteps todelete = list(range(x.shape[0], y.shape[0])) y = np.delete(y, todelete, 0) assert x.shape[0] == y.shape[0], "Misaligned; %s %s" % (x.shape, y.shape) return x, y, fs
def show_file_hystogram(filename): # data, sample_rate = sf.read(filename) # load the data data, sample_rate = sf.read(filename) # load the data #a = data.T[0] # this is a two channel soundtrack, I get the first track b=[(ele/2**8.)*2-1 for ele in data] # this is 8-bit track, b is now normalized on [-1,1) #print("vector more than 0 vals number: {0}".format(len(c))) c = np.fft.fft(b) # calculate fourier transform (complex numbers list) d = len(c)/2 - 1 # you only need half of the fft list (real signal symmetry) #d = len(c) # you only need half of the fft list (real signal symmetry) #print("complex val: {0} abs(complex val): {1}".format(c[0], abs(c[0]))) k = np.arange(d) fs = 8000 # 8kHz T = d/fs frqLabel = k/T c = c[:d] c = abs(c) c = [round(i, 1) for i in c] print("vector dimensionality: {0}".format(d)) print("min: {0} max:{1}".format(min(c), max(c))) plt.gca().set_ylim([min(c), 20]) plt.plot(frqLabel, c, 'g') plt.show()
def experimental_random_segmentation(audio_input, segments, options, sr): """ (branch mir-dev en Sonidos Mutantes) Segmenta con valores aleatorios según opciones """ outputPath = options['outputPath'] min_dur,max_dur = options['duration'] try: x = read(audio_input)[0] for i in range(segments): while(1): pos = random.uniform(0.,2.) #posición en el archivo normalizada dur = random.uniform(min_dur,max_dur) durSamples = dur*sr posSamples = int( pos*len(x) ) if posSamples+durSamples<len(x): break signalOut = x[pos:pos+durSamples] baseName = os.path.splitext(filename)[0].split('/')[-1] if not os.path.exists(outputPath): os.makedirs(outputPath) print("Creating samples directory") time.sleep(4) outputFilename = outputPath+'/'+baseName+'_sample'+str(i)+'.wav' write_file(outputFilename,signalOut,sr) print(("File generated: %s"%outputFilename)) time.sleep(1) except Exception as e: print(("Error: %s"%e))
def test_buffer_write_with_bytes(sf_stereo_w): b = b"\x01\x00\xFF\xFF\xFF\x00\x00\xFF" sf_stereo_w.buffer_write(b, 'short') sf_stereo_w.close() data, fs = sf.read(filename_new, dtype='int16') assert np.all(data == [[1, -1], [255, -256]]) assert fs == 44100
def test_process_multiple(self): keyword_file_names = ['alexa', 'americano', 'avocado', 'blueberry', 'bumblebee', 'caterpillar', 'christina', 'dragonfly', 'flamingo', 'francesca', 'grapefruit', 'grasshopper', 'iguana', 'picovoice', 'pineapple', 'porcupine', 'raspberry', 'terminator', 'vancouver'] keyword_file_paths = [ self._abs_path('../../resources/keyword_files/%s_%s.ppn' % (name, self._keyword_file_extension())) for name in keyword_file_names] porcupine = Porcupine( library_path=self._library_path(), model_file_path=self._abs_path('../../lib/common/porcupine_params.pv'), keyword_file_paths=keyword_file_paths, sensitivities=[0.5] * len(keyword_file_paths)) audio, sample_rate = soundfile.read( self._abs_path('../../resources/audio_samples/multiple_keywords.wav'), dtype='int16') assert sample_rate == porcupine.sample_rate num_frames = len(audio) // porcupine.frame_length results = [] for i in range(num_frames): frame = audio[i * porcupine.frame_length:(i + 1) * porcupine.frame_length] result = porcupine.process(frame) if result >= 0: results.append(result) self.assertEqual(results, [15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]) porcupine.delete()
def test_write_int_data_to_float_file(file_inmemory): """This is a very uncommon use case.""" sf.write(file_inmemory, data_mono, 44100, format='WAV', subtype='FLOAT') file_inmemory.seek(0) read, fs = sf.read(file_inmemory, always_2d=False, dtype='float32') assert np.all(read == data_mono) assert fs == 44100
def test_rplus_append_data(sf_stereo_rplus): sf_stereo_rplus.seek(0, sf.SEEK_END) sf_stereo_rplus.write(data_stereo / 2) sf_stereo_rplus.close() data, fs = sf.read(filename_new) assert np.all(data[:len(data_stereo)] == data_stereo) assert np.all(data[len(data_stereo):] == data_stereo / 2)
def importRirs(downloadDir, insertIntoDbF): url = "http://www.openslr.org/resources/13/RWCP.tar.gz" filename = join(downloadDir, "rwcp.tar.gz") unpackDir = join(downloadDir, "rwcp") dl = util.FileDownloader(url, filename) dl.download() dl.unpackTo(unpackDir) files = [] for root, dirnames, filenames in os.walk(join(unpackDir, "RWCP/micarray/MICARRAY/data1")): for filename in filenames: if filename[-2:] != ".1": continue # we only use the front microphone files.append(join(root, filename)) pattern = re.compile("(circle|cirline)\/(\w{3})\/imp(\d{3})") bar = util.ConsoleProgressBar() bar.start("Import RWCP") for i, file in enumerate(sorted(files)): # we sort to get same identifiers cross-platform m = pattern.search(file) assert m, "Could parse room from path ({})".format(file) room = m.group(2) identifier = "{:04d}_{}_{}".format(i, room.lower(), m.group(3)) x, fs = sf.read(file, dtype="float32", **RawFormat) x /= max(abs(x)) x = (3 ** 15 * x).astype(np.int16) insertIntoDbF((x, fs), identifier, {"source": "RWCP", "room": room}) bar.progress(i / len(files)) bar.end()
def update_max_len(file_path_list, max_len): tmp_max_len = 0 # Update the max length based on the given dataset signal_set = set() for file_path in file_path_list: file_list = open(file_path) for line in file_list: line = line.strip().split() if len(line) < 2: print 'Wrong audio list file record in the line:', line continue file_str = line[0] if file_str in signal_set: continue signal_set.add(file_str) signal, rate = sf.read(file_str) # signal: sample values,rate: sample rate if len(signal.shape) > 1: signal = signal[:, 0] if rate != FRAME_RATE: # up-sample or down-sample for predefined sample rate signal = resampy.resample(signal, rate, FRAME_RATE, filter='kaiser_fast') if len(signal) > tmp_max_len: tmp_max_len = len(signal) file_list.close() if tmp_max_len < max_len: max_len = tmp_max_len return max_len
def _process_function(self, track, user_function, estimates_dir, evaluate): # load estimates from disk instead of processing if user_function is None: track_estimate_dir = op.join( estimates_dir, track.subset, track.filename ) user_results = {} for target_path in glob.glob(track_estimate_dir + '/*.wav'): target_name = op.splitext( os.path.basename(target_path) )[0] try: target_audio, rate = sf.read( target_path, always_2d=True ) user_results[target_name] = target_audio except RuntimeError: pass else: # call the user provided function user_results = user_function(track) if estimates_dir and not evaluate and user_function is not None: self._save_estimates(user_results, track, estimates_dir) if evaluate: self._evaluate_estimates(user_results, track)
def play(self, file_path): if self.convert: self.convert_mp3_to_wav(file_path_mp3=file_path) data, fs = sf.read(file_path) sd.play(data, fs) sd.wait()
def do_segmentation(audio_input, audio_input_from_filename = True, audio_input_from_array = False, sec_len = 6, save_file = True): lenght = int(sec_len) * 10 if audio_input_from_filename == True: x = read(audio_input)[0] if (audio_input_from_filename == False) and audio_input_from_array == True: x = audio_input retriever = MIR(x, 44100) frame_size = 4096 hop_size = 1024 segments = [len(frame) / 44100 for frame in retriever.FrameGenerator()] output = [] for segment in segments: sample = int(segment*44100) output.append(x[:sample*lenght]) #extend duration of segment output = choice(output) if save_file == True: baseName = os.path.splitext(audio_input)[0].split('/')[-1] outputFilename = 'samples'+'/'+baseName+'_sample'+'.wav' write_file(outputFilename, 44100, output) print(("File generated: %s"%outputFilename)) if save_file == False: return output
def get_data(rootdir = TIMIT_main_dir): inputs = [] targets = [] for dir_path, sub_dirs, files in os.walk(rootdir): for file in files: if (os.path.join(dir_path, file)).endswith('.wav'): wav_file_name = os.path.join(dir_path, file) input_data, f_s = sf.read(wav_file_name) # mfcc_feat = MFCC_input(mfcc(input_data,f_s)) mfcc_feat = mfcc(input_data,f_s) #Delta features delta_feat = mfcc_feat[:-1]-mfcc_feat[1:] #Delta-Delta features deltadelta_feat = delta_feat[:-1]-delta_feat[1:] #Removing the first two frames mfcc_feat = mfcc_feat[2:] delta_feat = delta_feat[1:] #Concatenating mfcc, delta and delta-delta features full_input = np.concatenate((mfcc_feat,delta_feat,deltadelta_feat), axis=1) inputs.append(np.asarray(full_input, dtype=theano.config.floatX))#Rakeshvar wants one frame along each column but i am using Lasagne text_file_name = wav_file_name[:-4] + '.txt' target_data_file = open(text_file_name) target_data = str(target_data_file.read()).lower().translate(None, '!:,".;?') # target_data = str(target_data_file.read()).lower().translate(str.maketrans('','', '!:,".;?')) target_data = target_data[8:-1]#No '.' in lexfree dictionary targets.append(target_data) return inputs, targets
def test_buffer_write(sf_stereo_w): buf = np.array([[1, 2], [-1, -2]], dtype='int16') sf_stereo_w.buffer_write(buf, 'short') sf_stereo_w.close() data, fs = sf.read(filename_new, dtype='int16') assert np.all(data == buf) assert fs == 44100
def _transform(self, row): if len(row) == 7: path, channel, name, spkid, dataset, start_time, end_time = row else: path, channel, name, spkid, dataset = row[:5] start_time = None end_time = None # ====== read audio ====== # # for voxceleb1 if dataset == 'voxceleb1': with open(path, 'rb') as f: y, sr = sf.read(f) y = pp.signal.resample(y, sr_orig=sr, sr_new=8000, best_algorithm=True) sr = 8000 # for sre, fisher and swb elif (dataset[:3] == 'sre' or dataset == 'swb' or dataset == 'fisher'): with open(path, 'rb') as f: y, sr = sf.read(f) y = pp.signal.resample(y, sr_orig=sr, sr_new=8000, best_algorithm=True) if y.ndim == 2: y = y[:, int(channel)] sr = 8000 # all other dataset: mix6, voxceleb2 else: y, sr = pp.signal.anything2wav(inpath=path, outpath=None, channel=channel, dataset=dataset, start=start_time, end=end_time, sample_rate=Config.SAMPLE_RATE, return_data=True) # ====== error happen ignore file ====== # if len(y) == 0: return None # ====== remove DC offset ====== # y = y - np.mean(y, 0) duration = max(y.shape) / sr ret = {'raw': y, 'sr': sr, 'duration': duration, # in second 'path': path, 'spkid': spkid, 'name': name, 'dsname': dataset} return ret
def load_bgd_wav(file_path): signal, rate = sf.read(file_path) # signal: sample values,rate: sample rate if len(signal.shape) > 1: signal = signal[:, 0] if rate != FRAME_RATE: # up-sample or down-sample for predefined sample rate signal = resampy.resample(signal, rate, FRAME_RATE, filter='kaiser_fast') return signal
def open_sound_and_normalise(path): """ returns mono audio of given samplerate """ orig_samples, orig_samplerate = soundfile.read(path) ratio = orig_samplerate / samplerate samples = orig_samples[::ratio, 0] return samples
def getData(self, params): ticker = params['ticker'] import soundfile sig, samplerate = soundfile.read(ticker + ".wav") df = pd.Series({"filename": ticker, "length": len(sig), "samplerate": samplerate}) df = df.to_frame().transpose() return df
def __rubberband(y, sr, **kwargs): '''Execute rubberband Parameters ---------- y : np.ndarray [shape=(n,) or (n, c)] Audio time series, either single or multichannel sr : int > 0 sampling rate of y **kwargs keyword arguments to rubberband Returns ------- y_mod : np.ndarray [shape=(n,) or (n, c)] `y` after rubberband transformation ''' assert sr > 0 # Get the input and output tempfile fd, infile = tempfile.mkstemp(suffix='.wav') os.close(fd) fd, outfile = tempfile.mkstemp(suffix='.wav') os.close(fd) # dump the audio sf.write(infile, y, sr) try: # Execute rubberband arguments = ['rubberband', '-q'] for key, value in six.iteritems(kwargs): arguments.append(str(key)) arguments.append(str(value)) arguments.extend([infile, outfile]) subprocess.check_call(arguments) # Load the processed audio. y_out, _ = sf.read(outfile, always_2d=True) # make sure that output dimensions matches input if y.ndim == 1: y_out = np.squeeze(y_out) finally: # Remove temp files os.unlink(infile) os.unlink(outfile) pass return y_out
def getAllFeatures(featureType, wavFileList, samplerate=16000,winlen=0.0256,winstep=0.01, nfilt=40, nfft=512,lowfreq=133.3333,highfreq=6855.4976,preemph=0.97, winSzForDelta=2, numcep=13, ceplifter=22, appendEnergy=True): ''' Computes all features of a given numpy vector of file paths to .wav files. Reads the wav files specified in 'wavFileList' the package 'PySoundFile'. PySoundFile is able to read the format of the files from TIMIT database. See: http://pysoundfile.readthedocs.org/en/0.7.0/ and https://github.com/bastibe/PySoundFile For other parameters see function getFeatures, once signal is read from path, signal and other parameters are forwarded to 'getFeatures' :parameters: - featureType: either 'mfcc' or 'logFB' - wavFileList: list of file paths - samplerate - winlen - winstep - nfilt - nfft - lowfreq - highfreq - preemph - winSzForDelta :returns: - featureList: numpy vector of np.arrays list of same length as input wavFileList, dimensions of every element of the list specified by signal duration and winstep (1st dim), and number of filters (2nd dim) ''' featureList = [] for f in wavFileList: signal, _ = sf.read(f) # equalize rms --> same power in all speech signals. Note that later features will be normalised # to have zero mean and unit variance, but that is w.r.t all signals. Before, make sure that signals # have same energy. rms = np.sqrt(np.mean(np.square(signal))) signal=signal/rms if featureType == 'mfcc': featureList.append(mfccFeatures( signal=signal,samplerate=samplerate,winlen=winlen, winstep=winstep, nfilt=nfilt,nfft=nfft,lowfreq=lowfreq, highfreq=highfreq,preemph=preemph, winSzForDelta=winSzForDelta, numcep=numcep, ceplifter=ceplifter, appendEnergy=appendEnergy)) elif featureType == 'logFB': featureList.append(logFilterbankFeatures( signal=signal,samplerate=samplerate,winlen=winlen,winstep=winstep, nfilt=nfilt,nfft=nfft,lowfreq=lowfreq,highfreq=highfreq,preemph=preemph, winSzForDelta=winSzForDelta)) elif featureType == 'FB': featureList.append(filterbankFeatures( signal=signal,samplerate=samplerate,winlen=winlen,winstep=winstep, nfilt=nfilt,nfft=nfft,lowfreq=lowfreq,highfreq=highfreq,preemph=preemph, winSzForDelta=winSzForDelta)) else: raise ValueError return np.array(featureList)
def compute_vad(args): filename, path, resultpath = args signame = os.path.basename(os.path.splitext(filename)[0]) ids = signame.split("_") print("computing: "+path+filename) bg_signal, rate = soundfile.read(path+filename) ltsd = LTSD_VAD() bg_signal=bg_signal[:2000] print(bg_signal) ltsd.init_params_by_noise(rate, bg_signal) signal, rate = soundfile.read(path+filename) #vaded_signal = ltsd.filter(signal) segments, sig_len = ltsd.segments(signal) #seconds = float(len(sig))/rate res_name = resultpath+"/ad-ltsd_"+os.path.basename(os.path.splitext(filename)[0])+".txt" segments = librosa.core.samples_to_time(segments, rate).tolist() len_s = librosa.core.samples_to_time(sig_len, rate) write_results(segments, res_name, len_s)
def Load(cls, filename): data, samplingrate = soundfile.read(file="%s.%s" % (filename, cls.ending)) if numpy.size(data) == len(data): # single channel files are imported into a one dimensional row array, so len and size are the same. These need not be transposed channels = (data,) else: channels = numpy.transpose(data) return sumpf.Signal(channels=channels, samplingrate=samplingrate, labels=[str(" ".join([filename.split(os.sep)[-1], str(c + 1)])) for c in range(len(data))])
def test_wplus_read_written_data(sf_stereo_wplus): sf_stereo_wplus.write(data_stereo) assert sf_stereo_wplus.seek(0, sf.SEEK_CUR) == len(data_stereo) sf_stereo_wplus.seek(0) assert np.all(sf_stereo_wplus.read() == data_stereo) assert sf_stereo_wplus.seek(0, sf.SEEK_CUR) == len(data_stereo) sf_stereo_wplus.close() data, fs = sf.read(filename_new) assert np.all(data == data_stereo)
def audio(self, audio_): if is_string(audio_): # Assuming this is a wav file audio_, fs = sf.read(audio_) self.fs = fs assert is_array(audio_) if audio_.ndim == 1: audio_ = audio_[:, np.newaxis] self.mfcc.audio = audio_ self.periphery.sound_process = ArrayProcess(audio_)
def __init__(self, filename): data, self.sr = sf.read(filename) self.rawdata = np.array(data) if len(self.rawdata.shape) == 1: self.frames = self.rawdata.shape[0] self.data = np.array(self.rawdata) else: self.frames, self.channels = self.rawdata.shape self.data = self.rawdata[:,0] self.length = self.frames/self.sr
def readFromAudioFile(filename, mono=False): ''' Calls audiolab to generate Sound object from wav and aiff files. If mono is true, returns the left channel only. ''' data, fs = sf.read(filename) if (len(data.shape) == 2 and mono): return Sound(data[:, 0], fs, filename) else: return Sound(data, fs, filename)
import numpy as np import matplotlib.pyplot as plt import soundfile as sf from SingleMic import segment_overlap as s_o from SingleMic import inverse_segment_overlap as i_s_o import time start_time = time.time() #Variables tsegment = 20e-3 #20ms segment overlap = 0.5 #Import data & fs data, fs = sf.read('Audio/clean.wav') # Calc s_segment = int(tsegment * fs) s_overlap = int(overlap * s_segment) # pad data with zeros remainder = s_segment - (len(data) % s_segment) data_extended = np.ravel( np.asmatrix(np.pad(data, (0, int(remainder)), 'constant'))) x_array = s_o.segment_overlap(data_extended, s_segment, s_overlap) x_truncarray = i_s_o.inverse_segment_overlap(x_array, len(data_extended), s_segment, s_overlap) #calculate difference between initial and reconstructed signals residual = data_extended - x_truncarray
def read_audio(path): filepath = get_abs_path(src_path, path) return soundfile.read(filepath)
:param extract: (Function) extraction method to use :param multi: (bool) specify if several samples can be extract from one audio file :param audio_dir: (str) directory where the audio files are located :return: (features:list, labels:list) """ features = [] speakers = [] for index, row in data.iterrows(): audio_name = row.loc[AUDIO] speaker = row.loc[SPEAKER_ID] audio, samplerate = sf.read(audio_dir + audio_name) audio_extracts = segment_audio(audio, samplerate) if not multi: audio_extracts = audio_extracts[0:1] for audio_extract in audio_extracts: # extract the features using the given extraction function features.append(extract(audio_extract, samplerate)) speakers.append(speaker) return features, speakers if __name__ == '__main__': audio, sp = sf.read("database/dev/audio/aahtm.flac") sp_audio = segment_audio(audio, sp) lpc = extract_with_lpc(audio, sp)
def wav_read(wav_file): wav_data, sr = sf.read(wav_file, dtype='int16') return wav_data, sr
import sounddevice as sd import numpy as np import soundfile as sf import matplotlib.pyplot as plt import time import os from scipy import signal # In[2]: files = os.listdir(r'C:\Users\Hp\Desktop\music\written') s = 'C:/Users/Hp/Desktop/music/written/' duration = 5 # seconds fs = 44100 sd.default.device = 1 print('Started recording') myrecording = sd.rec(int(duration * fs), samplerate=fs, channels=1) sd.wait() print('Stopped recording') m = [] for i in range(len(files)): data, samplerate = sf.read(s+files[i]) a = signal.correlate(data,myrecording[0::20,0]) m.append(np.max(abs(a))) I = m.index(max(m)) print(files[I])
import soundfile as sf import numpy as np from utils import signals_to_string filepath = 'result.wav' data, _ = sf.read(filepath) print(len(data)) size = len(data) data = set(["{:.6f}".format(d) for d in data]) rev_data = [] for i in range(size): k = np.sin(i * 439.97 / 44100 * (2 * np.pi)) ks = "{:.6f}".format(k) if ks in data: rev_data.append(k) else: rev_data.append("x") signal = [] for i in range(0, len(rev_data), 2000): if rev_data[i:i + 2000].count("x") > 500: signal.append(0) else: signal.append(1) print(len(rev_data)) print(signal) print(signals_to_string(signal))
def find_peaks(ramec): t = np.zeros(100) N = len(ramec) prev_val = 0 curr_val = 0 n_val = 0 index = int(0) for i in range(N): if (curr_val > prev_val and curr_val < n_val): t[index] = curr_val znely_ramec = 45 s, fs = sf.read('xsiska16.wav') print("Pocet vzorkov: ", s.size) max = max(s) min = min(s) print("max ", max) print("min ", min) t = np.arange(s.size) / fs time = s.size / fs print("Cas nahravky v sekundach: ", time)
def playsound(filename): data, fs = soundfile.read(filename, dtype='float32') sounddevice.play(data, fs) status = sounddevice.wait()
def size_hours(self): return sum( soundfile.read(self.get(i)[0])[0].size / (16000 * 3600) for i in range(self.size()))
# From specs samplingRateVar = rootgrp.createVariable('Data.SamplingRate', 'f8', ('I')) samplingRateVar.Units = 'hertz' samplingRateVar[:] = 44100 # No delay found delayVar = rootgrp.createVariable('Data.Delay', 'f8', ('I', 'R', 'E')) delay = np.zeros((I, R, E)) delayVar[:, :, :] = delay # Parse the audio files... dataIRVar = rootgrp.createVariable('Data.IR', 'f8', ('M', 'R', 'E', 'N')) dataIRVar.ChannelOrdering = 'fuma' dataIRVar.Normalization = 'fuma' audioFilesPath = '/Volumes/Dinge/audio/S3A_original/MainChurch/Soundfield/' for e in range(E): fileIdx = e + 1 # Numeration starts at 1 fileName = 'ls' + str(fileIdx) + '.wav' # Open the audio file data, samplerate = sf.read(audioFilesPath + fileName) assert samplerate == 44100 assert np.shape(data) == (65536, 4) dataIRVar[:, :, e, :] = data #----------Close it----------# rootgrp.close()
self.w) if (len(frame) != self.wl): frame = np.concatenate( (frame, np.zeros((self.wl - len(frame))))) #print('min '+str(self.bw(self.gama[i]))+' max '+str(self.ew(self.gama[i]))+' sigma ' +'min '+str(self.bw(self.sigma[i]))+' max '+str(self.ew(self.sigma[i]))) self.y[self.bw(self.gama[i]):( self.ew(self.gama[i]))] = self.y[self.bw(self.gama[i]):( self.ew(self.gama[i]))] + frame except: print('El factor de escalamiento no funciona') Fs = 41000 #frecuencia de sampleo f = 20 timeVector = np.arange(0, 1, 1 / Fs) Audio, Fs = sf.read('guitarra.wav') #Audio= (np.sin(2*pi*f*timeVector) + np.sin(2*50*pi*f*timeVector) + np.sin(2*100*pi*f*timeVector))/3 #Audio= np.sin(2*pi*f*timeVector) abc = ola() abc.run(Audio, 2) sd.play(abc.y, Fs) sd.wait() #sf.write('speech_dobleDuracion.wav',abc.y,Fs) n = len(Audio) timeVector = np.arange(0, n * (1 / Fs), 1 / Fs) n = len(Audio) frecVector = fftfreq(n) espectroVector = fft(Audio)
def file_to_text(self, filename): audio_input, samplerate = sf.read(filename) assert samplerate == 16000 return self.buffer_to_text(audio_input)
'''demo for using sound device and sound file: Taken from: https://python-sounddevice.readthedocs.io/en/0.2.1/examples.html ''' import argparse import logging # To use, cd into helpers directory, run >> python demo/sound_card_demo.py "filename" # Example: python demo/sound_card_demo.py "../static/sounds/chime.wav" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("filename", help="audio file to be played back") parser.add_argument("-d", "--device", type=int, help="device ID") args = parser.parse_args() try: import sounddevice as sd import soundfile as sf devices = sd.query_devices() print(devices) data, fs = sf.read(args.filename, dtype='float32') sd.play(data, fs, device=args.device, blocking=True) status = sd.get_status() if status: logging.warning(str(status)) except BaseException as e: # This avoids printing the traceback, especially if Ctrl-C is used. raise SystemExit(str(e))
def asr_worker(text_queue: Queue, run: Value, done_loading: Value): try: import sounddevice as sd import soundfile as sf # Initialise CUDA. cuda.init() device = cuda.Device(0) ctx = device.make_context() # Load the QuartzNet ASR model. logging.info('Loading QuartzNet model for ASR...') featurizer = MelFeaturizer() quartznet = QuartzNet() # Initialise the Decoder. logging.info('Loading CTC Beam Decoder...') decoder = Decoder(model_path='models/lm/3_gram_lm.trie', alpha=1, beta=0.5) with done_loading.get_lock(): done_loading.value = 1 chunk_size = 1 * 16000 n_past_chunks = 5 past_chunks_size = chunk_size * (n_past_chunks - 1) activation_words = ['jarvis', 'jervis'] beep, _ = sf.read('assets/wav/beep.wav', dtype='float32') peeb = np.ascontiguousarray(np.flip(beep)) activation_waveform = np.zeros((n_past_chunks * chunk_size, 1), dtype=np.float32) in_stream = sd.InputStream(samplerate=16000, channels=1) out_stream = sd.OutputStream(samplerate=44100, channels=2) in_stream.start() out_stream.start() while run.value: # Read waveform from the microphone and store in the rolling buffer. data, overflowed = in_stream.read(chunk_size) if overflowed: logging.warning('ASR process is skipping microphone frames!') activation_waveform = np.roll(activation_waveform, -chunk_size) activation_waveform[past_chunks_size:, 0] = data[:, 0] # Run ASR. token_probs = quartznet(featurizer(activation_waveform.T)) decoded = decoder(token_probs) # If the keyword was said... if any([word in decoded for word in activation_words]): logging.info('ASR triggered!') # Play a beep sound. out_stream.write(beep) # Read waveform from the microphone. _data = in_stream.read(5 * 16000)[0] # Play a peeb sound. out_stream.write(peeb) # Run ASR. token_probs = quartznet(featurizer(_data.T)) decoded = decoder(token_probs) # Add the recognised text to the text queue and reset the activation waveform buffer. logging.info(f'ASR recognised: "{decoded}".') text_queue.put(decoded) activation_waveform *= 0 except KeyboardInterrupt: pass ctx.pop() in_stream.stop() out_stream.stop()
def test_call(self): sample, sr = sf.read(f"{CWD}/tests/test_data/test-clean/61/70968/61-70968-0000.flac") path = f"{CWD}/tests/test_data/test-clean/61/70968/61-70968-0000.flac" res, sr = self.add_noise(path) self.assertEqual(res.shape, sample.shape) sf.write("foo.flac", res, sr)
def test_stereo_to_mono(self): sample, sr = sf.read("test_data/UrbanSound8K/audio/fold1/118279-8-0-5.wav") res = stereo_to_mono(sample) self.assertEqual(res.shape, (192000,))
def test_match_to_speech(self): sample, sr = sf.read("test_data/UrbanSound8K/audio/fold1/118279-8-0-5.wav") func = create_match_to_speech(noise_sr=44_100, speech_sr=16_000) res = func(sample) print(type(res))
def _trim(input_folder, sound_list, output_folder): for sound in sound_list: data, sample_rate = sf.read(os.path.join(input_folder, sound)) sf.write(os.path.join(output_folder, sound), data[:(sample_rate * 5)], sample_rate)
def sampler(video_1, video_2, rate=32, augment=False, precompute=False, include_metadata=False): """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file, 50% chance sample one second from another audio_file in the list of audio_files. Args: video_1: dict for candidate video to sample from video_2: dict for candidate video to sample from Keyword Args: rate: Poisson rate parameter. Used for precomputing samples augment: If True, perform data augmention precompute: If True, precompute samples during initialization so that memory can be discarded Returns: A generator that yields dictionary of video sample, audio sample, and label (0: not from corresponding files, 1: from corresponding files) """ video_file_1 = video_1['video_filepath'] video_file_2 = video_2['video_filepath'] audio_file_1 = video_1['audio_filepath'] audio_file_2 = video_2['audio_filepath'] debug_msg = 'Initializing streamer with videos "{}" and "{}"' LOGGER.debug(debug_msg.format(video_file_1, video_file_2)) # Hack: choose a number of samples such that we with high probability, we # won't run out of samples, but is also less than the entire length of # the video so we don't have to resize all of the frames num_samples = int(scipy.stats.poisson.ppf(0.999, rate)) try: with LogTimer(LOGGER, 'Reading video'): video_data_1 = read_video(video_file_1) except Exception as e: warn_msg = 'Could not open video file {} - {}: {}; Skipping...' warn_msg = warn_msg.format(video_file_1, type(e), e) LOGGER.warning(warn_msg) warnings.warn(warn_msg) raise StopIteration() try: with LogTimer(LOGGER, 'Reading video'): video_data_2 = read_video(video_file_2) except Exception as e: warn_msg = 'Could not open video file {} - {}: {}; Skipping...' warn_msg = warn_msg.format(video_file_2, type(e), e) LOGGER.warning(warn_msg) warnings.warn(warn_msg) raise StopIteration() try: with LogTimer(LOGGER, 'Reading audio'): audio_data_1, sampling_frequency = sf.read(audio_file_1, dtype='int16', always_2d=True) audio_data_1 = audio_data_1.mean(axis=-1).astype('int16') except Exception as e: warn_msg = 'Could not open audio file {} - {}: {}; Skipping...' warn_msg = warn_msg.format(audio_file_1, type(e), e) LOGGER.warning(warn_msg) warnings.warn(warn_msg) raise StopIteration() try: with LogTimer(LOGGER, 'Reading audio'): audio_data_2, sampling_frequency = sf.read(audio_file_2, dtype='int16', always_2d=True) audio_data_2 = audio_data_2.mean(axis=-1).astype('int16') except Exception as e: warn_msg = 'Could not open audio file {} - {}: {}; Skipping...' warn_msg = warn_msg.format(audio_file_2, type(e), e) LOGGER.warning(warn_msg) warnings.warn(warn_msg) raise StopIteration() if precompute: samples = [] for _ in range(num_samples): sample = generate_sample( audio_file_1, audio_data_1, audio_file_2, audio_data_2, video_file_1, video_data_1, video_file_2, video_data_2, sampling_frequency, augment=augment, include_metadata=include_metadata) samples.append(sample) # Clear the data from memory video_data_1 = None video_data_2 = None audio_data_1 = None audio_data_2 = None video_data = None audio_data = None del video_data_1 del video_data_2 del audio_data_1 del audio_data_2 del video_data del audio_data while samples: # Yield the sample, and remove from the list to free up some memory yield samples.pop() else: while True: yield generate_sample( audio_file_1, audio_data_1, audio_file_2, audio_data_2, video_file_1, video_data_1, video_file_2, video_data_2, sampling_frequency, augment=augment, include_metadata=include_metadata) raise StopIteration()
print("Warning: cupy is not installed. 'gpu' argument should be set to -1. Switched to CPU.\n") import numpy as xp separater = AR_FastMNMF2( n_source=args.n_source, n_basis=args.n_basis, xp=xp, init_SCM=args.init_SCM, n_tap_AR=args.n_tap_AR, n_delay_AR=args.n_delay_AR, n_bit=args.n_bit, algo=args.algo, n_iter_init=args.n_iter_init ) wav, sample_rate = sf.read(args.input_fname) wav /= np.abs(wav).max() * 1.2 M = min(len(wav), args.n_mic) spec_FTM = MultiSTFT(wav[:, :M], n_fft=args.n_fft) separater.file_id = args.file_id separater.load_spectrogram(spec_FTM, sample_rate) separater.solve( n_iter=args.n_iter, save_dir="./", save_likelihood=False, save_param=False, save_wav=True, interval_save=5, )
def play_from_file(file): data, fs = soundfile.read(file) sd.play(data, fs, device=sd.default.device) status = sd.wait()
def process(args): f0_max = 1100.0 f0_min = 50.0 frame_shift = args.shift_size / 1000 hop_length = int(args.sr * frame_shift) lab_list = os.listdir(args.labdir) phone_set = [] idscp = {} index = 1 for lab in lab_list: lab_id = lab[:-4] idscp[lab_id] = index segments, phone = load_label( os.path.join(args.labdir, lab), s_type=args.label_type, sr=args.sr, frame_shift=frame_shift, sil=args.sil, ) for p in phone: if p not in phone_set: phone_set.append(p) wav_path = os.path.join(args.wavdir, lab_id + "." + args.wav_extention) if args.wav_extention == "raw": signal, osr = sf.read( wav_path, subtype="PCM_16", channels=1, samplerate=args.sr, endian="LITTLE", ) else: signal, osr = librosa.load(wav_path, sr=None) if osr != args.sr: signal = librosa.resample(signal, osr, args.sr) song_align = os.path.join(args.outdir, "alignment") song_wav = os.path.join(args.outdir, "wav_info", str(index)) song_pitch_beat = os.path.join(args.outdir, "pitch_beat_extraction", str(index)) if not os.path.exists(song_align): os.makedirs(song_align) if not os.path.exists(song_wav): os.makedirs(song_wav) if not os.path.exists(song_pitch_beat): os.makedirs(song_pitch_beat) print("processing {}".format(song_wav)) for seg in segments.keys(): alignment = segments[seg]["alignment"] start = segments[seg]["start"] name = seg seg_signal = signal[int(start * hop_length):int(start * hop_length + len(alignment) * hop_length)] """extract beats""" tempo, beats = librosa.beat.beat_track(y=seg_signal, sr=args.sr, hop_length=hop_length) # times = librosa.frames_to_time(beats, sr=args.sr) # frames = librosa.time_to_frames( # times, sr=args.sr, hop_length=hop_length, n_fft=n_fft # ) np.save( os.path.join(song_pitch_beat, name) + "_beats", np.array(beats)) """extract pitch""" seg_signal = seg_signal.astype("double") _f0, t = pw.harvest( seg_signal, args.sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=frame_shift * 1000, ) _f0 = pw.stonemask(seg_signal, _f0, t, args.sr) np.save( os.path.join(song_pitch_beat, name) + "_pitch", np.array(_f0)) alignment_id = np.zeros((len(alignment))) for i in range(len(alignment)): alignment_id[i] = phone_set.index(alignment[i]) np.save( os.path.join(song_align, pack_zero(index) + name), np.array(alignment_id), ) sf.write(os.path.join(song_wav, name) + ".wav", seg_signal, samplerate=args.sr) print("saved {}".format(os.path.join(song_wav, name) + ".wav")) index += 1 with open(os.path.join(args.outdir, "phone_set.txt"), "w") as f: for p_id, p in enumerate(phone_set): f.write(str(p_id) + " " + p) f.write("\n")
pl.show() ''' # Plot Spectrogram for each detection, plot to screen and to file DETECTION_THRESHOLD = 2.0 SPECTROGRAM_NFFT = 1024 SPECTROGRAM_STEP = 128 print("Events detected from statistical deviation:") for d in detections_final: if d[2] > DETECTION_THRESHOLD: print(' {}: {:>3.1f} {:>5.1f}'.format(hhmmss( d[0]), d[1], d[2])) # start_time, duration, significance start_i = d[3] - DETECTION_CONTEXT_N stop_i = d[4] + 1 + DETECTION_CONTEXT_N wave = sf.read(wave_file, start=start_i * BLOCK_DURATION_N, stop=stop_i * BLOCK_DURATION_N)[0] wave = fixup_wave(wave, f.samplerate) # Apply noise cancellation to signal wave = filter_signal(wave, f.samplerate) # Generate plot of wave, spectrogram, and power and save as image file pl.subplot(3, 1, 1) pl.plot(wave) pl.title('start={}, dur={:.1f}, sig={:.1f}'.format( hhmmss(d[0]), d[1], d[2])) # start_time, duration, significance pl.xticks([]) pl.subplot(3, 1, 2) pl.specgram(wave, SPECTROGRAM_NFFT, f.samplerate,
os.makedirs(cropped_threshold_dir) fc = 3000 / 22050 b, a = signal.butter(10, fc, 'low') for file_name in os.listdir(files_dir): direction = re.findall("\d+", file_name)[0] paths = capsule_path_difference(polar, [1.5, 0, direction]) max_channel, min_channel, max_onset, min_onset = 0, 0, 0, sys.maxsize onsets = [] # Read wav file data, fs = sf.read(os.path.join(files_dir, file_name)) closest = np.argmin(paths) # low pass filter at fc filtered = signal.filtfilt(b, a, data.T[closest]) filter_max = max(filtered) closest_onset = next(x[0] for x in enumerate(filtered) if abs(x[1]) == filter_max) time = [n / fs for n in range(filtered.size)] onsets_distance = np.array([ round(((paths[i] - paths[closest]) / c) * fs) + closest_onset for i in range(len(paths)) ], dtype='int32')
import sys import numpy as np import math import librosa import soundfile as sf import json from librosa.core.spectrum import power_to_db import scipy file_path = sys.argv[1] data, samplerate = sf.read(file_path) #data = np.clip(data*3, -1, 1) with open("MfccConfig.json", "r") as f: config = json.load(f) frame_size = config['frame_size'] frame_step = config['frame_step'] n_fft = config['n_fft'] n_mels = config['mfcc_bank_cnt'] fmin = config['fmin'] fmax = config['fmax'] dtype = config.get('dtype', "int") high_prec = config.get('use_high_prec', False) or dtype == "fix32_scal" use_power = False rad4 = round(math.log(n_fft // 2, 4)) == math.log(n_fft // 2, 4) ndct = config.get('n_dct', False) from librosa.filters import get_window from librosa import util librosa_fft_window = get_window("hann", frame_size, fftbins=True)
def load_audio_from_path(self, wav_path): assert os.path.isfile(wav_path) and wav_path.endswith('.wav') samples, _ = soundfile.read(wav_path, dtype="int16") self.samples = samples.tolist()
import soundfile as sf import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer from torch.utils.mobile_optimizer import optimize_for_mobile tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") model.eval() audio_input, _ = sf.read("scent_of_a_woman_future.wav") input_values = tokenizer(audio_input, return_tensors="pt").input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = tokenizer.batch_decode(predicted_ids)[0] model_dynamic_quantized = torch.quantization.quantize_dynamic( model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8) traced_quantized_model = torch.jit.trace(model_dynamic_quantized, input_values, strict=False) optimized_traced_quantized_model = optimize_for_mobile(traced_quantized_model) optimized_traced_quantized_model.save("wav2vec2.pt")
def map_to_array(batch): speech, _ = sf.read(batch["file"]) batch["speech"] = speech return batch
def read_wav(self, bytes): waveform, sample_rate = sf.read(BytesIO(bytes), dtype="float32") return waveform, sample_rate
import playsound import soundfile as sf import numpy as np #import matplotlib.pyplot as plt x, fs = sf.read('carrie1.wav') audio = x #=================================================================================# fc = 4000 M = 20 wc = (2*np.pi*fc)/fs #=================================================================================# w = np.hamming(M)[:M-1] hd = [] for n in range(M-1): hd.insert(n, (wc/np.pi)* np.sinc((wc/np.pi)*(n-(M/2)))) #=================================================================================# h = hd*w #=================================================================================# audio_filtrado = np.convolve(h, audio)