def test_save(tmpdir, audio): p = str(tmpdir.join('test.wav')) audio.save(p) # cannot overwrite an existing file with pytest.raises(ValueError) as err: audio.save(p) assert 'file already exist' in str(err) audio2 = Audio.load(p) assert audio == audio2 # test with float32 wav signal = np.zeros((1000,), dtype=np.float32) signal[10] = 1.0 signal[20] = -1.0 p = str(tmpdir.join('test2.wav')) audio = Audio(signal, 1000) audio.save(p) meta = Audio.scan(p) assert meta.nsamples == 1000 assert meta.nchannels == 1 audio2 = Audio.load(p) assert audio2 == audio assert audio2.data.min() == -1.0 assert audio2.data.max() == 1.0
def extract_features_shennong(audio_path, save_path): audio = Audio.load(audio_path) # 80-dim fbank with 1-dim energe processor = FilterbankProcessor(sample_rate=audio.sample_rate, num_bins=40, use_energy=False) #80 fbank + 1 energy fbank = processor.process(audio) fbank = fbank.data #(fbank.data - fbank.data.mean()) / fbank.data.std() # 3-dim pitch processor = PitchProcessor(frame_shift=0.01, frame_length=0.025) options = { 'sample_rate': audio.sample_rate, 'frame_shift': 0.01, 'frame_length': 0.025, 'min_f0': 20, 'max_f0': 500 } processor = PitchProcessor(**options) pitch = processor.process(audio) postprocessor = PitchPostProcessor() # use default options postpitch = postprocessor.process(pitch) # 3 dim postpitch = postpitch.data #(postpitch.data - postpitch.data.mean()) / postpitch.data.std() #features = postpitch shape = min(fbank.shape[0], postpitch.shape[0]) #zero = np.zeros((,content[i].shape[1]),dtype=np.float32) #content[i] = np.vstack((content[i],zero)) features = np.concatenate((fbank[:shape, :], postpitch[:shape, :]), axis=-1) # name = os.path.basename(audio_path).split('.')[0] + '.npy' # np.save(os.path.join(save_path, name), features.data) return features
def wavs_to_feats_df(wavs_list, feats): assert feats in ['mfcc', 'bnf'], "Unknown feature parameter for wavs_to_feats_df function: {}".format(feats) feats_list = [] for wav_file in wavs_list: wav_data = Audio.load(wav_file).resample(8000) assert wav_data.sample_rate == 8000, "Error. Could not resample file to 8000 Hz for MFCC/BNF feature extraction." assert wav_data.nchannels == 1, "Unexpected non-mono file supplied: {}".format(filename) if feats == 'mfcc': mfcc_data = mfcc_processor.process(wav_data) mfcc_data = delta_processor.process(mfcc_data) feats_list.append(mfcc_data.data) elif feats == 'bnf': bnf_data = bnf_processor.process(wav_data) feats_list.append(bnf_data.data) feats_df = pd.DataFrame({ "filename" : [ os.path.splitext(os.path.basename(f))[0] for f in wavs_list ], # '.../filename.wav' => 'filename', "features" : feats_list }) return feats_df
def test_check_wavs_bad(wav_file, wav_file_8k, tmpdir, capsys): def fun(utts): c = pipeline._init_config( pipeline.get_default_config('mfcc', with_cmvn=False)) u = pipeline._init_utterances(utts) pipeline._Manager(c, u) return u # build a stereo file and make sure it is not supported by the # pipeline audio = Audio.load(wav_file) stereo = Audio(np.asarray((audio.data, audio.data)).T, sample_rate=audio.sample_rate) assert stereo.nchannels == 2 wav_file_2 = str(tmpdir.join('stereo.wav')) stereo.save(wav_file_2) with pytest.raises(ValueError) as err: fun([(wav_file_2, )]) assert 'all wav files are not mono' in str(err) # ensure we catch differences in sample rates capsys.readouterr() # clear buffer w = [(wav_file, ), (wav_file_8k, )] out = fun(w) err = capsys.readouterr().err assert 'several sample rates found in wav files' in err assert sorted(out.keys()) == ['utt_1', 'utt_2'] # make sure timestamps are ordered with pytest.raises(ValueError) as err: fun([('1', wav_file, 1, 0)]) assert 'timestamps are not in increasing order for' in str(err)
def get_plp_dd(wav_fn, norm): """Return the MFCCs with deltas and delta-deltas for a audio file.""" audio = Audio.load(wav_fn) processor = PlpProcessor(sample_rate=audio.sample_rate, window_type="hamming",frame_length=0.025, frame_shift=0.01, low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate/2) plp_static = processor.process(audio, vtln_warp=1.0) d_processor = DeltaPostProcessor(order=2) plp_deltas = d_processor.process(plp_static) features = np.float64(plp_deltas._to_dict()["data"]) if norm == "cmvn": features = (features - np.mean(features, axis=0)) / np.std(features, axis=0) return features
def transform_all_wavs(folder_wav, type, folder_out): # will output [timexdim} processor = BottleneckProcessor(weights=type) count = 0 for file in os.listdir(folder_wav): if count % 500 == 0: print(count) count += 1 if not file.endswith('.wav'): continue audio = Audio.load(os.path.join(folder_wav, file)) features = processor.process(audio) #print(features.shape) #print(features) np.savetxt(fname = os.path.join(folder_out,file[:-4] + '.csv'), X=features._data)
def test_compare_kaldi(wav_file): a1 = Audio.load(wav_file).data with tempfile.NamedTemporaryFile('w+') as tfile: tfile.write('test {}\n'.format(wav_file)) tfile.seek(0) with SequentialWaveReader('scp,t:' + tfile.name) as reader: for key, wave in reader: a2 = wave.data().numpy() assert a1.max() == a2.max() assert a1.min() == a2.min() assert len(a1) == len(a2.flatten()) == 22713 assert a1.dtype == np.int16 and a2.dtype == np.float32 assert a1.shape == (22713,) and a2.shape == (1, 22713) assert pytest.approx(a1, a2)
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('data_dir', help='input directory with wavs') parser.add_argument( 'output_dir', default='/tmp', nargs='?', help='output directory (created files are deleted at exit)') args = parser.parse_args() # load audio data and compute total duration audio_data = { os.path.basename(f): Audio.load(f) for f in list_files_with_extension(args.data_dir, '.wav') } total_duration = datetime.timedelta( seconds=int(sum(a.duration for a in audio_data.values()))) print('found {} wav files, total duration of {}'.format( len(audio_data), str(total_duration))) # compute the features (default MFCC) print('computing MFCC features...') t1 = datetime.datetime.now() processor = MfccProcessor() features = FeaturesCollection( **{k: processor.process(v) for k, v in audio_data.items()}) t2 = datetime.datetime.now() print('took {}'.format(t2 - t1)) # save the features in all the supported formats data = { 'duration': total_duration, 'data': { ext: analyze_serializer(features, ext, args.output_dir) for ext in supported_extensions().keys() } } print_results(data)
def get_features(sound_file, chosen_processor): # computes the feature coefficients of a sound file # :param sound_file : sound file in format .wav # :type amount: .wav file # :returns: feature coefficients per frame of 25ms every 10ms can be 'filterbank' # 'plp', 'rasteplp' or 'bottleneck' # :rtype: a numpy array audio = Audio.load(sound_file) processors = { 'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate), 'plp': PlpProcessor(sample_rate=audio.sample_rate), 'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate), 'bottleneck': BottleneckProcessor(weights='BabelMulti') } features = chosen_processor.process(audio) features = pd.DataFrame(features) return (features)
def get_mfcc_vtln(wav_fn, f, norm, lang): """Return the MFCCs with deltas and delta-deltas for a audio file.""" ref = os.path.basename(f).replace(".wav", "") if not os.path.isfile("warps_{}.pkl".format(lang)): if os.path.isfile('warps_{}.txt'.format(lang)): factors = {} with open('warps_{}.txt'.format(lang), mode='r', encoding='utf-8') as opfile: wop = opfile.read().split('\n') for line in wop: if len(line) > 1: l_sp = line.split() factors[l_sp[0]] = float(l_sp[1]) print(factors) with open('warps_{}.pkl'.format(lang), mode='wb') as opfile: pickle.dump(factors, opfile) else: print('no warp factors found') exit() with open("warps_{}.pkl".format(lang), mode="rb") as op: factors = pickle.load(op) warp = float(factors[ref]) audio = Audio.load(wav_fn) processor = MfccProcessor(sample_rate=audio.sample_rate, window_type="hamming", frame_length=0.025, frame_shift=0.01, cepstral_lifter=26.0, low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate / 2) d_processor = DeltaPostProcessor(order=2) mfcc_static = processor.process(audio, vtln_warp=warp) mfcc_deltas = d_processor.process(mfcc_static) features = np.float64(mfcc_deltas._to_dict()["data"]) if norm == "cmvn": features = (features - np.mean(features, axis=0)) / np.std(features, axis=0) return features
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('wav', help='wav file to compute features on') # load the wav file wav_file = parser.parse_args().wav audio = Audio.load(wav_file) # initialize features processors processors = { 'spectrogram': SpectrogramProcessor(sample_rate=audio.sample_rate), 'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate), 'mfcc': MfccProcessor(sample_rate=audio.sample_rate), 'plp': PlpProcessor(sample_rate=audio.sample_rate), 'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate), 'bottleneck': BottleneckProcessor(weights='BabelMulti')} # compute the features for all processors features = {k: v.process(audio) for k, v in processors.items()} # plot the audio signal and the resulting features fig, axes = plt.subplots( nrows=len(processors)+1, gridspec_kw={'top': 0.95, 'bottom': 0.05, 'hspace': 0}, subplot_kw={'xticks': [], 'yticks': []}) time = np.arange(0.0, audio.nsamples) / audio.sample_rate axes[0].plot(time, audio.astype(np.float32).data) axes[0].set_xlim(0.0, audio.duration) axes[0].text( 0.02, 0.8, 'audio', bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'}, transform=axes[0].transAxes) for n, (k, v) in enumerate(features.items(), start=1): axes[n].imshow(v.data.T, aspect='auto') axes[n].text( 0.02, 0.8, k, bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'}, transform=axes[n].transAxes) plt.show()
def get_audio(self, utterance): """Returns the audio data for that `utterance`""" utt = self.utterances[utterance] audio = Audio.load(utt.file) if utt.tstart is not None: assert utt.tstop > utt.tstart audio = audio.segment([(utt.tstart, utt.tstop)])[0] if self.features == 'bottleneck': # resample here the signal (this avoid bugs if one part of # the pipeline on 8k and the other on 16k), then update # the metadata for the wav to be used by the rest of the # pipeline self.log.debug( 'resampling audio from %dHz@%db to %dHz@%db', audio.sample_rate, audio.dtype.itemsize * 8, 8000, 16) audio = audio.resample(8000).astype(np.int16) self._wavs_metadata[self.utterances[utterance].file] = ( Audio._metawav( audio.nchannels, audio.sample_rate, audio.nsamples, audio.duration)) return audio
def test_load_badfile(): with pytest.raises(ValueError) as err: Audio.load('/spam/spam/with/eggs') assert 'file not found' in str(err)
def audio_8k(wav_file_8k): return Audio.load(wav_file_8k)
def test_load_notwav(): with pytest.raises(ValueError) as err: Audio.load(__file__) assert 'is it a wav?' in str(err)
k = shortest_path_position[0][0] l = shortest_path_position[1][0] # divide the shortest distance by the length of the path average_distance = (distance_matrix[vector_1.shape[0]-1][vector_2.shape[0]-1]) \ / path_length return average_distance all_features = {} # get bottleneck features of all .wav files (stimuli) for root, dirs, files in os.walk(WAV_FOLDER): for wav_file in files: if wav_file.endswith(".wav"): audio = Audio.load(root + wav_file) processor = BottleneckProcessor(weights='BabelMulti') features = processor.process(audio) vectors = features.data utterance = wav_file.split('.')[0] all_features[utterance] = vectors for row in distance_list.itertuples(): row_index = getattr(row, 'Index') trip_id = getattr(row, 'tripletid') bottle_oth = all_features[trip_id + "_OTH"] bottle_tgt = all_features[trip_id + "_TGT"] bottle_x = all_features[trip_id + "_X"] eucl_oth_x = \ calculate_distances_dtw(bottle_oth,\
def audio(wav_file): return Audio.load(wav_file)