def test_kaldi_audio(wav_file, audio, dtype): # make sure we have results when loading a wav file with # shennong.Audio and with the Kaldi code. with tempfile.NamedTemporaryFile('w+') as tfile: tfile.write('test {}\n'.format(wav_file)) tfile.seek(0) with SequentialWaveReader('scp,t:' + tfile.name) as reader: for key, wave in reader: audio_kaldi = Audio(wave.data().numpy().reshape( audio.data.shape), audio.sample_rate, validate=False) audio = audio.astype(dtype) assert audio.duration == audio_kaldi.duration assert audio.dtype == dtype assert audio.is_valid() assert audio_kaldi.dtype == np.float32 assert not audio_kaldi.is_valid() # not in [-1, 1] but [-2**15, 2**15-1] mfcc = MfccProcessor().process(audio) mfcc_kaldi = MfccProcessor().process(audio_kaldi) assert mfcc.shape == mfcc_kaldi.shape assert np.array_equal(mfcc.times, mfcc_kaldi.times) assert mfcc.properties == mfcc_kaldi.properties assert mfcc.dtype == mfcc_kaldi.dtype assert pytest.approx(mfcc.data, mfcc_kaldi.data)
def test_check_wavs_bad(wav_file, wav_file_8k, tmpdir, capsys): def fun(utts): c = pipeline._init_config( pipeline.get_default_config('mfcc', with_cmvn=False)) u = pipeline._init_utterances(utts) pipeline._Manager(c, u) return u # build a stereo file and make sure it is not supported by the # pipeline audio = Audio.load(wav_file) stereo = Audio(np.asarray((audio.data, audio.data)).T, sample_rate=audio.sample_rate) assert stereo.nchannels == 2 wav_file_2 = str(tmpdir.join('stereo.wav')) stereo.save(wav_file_2) with pytest.raises(ValueError) as err: fun([(wav_file_2, )]) assert 'all wav files are not mono' in str(err) # ensure we catch differences in sample rates capsys.readouterr() # clear buffer w = [(wav_file, ), (wav_file_8k, )] out = fun(w) err = capsys.readouterr().err assert 'several sample rates found in wav files' in err assert sorted(out.keys()) == ['utt_1', 'utt_2'] # make sure timestamps are ordered with pytest.raises(ValueError) as err: fun([('1', wav_file, 1, 0)]) assert 'timestamps are not in increasing order for' in str(err)
def test_scan_bad(): with pytest.raises(ValueError) as err: Audio.scan(__file__) assert 'is it a wav?' in str(err) with pytest.raises(ValueError) as err: Audio.scan('/path/to/some/lost/place') assert 'file not found' in str(err)
def test_equal(audio): assert audio == audio audio2 = Audio(audio.data, audio.sample_rate) assert audio == audio2 audio2 = Audio(audio.data, audio.sample_rate + 1) assert audio != audio2 audio2 = Audio(audio.data * 2, audio.sample_rate) assert audio.duration == audio2.duration assert audio.sample_rate == audio2.sample_rate assert audio != audio2
def wavs_to_feats_df(wavs_list, feats): assert feats in ['mfcc', 'bnf'], "Unknown feature parameter for wavs_to_feats_df function: {}".format(feats) feats_list = [] for wav_file in wavs_list: wav_data = Audio.load(wav_file).resample(8000) assert wav_data.sample_rate == 8000, "Error. Could not resample file to 8000 Hz for MFCC/BNF feature extraction." assert wav_data.nchannels == 1, "Unexpected non-mono file supplied: {}".format(filename) if feats == 'mfcc': mfcc_data = mfcc_processor.process(wav_data) mfcc_data = delta_processor.process(mfcc_data) feats_list.append(mfcc_data.data) elif feats == 'bnf': bnf_data = bnf_processor.process(wav_data) feats_list.append(bnf_data.data) feats_df = pd.DataFrame({ "filename" : [ os.path.splitext(os.path.basename(f))[0] for f in wavs_list ], # '.../filename.wav' => 'filename', "features" : feats_list }) return feats_df
def test_segment(audio): d = audio.duration assert audio.segment([(0., d)])[0] == audio assert audio.segment([(0., d+10)])[0] == audio chunks = audio.segment([(0, d/2), (d/2, d)]) assert all(c.duration == pytest.approx(d/2, rel=1e-3) for c in chunks) assert sum(c.nsamples for c in chunks) == audio.nsamples assert Audio( np.concatenate([c.data for c in chunks]), audio.sample_rate) == audio chunks = audio.segment([(0, d/3), (d/3, 2*d/3), (2*d/3, d)]) assert all(c.duration == pytest.approx(d/3, rel=1e-3) for c in chunks) assert sum(c.nsamples for c in chunks) == audio.nsamples assert Audio( np.concatenate([c.data for c in chunks]), audio.sample_rate) == audio
def get_features(self, y, sample_rate): """Feature extraction Parameters ---------- y : (n_samples, 1) numpy array Waveform sample_rate : int Sample rate Returns ------- data : (n_frames, n_dimensions) numpy array Features """ # scale the audio signal between -1 and 1 before # creating audio object w/ shennong: Do this because # when pyannote uses "data augmentation", it normalizes # the signal, but when loading the data without data # augmentation it doesn't normalize it. y = y / np.max((-np.min(y), np.max(y))) # create audio object for shennong audio = Audio(data=y, sample_rate=sample_rate) # create processor processor = BottleneckProcessor(weights=self.weights) # define parameters #processor.frame_length = self.duration #processor.frame_shift = self.step # extract features bottleneck = processor.process(audio) # Compute Pitch if self.with_pitch: # extract pitch pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax) ## concatenate mfcc w/pitch - sometimes Kaldi adds to pitch ## one frame so give 2 frames of tolerance #bottleneck = bottleneck.concatenate(pitch, 2) bottleneck = self.concatenate_with_pitch(bottleneck.data, pitch.data) ## add 1 frame at begining and 1 frame at end to ensure that ## we have the same length as mfccs etc.. bottleneck = np.insert(bottleneck, 0, np.zeros((1, bottleneck.shape[1])), axis=0) bottleneck = np.insert(bottleneck, bottleneck.shape[0], np.zeros((1, bottleneck.shape[1])), axis=0) else: bottleneck = bottleneck.data return bottleneck
def extract_features_shennong(audio_path, save_path): audio = Audio.load(audio_path) # 80-dim fbank with 1-dim energe processor = FilterbankProcessor(sample_rate=audio.sample_rate, num_bins=40, use_energy=False) #80 fbank + 1 energy fbank = processor.process(audio) fbank = fbank.data #(fbank.data - fbank.data.mean()) / fbank.data.std() # 3-dim pitch processor = PitchProcessor(frame_shift=0.01, frame_length=0.025) options = { 'sample_rate': audio.sample_rate, 'frame_shift': 0.01, 'frame_length': 0.025, 'min_f0': 20, 'max_f0': 500 } processor = PitchProcessor(**options) pitch = processor.process(audio) postprocessor = PitchPostProcessor() # use default options postpitch = postprocessor.process(pitch) # 3 dim postpitch = postpitch.data #(postpitch.data - postpitch.data.mean()) / postpitch.data.std() #features = postpitch shape = min(fbank.shape[0], postpitch.shape[0]) #zero = np.zeros((,content[i].shape[1]),dtype=np.float32) #content[i] = np.vstack((content[i],zero)) features = np.concatenate((fbank[:shape, :], postpitch[:shape, :]), axis=-1) # name = os.path.basename(audio_path).split('.')[0] + '.npy' # np.save(os.path.join(save_path, name), features.data) return features
def test_save(tmpdir, audio): p = str(tmpdir.join('test.wav')) audio.save(p) # cannot overwrite an existing file with pytest.raises(ValueError) as err: audio.save(p) assert 'file already exist' in str(err) audio2 = Audio.load(p) assert audio == audio2 # test with float32 wav signal = np.zeros((1000,), dtype=np.float32) signal[10] = 1.0 signal[20] = -1.0 p = str(tmpdir.join('test2.wav')) audio = Audio(signal, 1000) audio.save(p) meta = Audio.scan(p) assert meta.nsamples == 1000 assert meta.nchannels == 1 audio2 = Audio.load(p) assert audio2 == audio assert audio2.data.min() == -1.0 assert audio2.data.max() == 1.0
def get_features(self, y, sample_rate): """Feature extraction Parameters ---------- y : (n_samples, 1) numpy array Waveform sample_rate : int Sample rate Returns ------- data : (n_frames, n_dimensions) numpy array Features """ # scale the audio signal between -1 and 1 before # creating audio object w/ shennong: Do this because # when pyannote uses "data augmentation", it normalizes # the signal, but when loading the data without data # augmentation it doesn't normalize it. y = y / np.max((-np.min(y), np.max(y))) # create audio object for shennong audio = Audio(data=y, sample_rate=sample_rate) # create filterbank processor processor = FilterbankProcessor(sample_rate=sample_rate) # use energy ? processor.use_energy = self.e # set parameters processor.frame_length = self.duration processor.frame_shift = self.step processor.window_type = self.fftWindow processor.low_freq = self.melLowFreq processor.high_freq = self.melHighFreq processor.num_bins = self.melNbFilters processor.snip_edges = False # process audio to get filterbanks fbank = processor.process(audio) # Compute Pitch if self.with_pitch: # extract pitch pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax) ## concatenate mfcc w/pitch - sometimes Kaldi adds to pitch ## one frame so give 2 frames of tolerance #fbank = fbank.concatenate(pitch, 2) fbank = self.concatenate_with_pitch(fbank.data, pitch.data) else: fbank = fbank.data return fbank
def test_silence(): silence = Audio(np.zeros((100, )), 16000) with pytest.raises(RuntimeError) as err: BottleneckProcessor().process(silence) assert 'no voice detected in signal' in str(err.value) # silence VAD all false vad = _compute_vad(silence.data, null_logger(), bugfix=True) assert not vad.any()
def get_features(self, y, sample_rate): """Feature extraction Parameters ---------- y : (n_samples, 1) numpy array Waveform sample_rate : int Sample rate Returns ------- data : (n_frames, n_dimensions) numpy array Features """ # scale the audio signal between -1 and 1 before # creating audio object w/ shennong: Do this because # when pyannote uses "data augmentation", it normalizes # the signal, but when loading the data without data # augmentation it doesn't normalize it. y = y / np.max((-np.min(y), np.max(y))) # create audio object for shennong audio = Audio(data=y, sample_rate=sample_rate) # MFCC parameters processor = SpectrogramProcessor(sample_rate=sample_rate) processor.window_type = self.window_type processor.dither = self.dither processor.preemph_coeff = self.preemph_coeff processor.remove_dc_offset = self.remove_dc_offset processor.round_to_power_of_two = self.round_to_power_of_two processor.blackman_coeff = self.blackman_coeff processor.energy_floor = self.energy_floor processor.raw_energy = self.raw_energy processor.snip_edges = False # end with correct number of frames # MFCC extraction #audio = Audio(data=y, sample_rate=sample_rate) spect = processor.process(audio) # Compute Pitch if self.with_pitch: # extract pitch pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax) ## concatenate spect w/pitch - sometimes Kaldi adds to pitch ## one frame so give 2 frames of tolerance spect = self.concatenate_with_pitch(spect.data, pitch.data) else: spect = spect.data return spect
def test_bad_signal(audio): signal = Audio(np.random.random((10, 2)), 50) proc = SpectrogramProcessor(sample_rate=signal.sample_rate) with pytest.raises(ValueError) as err: proc.process(signal) assert 'signal must have one dimension' in str(err) with pytest.raises(ValueError) as err: proc = SpectrogramProcessor(sample_rate=signal.sample_rate + 1) proc.process(audio) assert 'mismatch in sample rates' in str(err)
def get_plp_dd(wav_fn, norm): """Return the MFCCs with deltas and delta-deltas for a audio file.""" audio = Audio.load(wav_fn) processor = PlpProcessor(sample_rate=audio.sample_rate, window_type="hamming",frame_length=0.025, frame_shift=0.01, low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate/2) plp_static = processor.process(audio, vtln_warp=1.0) d_processor = DeltaPostProcessor(order=2) plp_deltas = d_processor.process(plp_static) features = np.float64(plp_deltas._to_dict()["data"]) if norm == "cmvn": features = (features - np.mean(features, axis=0)) / np.std(features, axis=0) return features
def test_shape(): # it was a bug when audio data is shaped (n, 1): must be reshaped # as (n,). The bug appens when converting audio data to pykaldi # vector. d1 = np.random.random((100,)) assert d1.shape == (100,) d2 = np.random.random((100, 1)) assert d2.shape == (100, 1) for d in (d1, d2): a = Audio(d, 10) assert a.shape == (100,)
def get_audio(self, utterance): """Returns the audio data for that `utterance`""" utt = self.utterances[utterance] audio = Audio.load(utt.file) if utt.tstart is not None: assert utt.tstop > utt.tstart audio = audio.segment([(utt.tstart, utt.tstop)])[0] if self.features == 'bottleneck': # resample here the signal (this avoid bugs if one part of # the pipeline on 8k and the other on 16k), then update # the metadata for the wav to be used by the rest of the # pipeline self.log.debug( 'resampling audio from %dHz@%db to %dHz@%db', audio.sample_rate, audio.dtype.itemsize * 8, 8000, 16) audio = audio.resample(8000).astype(np.int16) self._wavs_metadata[self.utterances[utterance].file] = ( Audio._metawav( audio.nchannels, audio.sample_rate, audio.nsamples, audio.duration)) return audio
def transform_all_wavs(folder_wav, type, folder_out): # will output [timexdim} processor = BottleneckProcessor(weights=type) count = 0 for file in os.listdir(folder_wav): if count % 500 == 0: print(count) count += 1 if not file.endswith('.wav'): continue audio = Audio.load(os.path.join(folder_wav, file)) features = processor.process(audio) #print(features.shape) #print(features) np.savetxt(fname = os.path.join(folder_out,file[:-4] + '.csv'), X=features._data)
def test_output(audio): assert MfccProcessor(frame_shift=0.01).process(audio).shape == (140, 13) assert MfccProcessor(frame_shift=0.02).process(audio).shape == (70, 13) assert MfccProcessor(frame_shift=0.02, frame_length=0.05).process(audio).shape == (69, 13) # sample rate mismatch with pytest.raises(ValueError): MfccProcessor(sample_rate=8000).process(audio) # only mono signals are accepted with pytest.raises(ValueError): data = np.random.random((1000, 2)) stereo = Audio(data, sample_rate=16000) MfccProcessor(sample_rate=stereo.sample_rate).process(stereo)
def test_compare_kaldi(wav_file): a1 = Audio.load(wav_file).data with tempfile.NamedTemporaryFile('w+') as tfile: tfile.write('test {}\n'.format(wav_file)) tfile.seek(0) with SequentialWaveReader('scp,t:' + tfile.name) as reader: for key, wave in reader: a2 = wave.data().numpy() assert a1.max() == a2.max() assert a1.min() == a2.min() assert len(a1) == len(a2.flatten()) == 22713 assert a1.dtype == np.int16 and a2.dtype == np.float32 assert a1.shape == (22713,) and a2.shape == (1, 22713) assert pytest.approx(a1, a2)
def __init__(self, config, utterances, log=get_logger('manager', 'warning')): self._config = config self._utterances = utterances self._warps = {} self.log = log self._check_utterances() # store the metadata because we need to access the sample rate # for processors instanciation audio_files = set(utt.audio_file for utt in utterances) self._audio_metadata = {} for audio in audio_files: log.debug('scanning %s', audio) self._audio_metadata[audio] = Audio.scan(audio) # make sure all the audio files are compatible with the pipeline log.info('scanning %s utterances...', len(self._utterances)) self._check_audio_files() # the features type to be extracted self.features = [ k for k in self.config.keys() if k in self.valid_features ][0] # get some framing parameters constant for all processors # (retrieve them from a features processor instance) proc = self.get_features_processor(next(iter(self.utterances))) self.frame_length = proc.frame_length self.frame_shift = proc.frame_shift # if CMVN by speaker, instanciate a CMVN processor by speaker # here, else instanciate a processor per utterance if 'cmvn' in self.config: if self.config['cmvn']['by_speaker']: self._cmvn_processors = { spk: self.get_processor_class('cmvn')(proc.ndims) for spk in set(utt.speaker for utt in self.utterances) } else: self._cmvn_processors = { utt.name: self.get_processor_class('cmvn')(proc.ndims) for utt in self.utterances }
def test_output_shape(audio): assert EnergyProcessor(frame_shift=0.01).process(audio).shape == (140, 1) assert EnergyProcessor(frame_shift=0.02).process(audio).shape == (70, 1) assert EnergyProcessor(frame_shift=0.02, frame_length=0.05).process(audio).shape == (69, 1) # sample rate mismatch with pytest.raises(ValueError) as err: EnergyProcessor(sample_rate=8000).process(audio) assert 'mismatch in sample rate' in str(err) # only mono signals are accepted with pytest.raises(ValueError) as err: data = np.random.random((1000, 2)) stereo = Audio(data, sample_rate=16000) EnergyProcessor(sample_rate=stereo.sample_rate).process(stereo) assert 'must have one dimension' in str(err)
def get_audio(self, utterance): """Returns the audio data for that `utterance`""" audio = utterance.load_audio() if self.features == 'bottleneck': # resample here the signal (this avoid bugs if one part of the # pipeline on 8k and the other on 16k), then update the metadata # for the audio to be used by the rest of the pipeline self.log.debug('resampling audio from %dHz@%db to %dHz@%db', audio.sample_rate, audio.dtype.itemsize * 8, 8000, 16) audio = audio.resample(8000).astype(np.int16) self._audio_metadata[utterance.audio_file] = (Audio._metadata( audio.nchannels, audio.sample_rate, audio.nsamples, audio.duration)) return audio
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('data_dir', help='input directory with wavs') parser.add_argument( 'output_dir', default='/tmp', nargs='?', help='output directory (created files are deleted at exit)') args = parser.parse_args() # load audio data and compute total duration audio_data = { os.path.basename(f): Audio.load(f) for f in list_files_with_extension(args.data_dir, '.wav') } total_duration = datetime.timedelta( seconds=int(sum(a.duration for a in audio_data.values()))) print('found {} wav files, total duration of {}'.format( len(audio_data), str(total_duration))) # compute the features (default MFCC) print('computing MFCC features...') t1 = datetime.datetime.now() processor = MfccProcessor() features = FeaturesCollection( **{k: processor.process(v) for k, v in audio_data.items()}) t2 = datetime.datetime.now() print('took {}'.format(t2 - t1)) # save the features in all the supported formats data = { 'duration': total_duration, 'data': { ext: analyze_serializer(features, ext, args.output_dir) for ext in supported_extensions().keys() } } print_results(data)
def get_features(sound_file, chosen_processor): # computes the feature coefficients of a sound file # :param sound_file : sound file in format .wav # :type amount: .wav file # :returns: feature coefficients per frame of 25ms every 10ms can be 'filterbank' # 'plp', 'rasteplp' or 'bottleneck' # :rtype: a numpy array audio = Audio.load(sound_file) processors = { 'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate), 'plp': PlpProcessor(sample_rate=audio.sample_rate), 'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate), 'bottleneck': BottleneckProcessor(weights='BabelMulti') } features = chosen_processor.process(audio) features = pd.DataFrame(features) return (features)
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('wav', help='wav file to compute features on') # load the wav file wav_file = parser.parse_args().wav audio = Audio.load(wav_file) # initialize features processors processors = { 'spectrogram': SpectrogramProcessor(sample_rate=audio.sample_rate), 'filterbank': FilterbankProcessor(sample_rate=audio.sample_rate), 'mfcc': MfccProcessor(sample_rate=audio.sample_rate), 'plp': PlpProcessor(sample_rate=audio.sample_rate), 'rastaplp': RastaPlpProcessor(sample_rate=audio.sample_rate), 'bottleneck': BottleneckProcessor(weights='BabelMulti')} # compute the features for all processors features = {k: v.process(audio) for k, v in processors.items()} # plot the audio signal and the resulting features fig, axes = plt.subplots( nrows=len(processors)+1, gridspec_kw={'top': 0.95, 'bottom': 0.05, 'hspace': 0}, subplot_kw={'xticks': [], 'yticks': []}) time = np.arange(0.0, audio.nsamples) / audio.sample_rate axes[0].plot(time, audio.astype(np.float32).data) axes[0].set_xlim(0.0, audio.duration) axes[0].text( 0.02, 0.8, 'audio', bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'}, transform=axes[0].transAxes) for n, (k, v) in enumerate(features.items(), start=1): axes[n].imshow(v.data.T, aspect='auto') axes[n].text( 0.02, 0.8, k, bbox={'boxstyle': 'round', 'alpha': 0.5, 'color': 'white'}, transform=axes[n].transAxes) plt.show()
def __init__(self, config, utterances, log=get_logger()): self._config = config self._utterances = utterances self.log = log # the list of speakers self._speakers = set(u.speaker for u in self.utterances.values()) if self._speakers == {None}: self._speakers = None self._check_speakers() # store the metadata because we need to access the sample rate # for processors instanciation wavs = set(u.file for u in utterances.values()) self._wavs_metadata = {w: Audio.scan(w) for w in wavs} # make sure all the wavs are compatible with the pipeline log.info(f'scanning {len(self._utterances)} utterances...') self._check_wavs() # the features type to be extracted self.features = [ k for k in self.config.keys() if k in self._valid_features][0] # get some framing parameters constant for all processors # (retrieve them from a features processor instance) p = self.get_features_processor(next(iter(self.utterances.keys()))) self.frame_length = p.frame_length self.frame_shift = p.frame_shift # if CMVN by speaker, instanciate a CMVN processor by speaker # here, else instanciate a processor per utterance if 'cmvn' in self.config: if self.config['cmvn']['by_speaker']: self._cmvn_processors = { spk: self.get_processor_class('cmvn')(p.ndims) for spk in self.speakers} else: self._cmvn_processors = { utt: self.get_processor_class('cmvn')(p.ndims) for utt in self.utterances}
def get_mfcc_vtln(wav_fn, f, norm, lang): """Return the MFCCs with deltas and delta-deltas for a audio file.""" ref = os.path.basename(f).replace(".wav", "") if not os.path.isfile("warps_{}.pkl".format(lang)): if os.path.isfile('warps_{}.txt'.format(lang)): factors = {} with open('warps_{}.txt'.format(lang), mode='r', encoding='utf-8') as opfile: wop = opfile.read().split('\n') for line in wop: if len(line) > 1: l_sp = line.split() factors[l_sp[0]] = float(l_sp[1]) print(factors) with open('warps_{}.pkl'.format(lang), mode='wb') as opfile: pickle.dump(factors, opfile) else: print('no warp factors found') exit() with open("warps_{}.pkl".format(lang), mode="rb") as op: factors = pickle.load(op) warp = float(factors[ref]) audio = Audio.load(wav_fn) processor = MfccProcessor(sample_rate=audio.sample_rate, window_type="hamming", frame_length=0.025, frame_shift=0.01, cepstral_lifter=26.0, low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate / 2) d_processor = DeltaPostProcessor(order=2) mfcc_static = processor.process(audio, vtln_warp=warp) mfcc_deltas = d_processor.process(mfcc_static) features = np.float64(mfcc_deltas._to_dict()["data"]) if norm == "cmvn": features = (features - np.mean(features, axis=0)) / np.std(features, axis=0) return features
def test_channels_stereo(): data = np.random.random((1000, 2)) audio2 = Audio(data, sample_rate=16000) assert audio2.nchannels == 2 assert audio2.shape == (1000, 2) audio1 = audio2.channel(0) assert audio1.nchannels == 1 assert audio1.shape == (1000,) assert all(np.equal(audio1.data, audio2.data[:, 0])) assert not all(np.equal(audio1.data, audio2.data[:, 1])) assert audio1.duration == audio2.duration audio1 = audio2.channel(1) assert audio1.nchannels == 1 assert audio1.shape == (1000,) assert all(np.equal(audio1.data, audio2.data[:, 1])) assert not all(np.equal(audio1.data, audio2.data[:, 0])) with pytest.raises(ValueError): audio2.channel(2)
k = shortest_path_position[0][0] l = shortest_path_position[1][0] # divide the shortest distance by the length of the path average_distance = (distance_matrix[vector_1.shape[0]-1][vector_2.shape[0]-1]) \ / path_length return average_distance all_features = {} # get bottleneck features of all .wav files (stimuli) for root, dirs, files in os.walk(WAV_FOLDER): for wav_file in files: if wav_file.endswith(".wav"): audio = Audio.load(root + wav_file) processor = BottleneckProcessor(weights='BabelMulti') features = processor.process(audio) vectors = features.data utterance = wav_file.split('.')[0] all_features[utterance] = vectors for row in distance_list.itertuples(): row_index = getattr(row, 'Index') trip_id = getattr(row, 'tripletid') bottle_oth = all_features[trip_id + "_OTH"] bottle_tgt = all_features[trip_id + "_TGT"] bottle_x = all_features[trip_id + "_X"] eucl_oth_x = \ calculate_distances_dtw(bottle_oth,\
def test_bad_signal(): signal = Audio(np.random.random((10, 2)), 50) proc = RastaPlpProcessor(sample_rate=signal.sample_rate) with pytest.raises(ValueError) as err: proc.process(signal) assert 'signal must have one dimension' in str(err.value)