def get_features(self, y, sample_rate): """Feature extraction Parameters ---------- y : (n_samples, 1) numpy array Waveform sample_rate : int Sample rate Returns ------- data : (n_frames, n_dimensions) numpy array Features """ # scale the audio signal between -1 and 1 before # creating audio object w/ shennong: Do this because # when pyannote uses "data augmentation", it normalizes # the signal, but when loading the data without data # augmentation it doesn't normalize it. y = y / np.max((-np.min(y), np.max(y))) # create audio object for shennong audio = Audio(data=y, sample_rate=sample_rate) # create processor processor = BottleneckProcessor(weights=self.weights) # define parameters #processor.frame_length = self.duration #processor.frame_shift = self.step # extract features bottleneck = processor.process(audio) # Compute Pitch if self.with_pitch: # extract pitch pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax) ## concatenate mfcc w/pitch - sometimes Kaldi adds to pitch ## one frame so give 2 frames of tolerance #bottleneck = bottleneck.concatenate(pitch, 2) bottleneck = self.concatenate_with_pitch(bottleneck.data, pitch.data) ## add 1 frame at begining and 1 frame at end to ensure that ## we have the same length as mfccs etc.. bottleneck = np.insert(bottleneck, 0, np.zeros((1, bottleneck.shape[1])), axis=0) bottleneck = np.insert(bottleneck, bottleneck.shape[0], np.zeros((1, bottleneck.shape[1])), axis=0) else: bottleneck = bottleneck.data return bottleneck
def transform_all_wavs(folder_wav, type, folder_out): # will output [timexdim} processor = BottleneckProcessor(weights=type) count = 0 for file in os.listdir(folder_wav): if count % 500 == 0: print(count) count += 1 if not file.endswith('.wav'): continue audio = Audio.load(os.path.join(folder_wav, file)) features = processor.process(audio) #print(features.shape) #print(features) np.savetxt(fname = os.path.join(folder_out,file[:-4] + '.csv'), X=features._data)
def test_process(capsys, audio, mfcc, weights): get_logger(level='debug') proc = BottleneckProcessor(weights=weights) feat = proc.process(audio) assert feat.shape == (140, 80) assert feat.shape[1] == proc.ndims assert np.allclose(feat.times, mfcc.times) assert proc.frame_length == 0.025 assert proc.frame_shift == 0.01 assert proc.sample_rate == 8000 # check the log messages captured = capsys.readouterr().err assert 'resampling audio from 16000Hz@16b to 8000Hz@16b' in captured assert '{} frames of speech detected (on 140 total frames)'.format( '118' if audio._sox_binary else '121') in captured
# divide the shortest distance by the length of the path average_distance = (distance_matrix[vector_1.shape[0]-1][vector_2.shape[0]-1]) \ / path_length return average_distance all_features = {} # get bottleneck features of all .wav files (stimuli) for root, dirs, files in os.walk(WAV_FOLDER): for wav_file in files: if wav_file.endswith(".wav"): audio = Audio.load(root + wav_file) processor = BottleneckProcessor(weights='BabelMulti') features = processor.process(audio) vectors = features.data utterance = wav_file.split('.')[0] all_features[utterance] = vectors for row in distance_list.itertuples(): row_index = getattr(row, 'Index') trip_id = getattr(row, 'tripletid') bottle_oth = all_features[trip_id + "_OTH"] bottle_tgt = all_features[trip_id + "_TGT"] bottle_x = all_features[trip_id + "_X"] eucl_oth_x = \ calculate_distances_dtw(bottle_oth,\ bottle_x) eucl_tgt_x = \
from shennong.features.processor.bottleneck import BottleneckProcessor import pandas as pd import numpy as np import scipy.spatial import os import sys WAV_FOLDER = sys.argv[1] # stimuli in .wav OUT_NPZ_FILE = sys.argv[2] all_features = {} # get bottleneck features of all .wav files (stimuli) for root, dirs, files in os.walk(WAV_FOLDER): for wav_file in files: if wav_file.endswith(".wav"): audio = Audio.load(root + wav_file) all_features[wav_file] = audio processor = BottleneckProcessor(weights='BabelMulti') features = processor.process(all_features) np.savez_compressed(OUT_NPZ_FILE, features) # features.items() # features['triplet001_OTH.wav']