def audio_features(params, img_audio, audio_path, append_name, node_list): output_file = params[5] # create pytable atom for the features f_atom = tables.Float32Atom() count = 1 # keep track of the nodes for which no features could be made, places database contains some # empty audio files invalid = [] for node in node_list: print('processing file audio:' + str(count)) count += 1 # create a group for the desired feature type (e.g. a group called 'fbanks') audio_node = output_file.create_group(node, params[4]) # get the base name of the node this feature will be appended to base_name = node._v_name.split(append_name)[1] # get the caption file names corresponding to the image of this node caption_files = [base_name + '.wav'] for cap in caption_files: # basename for the caption file, i.e. cut of the file extension as dots arent # allowed in pytables group names. base_capt = cap.split('.')[0] # as the places database splits the audio files over multiple subfolders these paths from # the top folder are included the captions in the dictionary but can be removed from the base_name # of the node in the h5 file. if '/' in base_capt: base_capt = base_capt.split('/')[-1] # read audio samples try: input_data = read(os.path.join(audio_path, cap)) # in the places database some of the audiofiles are empty. To keep this script # compatible with database that might have more captions to one image, we check # if the audio node is empty at the end of the loop and delete the entire node # if no caption features could be made. if len(input_data[1]) == 0: # break as we can do nothing with an empty audio file. break except: # try to repair the file, however I found some files in places, so broken that # such that they could not be read at all. Just remove such nodes try: fix_wav(os.path.join(audio_path, cap)) input_data = read(os.path.join(audio_path, cap)) except: break # sampling frequency fs = input_data[0] # get window and frameshift size in samples window_size = int(fs * params[2]) frame_shift = int(fs * params[3]) # create features (implemented are raw audio, the frequency spectrum, fbanks and # mfcc's) if params[4] == 'raw': [features, energy] = raw_frames(input_data, frame_shift, window_size) elif params[4] == 'freq_spectrum': [frames, energy] = raw_frames(input_data, frame_shift, window_size) features = get_freqspectrum(frames, params[0], fs, window_size) elif params[4] == 'fbanks': [frames, energy] = raw_frames(input_data, frame_shift, window_size) freq_spectrum = get_freqspectrum(frames, params[0], fs, window_size) features = get_fbanks(freq_spectrum, params[1], fs) elif params[4] == 'mfcc': [frames, energy] = raw_frames(input_data, frame_shift, window_size) freq_spectrum = get_freqspectrum(frames, params[0], fs, window_size) fbanks = get_fbanks(freq_spectrum, params[1], fs) features = get_mfcc(fbanks) # optionally add the frame energy if params[7]: features = numpy.concatenate([energy[:, None], features], 1) # optionally add the deltas and double deltas if params[6]: single_delta = delta(features, 2) double_delta = delta(single_delta, 2) features = numpy.concatenate([features, single_delta, double_delta], 1) # create new leaf node in the feature node for the current audio file feature_shape = numpy.shape(features)[1] f_table = output_file.create_earray(audio_node, append_name + base_capt, f_atom, (0, feature_shape), expectedrows=5000) # append new data to the tables f_table.append(features) if audio_node._f_list_nodes() == []: # keep track of all the invalid nodes for which no features could be made invalid.append(node._v_name) # remove the top node including all other features if no captions features could be created output_file.remove_node(node, recursive=True) print(invalid) return
def audio_features (params, audio_path, append_name, node_list): output_file = params[5] # create pytable atom for the features f_atom= tables.Float32Atom() count = 1 # keep track of the nodes for which no features could be made, basically empty audio files invalid = [] for node in node_list: print('processing file:' + str(count)) count+=1 # create a group for the desired feature type (e.g. a group called 'fbanks') audio_node = output_file.create_group(node, params[4]) # get the base name of the node this feature will be appended to base_name = node._v_name.split(append_name)[1] lis = base_name.split("_") participant = lis[0] emotion = lis[1] word = lis[2] audio_file = "_".join([participant,word,emotion]) + ".wav" audio_folder = "_".join([participant,emotion]) input_data = read(os.path.join(audio_path, audio_folder + "\\" + audio_file)) if len(input_data[1]) == 0: print("$") # sampling frequency fs = input_data[0] # get window and frameshift size in samples window_size = int(fs*params[2]) frame_shift = int(fs*params[3]) # create features (implemented are raw audio, the frequency spectrum, fbanks and # mfcc's) if params[4] == 'raw': [features, energy] = raw_frames(input_data, frame_shift, window_size) elif params[4] == 'freq_spectrum': [frames, energy] = raw_frames(input_data, frame_shift, window_size) features = get_freqspectrum(frames, params[0], fs, window_size) elif params[4] == 'fbanks': [frames, energy] = raw_frames(input_data, frame_shift, window_size) freq_spectrum = get_freqspectrum(frames, params[0], fs, window_size) features = get_fbanks(freq_spectrum, params[1], fs) elif params[4] == 'mfcc': [frames, energy] = raw_frames(input_data, frame_shift, window_size) freq_spectrum = get_freqspectrum(frames, params[0], fs, window_size) fbanks = get_fbanks(freq_spectrum, params[1], fs) features = get_mfcc(fbanks) # optionally add the frame energy if params[7]: features = numpy.concatenate([energy[:,None], features],1) # optionally add the deltas and double deltas if params[6]: single_delta= delta (features,2) double_delta= delta(single_delta,2) features= numpy.concatenate([features,single_delta,double_delta],1) # create new leaf node in the feature node for the current audio file feature_shape= numpy.shape(features)[1] #39 f_table = output_file.create_earray(audio_node, append_name + base_name, f_atom, (0,feature_shape),expectedrows=5000) # append new data to the tables f_table.append(features) if audio_node._f_list_nodes() == []: # keep track of all the invalid nodes for which no features could be made invalid.append(node._v_name) # remove the top node including all other features if no captions features could be created output_file.remove_node(node, recursive = True) print(invalid) return
window_size = int(fs * params['t_window']) exp = 1 while True: if np.power(2, exp) - window_size >= 0: fft_size = np.power(2, exp) break else: exp += 1 window_size = int(fs * params['t_window']) frame_shift = int(fs * params['t_shift']) [frames, energy] = raw_frames(prime_data, frame_shift, window_size) freq_spectrum = get_freqspectrum(frames, params['alpha'], fs, window_size) fbanks = get_fbanks(freq_spectrum, params['n_filters'], fs) prime_features = get_mfcc(fbanks) prime_features = np.concatenate([energy[:, None], prime_features], 1) [frames, energy] = raw_frames(target_data, frame_shift, window_size) freq_spectrum = get_freqspectrum(frames, params['alpha'], fs, window_size) fbanks = get_fbanks(freq_spectrum, params['n_filters'], fs) target_features = get_mfcc(fbanks) target_features = np.concatenate([energy[:, None], target_features], 1) single_delta = base.delta(prime_features, params['delta_n']) double_delta = base.delta(single_delta, params['delta_n']) prime_features = np.concatenate( [prime_features, single_delta, double_delta], 1)