Beispiel #1
0
def audio_features(params, img_audio, audio_path, append_name, node_list):
    output_file = params[5]
    # create pytable atom for the features
    f_atom = tables.Float32Atom()
    count = 1
    # keep track of the nodes for which no features could be made, places database contains some
    # empty audio files
    invalid = []
    for node in node_list:
        print('processing file audio:' + str(count))
        count += 1
        # create a group for the desired feature type (e.g. a group called 'fbanks')
        audio_node = output_file.create_group(node, params[4])
        # get the base name of the node this feature will be appended to
        base_name = node._v_name.split(append_name)[1]
        # get the caption file names corresponding to the image of this node
        caption_files = [base_name + '.wav']

        for cap in caption_files:
            # basename for the caption file, i.e. cut of the file extension as dots arent
            # allowed in pytables group names.
            base_capt = cap.split('.')[0]
            # as the places database splits the audio files over multiple subfolders these paths from
            # the top folder are included the captions in the dictionary but can be removed from the base_name
            # of the node in the h5 file.
            if '/' in base_capt:
                base_capt = base_capt.split('/')[-1]
            # read audio samples
            try:
                input_data = read(os.path.join(audio_path, cap))
                # in the places database some of the audiofiles are empty. To keep this script
                # compatible with database that might have more captions to one image, we check
                # if the audio node is empty at the end of the loop and delete the entire node
                # if no caption features could be made.
                if len(input_data[1]) == 0:
                    # break as we can do nothing with an empty audio file.
                    break
            except:
                # try to repair the file, however I found some files in places, so broken that
                # such that they could not be read at all. Just remove such nodes
                try:
                    fix_wav(os.path.join(audio_path, cap))
                    input_data = read(os.path.join(audio_path, cap))
                except:
                    break
            # sampling frequency
            fs = input_data[0]
            # get window and frameshift size in samples
            window_size = int(fs * params[2])
            frame_shift = int(fs * params[3])

            # create features (implemented are raw audio, the frequency spectrum, fbanks and
            # mfcc's)
            if params[4] == 'raw':
                [features, energy] = raw_frames(input_data, frame_shift, window_size)

            elif params[4] == 'freq_spectrum':
                [frames, energy] = raw_frames(input_data, frame_shift, window_size)
                features = get_freqspectrum(frames, params[0], fs, window_size)

            elif params[4] == 'fbanks':
                [frames, energy] = raw_frames(input_data, frame_shift, window_size)
                freq_spectrum = get_freqspectrum(frames, params[0], fs, window_size)
                features = get_fbanks(freq_spectrum, params[1], fs)

            elif params[4] == 'mfcc':
                [frames, energy] = raw_frames(input_data, frame_shift, window_size)
                freq_spectrum = get_freqspectrum(frames, params[0], fs, window_size)
                fbanks = get_fbanks(freq_spectrum, params[1], fs)
                features = get_mfcc(fbanks)

            # optionally add the frame energy
            if params[7]:
                features = numpy.concatenate([energy[:, None], features], 1)
            # optionally add the deltas and double deltas
            if params[6]:
                single_delta = delta(features, 2)
                double_delta = delta(single_delta, 2)
                features = numpy.concatenate([features, single_delta, double_delta], 1)

            # create new leaf node in the feature node for the current audio file
            feature_shape = numpy.shape(features)[1]
            f_table = output_file.create_earray(audio_node, append_name + base_capt, f_atom, (0, feature_shape),
                                                expectedrows=5000)

            # append new data to the tables
            f_table.append(features)
        if audio_node._f_list_nodes() == []:
            # keep track of all the invalid nodes for which no features could be made
            invalid.append(node._v_name)
            # remove the top node including all other features if no captions features could be created
            output_file.remove_node(node, recursive=True)
    print(invalid)
    return
def audio_features (params, audio_path, append_name, node_list):
    
    output_file = params[5]
    # create pytable atom for the features   
    f_atom= tables.Float32Atom() 
    count = 1
    # keep track of the nodes for which no features could be made, basically empty audio files
    invalid = []
    for node in node_list:
        print('processing file:' + str(count))
        count+=1
        # create a group for the desired feature type (e.g. a group called 'fbanks')
        audio_node = output_file.create_group(node, params[4])
        # get the base name of the node this feature will be appended to
        base_name = node._v_name.split(append_name)[1]
        
        lis = base_name.split("_")
        participant = lis[0]
        emotion = lis[1]
        word = lis[2]
        
        audio_file = "_".join([participant,word,emotion]) + ".wav"
        
        audio_folder = "_".join([participant,emotion]) 
       
        input_data = read(os.path.join(audio_path, audio_folder + "\\" + audio_file))
        
        if len(input_data[1]) == 0:
            print("$")
 
            
        # sampling frequency
        fs = input_data[0]
        # get window and frameshift size in samples
        window_size = int(fs*params[2])
        frame_shift = int(fs*params[3])
        
        # create features (implemented are raw audio, the frequency spectrum, fbanks and
        # mfcc's)
        if params[4] == 'raw':
            [features, energy] = raw_frames(input_data, frame_shift, window_size)
        
        elif params[4] == 'freq_spectrum':
            [frames, energy] = raw_frames(input_data, frame_shift, window_size)
            features = get_freqspectrum(frames, params[0], fs, window_size)
        
        elif params[4] == 'fbanks':
            [frames, energy] = raw_frames(input_data, frame_shift, window_size)
            freq_spectrum = get_freqspectrum(frames, params[0], fs, window_size)
            features = get_fbanks(freq_spectrum, params[1], fs) 
            
        elif params[4] == 'mfcc':
            [frames, energy] = raw_frames(input_data, frame_shift, window_size)
            freq_spectrum = get_freqspectrum(frames, params[0], fs, window_size)
            fbanks = get_fbanks(freq_spectrum, params[1], fs)
            features = get_mfcc(fbanks)
            
        # optionally add the frame energy
        if params[7]:
            features = numpy.concatenate([energy[:,None], features],1)
        # optionally add the deltas and double deltas
        if params[6]:
            single_delta= delta (features,2)
            double_delta= delta(single_delta,2)
            features= numpy.concatenate([features,single_delta,double_delta],1)
           
        # create new leaf node in the feature node for the current audio file
        feature_shape= numpy.shape(features)[1] #39
        f_table = output_file.create_earray(audio_node, append_name + base_name, f_atom, (0,feature_shape),expectedrows=5000)
        
        # append new data to the tables
        f_table.append(features)
        
        if audio_node._f_list_nodes() == []:
            # keep track of all the invalid nodes for which no features could be made
            invalid.append(node._v_name)
            # remove the top node including all other features if no captions features could be created
            output_file.remove_node(node, recursive = True)
    
    print(invalid)
    return 
    window_size = int(fs * params['t_window'])
    exp = 1
    while True:
        if np.power(2, exp) - window_size >= 0:
            fft_size = np.power(2, exp)
            break
        else:
            exp += 1

    window_size = int(fs * params['t_window'])
    frame_shift = int(fs * params['t_shift'])

    [frames, energy] = raw_frames(prime_data, frame_shift, window_size)
    freq_spectrum = get_freqspectrum(frames, params['alpha'], fs, window_size)
    fbanks = get_fbanks(freq_spectrum, params['n_filters'], fs)
    prime_features = get_mfcc(fbanks)

    prime_features = np.concatenate([energy[:, None], prime_features], 1)

    [frames, energy] = raw_frames(target_data, frame_shift, window_size)
    freq_spectrum = get_freqspectrum(frames, params['alpha'], fs, window_size)
    fbanks = get_fbanks(freq_spectrum, params['n_filters'], fs)
    target_features = get_mfcc(fbanks)

    target_features = np.concatenate([energy[:, None], target_features], 1)

    single_delta = base.delta(prime_features, params['delta_n'])
    double_delta = base.delta(single_delta, params['delta_n'])
    prime_features = np.concatenate(
        [prime_features, single_delta, double_delta], 1)