Beispiel #1
0
    def get_features(self, y, sample_rate):
        """Feature extraction

        Parameters
        ----------
        y : (n_samples, 1) numpy array
            Waveform
        sample_rate : int
            Sample rate

        Returns
        -------
        data : (n_frames, n_dimensions) numpy array
            Features
        """
        # scale the audio signal between -1 and 1 before
        # creating audio object w/ shennong: Do this because
        # when pyannote uses "data augmentation", it normalizes
        # the signal, but when loading the data without data
        # augmentation it doesn't normalize it.
        y = y / np.max((-np.min(y), np.max(y)))

        # create audio object for shennong
        audio = Audio(data=y, sample_rate=sample_rate)

        # create processor
        processor = BottleneckProcessor(weights=self.weights)

        # define parameters

        #processor.frame_length = self.duration
        #processor.frame_shift = self.step

        # extract features
        bottleneck = processor.process(audio)

        # Compute Pitch
        if self.with_pitch:
            # extract pitch
            pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax)

            ## concatenate mfcc w/pitch - sometimes Kaldi adds to pitch
            ## one frame so give 2 frames of tolerance
            #bottleneck = bottleneck.concatenate(pitch, 2)
            bottleneck = self.concatenate_with_pitch(bottleneck.data,
                                                     pitch.data)
            ## add 1 frame at begining and 1 frame at end to ensure that
            ## we have the same length as mfccs etc..
            bottleneck = np.insert(bottleneck,
                                   0,
                                   np.zeros((1, bottleneck.shape[1])),
                                   axis=0)
            bottleneck = np.insert(bottleneck,
                                   bottleneck.shape[0],
                                   np.zeros((1, bottleneck.shape[1])),
                                   axis=0)
        else:
            bottleneck = bottleneck.data

        return bottleneck
def transform_all_wavs(folder_wav, type, folder_out): # will output [timexdim}
    processor = BottleneckProcessor(weights=type)
    count = 0
    for file in os.listdir(folder_wav):
        if count % 500 == 0:
            print(count)
        count += 1
        if not file.endswith('.wav'):
            continue
        audio = Audio.load(os.path.join(folder_wav, file))

        features = processor.process(audio)
        #print(features.shape)
        #print(features)
        np.savetxt(fname = os.path.join(folder_out,file[:-4] + '.csv'), X=features._data)
Beispiel #3
0
def test_process(capsys, audio, mfcc, weights):
    get_logger(level='debug')

    proc = BottleneckProcessor(weights=weights)
    feat = proc.process(audio)
    assert feat.shape == (140, 80)
    assert feat.shape[1] == proc.ndims
    assert np.allclose(feat.times, mfcc.times)
    assert proc.frame_length == 0.025
    assert proc.frame_shift == 0.01
    assert proc.sample_rate == 8000

    # check the log messages
    captured = capsys.readouterr().err
    assert 'resampling audio from 16000Hz@16b to 8000Hz@16b' in captured
    assert '{} frames of speech detected (on 140 total frames)'.format(
        '118' if audio._sox_binary else '121') in captured
Beispiel #4
0
    # divide the shortest distance by the length of the path
    average_distance = (distance_matrix[vector_1.shape[0]-1][vector_2.shape[0]-1]) \
                        / path_length
    return average_distance


all_features = {}

# get bottleneck features of all .wav files (stimuli)
for root, dirs, files in os.walk(WAV_FOLDER):
    for wav_file in files:
        if wav_file.endswith(".wav"):
            audio = Audio.load(root + wav_file)
            processor = BottleneckProcessor(weights='BabelMulti')
            features = processor.process(audio)
            vectors = features.data
            utterance = wav_file.split('.')[0]
            all_features[utterance] = vectors

for row in distance_list.itertuples():
    row_index = getattr(row, 'Index')
    trip_id = getattr(row, 'tripletid')
    bottle_oth = all_features[trip_id + "_OTH"]
    bottle_tgt = all_features[trip_id + "_TGT"]
    bottle_x = all_features[trip_id + "_X"]

    eucl_oth_x = \
        calculate_distances_dtw(bottle_oth,\
                                bottle_x)
    eucl_tgt_x = \
Beispiel #5
0
from shennong.features.processor.bottleneck import BottleneckProcessor
import pandas as pd
import numpy as np
import scipy.spatial
import os
import sys


WAV_FOLDER = sys.argv[1] # stimuli in .wav
OUT_NPZ_FILE = sys.argv[2] 


all_features = {}

# get bottleneck features of all .wav files (stimuli)
for root, dirs, files in os.walk(WAV_FOLDER):
    for wav_file in files:
        if wav_file.endswith(".wav"):
            audio = Audio.load(root + wav_file)
            all_features[wav_file] = audio


processor = BottleneckProcessor(weights='BabelMulti')
features = processor.process(all_features)

np.savez_compressed(OUT_NPZ_FILE, features)


# features.items()
# features['triplet001_OTH.wav']