Ejemplo n.º 1
0
def convert_inputs_to_ctc_format(audio, fs, target_text):
    #print('convert_inputs_to_ctc_format target_text:' + target_text)
    inputs = mfcc(audio, samplerate=fs, numcep=num_features)
    # Transform in 3D array
    train_inputs = np.asarray(inputs[np.newaxis, :])
    train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)
    train_seq_len = [train_inputs.shape[1]]

    # Get only the words between [a-z] and replace period for none
    original = ' '.join(target_text.strip().lower().split(' ')).\
               replace('.', '').\
               replace('?', '').\
               replace(',', '').\
               replace("'", '').\
               replace('!', '').\
               replace('-', '')
    #print('original:' + original)
    targets = original.replace(' ', '  ')
    targets = targets.split(' ')

    # Adding blank label
    targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])

    # Transform char into index
    targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                          for x in targets])

    # Creating sparse representation to feed the placeholder
    train_targets = sparse_tuple_from([targets])

    return train_inputs, train_targets, train_seq_len, original
Ejemplo n.º 2
0
def audiofile_to_input_vector(audio_filename, n_input, n_context):
    """
    将音频装换成MFCC
    :param audio_filename:
    :param n_input:
    :param n_context:
    :return:
    """
    # 加载wav文件
    fs, audio = wav.read(audio_filename)

    # 获取mfcc数值
    orig_inputs = mfcc(audio, samplerate=fs, numcep=n_input)
    # print(np.shape(orig_inputs))  #(277, 26)
    orig_inputs = orig_inputs[::2]  # (139, 26) 每隔一行进行一次取样

    # train_inputs = np.array([], np.float32)
    # print(orig_inputs.shape[0])
    train_inputs = np.zeros((orig_inputs.shape[0], n_input + 2 * n_input * n_context))
    # print(np.shape(train_inputs))#)(139, 494)
    # empty_mfcc = np.array([])
    empty_mfcc = np.zeros((n_input))

    # 准备输入数据,数据由三部分安顺序拼接而成,分为当前样本的前9个序列样本,当前样本序列,后9个序列样本
    time_slices = range(train_inputs.shape[0])  # 139个切片
    context_past_min = time_slices[0] + n_context
    context_future_max = time_slices[-1] - n_context  # [9,1,2...,137,129]
    for time_slice in time_slices:
        # 前9个补0,mfcc features
        need_empty_past = max(0, (context_past_min - time_slice))
        empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past))
        data_source_past = orig_inputs[max(0, time_slice - n_context):time_slice]

        # 后9个补0,mfcc features
        need_empty_future = max(0, (time_slice - context_future_max))
        empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
        data_source_future = orig_inputs[time_slice + 1:time_slice + n_context + 1]

        if need_empty_past:
            past = np.concatenate((empty_source_past, data_source_past))
        else:
            past = data_source_past

        if need_empty_future:
            future = np.concatenate((data_source_future, empty_source_future))
        else:
            future = data_source_future

        past = np.reshape(past, n_context * n_input)
        now = orig_inputs[time_slice]
        future = np.reshape(future, n_context * n_input)
        # 234, 26, 234
        # train_data = np.concatenate((past, now, future));
        train_inputs[time_slice] = np.concatenate((past, now, future))

    # 将数据使用正太分布标准化,减去均值然后再除以方差
    train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)
    return train_inputs
 def featurize(self, audio_clip):
     """ For a given audio clip, calculate the corresponding feature
     Params:
         audio_clip (str): Path to the audio clip
     """
     if self.spectrogram:
         return spectrogram_from_file(
             audio_clip, step=self.step, window=self.window,
             max_freq=self.max_freq)
     else:
         (rate, sig) = wav.read(audio_clip)
         return mfcc(sig, rate, numcep=self.mfcc_dim)
Ejemplo n.º 4
0
def getMEL(data, n_mfcc = 12, invCoeffOrder = False, winsize = 20, frames = 64):
    """ Function that goes through all samples of each syllable and extracts the
        mfccs for the 12 mel frequency channels.

    :param data: list of syllables with sample data
    :param n_mfcc: number of mel frequency cepstral coefficients to return (default = 12)
    :param invCoeffOrder: if True, extract last n mfcc instead of first n (default = False)
    :param wisize: size of the time window used for mfcc extraction
    :param frames: desired number of time frames in final mfcc data

    :returns syllables: list with mfccs for n_mfcc mel channels for each sample of each syllable
    """

    syllables = []
    i = 0
    for syllable in data:
        samples = []
        for sample in syllable:
            W = winsize/1000. * sample[1]
            winstep = (np.round(1 + (len(sample[0]) - W) / (frames - 1))) / float(sample[1])
            i += 1
            if invCoeffOrder:
                samples.append(
                    mfcc(sample[0],
                         samplerate = sample[1],
                         winlen = winsize/1000.,
                         winstep = winstep,
                         numcep = n_mfcc
                         )[:,-n_mfcc::])
            else:
                samples.append(
                    mfcc(sample[0],
                         samplerate = sample[1],
                         winlen = winsize/1000.,
                         winstep = winstep,
                         numcep = n_mfcc + 1
                         )[:,1::])
        syllables.append(samples)
    return syllables
Ejemplo n.º 5
0
def audiofile_to_input_vector(audio_filename, numcep, numcontext):
    r"""
    Given a WAV audio file at ``audio_filename``, calculates ``numcep`` MFCC features
    at every 0.01s time step with a window length of 0.025s. Appends ``numcontext``
    context frames to the left and right of each time step, and returns this data
    in a numpy array.
    """
    # Load wav files
    fs, audio = wav.read(audio_filename)

    # Get mfcc coefficients
    features = mfcc(audio, samplerate=fs, numcep=numcep, winlen=0.032, winstep=0.02, winfunc=np.hamming)

    # Add empty initial and final contexts
    empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)
    features = np.concatenate((empty_context, features, empty_context))

    return features
def extract_features_and_targets(wav_file, txt_file):
    """
    Extract MFCC features from an audio file and target character annotations from
    a corresponding text transcription
    Args:
        wav_file: audio wav file
        txt_file: text file with transcription
    Returns:
        features, targets, sequence length, original text transcription
    """

    fs, audio = wav.read(wav_file)

    features = mfcc(audio, samplerate=fs, lowfreq=50)

    mean_scale = np.mean(features, axis=0)
    std_scale = np.std(features, axis=0)

    features = (features - mean_scale[np.newaxis, :]) / std_scale[np.newaxis, :]

    seq_len = features.shape[0]

    # Readings targets
    with open(txt_file, 'r') as f:
        for line in f.readlines():
            if line[0] == ';':
                continue

            # Get only the words between [a-z] and replace period for none
            original = ' '.join(line.strip().lower().split(' ')).replace('.', '').replace("'", '').replace('-', '').replace(',','')
            targets = original.replace(' ', '  ')
            targets = targets.split(' ')

    # Adding blank label
    targets = np.hstack([Space_Token if x == '' else list(x) for x in targets])

    # Transform char into index
    targets = np.asarray([Space_Index if x == Space_Token else ord(x) - Index_Start
                          for x in targets])

    # shape (None, num_steps, num_features)
    features = np.asarray(features[np.newaxis, :])

    return features, targets, seq_len, original
Ejemplo n.º 7
0
def extract_cepstrum(df, rate, mfcc_start=0, mfcc_end=-1, winlen = 0.025, winstep = 0.01,
                    numcep = 16, nfilt = 32, nfft=512, lowfreq = 400, highfreq = 12000, noise = None):
    '''
        Extracts the cepstrum features from the raw signal data 

            df : a dataframe where the indices are the timepoint for each 
                supposed key press 

            rate : The rate at which the sound file was processed either when call
                made to  spl.open_audio() or scipy.io.wavfile.wav()

            mfcc_start/mfcc_end : indices to slice the feature vector

            remainder of args are passed into the mfcc punction 
    '''
    rate = float(rate)
    # Convert raw signal into list of numpy arrays 
    char_data = df[df.columns[list(df.columns).index('0'):]].values
    if noise:
        char_data += np.random.normal(0, noise, char_data.shape) 
    keypress_sigs = [np.nan_to_num(np.squeeze(l)) for l in np.split(char_data, char_data.shape[0], axis=0)]
    
    # Create the keypress features one by one
    keypress_feats = []
    for keypress_sig in keypress_sigs:

        mfcc_feat = mfcc(keypress_sig, samplerate= rate, winlen=winlen, 
                         winstep=winstep, numcep=numcep, nfilt=nfilt, 
                         lowfreq=lowfreq, nfft = nfft, highfreq=highfreq)
        keypress_feats.append(np.concatenate(mfcc_feat[mfcc_start:mfcc_end, :]).T)

    # Create cepstrum dataframe
    cepstrum_df = pd.DataFrame(np.vstack(keypress_feats))

    # Copy over true char labels
    if 'char' in cepstrum_df:
        cepstrum_df['char'] = df['char']

    # Put the char labels at the front
    cepstrum_df = cepstrum_df.reindex(columns = [cepstrum_df.columns[-1]] + list(cepstrum_df.columns[:-1]))

    return cepstrum_df
Ejemplo n.º 8
0
    def audioToInputVector(audio, fs, numcep, numcontext):
        if DeprecationWarning.displayed is not True:
            DeprecationWarning.displayed = True
            print('------------------------------------------------------------------------', file=sys.stderr)
            print('WARNING: libdeepspeech failed to load, resorting to deprecated code',      file=sys.stderr)
            print('         Refer to README.md for instructions on installing libdeepspeech', file=sys.stderr)
            print('------------------------------------------------------------------------', file=sys.stderr)

        # Get mfcc coefficients
        features = mfcc(audio, samplerate=fs, numcep=numcep)

        # We only keep every second feature (BiRNN stride = 2)
        features = features[::2]

        # One stride per time step in the input
        num_strides = len(features)

        # Add empty initial and final contexts
        empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)
        features = np.concatenate((empty_context, features, empty_context))

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2*numcontext+1
        train_inputs = np.lib.stride_tricks.as_strided(
            features,
            (num_strides, window_size, numcep),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        # Flatten the second and third dimensions
        train_inputs = np.reshape(train_inputs, [num_strides, -1])

        # Whiten inputs (TODO: Should we whiten?)
        # Copy the strided array so that we can write to it safely
        train_inputs = np.copy(train_inputs)
        train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)

        # Return results
        return train_inputs
def input_preprocess():
    
    ###audio_filename = maybe_download('LDC93S1.wav', 93638)
    ###target_filename = maybe_download('LDC93S1.txt', 62)

    fs, audio = wav.read(audio_filename)
    
    inputs = mfcc(audio, samplerate=fs)
    # Tranform in 3D array
    train_inputs = np.asarray(inputs[np.newaxis, :])
    train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
    train_seq_len = [train_inputs.shape[1]]
    num_examples = 1
    
    with open(target_filename, 'r') as f:

    #Only the last line is necessary
        line = f.readlines()[-1]
       ## global original[num_examples]
    # Get only the words between [a-z] and replace period for none
        ###original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '')
        targets = original.replace(' ', '  ')
        targets = targets.split(' ')

    # Adding blank label
    targets = np.hstack([Space_Token if x == '' else list(x) for x in targets])

    # Transform char into index
    targets = np.asarray([Space_Index if x == Space_Token else ord(x) - Index_Start
                      for x in targets])

    # Creating sparse representation to feed the placeholder
    train_targets = sparse_tuple_from([targets])

    # We don't have a validation dataset :(
    val_inputs, val_targets, val_seq_len = train_inputs, train_targets, \
                                       train_seq_len

    return inputs, train_inputs, train_targets, train_seq_len
Ejemplo n.º 10
0
        data_lines = f.readlines()
        data_num = len(data_lines)
        training_num = int(data_num * TRAIN_FRACTION)
    with open(TEXT_FILE, 'r') as f:
        for i, line in enumerate(f):
            parts = line.split(',')

            # Get just the filename part of the audio file
            audio_file = parts[0]
            last_slash = audio_file.rfind('/') + 1
            audio_file = audio_file[last_slash:-4]

            text = parts[1]

            rate, signal = wav.read(AUDIO_PATH + "/" + audio_file + ".wav")
            mfcc_feat = mfcc(signal, rate, numcep=26)
            mfcc_feat = mfcc_feat.transpose()
            characters = np.array([])
            for character in text:
                characters = np.append(characters, ALPHABET.index(character))

            if i < training_num:
                np.save(TRAIN_INPUT_PATH + "/" + audio_file, mfcc_feat)
                np.save(TRAIN_TARGET_PATH + "/" + audio_file, characters)
            else:
                np.save(TEST_INPUT_PATH + "/" + audio_file, mfcc_feat)
                np.save(TEST_TARGET_PATH + "/" + audio_file, characters)

    inputs, outputs = load_batched_data(TRAIN_INPUT_PATH, TRAIN_TARGET_PATH)

    # 8 input samples:
Ejemplo n.º 11
0
def extract_features(audio, rate):
    mfcc_feat = mfcc.mfcc(audio, rate, 0.025, 0.01, 20, appendEnergy=True)
    mfcc_feat = preprocessing.scale(mfcc_feat)
    delta = calculate_delta(mfcc_feat)
    combined = np.hstack((mfcc_feat, delta))
    return combined
Ejemplo n.º 12
0
def readwav(audio_filename, n_input=26):
    # 读取音频文件
    fs, audio = wav.read(audio_filename)
    # 提取mfcc数值
    orig_inputs = mfcc(audio, samplerate=fs, numcep=26)
Ejemplo n.º 13
0
def get_speech_features(signal,
                        sample_freq,
                        num_features,
                        pad_to=8,
                        features_type='spectrogram',
                        window_size=20e-3,
                        window_stride=10e-3,
                        augmentation=None):
    """Function to convert raw audio signal to numpy array of features.

  Args:
    signal (np.array): np.array containing raw audio signal.
    sample_freq (float): frames per second.
    num_features (int): number of speech features in frequency domain.
    pad_to (int): if specified, the length will be padded to become divisible
        by ``pad_to`` parameter.
    features_type (string): 'mfcc' or 'spectrogram'.
    window_size (float): size of analysis window in milli-seconds.
    window_stride (float): stride of analysis window in milli-seconds.
    augmentation (dict, optional): dictionary of augmentation parameters. See
        :func:`get_speech_features_from_file` for specification and example.

  Returns:
    np.array: np.array of audio features with shape=[num_time_steps,
    num_features].
    audio_duration (float): duration of the signal in seconds
  """
    if augmentation is not None:
        if 'time_stretch_ratio' not in augmentation:
            raise ValueError(
                'time_stretch_ratio has to be included in augmentation '
                'when augmentation it is not None')
        if 'noise_level_min' not in augmentation:
            raise ValueError(
                'noise_level_min has to be included in augmentation '
                'when augmentation it is not None')
        if 'noise_level_max' not in augmentation:
            raise ValueError(
                'noise_level_max has to be included in augmentation '
                'when augmentation it is not None')
        signal = augment_audio_signal(signal, sample_freq, augmentation)
    else:
        signal = (normalize_signal(signal.astype(np.float32)) *
                  32767.0).astype(np.int16)

    audio_duration = len(signal) * 1.0 / sample_freq

    n_window_size = int(sample_freq * window_size)
    n_window_stride = int(sample_freq * window_stride)

    # making sure length of the audio is divisible by 8 (fp16 optimization)
    length = 1 + int(
        math.ceil((1.0 * signal.shape[0] - n_window_size) / n_window_stride))
    if pad_to > 0:
        if length % pad_to != 0:
            pad_size = (pad_to - length % pad_to) * n_window_stride
            signal = np.pad(signal, (0, pad_size), mode='constant')

    if features_type == 'spectrogram':
        frames = psf.sigproc.framesig(sig=signal,
                                      frame_len=n_window_size,
                                      frame_step=n_window_stride,
                                      winfunc=np.hanning)

        # features = np.log1p(psf.sigproc.powspec(frames, NFFT=N_window_size))
        features = psf.sigproc.logpowspec(frames, NFFT=n_window_size)
        assert num_features <= n_window_size // 2 + 1, \
          "num_features for spectrogram should be <= (sample_freq * window_size // 2 + 1)"

        # cut high frequency part
        features = features[:, :num_features]

    elif features_type == 'mfcc':
        features = psf.mfcc(signal=signal,
                            samplerate=sample_freq,
                            winlen=window_size,
                            winstep=window_stride,
                            numcep=num_features,
                            nfilt=2 * num_features,
                            nfft=512,
                            lowfreq=0,
                            highfreq=None,
                            preemph=0.97,
                            ceplifter=2 * num_features,
                            appendEnergy=False)

    elif features_type == 'logfbank':
        features = psf.logfbank(signal=signal,
                                samplerate=sample_freq,
                                winlen=window_size,
                                winstep=window_stride,
                                nfilt=num_features,
                                nfft=512,
                                lowfreq=0,
                                highfreq=sample_freq / 2,
                                preemph=0.97)

    else:
        raise ValueError('Unknown features type: {}'.format(features_type))

    if pad_to > 0:
        assert features.shape[0] % pad_to == 0
    mean = np.mean(features)
    std_dev = np.std(features)
    features = (features - mean) / std_dev
    return features, audio_duration
Ejemplo n.º 14
0
x= len(data)
p = 25000-x
l =0
tests = np.empty([200,4043])
new_data = np.empty([25000,])
y1 = np.empty([25000,])	

y = p/2;

for i in range(0,y-1):
	new_data[i] = y1 [i]
for i in range(y,25000-p+y-1):
	new_data[i] = data[i-y]
for i in range(25000-y,24999):
    new_data[i] = y1[i]
data1 = mfcc(new_data,samplerate)
data = data1
data = data.reshape(4043,)

nIn = 4043
nOut = 5
x = data


def sigmoid(x):
    x = np.array(x,dtype=np.float128)
    x = x.reshape(nOut,1)
    x = x
    for  i in range (0,5):
	if x[i] < -700:
	    x[i]=0
Ejemplo n.º 15
0
def extract_mfcc(sound):
    (rate,sig) = wav.read(StringIO.StringIO(sound))
    mfcc_feat = features.mfcc(sig,rate)
    return numpy.asarray(mfcc_feat, dtype='float32')
Ejemplo n.º 16
0
#!/usr/bin/env python

from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav

(rate,sig) = wav.read("english.wav")
mfcc_feat = mfcc(sig,rate)
d_mfcc_feat = delta(mfcc_feat, 2)
fbank_feat = logfbank(sig,rate)

print(fbank_feat[1:3,:])
Ejemplo n.º 17
0
"""
demo04_mfcc.py  mfcc矩阵
"""
import scipy.io.wavfile as wf
import python_speech_features as sf
import matplotlib.pyplot as mp

sample_rate, sigs = wf.read('../ml_data/speeches/training/banana/banana01.wav')
mfcc = sf.mfcc(sigs, sample_rate)

mp.matshow(mfcc.T, cmap='gist_rainbow')
mp.show()
Ejemplo n.º 18
0
def pinzhen(file_path, n_input, n_context):
    wav_data, fs = read_wav_data(file_path)
    origin_inputs = mfcc(wav_data, samplerate=fs, numcep=n_input)
    '''
       跳帧选择需要的特征;
    '''
    # print(origin_inputs)
    '''
       这里涉及跳帧处理,隔一帧,取一列特征;
    '''
    # origin_inputs = origin_inputs[::2]
    # print(origin_inputs)
    '''
       初始化最后提取的总特征维度;
    '''
    train_inputs = np.zeros(shape=(origin_inputs.shape[0],
                                   n_input + 2 * n_input * n_context))
    '''
       初始化需要填补的MFCC特征;
    '''
    empty_mfcc = np.zeros((n_input))

    time_slices = range(train_inputs.shape[0])
    '''
       设置初始需要拼帧与未来需要拼帧的初始位置;
    '''
    context_past_min = time_slices[0] + n_context
    context_future_max = time_slices[-1] - n_context

    for time_slice in time_slices:
        need_empty_past = max(0, (context_past_min - time_slice))
        empty_source_past = list(empty_mfcc
                                 for empty_slots in range(need_empty_past))
        data_source_past = origin_inputs[max(0, time_slice -
                                             n_context):time_slice]
        need_empty_future = max(0, (time_slice - context_future_max))
        empty_source_future = list(empty_mfcc
                                   for empty_slots in range(need_empty_future))
        data_source_future = origin_inputs[time_slice + 1:time_slice +
                                           n_context + 1]

        if need_empty_past:
            past = np.concatenate((empty_source_past, data_source_past))
        else:
            past = data_source_past

        if need_empty_future:
            future = np.concatenate((data_source_future, empty_source_future))
        else:
            future = data_source_future

        past = np.reshape(past, n_context * n_input)
        now = origin_inputs[time_slice]
        future = np.reshape(future, n_context * n_input)

        train_inputs[time_slice] = np.concatenate((past, now, future))
    '''
       可以做一下均值归一化,将数据服从正太分布标准,减去均值再除以方差
    '''
    train_inputs = (train_inputs -
                    np.mean(train_inputs)) / np.std(train_inputs)
    return train_inputs
Ejemplo n.º 19
0
def makeData(from_file, to_file, core):
#    samples = np.zeros((0, num_cols))

    for counter, wp in enumerate(wavpaths[from_file:to_file + 1], 1):
        rate, sig = scipy.io.wavfile.read(wp)
        np_mfcc = python_speech_features.mfcc(sig, rate, winlen=window, winstep=step)
        np_mfcc_d = python_speech_features.delta(np_mfcc, 2)
        np_mfcc_dd = python_speech_features.delta(np_mfcc_d, 2)
        np_mfcc_all = np.append(np.append(np_mfcc, np_mfcc_d, axis=1), np_mfcc_dd, axis=1)
#        print(np_mfcc_all.shape)
        wn = wp.split("/")[-1]
        # get corpus info
        if wn[0] in ["F", "M"]:
            corpus = "ifa"
        elif wn[0] == "D":
            corpus = "ifadv"
        elif wn[0] == "p":
            corpus = "ecsd"
        else:
            corpus = "cgn-" + wn[0]
        #
        sent_id = ".".join(wn.split(".")[:-1])
        print(core, counter, "/", to_file - from_file, sent_id)
        tg_path = af_path + chunk_folder + sent_id + ".TextGrid"
        tg = textgrid.TextGrid()
        with makeTempFile(tg_path) as tempf:
            tg.read(tempf.name)
        intervals = tg.tiers[0].intervals
        end_time = round(intervals[-1].maxTime, 3)
        start_time = round(intervals[0].minTime, 3)
#        print(start_time, end_time)
        classes = np.zeros((0, num_cols_per_frame))
        int_i = 0
        num_frames = np_mfcc_all.shape[0]
        useable_frame_indices = []
        for frame in range(1, num_frames + 1):
            frame_s = round(start_time + (frame - 1) * step, 3)
            frame_e = frame_s + window
            if frame_e > end_time:  # because '0' samples can be appended to sig so it can be divided by an integer of frames
                frame_e = end_time
            intvl = intervals[int_i]
            if frame_s < round(intvl.minTime, 3):
                print(frame_s, round(intvl.minTime, 3))
            assert frame_s >= round(intvl.minTime, 3)
            if frame_e <= round(intvl.maxTime, 3):
                # calculate the proportion of the frame that is within the useable centre of the interval
                int_dur = round(intvl.duration(), 3)
                prop_dur = int_dur * prop_used
                used_s = round(intvl.minTime, 3) + (int_dur - prop_dur) / 2
                used_e = used_s + prop_dur
                x1 = used_s - frame_s
                x1 = 0 if x1 <= 0 else window if x1 > window else x1
                x2 = frame_e - used_e
                x2 = 0 if x2 <= 0 else window if x2 > window else x2
                prop_f_in_used_i = (window - (x1 + x2)) / window
#                print(prop_f_in_used_i, int_dur, used_s, used_e, frame_s, frame_e)
                if prop_f_in_used_i > 0.5:
                    useable_frame_indices.append(frame - 1)
                    label_list = getFeatureLabel(intvl.mark)
                else:
                    label_list = [99 for i in range(len(features))]
                row = np.array([np.append(np_mfcc_all[frame - 1, ], label_list)])
                classes = np.append(classes, row, axis=0)
            else:
                assert frame_e > round(intvl.maxTime, 3)
                proportions = [(round(intvl.maxTime, 3) - frame_s, int_i)]
                new_int = intvl
                new_int_i = int_i
                next_int_i = int_i
                while frame_e > round(new_int.maxTime, 3):
                    new_int_i += 1
                    new_int = intervals[new_int_i]
                    overlap = (frame_e - round(new_int.minTime, 3)) if frame_e <= round(new_int.maxTime, 3) else (round(new_int.maxTime, 3) - round(new_int.minTime, 3))
                    proportions.append((overlap, new_int_i))
                    if (frame_s + step) >= round(new_int.minTime, 3):
                        next_int_i = new_int_i
                best_int_i = max(proportions)[1]
                best_int = intervals[best_int_i]
                # calculate the proportion of the frame that is within the useable centre of the interval
                int_dur = round(best_int.duration(), 3)
                prop_dur = int_dur * prop_used
                used_s = round(best_int.minTime, 3) + (int_dur - prop_dur) / 2
                used_e = used_s + prop_dur
                x1 = used_s - frame_s
                x1 = 0 if x1 <= 0 else window if x1 > window else x1
                x2 = frame_e - used_e
                x2 = 0 if x2 <= 0 else window if x2 > window else x2
                prop_f_in_used_i = (window - (x1 + x2)) / window
                if prop_f_in_used_i > 0.5:
                    useable_frame_indices.append(frame - 1)
                    label_list = getFeatureLabel(best_int.mark)
                else:
                    label_list = [99 for i in range(len(features))]
                row = np.array([np.append(np_mfcc_all[frame - 1, ], label_list)])
                classes = np.append(classes, row, axis=0)
                int_i = next_int_i
#        print(useable_frame_indices)
        for old_row in range(classes.shape[0]):
            if (old_row >= 2 * frame_window) and ((old_row - frame_window) in useable_frame_indices):
                new_labels = classes[old_row - frame_window, num_cols_per_frame - len(features):]
                if new_labels[0] < 90:
                    new_feat = classes[old_row - (2 * frame_window):old_row + 1, :num_cols_per_frame - len(features)].flatten()
                    new_row = np.array([np.append(np.append(new_feat, corpora[corpus]), new_labels)])
                    # samples = np.append(samples, new_row, axis=0)
                    with open(af_path + "AF_s" + str(int(core) + running_cores) + ".csv", "a") as f:
#                    with open(scratch + "AF_s" + core + ".csv", "a") as f:
                        np.savetxt(f, new_row, fmt='%.5e', delimiter=",")
Ejemplo n.º 20
0
    def evaluate(self, opt, videofile):

        self.__S__.eval()

        # ========== ==========
        # Load video
        # ========== ==========
        cap = cv2.VideoCapture(videofile)

        frame_num = 1
        images = []
        while frame_num:
            frame_num += 1
            ret, image = cap.read()
            if ret == 0:
                break

            images.append(image)

        im = numpy.stack(images, axis=3)
        im = numpy.expand_dims(im, axis=0)
        im = numpy.transpose(im, (0, 3, 4, 1, 2))

        imtv = torch.autograd.Variable(
            torch.from_numpy(im.astype(float)).float())

        # ========== ==========
        # Load audio
        # ========== ==========

        audiotmp = os.path.join(opt.tmp_dir, 'audio.wav')

        command = (
            "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s"
            % (videofile, audiotmp))
        output = subprocess.call(command, shell=True, stdout=None)

        sample_rate, audio = wavfile.read(audiotmp)
        mfcc = zip(*python_speech_features.mfcc(audio, sample_rate))
        mfcc = numpy.stack([numpy.array(i) for i in mfcc])

        cc = numpy.expand_dims(numpy.expand_dims(mfcc, axis=0), axis=0)
        cct = torch.autograd.Variable(
            torch.from_numpy(cc.astype(float)).float())

        # ========== ==========
        # Check audio and video input length
        # ========== ==========

        if (float(len(audio)) / 16000) < (float(len(images)) / 25):
            print(
                " *** WARNING: The audio (%.4fs) is shorter than the video (%.4fs). Type 'cont' to continue. *** "
                % (float(len(audio)) / 16000, float(len(images)) / 25))
            pdb.set_trace()

        # ========== ==========
        # Generate video and audio feats
        # ========== ==========

        lastframe = len(images) - 6
        im_feat = []
        cc_feat = []

        tS = time.time()
        for i in range(0, lastframe, opt.batch_size):

            im_batch = [
                imtv[:, :, vframe:vframe + 5, :, :]
                for vframe in range(i, min(lastframe, i + opt.batch_size))
            ]
            im_in = torch.cat(im_batch, 0)
            im_out = self.__S__.forward_lip(im_in.cuda())
            im_feat.append(im_out.data.cpu())

            cc_batch = [
                cct[:, :, :, vframe * 4:vframe * 4 + 20]
                for vframe in range(i, min(lastframe, i + opt.batch_size))
            ]
            cc_in = torch.cat(cc_batch, 0)
            cc_out = self.__S__.forward_aud(cc_in.cuda())
            cc_feat.append(cc_out.data.cpu())

        im_feat = torch.cat(im_feat, 0)
        cc_feat = torch.cat(cc_feat, 0)

        # ========== ==========
        # Compute offset
        # ========== ==========

        print('Compute time %.3f sec.' % (time.time() - tS))

        dists = calc_pdist(im_feat, cc_feat, vshift=opt.vshift)
        mdist = torch.mean(torch.stack(dists, 1), 1)

        minval, minidx = torch.min(mdist, 0)

        offset = opt.vshift - minidx
        conf = torch.median(mdist) - minval

        fdist = numpy.stack([dist[minidx].numpy() for dist in dists])
        # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
        fconf = torch.median(mdist).numpy() - fdist
        fconfm = signal.medfilt(fconf, kernel_size=9)

        numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
        print('Framewise conf: ')
        print(fconfm)
        print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' %
              (offset, minval, conf))

        dists_npy = numpy.array([dist.numpy() for dist in dists])
        return offset.numpy(), conf.numpy(), dists_npy
Ejemplo n.º 21
0
#!/usr/bin/env python

import scipy.io.wavfile as w
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank

# Load waveform
audio_name = '/home/alanwuha/Documents/Projects/datasets/iemocap/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F000.wav'
sample_rate, waveform = w.read(audio_name)

# Compute MFCC
mfcc_feat = mfcc(waveform, sample_rate, preemph=0)
d_mfcc_feat = delta(mfcc_feat, 2)
fbank_feat = logfbank(waveform, sample_rate)

print(fbank_feat[1:3, :])
Ejemplo n.º 22
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Mar  8 09:53:24 2019

@author: mkumar
"""

from python_speech_features import mfcc

sample_rate, X = wavfile.read('./genres/rock/rock.00025.wav')
print(sample_rate, X.shape)
specgram(X, Fs=sample_rate, xextent=(0, 30))

ceps = mfcc(X)
print(ceps.shape)
    25000,
])
y1 = np.empty([
    25000,
])

y = p // 2

for i in range(0, y):
    new_data[i] = y1[i]
for i in range(y, x + y):
    new_data[i] = data[i - y]
for i in range(x + y, 25000):
    new_data[i] = y1[i]

data = (mfcc(y=new_data, sr=samplerate, n_mfcc=39).T)
data = data.reshape((1, data.shape[0], data.shape[1]))
print(data.shape)
nIn = 4043
nOut = 5

########################################################################################

# Def Loss function


# this is the implementation of categorical loss function from scratch
# the input to the function is predicted probability and a one hot vector
# it computes the loss by summing over the expression -ylog(p) over the tuple
# this is summed over a batch and the final loss is given
def categorical_cross_entropy(ytrue, ypred, axis=-1):
Ejemplo n.º 24
0
 def calculates_mfcc(data):
     return mfcc(data, samplerate=SAMPLE_RATE, winlen=0.02, winstep=0.01)
Ejemplo n.º 25
0
        print(video)

        video.download('./games')


        fn = './games/game_'+str(i) +'.mp4'
        new_fn = 'game_'+str(i)+'.wav'

        # get .wav
        os.system(cmd.format(fn,'./games_audio/' +new_fn))

        # delete video file
        os.system(rm.format('./games/game_'+str(i) +'.mp4'))

        # read wav
        fs,x = wav.read('./games_audio/' +new_fn)

        #mfcc coefs
        mel= mfcc(x[:,0],fs)

        #save mfcc
        np.save('./games_audio/game_'+str(i)+'.npy' , mel.astype(np.float32))

        #remove .wav
        os.system(rm.format( './games_audio/' +new_fn ) )

        j+=1

    except:
        pass
Ejemplo n.º 26
0
 def mfccs(self):
     """Returns the Mel-Frequency Cepstral Coefficients for this segment."""
     return mfcc(self.signal[:int(0.6 * self.sample_rate)], self.sample_rate, winlen=0.05, winstep=0.05, numcep=40, nfilt=80)
Ejemplo n.º 27
0
# -*- coding: utf-8 -*-

from scipy.io import wavfile
from python_speech_features import mfcc
# 1.读取wav格式数据
path = './music/data/1KHz-STERO.wav'
(rate,data) = wavfile.read(path)
print("文件的rate值:{}",format(rate))
print("文件的数据的大小:{}",format(data.shape))
print(data[:10])
print('-'*100)

# 提取特征信息
mfcc_feat = mfcc(signal=data,samplerate=rate,numcep=26,nfft=2048)
print(type(mfcc_feat))
print(mfcc_feat)

#
def mffcRead(str):
    (rate, sig) = wav.read(str)
    mfcc_feat = mfcc(sig, rate)
    d_mfcc_feat = delta(mfcc_feat, 2)
    fbank_feat = logfbank(sig, rate)
    return fbank_feat
Ejemplo n.º 29
0
def extract_ratio(train_ratio, test_ratio, audio_dir):
    """
    Extract audio in a ratio from the directory for training and testing 
    returns two numpy array Xtrain and Xtest 
    audio_dir: should be of the for : "wav/*/*.wav"
    """
    if test_ratio + train_ratio != 1:
        print("ratios should add up to 1\n")
        return

    all_music_files = glob.glob(audio_dir)
    all_music_files.sort()
    all_mfcc = np.array([])
    flag = True

    Testing = False
    Training = True

    #initialize the array
    all_mfcc = np.array([])

    count = 0; #training
    #count = test_ratio; 
    loop_count = -1
    flag = True
    

    for train_test_loop in range(2):      
        #extract mfcc features for the audio files
        for file_name in all_music_files: 
            #for training select only train_ratio songs from each
            if Training:
                loop_count += 1
                if loop_count % 100 == 0:
                    count = 0
                if count == train_ratio * 100:
                    continue    #selects only train_ratio songs in every 100 songs
                count += 1
            
            #for testing select last test_ratio songs from each genre
            if Testing:
                loop_count += 1
                if (loop_count + (test_ratio * 100)) % 100 == 0 and loop_count:
                    count = 0
                    print('--'*10)
            
                if count == test_ratio * 100:
                    continue
                count += 1

            if Training or Testing:
                (rate, data) = scipy.io.wavfile.read(file_name)
                mfcc_feat = mfcc(data,rate)
                #redusing mfcc dimension to 104
                mm = np.transpose(mfcc_feat)
                mf = np.mean(mm,axis=1)
                cf = np.cov(mm)
                ff=mf  

                #ff is a vector of size 104
                for i in range(mm.shape[0]):
                    ff = np.append(ff,np.diag(cf,i))

                #re initializing to size 104
                if flag:
                    all_mfcc = ff;
                    print('*'*20)
                    flag = False      
                else:
                    all_mfcc = np.vstack([all_mfcc,ff])
                
                print("loooping----",loop_count)
                print("all_mfcc.shape:",all_mfcc.shape)

        if train_test_loop == 0:
            print('\n'*10,'====Collected Training data===','\n'*20)
            print('\n','====Collecting Testing data===','\n')
            Xtrain = all_mfcc
            count = test_ratio * 100
            Testing = True
            Training = False
            loop_count = -1
            all_mfcc = np.array([])
            flag = True
            print("Xtrain.shape:", Xtrain.shape)

        if train_test_loop == 1:
            print('\n','====Collected Testing data===','\n')
            Xtest = all_mfcc
            print("Xtest.shape:", Xtest.shape)

    return Xtrain, Xtest
Ejemplo n.º 30
0
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav

(rate, sig) = wav.read("Rak_train.wav")
mfcc_feat = mfcc(sig, rate)
d_mfcc_feat = delta(mfcc_feat, 2)
fbank_feat1 = logfbank(sig, rate)
fbank_feat1 = fbank_feat1.sum()
print(fbank_feat1)

(rate, sig) = wav.read("Qwer2.wav")
mfcc_feat = mfcc(sig, rate)
d_mfcc_feat = delta(mfcc_feat, 2)
fbank_feat2 = logfbank(sig, rate)
fbank_feat2 = fbank_feat2.sum()
print(fbank_feat2)

if fbank_feat2 > fbank_feat1:
    likely = (fbank_feat1 / fbank_feat2) * 100
    print(likely)
else:
    likely = (fbank_feat2 / fbank_feat1) * 100
    print(likely)
def feature_extractor(sound_path):
    sampling_freq, audio = wavfile.read(sound_path)
    mfcc_features = mfcc(audio, sampling_freq, nfft=2048, numcep=13, nfilt=13)
    return mfcc_features
Ejemplo n.º 32
0
#!/usr/bin/env python3

from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import numpy as np
import sys
# from keras.layers import GaussianNoise

# GUARD: Check if someone has supplied an argument
if len(sys.argv) < 2:
    raise Exception('No input file')

# Read an input file from the first argument on the commandline
inputFile = sys.argv[1]

# Calculate MFCC
(rate, sig) = wav.read(inputFile)
mfcc_features = mfcc(sig, rate, nfft=2048)
# d_mfcc_feat = delta(mfcc_feat, 2)
# fbank_feat = logfbank(sig,rate)

# Normalize MFCC by subtracting the mean and using standard deviation
# In the future, we should possibly do this only with the training data

# Print MFCC
print(mfcc_features)
sys.stdout.flush()
Ejemplo n.º 33
0
# Name of the model (for saving and logs)
PREMODELNAME = "rnn_full_mfcc+chroma+time+spec_nopreemph_mixednoise_resnet_ws08_512"

os.chdir(PATH_SOURCE)

print("Generating features from validationsamples ...")

for aud in tqdm(glob.glob("*.wav")):
    [Fs, x] = audioBasicIO.read_audio_file(aud)
    F, f_names = frequencyandchromafeatures.feature_extraction(
        x, Fs, WINDOW_SIZE * Fs, WINDOW_STEP * Fs)
    (rate, sig) = wav.read(aud)
    mfcc_feat = mfcc(sig,
                     rate,
                     numcep=NUMCEP,
                     nfilt=NUMFILT,
                     winlen=WINDOW_SIZE,
                     winstep=WINDOW_STEP,
                     nfft=NFFT,
                     preemph=PREEMPH)
    emotion = "N"
    if "W" in aud:
        emotion = "W"
    elif "L" in aud:
        emotion = "L"
    elif "E" in aud:
        emotion = "E"
    elif "A" in aud:
        emotion = "A"
    elif "F" in aud:
        emotion = "F"
    elif "T" in aud:
            y_test.append(dataset[x][2])


# measuring run time
start_time = time.time()

directory = "C:/Users/rezaa/OneDrive/Desktop/Auburn Spring 2021/Machine Learning/Final Project/genres/"
f = open("my.dat", 'wb')
i = 0
for folder in os.listdir(directory):
    i += 1
    if i == 11:
        break
    for file in os.listdir(directory + folder):
        (rate, sig) = wav.read(directory + folder + "/" + file)
        mfcc_feat = mfcc(sig, rate, winlen=0.020, appendEnergy=False)
        covariance = np.cov(np.matrix.transpose(mfcc_feat))
        mean_matrix = mfcc_feat.mean(0)
        covariance_mean = covariance.mean(0)
        feature = (mean_matrix, covariance_mean, i)
        pickle.dump(feature, f)
f.close()

dataset = []
X_train = []
y_train = []
X_test = []
y_test = []
loadDataset("my.dat", 0.8, X_train, y_train, X_test, y_test)

X_train_np = np.asarray(X_train)
Ejemplo n.º 35
0
def audiofile_to_input_vector(audio_filename, numcep, numcontext):

    # Load wav files
    fs, audio = wav.read(audio_filename)

    # Get mfcc coefficients
    orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
    #print(np.shape(orig_inputs))#(277, 26)
    orig_inputs = orig_inputs[::2]  #(139, 26)

    train_inputs = np.array([], np.float32)
    train_inputs.resize(
        (orig_inputs.shape[0], numcep + 2 * numcep * numcontext))
    #print(np.shape(train_inputs))#)(139, 494)
    # Prepare pre-fix post fix context
    empty_mfcc = np.array([])
    empty_mfcc.resize((numcep))

    # Prepare train_inputs with past and future contexts
    time_slices = range(train_inputs.shape[0])  #139个切片
    context_past_min = time_slices[0] + numcontext
    context_future_max = time_slices[-1] - numcontext  #[9,1,2...,137,129]
    for time_slice in time_slices:
        # 前9个补0,mfcc features
        need_empty_past = max(0, (context_past_min - time_slice))
        empty_source_past = list(empty_mfcc
                                 for empty_slots in range(need_empty_past))
        data_source_past = orig_inputs[max(0, time_slice -
                                           numcontext):time_slice]
        assert (len(empty_source_past) + len(data_source_past) == numcontext)

        # 后9个补0,mfcc features
        need_empty_future = max(0, (time_slice - context_future_max))
        empty_source_future = list(empty_mfcc
                                   for empty_slots in range(need_empty_future))
        data_source_future = orig_inputs[time_slice + 1:time_slice +
                                         numcontext + 1]
        assert (len(empty_source_future) +
                len(data_source_future) == numcontext)

        if need_empty_past:
            past = np.concatenate((empty_source_past, data_source_past))
        else:
            past = data_source_past

        if need_empty_future:
            future = np.concatenate((data_source_future, empty_source_future))
        else:
            future = data_source_future

        past = np.reshape(past, numcontext * numcep)
        now = orig_inputs[time_slice]
        future = np.reshape(future, numcontext * numcep)

        train_inputs[time_slice] = np.concatenate((past, now, future))
        assert (len(train_inputs[time_slice]) == numcep +
                2 * numcep * numcontext)

    # 将数据使用正太分布标准化,减去均值然后再除以方差
    train_inputs = (train_inputs -
                    np.mean(train_inputs)) / np.std(train_inputs)
    return train_inputs
Ejemplo n.º 36
0
def get_speech_features(signal, fs, num_features, pad_to=8,
                        features_type='spectrogram',
                        window_size=20e-3,
                        window_stride=10e-3,
                        augmentation=None):
  """Function to convert raw audio signal to numpy array of features.

  Args:
    signal (np.array): np.array containing raw audio signal.
    fs (float): frames per second.
    num_features (int): number of speech features in frequency domain.
    pad_to (int): if specified, the length will be padded to become divisible
        by ``pad_to`` parameter.
    features_type (string): 'mfcc' or 'spectrogram'.
    window_size (float): size of analysis window in milli-seconds.
    window_stride (float): stride of analysis window in milli-seconds.
    augmentation (dict, optional): dictionary of augmentation parameters. See
        :func:`get_speech_features_from_file` for specification and example.
  Returns:
    np.array: np.array of audio features with shape=[num_time_steps, num_features].
  """
  if augmentation is not None:
    if 'time_stretch_ratio' not in augmentation:
      raise ValueError('time_stretch_ratio has to be included in augmentation '
                       'when augmentation it is not None')
    if 'noise_level_min' not in augmentation:
      raise ValueError('noise_level_min has to be included in augmentation '
                       'when augmentation it is not None')
    if 'noise_level_max' not in augmentation:
      raise ValueError('noise_level_max has to be included in augmentation '
                       'when augmentation it is not None')
    signal = augment_audio_signal(signal, fs, augmentation)

  n_window_size = int(fs * window_size)
  n_window_stride = int(fs * window_stride)

  # making sure length of the audio is divisible by 8 (fp16 optimization)
  length = 1 + int(math.ceil(
    (1.0 * signal.shape[0] - n_window_size) / n_window_stride)
  )
  if pad_to > 0:
    if length % pad_to != 0:
      pad_size = (pad_to - length % pad_to) * n_window_stride
      signal = np.pad(signal, (0, pad_size), mode='reflect')

  if features_type == 'spectrogram':
    frames = psf.sigproc.framesig(sig=signal,
                                  frame_len=n_window_size,
                                  frame_step=n_window_stride,
                                  winfunc=np.hanning)

    # features = np.log1p(psf.sigproc.powspec(frames, NFFT=N_window_size))
    features = psf.sigproc.logpowspec(frames, NFFT=n_window_size)
    assert num_features <= n_window_size // 2 + 1, \
        "num_features for spectrogram should be <= (fs * window_size // 2 + 1)"

    # cut high frequency part
    features = features[:, :num_features]

  elif features_type == 'mfcc':
    features = psf.mfcc(signal=signal,
                        samplerate=fs,
                        winlen=window_size,
                        winstep=window_stride,
                        numcep=num_features,
                        nfilt=2*num_features,
                        nfft=512,
                        lowfreq=0, highfreq=None,
                        preemph=0.97,
                        ceplifter=2*num_features,
                        appendEnergy=False)
  else:
    raise ValueError('Unknown features type: {}'.format(features_type))

  assert features.shape[0] % pad_to == 0
  m = np.mean(features)
  s = np.std(features)
  features = (features - m) / s
  return features
Ejemplo n.º 37
0
def file2mfcc(fileName):
    (rate, sig) = wav.read(fileName)
    if len(sig) != 16000:
        return False, []
    mfcc_feat = mfcc(sig, rate)
    return True, mfcc_feat
def getMFCC(data, fs, ANALYSIS_WINDOW, HOPSIZE):
    coeficientes = mfcc(signal=data, samplerate=fs,
                        winlen=ANALYSIS_WINDOW*1./fs,
                        winstep=HOPSIZE*1./fs,
                        numcep=5)
    return coeficientes
Ejemplo n.º 39
0
def _generate_mel_spectrogram(audio_clip, sample_rate):
    mfcc = zip(*python_speech_features.mfcc(audio_clip, sample_rate))
    audio_features = np.stack([np.array(i) for i in mfcc])
    audio_features = np.expand_dims(audio_features, axis=0)
    return audio_features
Ejemplo n.º 40
0
# ^_^ coding:utf-8 ^_^

import numpy as np
from scipy.io import wavfile
import matplotlib.pyplot as plt
from python_speech_features import mfcc, logfbank

# 读取输入的音频文件
sampling_freq, audio = wavfile.read('input_freq.wav')

# 提取MFCC和过滤器组特征
mfcc_features = mfcc(audio, sampling_freq)
filterbank_features = logfbank(audio, sampling_freq)

# 打印参数
print('MFCCL Number of windows = {}'.format(mfcc_features.shape[0]))
print('Length of each feature = {}'.format(mfcc_features.shape[1]))
print('Filter bank: Number of windows = {}'.format(filterbank_features.shape[0]))
print('Length of each feature = {}'.format(filterbank_features.shape[1]))

# 画出特征图
mfcc_features = mfcc_features.T
plt.matshow(mfcc_features)
plt.title('MFCC')

# 将滤波器组特征可视化
filterbank_features = filterbank_features.T
plt.matshow(filterbank_features)
plt.title('Filter bank')
plt.show()
Ejemplo n.º 41
0
def getMFCCFromFile(file_path='../recordings/all.ogg'):
	stereo_audio_data, sample_rate = sf.read(file_path)
	mono_audio_data = stereo_audio_data[:,1] #uses 2nd channel :) 
	data = list(mono_audio_data)

	#Convert data into matrix of frames
	#

	print(len(data))

	frames = []

	for i in range(0, len(data), 200):
		frames.append(data[i:i+200])


	lastFrame = len(frames) -1
	padder = 200 - len(frames[lastFrame]) % 200
	pprint(padder)
	frames[lastFrame].extend([0] * padder)
	print(len(frames))

	window = 1/200 #frame size  ==> w(n) = 1/2N

	def sgn(val):
		if(val >= 0):
			return 1;
		return -1;

	zcr = []

	for each_row in frames:
		zcr_inner = 0;
		for index, each_record in enumerate(each_row):
			if (index == 0):
				continue
			zcr_inner = zcr_inner + abs(sgn(each_record) - sgn(each_row[index-1]))/400
		zcr.append(zcr_inner);	

	ste = []

	for each_row in frames:
		ste_inner = 0;
		for index, each_record in enumerate(each_row):
			ste_inner = ste_inner + ((each_record*(0.54 - 0.46 * cos(2*pi*(index+1)/199)))**2)
		ste.append(ste_inner)


	#assert(len(ste) == len(zcr))


	#calculate multiplier
	#

	multiplier = [];

	for index, each_record in enumerate(zcr):
		if(each_record <= ste[index]):
			multiplier.append(1)
		else:
			multiplier.append(0)

	print(multiplier);

	total = []

	for each_multiplier in multiplier:
		total.extend([each_multiplier] * 200)

	padded_data = data;
	padded_data.extend([0]*padder)
	#assert(len(padded_data) == len(total))

	multiplied = [x * y for x, y in zip(padded_data, total)]


	#First plot the multiplier
	#

	plt.plot(numpy.asfarray(padded_data))
	plt.plot(numpy.asfarray(total))
	plt.savefig('amono_audio_data_all_processed_superimposed.png')
	plt.clf()

	plt.plot(numpy.asfarray(multiplied))
	plt.savefig('amono_audio_data_all_processed.png')
	plt.clf()

	sf.write('final.wav', multiplied, sample_rate)

	# Frame "multiplied" (which is already padded) with some overlap and calculate MFCC for each frame.
	# You get an vectors of MFCC coefficients
	# Store that in some db, look for how to build HMM based clasifier based on those MFCCs
	# Tell others to find how to use the classifer to get MFCC as input and get output as sequence of words
	# or phonemes
	# Build a phonetic dictionary
	# See YAHMM

	indexes = [ i for i, (x, y) in enumerate(zip(multiplier[:-1],multiplier[1:])) if x!=y]

	pprint(indexes)

	##Framing routine
	#
	#Do not neeed indexes, just do the thing in multiplied. 
	#
	#Use indexes for comparision of accuracy in Total number of words recognized Vs Actual number of words
	#
	#

	#Framing, each frame starts from 80th sample, with size 200

	#Using MFCC library, we can eradicate the following code:
	#

	mfcc_feat = mfcc(numpy.asarray(multiplied), sample_rate);
	pprint(mfcc_feat);
	pprint(len(mfcc_feat[0]))
	return mfcc_feat
num_layers = 1
batch_size = 1
initial_learning_rate = 1e-2
momentum = 0.9

num_examples = 1
num_batches_per_epoch = int(num_examples/batch_size)

# Loading the data

audio_filename = maybe_download('LDC93S1.wav', 93638)
target_filename = maybe_download('LDC93S1.txt', 62)

fs, audio = wav.read(audio_filename)

inputs = mfcc(audio, samplerate=fs)
# Tranform in 3D array
train_inputs = np.asarray(inputs[np.newaxis, :])
train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
train_seq_len = [train_inputs.shape[1]]

# Readings targets
with open(target_filename, 'r') as f:

    #Only the last line is necessary
    line = f.readlines()[-1]

    # Get only the words between [a-z] and replace period for none
    original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '')
    targets = original.replace(' ', '  ')
    targets = targets.split(' ')
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import numpy as np

(rate, sig) = wav.read(
    "C:\\Users\\Admin\\Music\\dulieuhocmay\\test_tatden\\Recording (1009).wav")
mfcc_feat = mfcc(sig, rate, 400 / rate, 160 / rate)
fbank_feat = logfbank(sig, rate, 400 / rate, 160 / rate)

for i in sig:
    print(i)

# print(rate)
a = mfcc_feat[0:900]
# print(a.shape)
a = a.ravel()
a = a.tolist()
if 11700 > len(a):
    l = len(a)
    a = a + [0] * (11700 - l)
t = 0
# print(a)
Ejemplo n.º 44
0
def get_audio_features(audio, feature_type, on_error=None, **kwargs):
	""" Returns audio features.

		# Arguments

		audio: dict or str. If dict, it should have keys, values as returned by
			`load_audio()`. If str, it should be a file that will be passed to
			`load_audio()`.
		feature_type. str. One of:
			- raw: Returns raw audio data (1-dimensional)
			- mfcc: Returns MFCC features
			- spec: Returns a spectrogram
		on_error: str or None (default: None). One of:
			- 'raise' or None: let the error propagate (no special catching)
			- 'suppress': catch the error and return None
		kwargs. Additional arguments that depend on `feature_type`:
			- For 'raw': no additional parameters
			- For 'mfcc':
				- features: int (default: 13). The number of MFCC features to
				  keep.
				- low_freq: int (default: None). The low-frequency cutoff.
				- high_freq: int (default: None). The high-frequency cutoff.
			- For 'spec':
				- low_freq: int (default: None). The low-frequency cutoff.
				- high_freq: int (default: None). The high-frequency cutoff.
	"""
	assert on_error in (None, 'suppress', 'raise')

	if isinstance(audio, str):
		original_path = audio
		try:
			audio = load_audio(audio)
		except Exception:						# pylint: disable=broad-except
			logger.exception('Failed to load audio file: %s', audio)
			if on_error == 'suppress':
				return None
			else:
				raise
	else:
		original_path = None

	if len(audio['signal']) < 1:
		logger.error('Failed to produce audio features while processing file '
			'%s. Length: %d. Sample rate: %d.', original_path,
			len(audio['signal']), audio['sample_rate'])
		if on_error == 'suppress':
			return None
		else:
			raise ValueError('Audio data is too short.')

	if feature_type == 'raw':
		return audio['signal']

	elif feature_type == 'mfcc':
		try:
			import python_speech_features
		except ImportError:
			logger.exception('"python_speech_features" is a required Python '
				'dependency for calculating MFCC features.')
			raise

		num_features = kwargs.get('features') or 13
		return python_speech_features.mfcc(
			audio['signal'],
			audio['sample_rate'],
			numcep=num_features,
			nfilt=num_features*2,
			lowfreq=kwargs.get('low_freq') or 0,
			highfreq=kwargs.get('high_freq') or None
		)

	elif feature_type == 'spec':
		# Window size, in seconds
		window_size = 0.020
		# Step size, in seconds
		step_size = 0.010

		signal = scale_signal(audio)

		hop_size = int(step_size * audio['sample_rate'])
		frame_size = int(window_size * audio['sample_rate'])

		if len(signal) < frame_size:
			logger.error('Failed to produce FFT while processing file '
				'%s. Original length: %d. Hop size: %d. Frame size: %d. '
				'Sample rate: %d.', original_path, len(signal), hop_size,
				frame_size, audio['sample_rate'])
			if on_error == 'suppress':
				return None
			else:
				raise ValueError('Audio data is too short.')

		# Cleave off any samples that do not cleanly fit into our step size.
		remove = (len(signal) - frame_size) % hop_size
		if remove:
			clean = signal[:-remove]
		else:
			clean = signal

		# Optimization: instead of doing a for loop or list comprehension to
		# apply the window to the signal, we can just create a new view into
		# the data with each window.
		num_frames = (len(clean) - frame_size) // hop_size + 1
		frames = numpy.lib.stride_tricks.as_strided(
			clean,
			shape=(frame_size, num_frames),
			strides=(clean.strides[0], clean.strides[0] * hop_size)
		)

		filter_window = numpy.hanning(frame_size)
		fft = numpy.fft.rfft(
			frames * numpy.expand_dims(filter_window, -1),
			axis=0
		)
		norm = numpy.absolute(fft)**2

		scale = numpy.sum(filter_window**2) * audio['sample_rate']
		scaled = norm
		scaled[1:-1] /= scale/2
		scaled[[0, -1]] /= scale

		spec = scaled
		# At this point, `spec` is shape (frequency, time).

		# Apply frequency cutoffs, if necessary
		low_freq = kwargs.get('low_freq')
		high_freq = kwargs.get('high_freq')
		if low_freq or high_freq:
			# Number of frequency bins
			num_bins = spec.shape[0]
			# Width of each frequency bin.
			delta_freq = 1 / window_size

			# Calculate the bin that a frequency would fall into.
			get_bin = lambda f, alt: \
				(
					min(
						max(int(f / delta_freq + 0.5), 0) + 1, num_bins
					)
					if f else alt
				)
			spec = spec[get_bin(low_freq, 0):get_bin(high_freq, num_bins)]

		spec = numpy.log(spec + 1e-14)
		# Format `spec` as (time, frequency)
		spec = spec.T
		return spec

	else:
		raise ValueError('Unsupported feature type: {}'.format(feature_type))
Ejemplo n.º 45
0
import glob
from scipy.io.wavfile import read
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import numpy as np

for i in range (1,10):
  for y in range(1,10):
    wavs = []
    dirName = str(i)+'/'+str(y) +'/'
    for filename in glob.glob(dirName+'*.wav'):
        (rate,sig) = wav.read(filename)
        mfcc_feat = mfcc(sig,rate,0.025,0.01,13,26,1200)
        #padding begins
        b=np.zeros((14352, 13))
        result= np.zeros(b.shape)
        result[:mfcc_feat.shape[0],:mfcc_feat.shape[1]] = mfcc_feat
        #end of padding
        wavs.append(result.ravel())
        # wavs.append(mfcc_feat)
    
    thefile = open(dirName+str(i)+'_'+str(y)+'_'+'MFCCfeatures.txt', 'w')
    for x in range (len(wavs)):
      for item in wavs[x]:
        thefile.write("%s " % item)
      thefile.write("\n\n\n")

Ejemplo n.º 46
0
def get_features(filepath):
    (rate, signal) = wav.read(filepath)
    mfcc_features = mfcc(signal=signal, samplerate=rate, winlen=0.025, winstep=0.01, winfunc=lambda m: hamming(m), appendEnergy=False)
    return mfcc_features
Ejemplo n.º 47
0
if __name__ == '__main__':
    # recording part
    # print("please, enter your name:")
    # label = input("")
    label = "on_inspection"
    print("прочитайте речення в мікрофон:")
    print(str(random_sentence))
    record_to_file('on_inspection.wav')
    # print("on_inspection: done - result written to on_inspection.wav")

    label = 'on_inspection'
    (rate, sig) = wav.read("on_inspection.wav")
    mfcc_feat = mfcc(sig,
                     rate,
                     winlen=0.094,
                     nfft=FFT_LENGTH,
                     numcep=numcep,
                     lowfreq=lowfreq,
                     highfreq=highfreq)
    # le's print results
    print('\n\n')
    print(
        '============================================================================'
    )
    print(
        '================================results:===================================='
    )
    print(
        '============================================================================'
    )
    print('\n\n')
        for k in range(en_range + 1):
            if st_loop < end[k] and en_loop < end[k]:
                if phoneme[k] not in unvoiced:  #unvoiced define karo
                    label[i] = 1
                else:
                    label[i] = -1
                break
            if k != en_range:
                if st_loop < end[k] and en_loop > end[k]:
                    if ((phoneme[k] not in unvoiced)
                            or (phoneme[k + 1]
                                not in unvoiced)):  #unvoiced define karo
                        label[i] = 1
                    else:
                        label[i] = -1
                    break

    #check MFCC length
    from python_speech_features import mfcc
    import scipy.io.wavfile as wav

    filename = "newtrainwithnoisep12\s" + str(j + 1) + ".wav"
    (Fs, data) = wav.read(filename)
    Obs = mfcc(data, samplerate=Fs, winlen=0.025, winstep=0.01, numcep=13)

    if len(Obs) > len(label):
        append_n = len(Obs) - len(label)
        for i in range(append_n):
            label[n + i] = 0
hop_length = 512
win_length = 1024
# Janela e overlapping (em tempo)
win_len = win_length / rate
win_hop = hop_length / rate
lifter = 22
fmin = 0
fmax = rate / 2
coef_pre_enfase = 0.97
append_energy = 0


attr = mfcc(
    signal=signal,
    samplerate=rate,
    winlen=win_len,
    winstep=win_hop,
    numcep=n_mfcc,
    nfilt=n_mels,
    nfft=n_fft,
    lowfreq=fmin,
    highfreq=fmax,
    preemph=coef_pre_enfase,
    ceplifter=lifter,
    appendEnergy=append_energy,
    winfunc=hann
)

Visualization.plot_cepstrals(
    attr, fig_name="./normal_40.png")
def wav2feature(wav_paths, feature_type='logfbank', feature_dim=40,
                energy=True, delta1=True, delta2=True):
    """Read wav file & convert to MFCC or log mel filterbank features.
    Args:
        wav_paths (list): paths to a wav file
        batch_size (int, optional): the batch size
        feature_type (string, optional): logfbank or fbank or mfcc
        feature_dim (int, optional): the demension of each feature
        energy (bool, optional): if True, add energy
        delta1 (bool, optional): if True, add delta features
        delta2 (bool, optional): if True, add delta delta features
    Returns:
        inputs: A tensor of size `[B, T, input_size]`
        inputs_seq_len: A tensor of size `[B]`
    """
    if feature_type not in ['logmelfbank', 'logfbank', 'fbank', 'mfcc']:
        raise ValueError(
            'feature_type is "logmelfbank" or "logfbank" or "fbank" or "mfcc".')
    if not isinstance(wav_paths, list):
        raise ValueError('wav_paths must be a list.')
    if delta2 and not delta1:
        delta1 = True

    batch_size = len(wav_paths)
    max_time = 0
    for wav_path in wav_paths:
        # Read wav file
        fs, audio = scipy.io.wavfile.read(wav_path)
        if len(audio) > max_time:
            max_time = len(audio)
    input_size = feature_dim
    if energy:
        input_size + 1
    if delta2:
        input_size *= 3
    elif delta1:
        input_size *= 2

    inputs = None
    inputs_seq_len = np.zeros((batch_size,), dtype=np.int32)
    for i, wav_path in enumerate(wav_paths):
        if feature_type == 'mfcc':
            feat = mfcc(audio, samplerate=fs, numcep=feature_dim)
            if energy:
                energy_feat = fbank(audio, samplerate=fs, nfilt=feature_dim)[1]
                feat = np.c_[feat, energy_feat]
        else:
            fbank_feat, energy_feat = fbank(
                audio, samplerate=fs, nfilt=feature_dim)
            if feature_type == 'logfbank':
                fbank_feat = np.log(fbank_feat)
            feat = fbank_feat
            if energy:
                # logenergy = np.log(energy_feat)
                feat = np.c_[feat, energy_feat]

        if delta2:
            delta1_feat = _delta(feat, N=2)
            delta2_feat = _delta(delta1_feat, N=2)
            feat = np.c_[feat, delta1_feat, delta2_feat]
        elif delta1:
            delta1_feat = _delta(feat, N=2)
            feat = np.c_[feat, delta1_feat]

        # Normalize per wav
        feat = (feat - np.mean(feat)) / np.std(feat)

        if inputs is None:
            max_time = feat.shape[0]
            input_size = feat.shape[-1]
            inputs = np.zeros((batch_size, max_time, input_size))

        inputs[i] = feat
        inputs_seq_len[i] = len(feat)

    return inputs, inputs_seq_len
for c in classes:
    wav_file = df[df.label == c].iloc[0, 0]
    signal, rate = librosa.load('wavfiles/' + wav_file, sr=44100)
    mask = envelope(signal, rate, 0.0005)
    signal = signal[mask]
    signals[c] = signal
    fft[c] = calc_fft(signal, rate)

    #### log f bank from speech features ######

    bank = logfbank(
        signal[:rate], rate, nfilt=26,
        nfft=1103).T  ### nfft is window size  44100/40 =1102.5 ########
    fbank[c] = bank  #### store values #####
    mel = mfcc(
        signal[:rate], rate, numcep=13, nfilt=26, nfft=1103
    ).T  ###signal rate is 1 second , where .T is used for transpose #######
    mfccs[c] = mel

########## plotting graph of signals ##########

plot_signals(signals)
plt.show()

plot_fft(fft)
plt.show()

plot_fbank(fbank)
plt.show()

plot_mfccs(mfccs)
Ejemplo n.º 52
0
from keras.models import load_model
######################################

# load the data 
# then compute the mfcc as a feature vector of dimension (49,39)
# 49 corresponds to the time steps and 39 the features in each time step
# librosa.effects.trim though not used is used for trimming the sound file for useful information

data  = []
label = []
for i in range(0,79):
  for j in range(0,24):

    back, sr = sf.read("back_"+str(i) +"_"+str(j)+".wav")
#     back, index = librosa.effects.trim(back)
    x=mfcc(y = back, sr = sr, n_mfcc=39)      
    i=49-x.shape[1]
    tp=np.zeros((39,i))
    x=np.append(x,tp,axis=1);
    data.append(x.T)
    label.append(0)

    forward, sr = sf.read("forward_"+str(i) +"_"+str(j)+".wav")
#     forward, index = librosa.effects.trim(forward)
    x=mfcc(y = forward, sr = sr, n_mfcc=39)      
    i=49-x.shape[1]
    tp=np.zeros((39,i))
    x=np.append(x,tp,axis=1);
    data.append(x.T)
    label.append(1)
    
    def process(self, y, sample_rate):

        return python_speech_features.mfcc(
            32768 * y, samplerate=sample_rate, winlen=self.duration, winstep=self.step,
            numcep=self.coefs)