Esempio n. 1
0
def process_data(wav_files, phn_files):
    max_step_size = 0
    inputs = []
    targets = []
    for i in tqdm(range(len(wav_files))):
        # extract mfcc features from wav
        (rate, sig) = wav.read(wav_files[i])
        mfcc_feat = mfcc(sig, rate)
        fbank_feat = logfbank(sig, rate)
        acoustic_features = join_features(
            mfcc_feat, fbank_feat)  # time_stamp x n_features

        # extract label from phn
        phn_labels = []
        with open(phn_files[i], 'rb') as csvfile:
            phn_reader = csv.reader(csvfile, delimiter=' ')
            for row in phn_reader:
                if row[2] == 'q':
                    continue
                phn_labels.append(
                    phoneme_set_39[phoneme_48_39.get(row[2], row[2])] - 1)

        inputs.append(acoustic_features)
        targets.append(phn_labels)

    return lists_batches(inputs, targets)
Esempio n. 2
0
def get_mel_spec(sig, sample_rate):
    """
     Compute log Mel-filter bank energy features from an audio signal.

      Args:
         sig (array like) : Signal data. Must be real.
         sample_rate (int) : Sample rate.
     Return:
         Mel spectrum.
     """
    try:
        mel_spec = python_speech_features.logfbank(sig,
                                                   samplerate=sample_rate,
                                                   winlen=0.005,
                                                   winstep=0.005,
                                                   nfilt=21,
                                                   preemph=0.97)
        mel_spec = mel_spec.T
        mini = np.amin(mel_spec)
        maxi = np.amax(mel_spec)
        if mini == maxi:
            return None
        else:
            mel_spec = np.ceil((mel_spec - mini) * (5 / np.abs(mini - maxi)))
            return mel_spec
    except Exception as e:
        save_as_exception("Root", "Mel Spectrum", e)
        return None
Esempio n. 3
0
def process_wav(wav_file):
    (rate, sig) = wav.read(wav_file)
    mfcc_feat = mfcc(sig, rate)
    fbank_feat = logfbank(sig, rate)
    acoustic_features = join_features(mfcc_feat,
                                      fbank_feat)  # time_stamp x n_features
    return acoustic_features
Esempio n. 4
0
    def gen_filtered_spec(self, filenames):
        x_data = []
        for filename in filenames:
            fs, wav = wavfiles.read(filename)
            # print(wav.shape, filename)
            if len(wav.shape) > 1:
                wav = wav[:, 0]
            if wav.shape[0] < 441000:
                pad_with = 441000 - wav.shape[0]
                wav = np.pad(wav, (0, pad_with),
                             'constant',
                             constant_values=(0))
            elif wav.shape[0] > 441000:
                wav = wav[0:441000]
            kernel = [-1, 2, -1]
            Sxx = logfbank(wav,
                           fs,
                           winlen=0.04,
                           winstep=0.02,
                           nfft=2048,
                           nfilt=40)
            delta = ndimage.convolve1d(Sxx,
                                       weights=kernel,
                                       axis=1,
                                       mode='nearest')
            delta_2 = ndimage.convolve1d(Sxx,
                                         weights=kernel,
                                         axis=0,
                                         mode='nearest')
            data = np.dstack((Sxx, delta, delta_2))
            x_data.append(
                data.reshape(1, data.shape[0], data.shape[1], data.shape[2]))

        return np.vstack(x_data)
Esempio n. 5
0
    def gen_delta_delta(self, filenames):
        x_data = []
        for filename in filenames:
            fs, wav = wavfiles.read(filename)
            # print(wav.shape, filename)
            if len(wav.shape) > 1:
                wav = wav[:, 0]
            if wav.shape[0] < 441000:
                pad_with = 441000 - wav.shape[0]
                wav = np.pad(wav, (0, pad_with),
                             'constant',
                             constant_values=(0))
            elif wav.shape[0] > 441000:
                wav = wav[0:441000]
            Sxx = logfbank(wav,
                           fs,
                           winlen=0.04,
                           winstep=0.02,
                           nfft=2048,
                           nfilt=40)
            delta = librosa.feature.delta(Sxx, order=1)
            delta_2 = librosa.feature.delta(Sxx, order=2)
            data = np.dstack((Sxx, delta, delta_2))
            x_data.append(
                data.reshape(1, data.shape[0], data.shape[1], data.shape[2]))

        return np.vstack(x_data)
Esempio n. 6
0
    def add_data(self, data_bytes):
        """Process audio data."""
        if not data_bytes:
            return
        softmax_tensor = self.sess_.graph.get_tensor_by_name(self.output_name_)

        input_data = np.frombuffer(data_bytes, dtype='i2') / pow(2, 15)

        mels = logfbank(input_data,
                        16000,
                        lowfreq=50.0,
                        highfreq=4200.0,
                        nfilt=36,
                        nfft=1024,
                        winlen=0.020,
                        winstep=0.010)
        input = {
            'fingerprint_4d:0':
            np.reshape(mels, (1, mels.shape[0], mels.shape[1], 1))
        }

        #print('logfbank', mels.shape, time.time() - bt)

        bt = time.time()

        predictions, = self.sess_.run(softmax_tensor, input)
        return predictions[1]
Esempio n. 7
0
def features(df, opt, path):
    df.reset_index(inplace=True)
    sign = {}
    signal_env = {}
    ffts = {}
    fbanks = {}
    mfccs = {}
    stf_f = {}

    for c in list(np.unique(df.Gender)):
        wav = df[df.Gender == c].iloc[0, 0]
        s, r = librosa.load(path + wav)
        sign[c] = s

        mask = envelope(s, r, opt.threshold)
        s = s[mask]
        signal_env[c] = s
        ffts[c] = fft(s, r)

        _, _, Zxx = signal.stft(s, fs=r, window='hann', nperseg=256)
        stf_f[c] = Zxx

        fbanks[c] = logfbank(s[:r], r, nfilt=opt.nfilt, nfft=1103).T

        mfccs[c] = mfcc(s[:r], r, numcep=opt.nfeat, nfilt=opt.nfilt, nfft=opt.nfft).T

    return sign, signal_env, fft, fbanks, mfccs, stf_f
def generate_train_data(text_filepath):

    fwb_train = open(mod_train_data_path, 'wb')
    fwb_test = open(mod_test_data_path, 'wb')

    train_data_list = list()
    test_data_list = list()

    train_label_list = list()
    test_label_list = list()

    with open(text_filepath, 'r', encoding='utf-8') as fr:
        while True:
            line = fr.readline()
            line = line.split()
            if not line: break

            samplerate, data = wavfile.read(line[0])
            # print(data)
            # print(len(data))
            logfb_feat = logfbank(data)
            # print(len(logfb_feat))
            # print(logfb_feat)

            # transformer = Normalizer().fit(logfb_feat)
            # logfb_feat = transformer.transform(logfb_feat)

            logfb_feat = util_module.standardization_func(logfb_feat)

            temp_path = line[0].split('\\\\')

            if temp_path[2] == 'test':
                test_data_list.append(logfb_feat)
                test_label_list.append(int(line[-1]))
            elif temp_path[2] == 'train':
                one_train = list()
                one_train.append(logfb_feat)
                one_train.append(int(line[-1]))
                train_data_list.append(one_train)

    random.shuffle(train_data_list)

    temp_train_data = list()
    temp_train_label = list()

    for one in train_data_list:
        # print(one)
        temp_train_data.append(one[0])
        temp_train_label.append(one[1])

    train_data = np.asarray(temp_train_data)
    train_label = np.asarray(temp_train_label)

    test_data = np.asarray(test_data_list)
    test_label = np.asarray(test_label_list)

    np.savez_compressed(fwb_train, label=train_label, data=train_data, rate=samplerate)
    np.savez_compressed(fwb_test, label=test_label, data=test_data, rate=samplerate)

    return
Esempio n. 9
0
def get_vector(sig, rate):
    vec = np.empty((1, 3))
    start = 0
    end = 320

    while (sig.shape[0] >= end + 160):
        vad = webrtcvad.Vad()
        vad.set_mode(2)

        res = vad.is_speech(sig[start:end].tobytes(),
                            rate)  # speech probability
        zero_crosses = np.nonzero(
            np.diff(sig[start:end] > 0))[0].shape[0] / 0.02  # zero crosses
        f = scipy.fft(sig[start:end])
        f0 = min(np.absolute(f))  # f0 frequency

        start = start + 160
        end = end + 160

        vec = np.vstack((vec, np.array([res, zero_crosses, f0], ndmin=2)))

    mfcc_feat = mfcc(sig, rate, numcep=12,
                     winlen=0.020)[0:vec.shape[0], :]  # mfcc
    fbank = logfbank(sig, rate,
                     nfilt=5)[0:vec.shape[0], :]  # log filterbank energies
    mfcc_grad = np.gradient(mfcc_feat, axis=0)  # mfcc first derivative

    final_feature = np.hstack((mfcc_feat, mfcc_grad, fbank, vec))

    return final_feature
Esempio n. 10
0
def get_features_from_spectrogram_with_filterbank(filepath,
                                                  sample_rate,
                                                  N_FFT,
                                                  window_size=30,
                                                  step_size=10,
                                                  eps=1e-10):
    (rate, width, sig) = wavio.readwav(filepath)
    sig = sig.ravel()
    # nperseg: Length of each segment
    # noverlap: Number of points to overlap between segments
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(sig,
                                            fs=sample_rate,
                                            window='hann',
                                            nperseg=nperseg,
                                            noverlap=noverlap,
                                            nfft=N_FFT,
                                            detrend=False)
    mean = np.mean(spec, axis=0)
    std = np.std(spec, axis=0)
    spec = (spec - mean) / std
    fbank_feat = logfbank(spec.T.astype(np.float32) + eps,
                          sample_rate,
                          winlen=0.030,
                          winstep=0.01,
                          nfilt=26,
                          nfft=512,
                          lowfreq=0,
                          highfreq=sample_rate / 4,
                          preemph=0.97)
    return freqs, times, fbank_feat
Esempio n. 11
0
def get_mfcc(audio_path):
    fs, audio = wav.read(audio_path)
    assert (fs == 16000)
    mfcc_feat = mfcc(audio, samplerate=fs, winlen=0.04, winstep=0.04)
    fbank_feat = logfbank(audio, samplerate=fs, winlen=0.04, winstep=0.04)
    ret = numpy.concatenate((mfcc_feat, fbank_feat), 1)
    return ret
Esempio n. 12
0
    def create_infer_batch(self):
        # self.hparams.in_wav1, self.hparams.in_wav2 are full paths of the wav file
        # for ex) /home/hdd2tb/ninas96211/dev_wav_set/id10343_pCDWKHjQjso_00002.wav

        wavs_list = [self.hparams.in_wav1, self.hparams.in_wav2]

        # file_name for ex) id10343_pCDWKHjQjso_00002
        for wav_path in wavs_list:
            wav_id = os.path.splitext(os.path.basename(wav_path))[0]
            audio, sample_rate = vad_ex.read_wave(wav_path)
            vad = webrtcvad.Vad(1)
            frames = vad_ex.frame_generator(30, audio, sample_rate)
            frames = list(frames)
            segments = vad_ex.vad_collector(sample_rate, 30, 300, vad, frames)
            total_wav = b""
            for i, segment in enumerate(segments):
                total_wav += segment
                print(wav_id + " : " + str(i) + "th segment appended")
            # Without writing, unpack total_wav into numpy [N,1] array
            # 16bit PCM 기준 dtype=np.int16
            wav_arr = np.frombuffer(total_wav, dtype=np.int16)
            print("read audio data from byte string. np array of shape:" +
                  str(wav_arr.shape))
            logmel_feats = logfbank(wav_arr, samplerate=sample_rate, nfilt=40)
            # file_name for ex, 'id10343_pCDWKHjQjso_00002'
            self.save_dict[wav_id] = logmel_feats

        num_frames = self.hparams.segment_length * 100
        num_overlap_frames = num_frames * self.hparams.overlap_ratio
        dvector_dict = {}

        match = False
        prev_wav_name = ""

        for wav_name, feats in self.save_dict.items():
            if wav_name.split("_")[0] == prev_wav_name:
                print("spk_id" + wav_name.split("_")[0])
                match = True
            total_len = feats.shape[0]
            num_dvectors = int((total_len - num_overlap_frames) //
                               (num_frames - num_overlap_frames))
            print("num dvec:" + str(num_dvectors))
            dvectors = []
            for dvec_idx in range(num_dvectors):
                start_idx = int((num_frames - num_overlap_frames) * dvec_idx)
                end_idx = int(start_idx + num_frames)
                print("wavname: " + wav_name + " start_idx: " + str(start_idx))
                print("wavname: " + wav_name + " end_idx: " + str(end_idx))
                dvectors.append(feats[start_idx:end_idx, :])
            dvectors = np.asarray(dvectors)
            dvector_dict[wav_name] = dvectors
            prev_wav_name = wav_name.split("_")[0]

        wav1_data = list(dvector_dict.values())[0]
        wav2_data = list(dvector_dict.values())[1]

        print("match: " + str(match))
        print("wav1_data.shape:" + str(wav1_data.shape))
        print("wav2_data.shape:" + str(wav2_data.shape))
        return wav1_data, wav2_data, match
Esempio n. 13
0
def _mel(y, sr, win_len, win_step, num_features, n_fft, f_min, f_max):
    """Convert a wav signal into a logarithmically scaled mel filterbank.

    Args:
        y (np.ndarray): Wav signal.
        sr (int):  Sampling rate.
        win_len (float): Window length in seconds.
        win_step (float): Window stride in seconds.
        num_features (int): Number of features to generate.
        n_fft (int): Number of Fast Fourier Transforms.
        f_min (float): Minimum frequency to consider.
        f_max (float): Maximum frequency to consider.

    Returns:
        np.ndarray: Mel-filterbank. Shape: [time, num_features]
    """
    mel = psf.logfbank(signal=y,
                       samplerate=sr,
                       winlen=win_len,
                       winstep=win_step,
                       nfilt=num_features,
                       nfft=n_fft,
                       lowfreq=f_min,
                       highfreq=f_max,
                       preemph=0.97)
    return mel
Esempio n. 14
0
def extract_feature(sig, rate, feature_type="MFCC", cmvn=True, delta=True):
    """ 从音频文件中抽取特征.

  Returns:
    特征向量, 规模是[帧数, 特征向量维度]

  """
    if feature_type == "fbank":
        feature = psf.logfbank(sig, rate)
    elif feature_type == "MFCC":
        feature = psf.mfcc(sig, rate)
    else:
        raise ValueError("不支持的特征抽取方法.")
    if cmvn:
        cmvn_stats = np.zeros((2, feature.shape[1] + 1))
        cmvn_stats[0, :-1] = feature.sum(axis=0)
        cmvn_stats[0, -1] = feature.shape[0]
        cmvn_stats[1, :-1] = (feature**2).sum(axis=0)
        norm_stats = np.zeros((2, feature.shape[1]))
        mean = cmvn_stats[0, :-1] / cmvn_stats[0, -1]
        var = cmvn_stats[1, :-1] / cmvn_stats[0, -1] - mean * mean
        var = np.maximum(var, 1e-20)
        scale = 1 / np.sqrt(var)
        norm_stats[0] = -(mean * scale)
        norm_stats[1] = scale
        feature = np.dot(feature, np.diag(norm_stats[1]))
        feature += norm_stats[0]

    if delta:
        delta_feat = psf.delta(feature, 2)
        delta_delta_feat = psf.delta(delta_feat, 2)
        feature = np.column_stack((feature, delta_feat, delta_delta_feat))

    return feature
Esempio n. 15
0
def build_rand_feat():
	tmp = check_data()
	if tmp:
		return tmp.data[0], tmp.data[1]
	X = []
	y = []
	_min, _max = float('inf'), -float('inf')
	for _ in tqdm(range(n_samples)):
		rand_class = np.random.choice(class_dist.index, p=prob_dist)
		file = np.random.choice(df[df.label==rand_class].index)
		rate, wav= wavfile.read('clean/'+file)
		label = df.at[file, 'label']
		rand_index = np.random.randint(0, wav.shape[0]-config.step)
		sample = wav[rand_index:rand_index+config.step]
		X_sample = logfbank(sample, rate, nfilt=config.nfilt, nfft=config.nfft)
		_min = min(np.amin(X_sample), _min)
		_max = max(np.amax(X_sample), _max)
		X.append(X_sample)
		y.append(classes.index(label))
	config.min = _min
	config.max = _max
	X, y = np.array(X), np.array(y)
	X = (X - _min) / (_max - _min)
	if config.mode == 'conv':
		X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
	elif config.mode == 'time':
		X = X.reshape(X.shape[0], X.shape[1], X.shape[2])
	y = to_categorical(y, num_classes=40)
	config.data = (X, y)

	with open(config.p_path, 'wb') as handle:
		pickle.dump(config, handle, protocol=2)
	return X, y
Esempio n. 16
0
def make_feature(y, sr):
    if FEATURE == 'fft':
        S = librosa.stft(y, n_fft=N_FFT, hop_length=HOP_LEN, window=hamming)
        feature, _ = librosa.magphase(S)
        feature = np.log1p(feature)
        feature = feature.transpose()
    else:
        if FEATURE == 'fbank':
            feature = logfbank(y,
                               sr,
                               nfilt=FEATURE_LEN,
                               winlen=WIN_LEN,
                               winstep=WIN_STEP)
            assert feature.shape[-1] == FEATURE_LEN, '{}'.format(
                feature.shape[-1])
        else:
            feature = mfcc(y,
                           sr,
                           nfilt=FEATURE_LEN,
                           winlen=WIN_LEN,
                           winstep=WIN_STEP)
            feature_d1 = delta(feature, N=1)
            feature_d2 = delta(feature, N=2)
            feature = np.hstack([feature, feature_d1, feature_d2])
    return normalize(feature).astype(np.float32)
Esempio n. 17
0
def draw_multiple_graph_with_one_data(data, sr, **kwargs):

    if "x_number" in kwargs.keys():
        x_num = kwargs['x_number']
    if "y_number" in kwargs.keys():
        y_num = kwargs['y_number']

    plt.subplot(x_num, y_num, 1)
    plt.plot(data)
    plt.title('Raw Signal')
    plt.xlabel('sample rate')
    plt.ylabel('amplitude')

    data = logfbank(data, sr)
    data = draw_single_graph.transpose_the_matrix(data)
    plt.subplot(x_num, y_num, 2)
    plt.pcolormesh(data)
    plt.title('Feature Vector')
    plt.xlabel('frame sequence')
    plt.ylabel('number of filters')
    plt.colorbar()

    data = draw_single_graph.new_minmax_normal([data])
    data = data[0]
    plt.subplot(x_num, y_num, 3)
    plt.pcolormesh(data)
    plt.title('Normalized Feature Vector')
    plt.xlabel('frame sequence')
    plt.ylabel('number of filters')
    plt.colorbar()

    return
Esempio n. 18
0
 def generator():
     _wav_files, _labels = _shuffle(wav_files, labels)
     for wav_file, label in zip(_wav_files, _labels):
         signal, sample_rate, _ = read_audio(wav_file, desired_ms)
         if input_feature == 'fbank':
             feat, _ = fbank(signal,
                             sample_rate,
                             winlen=window_size_ms / 1000,
                             winstep=window_stride_ms / 1000,
                             nfilt=input_feature_dim)
         elif input_feature == 'logfbank':
             feat = logfbank(signal,
                             sample_rate,
                             winlen=window_size_ms / 1000,
                             winstep=window_stride_ms / 1000,
                             nfilt=input_feature_dim)
         elif input_feature == 'mfcc':
             feat = mfcc(signal,
                         sample_rate,
                         winlen=window_size_ms / 1000,
                         winstep=window_stride_ms / 1000,
                         nfilt=input_feature_dim,
                         numcep=input_feature_dim)
         elif input_feature == 'raw':
             feat = np.expand_dims(signal, 1)
         yield (feat, label)
    def __call__(self, data):
        fname = data['fname']
        samples, sample_rate = librosa.load(fname, self.sample_rate)
        audio_duration = len(samples) * 1.0 / sample_rate

        # T, F
        features = psf.logfbank(signal=samples,
                                samplerate=sample_rate,
                                winlen=self.window_size,
                                winstep=self.window_stride,
                                nfilt=self.num_features,
                                nfft=512,
                                lowfreq=0,
                                highfreq=sample_rate / 2,
                                preemph=0.97)

        m = np.mean(features)
        s = np.std(features)
        features = (features - m) / s

        data = {
            'target': data['text'],
            'target_length': len(data['text']),
            'input': features.astype(np.float32),
            'input_length': features.shape[0]
        }
        # data['sample_rate'] = sample_rate
        # data['duration'] = audio_duration

        return data
Esempio n. 20
0
    def create_pickle(self, path, wav_arr, sample_rate):
        if round(
            (wav_arr.shape[0] / sample_rate), 1) > self.hparams.segment_length:
            save_dict = {}
            logmel_feats = logfbank(wav_arr,
                                    samplerate=sample_rate,
                                    nfilt=self.hparams.spectrogram_scale)
            print("created logmel feats from audio data. np array of shape:" +
                  str(logmel_feats.shape))
            save_dict["LogMel_Features"] = logmel_feats

            if self.hparams.data_type == "vox1" or self.hparams.data_type == "vox2":
                data_id = "_".join(path.split("/")[-3:])
                save_dict["SpkId"] = path.split("/")[-3]
                save_dict["ClipId"] = path.split("/")[-2]
                save_dict["WavId"] = path.split("/")[-1]
                if self.hparams.data_type == "vox1":
                    pickle_f_name = data_id.replace("wav", "pickle")
                elif self.hparams.data_type == "vox2":
                    pickle_f_name = data_id.replace("m4a", "pickle")

            elif self.hparams.data_type == "libri":
                data_id = "_".join(path.split("/")[-2:])
                save_dict["SpkId"] = path.split("/")[-2]
                save_dict["WavId"] = path.split("/")[-1]
                pickle_f_name = data_id.replace("wav", "pickle")
                print(pickle_f_name)

            with open(self.hparams.pk_dir + "/" + pickle_f_name, "wb") as f:
                pickle.dump(save_dict, f, protocol=3)
        else:
            print("wav length smaller than 1.6s: " + path)
    def make_train_logfb(self, **kwarg):

        if "raw_filename" in kwarg.keys():
            raw_filename = kwarg["raw_filename"]
        if "logfb_filename" in kwarg.keys():
            logfb_filename = kwarg["logfb_filename"]

        num = 0
        for raw_f, logfb_f in zip(raw_filename, logfb_filename):
            raw_f = self.return_path+'\\'+raw_f

            loaded_data = np.load(raw_f, allow_pickle=True)

            label = loaded_data['label']
            raw_signal = loaded_data['data']
            sample_rate = loaded_data['rate']

            logfb_list = list()

            for sig, rate in zip(raw_signal, sample_rate):
                logfb_feat = logfbank(sig, rate)
                logfb_list.append(logfb_feat)
                num += 1
                print(num)

            logfb_data = np.asarray(logfb_list)

            fwb = open(logfb_f, 'wb')
            np.savez_compressed(fwb, label=label, data=logfb_data, rate=sample_rate)
            fwb.close()

        return
def get_feature_vectors(file, directory, no_of_frames, start_frame):
    (rate, sig) = wav.read(os.path.join(directory, file))
    fbank_feat = logfbank(sig, rate, nfft=2048)
    mfcc_feat = mfcc(sig,
                     rate,
                     winlen=0.032,
                     winstep=0.016,
                     numcep=13,
                     nfft=2048)

    d_mfcc_feat = delta(mfcc_feat, 2)
    dd_mfcc_feat = delta(d_mfcc_feat, 2)

    mfcc_vectors = mfcc_feat[start_frame:start_frame +
                             no_of_frames, :no_of_features]
    dmfcc_vectors = d_mfcc_feat[start_frame:start_frame +
                                no_of_frames, :no_of_features]
    ddmfcc_vectors = dd_mfcc_feat[start_frame:start_frame +
                                  no_of_frames, :no_of_features]
    fbank_vectors = fbank_feat[start_frame:start_frame +
                               no_of_frames, :no_of_fbank_features]

    feature_vectors = numpy.hstack(
        (mfcc_vectors, dmfcc_vectors, ddmfcc_vectors, fbank_vectors))
    return feature_vectors
def get_audio(audio_path, is_crop=True):
    try:
        # pdb.set_trace()
        wave_obj = wavio.read(audio_path)
        rate = wave_obj.rate
        sig = np.squeeze(wave_obj.data)
        # (rate,sig) = wav.read(ad)
    except TypeError:
        # print(ad)
        (rate,sig) = wav.read(audio_path)
    # only short than 10 seconds
    # if np.shape(sig)[0]/float(rate) > 10:
    #     sig = sig[0:rate*10]

    # Mel-filter bank
    sig = sig - np.mean(sig)
    fbank_feat = logfbank(sig, rate, winlen=0.025,\
        winstep=0.01,nfilt=40,nfft=512,lowfreq=0,highfreq=None,preemph=0.97)
    
    if is_crop:
        if fbank_feat.shape[0] < 1024:
            # pdb.set_trace()
            zero_pad = np.zeros((1024-fbank_feat.shape[0], 40))
            fbank_feat = np.concatenate([fbank_feat, zero_pad], 0)
        else:
            fbank_feat = fbank_feat[:1024]

    return fbank_feat
Esempio n. 24
0
def print_features():
    # Read the input audio file
    file = '/home/james/Documents/git/SpeechRecognition/hmm-speech-recognition-0.1/audio/apple/apple01.wav'
    sampling_freq, signal = wavfile.read(file)
    # Take the first 10,000 samples for analysis
    signal = signal[:10000]

    # Extract the MFCC features
    features_mfcc = mfcc(signal, sampling_freq)

    # Print the parameters for MFCC
    print('\nMFCC:\nNumber of windows =', features_mfcc.shape[0])
    print('Length of each feature =', features_mfcc.shape[1])

    # Plot the features
    features_mfcc = features_mfcc.T
    plt.matshow(features_mfcc)
    plt.title('MFCC')

    # Extract the Filter Bank features
    features_fb = logfbank(signal, sampling_freq)
    # Print the parameters for Filter Bank
    print('\nFilter bank:\nNumber of windows =', features_fb.shape[0])
    print('Length of each feature =', features_fb.shape[1])

    # Plot the features
    features_fb = features_fb.T
    plt.matshow(features_fb)
    plt.title('Filter bank')
    plt.show()
Esempio n. 25
0
def collect_features(samples, feature_type='mfcc', sr=48000, window_size_ms=20,
                     window_shift_ms=10, num_filters=40, num_mfcc=40,
                     window_function=None):
    '''Collects fbank and mfcc features.
    '''
    if not window_function:
        # default for python_speech_features:
        def window_function(x): return np.ones((x,))
    else:
        if 'hamming' in window_function:
            window_function = hamming
        elif 'hann' in window_function:
            window_function = hann
        else:
            # default for python_speech_features:
            def window_function(x): return np.ones((x,))
    if len(samples)/sr*1000 < window_size_ms:
        window_size_ms = len(samples)/sr*1000
    frame_length = calc_frame_length(window_size_ms, sr)
    if 'fbank' in feature_type:
        feats = logfbank(samples,
                         samplerate=sr,
                         winlen=window_size_ms * 0.001,
                         winstep=window_shift_ms * 0.001,
                         nfilt=num_filters,
                         nfft=frame_length)
    elif 'mfcc' in feature_type:
        feats = mfcc(samples,
                     samplerate=sr,
                     winlen=window_size_ms * 0.001,
                     winstep=window_shift_ms * 0.001,
                     nfilt=num_filters,
                     numcep=num_mfcc,
                     nfft=frame_length)
    return feats, frame_length, window_size_ms
def mffcRead(str):
    (rate, sig) = wav.read(str)
    mfcc_feat = mfcc(sig, rate)
    #mfcc_feat = mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True)
    d_mfcc_feat = delta(mfcc_feat, 2)
    fbank_feat = logfbank(sig, rate)
    return fbank_feat
Esempio n. 27
0
def load_data(path):
    data = []
    label_index = np.array([], dtype=int)
    label_count = 0
    wav_files_count = 0

    for root, dirs, files in os.walk(path):
        # get all wav files in current dir
        wav_files = [file for file in files if file.endswith('.wav')]
        data_same_person = []
        # extract logfbank features from wav file
        for wav_file in wav_files:
            (rate, sig) = wav.read(root + "/" + wav_file)
            fbank_beats = logfbank(sig, rate, nfilt=40)
            # save logfbank features into same person array
            data_same_person.append(fbank_beats)

        # save all data of same person into the data array
        # the length of data array is number of speakers
        if wav_files:
            wav_files_count += len(wav_files)
            data.append(data_same_person)

    # return data, np.arange(len(data))
    return data
Esempio n. 28
0
def get_MFCC(sr, audio):

    features = mfcc.mfcc(audio, sr)

    #############################
    #                           #
    #      Noise Removal        #
    #                           #
    #############################

    features = mfcc.logfbank(
        audio)  #computes the filterbank energy from an audio signal
    features = mfcc.lifter(
        features)  #increases magnitude of high frequency DCT coefficients

    sum_of_squares = []
    index = -1

    for r in features:
        """
        Since signals can be either positive or negative, taking n**2 allows us to compare the magnitudes 
        """
        sum_of_squares.append(0)
        index = index + 1
        for n in r:
            sum_of_squares[index] = sum_of_squares[index] + n**2

    strongest_frame = sum_of_squares.index(max(sum_of_squares))
    hz = mfcc.mel2hz(features[strongest_frame]
                     )  #converts the strongest frame's mfcc to hertz

    max_hz = max(hz)
    min_hz = min(hz)

    speech_booster = AudioEffectsChain().lowshelf(
        frequency=min_hz * (-1), gain=20.0,
        slope=0.5)  #creates an audio booster that removes low hz
    y_speech_boosted = speech_booster(audio)  #apply booster to original audio

    #############################
    #                           #
    #  FINAL MFCC CALCULATION   #
    #                           #
    #############################

    features = mfcc.mfcc(y_speech_boosted,
                         sr,
                         0.025,
                         0.01,
                         16,
                         nfilt=40,
                         nfft=512,
                         appendEnergy=False,
                         winfunc=np.hamming)

    features = preprocessing.scale(
        features)  #scaling to ensure that all values are within 0 and 1

    return features
Esempio n. 29
0
def load_and_fbank(filename, labels={}):
    X, y = [], []

    for i, line in enumerate(open(filename, 'r')):
        # if i >= 200:
        #     break

        line = line.strip()
        if line == '':
            continue
        if line.startswith('#'):
            continue

        row = line.split('\t')
        if len(row) < 2:
            sys.stderr.write('invalid record: {}\n'.format(line))
            continue

        wave_file, emotion = row

        if emotion not in labels:
            labels[emotion] = len(labels)

        with wave.open(os.path.join(root_dir, wave_file), 'rb') as file:
            params = file.getparams()
            nchannels, sampwidth, framerate, nframes, comptipe, compname = params[:
                                                                                  6]
            str_data = file.readframes(nframes)
            wavedata = np.fromstring(str_data, dtype=np.short)

        winlen = 0.025
        winstep = 0.01
        nfft = int(winlen * framerate)
        nfilt = 40

        mel_spec = ps.logfbank(wavedata,
                               samplerate=framerate,
                               winlen=winlen,
                               winstep=winstep,
                               nfilt=nfilt,
                               nfft=nfft)
        delta1 = ps.delta(mel_spec, 2)
        delta2 = ps.delta(delta1, 2)
        time = mel_spec.shape[0]

        # max_frames = 1024
        # max_frames = 300
        feature = np.empty((time, nfilt, 3), dtype=np.float32)
        feature[:, :, 0] = mel_spec
        feature[:, :, 1] = delta1
        feature[:, :, 2] = delta2

        X.append(feature)
        y.append(np.array([labels[emotion]], 'i'))

    logger.info('Loading dataset ... done.')
    sys.stdout.flush()

    return X, y, labels
Esempio n. 30
0
	def get_fbank_fea(self, wave_file):
		fs,audio = wav.read(wave_file)
		fbank_inputs = logfbank(audio, samplerate=fs)
		train_fbank_inputs = np.asarray(fbank_inputs[np.newaxis,:])
		train_fbank_inputs = (train_fbank_inputs-np.mean(train_fbank_inputs))/np.std(train_fbank_inputs)
		train_seq_len = [train_fbank_inputs.shape[1]]
		
		return train_fbank_inputs, train_seq_len
def get_features(path):
    wav_name=path[path.rfind('/')+1:]
    # print 'wav_name:',wav_name
    label=wav_name[:3]

    # output_dir=sys.argv[2]
    output_dir='/home/sw/Shin/Codes/Deep-Rein4cement/One-shot-PGD/Omniglot/speech/dataset'
    output_dir=output_dir if output_dir[-1]=='/' else output_dir+'/'
    # print 'output_dir:',output_dir

    (rate,sig)=wav.read(path)
    logfbank0015_feat=logfbank(sig,rate,winstep=0.015,nfilt=40)
    feat_list=split_speech(logfbank0015_feat,100)
    return label,feat_list
Esempio n. 32
0
def extract_fbank(sound):
    (rate,sig) = wav.read(StringIO.StringIO(sound))
    fbank_feat = features.logfbank(sig,rate)
    return fbank_feat
Esempio n. 33
0
 def frequencybank(self, window_size=0.05, step_size=0.05, truncate=0.6):
     """Returns a Mel-frequency filter bank for this segment."""
     x = self.signal
     if truncate:
         x = x[:int(truncate * self.sample_rate)]
     return logfbank(x, self.sample_rate, winlen=window_size, winstep=step_size)
Esempio n. 34
0
#!/usr/bin/env python

from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav

(rate,sig) = wav.read("english.wav")
mfcc_feat = mfcc(sig,rate)
d_mfcc_feat = delta(mfcc_feat, 2)
fbank_feat = logfbank(sig,rate)

print(fbank_feat[1:3,:])
Esempio n. 35
0
# ^_^ coding:utf-8 ^_^

import numpy as np
from scipy.io import wavfile
import matplotlib.pyplot as plt
from python_speech_features import mfcc, logfbank

# 读取输入的音频文件
sampling_freq, audio = wavfile.read('input_freq.wav')

# 提取MFCC和过滤器组特征
mfcc_features = mfcc(audio, sampling_freq)
filterbank_features = logfbank(audio, sampling_freq)

# 打印参数
print('MFCCL Number of windows = {}'.format(mfcc_features.shape[0]))
print('Length of each feature = {}'.format(mfcc_features.shape[1]))
print('Filter bank: Number of windows = {}'.format(filterbank_features.shape[0]))
print('Length of each feature = {}'.format(filterbank_features.shape[1]))

# 画出特征图
mfcc_features = mfcc_features.T
plt.matshow(mfcc_features)
plt.title('MFCC')

# 将滤波器组特征可视化
filterbank_features = filterbank_features.T
plt.matshow(filterbank_features)
plt.title('Filter bank')
plt.show()
 def calculates_log_fbank(data):
     return logfbank(data, samplerate=SAMPLE_RATE, winlen=0.02, winstep=0.01, nfilt=40)