コード例 #1
0
ファイル: speech.py プロジェクト: ufal/neuralmonkey
    def preprocess(audio: Audio) -> np.ndarray:
        features = [FEATURE_TYPES[feature_type](
            audio.data, samplerate=audio.rate, **kwargs)]

        for _ in range(delta_order):
            features.append(delta(features[-1], delta_window))

        return np.concatenate(features, axis=1)
コード例 #2
0
def mk_MFB(filename, sample_rate=c.SAMPLE_RATE, use_delta=c.USE_DELTA):
    audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.flatten()

    filter_banks, energies = fbank(audio,
                                   samplerate=sample_rate,
                                   nfilt=c.FILTER_BANK,
                                   winlen=0.025)
    delta_1 = delta(filter_banks, N=1)
    delta_2 = delta(delta_1, N=1)

    filter_banks = normalize_frames(filter_banks)
    delta_1 = normalize_frames(delta_1)
    delta_2 = normalize_frames(delta_2)

    if use_delta:
        frames_features = np.hstack([filter_banks, delta_1, delta_2])
    else:
        frames_features = filter_banks

    np.save(filename.replace('.wav', '.npy'), frames_features)

    return
コード例 #3
0
def create_mfcc(filename):
    """Perform standard preprocessing, as described by Alex Graves (2012)
	http://www.cs.toronto.edu/~graves/preprint.pdf
	Output consists of 12 MFCC and 1 energy, as well as the first derivative of these.
	[1 energy, 12 MFCC, 1 diff(energy), 12 diff(MFCC)
	"""

    (rate, sample) = wav.read(filename)

    mfcc = features.mfcc(sample,
                         rate,
                         winlen=0.025,
                         winstep=0.01,
                         numcep=13,
                         nfilt=26,
                         preemph=0.97,
                         appendEnergy=True)
    d_mfcc = features.delta(mfcc, 2)
    a_mfcc = features.delta(d_mfcc, 2)

    out = np.concatenate([mfcc, d_mfcc, a_mfcc], axis=1)

    return out, out.shape[0]
コード例 #4
0
def extract_features(samples,
                     sample_rate,
                     win_len,
                     win_shift,
                     win_fun=np.hamming):
    """
    Computes 13 MFCC + delta + delta-delta features for an utterance.

    :param samples: samples of the utterance, numpy array of shape (n_samples,)
    :param sample_rate: sampling rate
    :param win_len: window length (in seconds)
    :param win_shift: window shift (in seconds)
    :param win_fun: window function
    :return: numpy array of shape (n_frames, n_features), where n_features=39
    """
    mfcc = pss.mfcc(samples,
                    sample_rate,
                    winlen=win_len,
                    winstep=win_shift,
                    winfunc=win_fun)
    delta = pss.delta(mfcc, 3)
    delta_delta = pss.delta(delta, 3)
    return np.concatenate((mfcc, delta, delta_delta), axis=1)
コード例 #5
0
def mfcc(wav_path):
    """ Grabs MFCC features with energy and derivates. """

    (rate, sig) = wav.read(wav_path)
    feat = python_speech_features.mfcc(sig, rate, appendEnergy=True)
    delta_feat = python_speech_features.delta(feat, 2)
    all_feats = [feat, delta_feat]
    all_feats = np.array(all_feats)
    # Make time the first dimension for easy length normalization padding later.
    all_feats = np.swapaxes(all_feats, 0, 1)
    all_feats = np.swapaxes(all_feats, 1, 2)

    feat_fn = wav_path[:-3] + "mfcc13_d.npy"
    np.save(feat_fn, all_feats)
コード例 #6
0
def get_mfcc_v2(y,
                sr,
                n_mfcc=13,
                tgt_sr=16000,
                win_len=0.025,
                hop_len=0.010,
                n_fft=512,
                n_mels=22,
                fmin=0.0,
                fmax=None,
                cep_lifter=22,
                pre_emph=0.97,
                win_func=lambda x: np.ones((x, )),
                append_energy=True,
                delta=True,
                delta_delta=True):
    if sr != 16000.0:
        y = librosa.core.resample(y, orig_sr=sr, target_sr=16000)
    mfccs = python_speech_features.mfcc(y,
                                        tgt_sr,
                                        winlen=win_len,
                                        winstep=hop_len,
                                        numcep=n_mfcc,
                                        nfilt=n_mels,
                                        nfft=n_fft,
                                        lowfreq=fmin,
                                        highfreq=fmax,
                                        preemph=pre_emph,
                                        ceplifter=cep_lifter,
                                        appendEnergy=append_energy,
                                        winfunc=win_func)
    features = [mfccs]
    if delta:
        features.append(python_speech_features.delta(mfccs, 1))
    if delta_delta:
        features.append(python_speech_features.delta(mfccs, 2))
    return np.hstack(features)
コード例 #7
0
    def extract_feature_for_audio(self, audio_file_path):
        # load the wav file to an array
        signal, sr = librosa.load(audio_file_path,
                                  mono=True,
                                  sr=self.sample_rate)

        # trim the leading and trailing slience
        signal_trimed, index = librosa.effects.trim(signal,
                                                    top_db=self.silence_cutoff)

        # extract the mfcc feature, for details about mfcc,
        # see: http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/
        MFCC = mfcc(signal_trimed,
                    self.sample_rate,
                    winlen=self.winlen,
                    winstep=self.winlen / 2,
                    winfunc=numpy.hamming,
                    nfft=self.nfft,
                    numcep=self.numcep)

        # do not use the first mfcc coefficient
        features = MFCC[:, 1:self.numcep]

        # caculate the delta of the mfcc and add to the features
        Delta = delta(MFCC, 2)
        features = numpy.column_stack((features, Delta))

        # caculate the delta of the delta of the mfcc and add to the features
        Acc = delta(Delta, 2)
        features = numpy.column_stack((features, Acc))

        # total number of features would be the number of columns of the `features` array
        self.num_features = features.shape[1]

        # each audio file will be transformed to an numpy array with a shape(N, self.num_features)
        # where N is the number of frames that are extracted from the audio file by the mfcc function
        return features
コード例 #8
0
    def process_mel(self, mel_input):

        #mel_input [80,344]
        mel_input = mel_input.T  #[344,80]
        delta1 = ps.delta(mel_input, 2)
        delta2 = ps.delta(delta1, 2)

        time = mel_input.shape[0]
        mel = np.pad(mel_input, ((0, 800 - time), (0, 0)),
                     'constant',
                     constant_values=0)  #[800,80]
        delta1 = np.pad(delta1, ((0, 800 - time), (0, 0)),
                        'constant',
                        constant_values=0)
        delta2 = np.pad(delta2, ((0, 800 - time), (0, 0)),
                        'constant',
                        constant_values=0)

        mel_output = np.zeros((3, 800, 80))
        mel_output[0, :, :] = mel
        mel_output[1, :, :] = delta1
        mel_output[2, :, :] = delta2

        return mel_output
コード例 #9
0
    def get_mfcc(self, data, fs):

        wav_feature = mfcc(data,
                           fs,
                           numcep=self.numc,
                           winlen=c.FRAME_LEN,
                           winstep=c.FRAME_STEP,
                           nfilt=26,
                           nfft=c.NUM_FFT)
        #print(wav_feature.shape,"  before",type(wav_feature))

        reserve_length = wav_feature.shape[0] - wav_feature.shape[0] % 100
        d_wav_feature_1 = delta(wav_feature, 2)
        d_wav_feature_2 = delta(d_wav_feature_1, 2)
        mfcc_feat_normal = normalize_frames(wav_feature.T)
        d_mfcc_feat_1_normal = normalize_frames(d_wav_feature_1.T)
        d_mfcc_feat_2_normal = normalize_frames(d_wav_feature_2.T)
        mfcc_feature = [
            mfcc_feat_normal, d_mfcc_feat_1_normal, d_mfcc_feat_2_normal
        ]
        mfcc_feature = torch.tensor(mfcc_feature)
        length = (reserve_length / 100 - 1) / 2
        total_length = (int(length) * 2 + 1) * 2
        index = torch.randperm(total_length)
        if self.mode == "train":
            feature = torch.zeros([int(length), 3, self.numc, 300])
            for r in range(int(length)):
                for i in range(6):
                    feature[r, :, :, i * 50:(i + 1) * 50] =\
                        mfcc_feature[:, :, index[r * 4 + i] * 50:(index[r * 4 + i] + 1) * 50]
            feature = feature.permute(0, 1, 3, 2)
        else:
            feature = mfcc_feature[:, :, 0:reserve_length]
            feature = feature.unsqueeze(0)
            feature = feature.permute(0, 1, 3, 2)
        return feature
コード例 #10
0
def audio_feature(signal,
                  samplerate=16000,
                  winlen=0.025,
                  winstep=0.01,
                  numcep=13,
                  nfilt=40,
                  nfft=512,
                  lowfreq=0,
                  highfreq=None,
                  preemph=0.97,
                  ceplifter=22,
                  appendEnergy=True,
                  winfunc=np.hamming):
    feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft,
                         lowfreq, highfreq, preemph, winfunc)
    log_fbank = np.log(feat)
    # discard the 0-th dct coefficient
    mfcc = dct(log_fbank, type=2, axis=1, norm='ortho')[:, 1:numcep]
    mfcc = lifter(mfcc, ceplifter)
    d1_mfcc = delta(mfcc, 1)
    d2_mfcc = delta(d1_mfcc, 1)
    energy = np.reshape(np.log(energy), (energy.shape[0], 1))
    mixed = np.concatenate((mfcc, d1_mfcc, d2_mfcc, energy), axis=1)
    return mixed
コード例 #11
0
def fbank(wav_path, flat=True):
    """ Currently grabs log Mel filterbank, deltas and double deltas."""

    (rate, sig) = wav.read(wav_path)
    if len(sig) == 0:
        logger.warning("Empty wav: {}".format(wav_path))
    fbank_feat = python_speech_features.logfbank(sig, rate, nfilt=40)
    energy = extract_energy(rate, sig)
    feat = np.hstack([energy, fbank_feat])
    delta_feat = python_speech_features.delta(feat, 2)
    delta_delta_feat = python_speech_features.delta(delta_feat, 2)
    all_feats = [feat, delta_feat, delta_delta_feat]
    if not flat:
        all_feats = np.array(all_feats)
        # Make time the first dimension for easy length normalization padding
        # later.
        all_feats = np.swapaxes(all_feats, 0, 1)
        all_feats = np.swapaxes(all_feats, 1, 2)
    else:
        all_feats = np.concatenate(all_feats, axis=1)

    # Log Mel Filterbank, with delta, and double delta
    feat_fn = wav_path[:-3] + "fbank.npy"
    np.save(feat_fn, all_feats)
コード例 #12
0
 def mfccProc2(self, results_dict):
     (rate, sig) = audioBasicIO.readAudioFile(self.fname)
     #Create 2d array for MFCC features
     mfcc_feat = mfcc(sig, samplerate=44100, nfft=1103)
     #Create 2d array for the delta of MFCC features
     d_mfcc_feat = delta(mfcc_feat, 2)
     #Create 2d array for the log of fbank features
     fbank_feat = logfbank(sig, rate)
     dev_array = []
     for i in mfcc_feat:
         temp = stdev(i)
         dev_array.append(temp)
     tone = stdev(dev_array)
     results_dict["tone"] = tone
     return (mfcc_feat)
コード例 #13
0
def log_mel_filterbank(audio, sample_rate, window_size, step_size):
    """Returns the log of the mel filterbank energies as well as the first and second order deltas.
    Hanning window used for parity with log_spectrogram function.

    Args:
        audio (np.ndarray): audio signal array
        sample_rate (int): sample_rate of signal
        window_size (int): window size
        step_size (int): step size

    Returns:
        np.ndarray: log mel filterbank, delta, and delta-deltas
    """
    delta_window = 1
    log_mel = python_speech_features.base.logfbank(audio,
                                                   sample_rate,
                                                   winlen=window_size / 1000,
                                                   winstep=step_size / 1000,
                                                   winfunc=np.hanning)
    delta = python_speech_features.delta(log_mel, N=delta_window)
    delta_delta = python_speech_features.delta(delta, N=delta_window)
    output = np.concatenate((log_mel, delta, delta_delta), axis=1)

    return output.astype(np.float32)
コード例 #14
0
def mfcc(y, sr, numcep=13, delta=False, delta_delta=False, width=2, **kwargs):
    """
    Compute MFCCs of Audio Signal
    :param y: Audio signal
    :param sr: Original sample rate
    :param numcep: Number of MFCCs to compute
    :param delta: If delta of MFCCs are required
    :param delta_delta: If acceleration of MFCCs are required
    :param width: Number of samples to consider for computing delta
    :param kwargs: Other parameters to pass on python_speech_features like hop length etc.
    :return: MFCCs (numpy array of shape n_frames * n_mfccs)
    """
    mfccs = python_speech_features.mfcc(signal=y,
                                        samplerate=sr,
                                        numcep=numcep,
                                        **kwargs)
    if delta:
        d1 = python_speech_features.delta(mfccs, N=width)
        mfccs = np.hstack((mfccs, d1))
    if delta_delta:
        d2 = python_speech_features.delta(mfccs[:, mfccs.shape[1] / 2:],
                                          N=width)
        mfccs = np.hstack((mfccs, d2))
    return mfccs
コード例 #15
0
def test_all_files(path, hmm_list):
    global false_list, word_list, false_number, correct_number
    file_list = []
    files = os.listdir(path)
    for file_name in files:
        for u in word_list:
            if file_name.__contains__(u):
                file_list.append(file_name)

    for file in file_list:

        for labels in word_list:
            if file.__contains__(labels):
                tested_label = labels
                break
        #tested_label = file[ :-6]
        print("Tested label : ", tested_label)

        audio = AudioSegment.from_file(path + "/" + file,
                                       format="wav",
                                       frame_rate=32000)
        audio = audio.set_frame_rate(16000)
        audio.export("filtered-talk1.wav", format="wav")

        (rate, sig) = wav.read("filtered-talk1.wav")
        # print "Read : " , ses_yol + word + "/" + word + str ( i / 10 ) + str ( i %10 ) +".wav"
        mfcc_feat = mfcc(sig, rate, nfft=1536)
        d_mfcc_feat = delta(mfcc_feat, 2)

        data = np.concatenate((mfcc_feat, d_mfcc_feat), axis=1).tolist()

        for label in word_list:
            if label in file:
                break

        data = scalers[label].transform(data)
        vector = get_score_vector(data, hmm_list)
        vector = nn_scaler.transform([vector])
        print("Vector : ", vector.shape)
        predicted = trained.predict(vector)
        print("type : ", type(predicted[0]))
        print("Res : ", str(predicted[0]))
        print("Prediction ", trained.predict_proba(vector))
        if predicted[0] != tested_label:
            false_list.append(file + " predicted answer : " + predicted[0])
            false_number += 1
        else:
            correct_number += 1
コード例 #16
0
def convert_to_vector(filename):
    (rate, sig) = wav.read(filename)
    mfcc_feat = mfcc(sig, rate)
    d_mfcc_feat = delta(mfcc_feat, 2)
    fbank_feat = logfbank(sig, rate)
    #print(fbank_feat)
    print("######################")
    vector1 = (fbank_feat[1:3, :][1])
    #print(vector1)
    vector2 = (fbank_feat[1:3, :][0])

    #print vector2
    print("######################")
    z = np.hstack((vector1, vector2))
    #	vector.extend(list(fbank_feat[1:3,:][1]))
    return z
コード例 #17
0
 def mfcc_delta_feature_extraction(self, data):
     vectorized_data = []
     hop_length = int(self.wnd_step * self.sample_rate)
     win_length = int(self.wnd_len * self.sample_rate)
     for d in data:
         mfcc_librosa = mfcc(d,
                             sr=self.sample_rate,
                             n_mfcc=self.num_features,
                             hop_length=hop_length,
                             win_length=win_length)
         # mfcc_librosa = mfcc(d, sr=self.sample_rate, n_mfcc=self.num_features)
         mfcc_f = np.transpose(mfcc_librosa)
         delta_f = delta(mfcc_f, 8)
         v_d = np.append(mfcc_f, delta_f, axis=1)
         vectorized_data.append(v_d)
     return vectorized_data
コード例 #18
0
def get_24_coefficients(filenames,
                        data_dir=os.path.join(
                            'Support_CentraleDigitale_Lab_201920',
                            'Data_Submarin', 'Dataset_J1')):
    """Input : audio recordings filenames list of a same speaker
    For instance, filenames_MJPM = ['MJPM-1','MJPM-2','MJPM-3']
    Output : Array with 24 columns corresponding to the MFC Coefficients 2-13, 
    and the delta MFC Coefficients of these MFC Coefficients.
    It has I rows corresponding to speaking frames of the recordings."""
    # The 12 MFC Coefficients 2-13
    mfcc_speaker_speaking = mfcc_locuteur_speaking(filenames, data_dir)
    # The 12 delta MFC Coefficients corresponding to the previous coefficients
    d_mfcc_speaker_speaking = delta(mfcc_speaker_speaking, 2)
    mfcc_24_coeffs = np.hstack(
        (mfcc_speaker_speaking, d_mfcc_speaker_speaking))
    return mfcc_24_coeffs
コード例 #19
0
def test_features(fname):

    #os.chdir(path)
    #feat=np.zeros((1,27))
    #fnames=[x for x in os.listdir(path) if x[-3:]=="wav"]
    (rate, sig) = wav.read(fname)
    #sig=sig[:,1]
    fr_l = math.floor(rate * 0.025)
    mfcc_feat = mfcc(sig, rate, nfft=fr_l + 1)
    d_mfcc_feat = delta(mfcc_feat, 2)
    feat = np.zeros((len(d_mfcc_feat), 26))  # first row is the tag
    feat[:, :13] = mfcc_feat
    feat[:, 13:] = d_mfcc_feat

    feat_std = preprocessing.scale(feat)

    return feat
コード例 #20
0
def compute_mfcc(filename):
    (rate, sig) = wav.read(filename)
    m = mfcc(sig,
             samplerate=rate,
             winlen=0.025,
             winstep=0.01,
             numcep=13,
             nfilt=40,
             nfft=512,
             lowfreq=0,
             highfreq=None,
             preemph=0,
             ceplifter=22,
             appendEnergy=True)
    m = delta(m, 6)
    #m = delta(m, 2)
    return m
def mfcc_with_delta(
        audio, samplerate, n_features, n_channels,
        **kwargs):  # 이 함수가 호출될때 위의 함수 (preprocessor)에 이 함수가 붙어서 호출됨.
    """Calculate Mel-frequency cepstral coefficients, and calculate delta
  features if requested."""
    tmp = _features.mfcc(
        audio, samplerate, numcep=n_features, **kwargs
    )  # python_speech_features.mfcc 여기서 audio 는 1d array 여야하는데.. 언제 바뀜?
    # return a numpy array of size containing features.
    # Each row holds 1 feature vector.
    tmp -= _np.mean(tmp, axis=0) + 1e-8
    result = [tmp]
    for _ in range(1, n_channels):
        tmp = _features.delta(tmp, 2)
        result.append(tmp)
    result = _np.stack(result, axis=2)
    return result
コード例 #22
0
    def featureList(self, path):
        # obj = Silence()
        # newpath = os.path.splitext(path)[0] + "_silenced" + os.path.splitext(path)[1]
        # obj.silencemain(path,newpath)

        (rate, sig) = wav.read(path)
        print "___________________path_________________"
        print sig.shape
        print rate
        print path
        mfcc_feat = mfcc(sig, rate)
        d_mfcc_feat = delta(mfcc_feat, 2)
        fbank_feat = logfbank(sig, rate)

        print "file:feature.py line:24"
        print fbank_feat.shape

        return fbank_feat
コード例 #23
0
def extract_features(audio, rate):
    """extract 20 dim mfcc features from an audio, performs CMS and combines 
    delta to make it 40 dim feature vector"""

    mfcc_feature = mfcc.mfcc(audio,
                             rate,
                             0.025,
                             0.01,
                             26,
                             nfft=1200,
                             preemph=0.97,
                             appendEnergy=True)
    # mfcc_feature = preprocessing.scale(mfcc_feature)
    mfcc_feature1 = mfcc.logfbank(audio, rate, 0.025, 0.01, 26, nfft=1200)
    mfcc_feature2 = mfcc.ssc(audio, rate, 0.025, 0.01, 26, nfft=1200)
    delta = mfcc.delta(mfcc_feature, 26)
    combined = np.hstack((mfcc_feature, delta, mfcc_feature1, mfcc_feature2))
    return combined
コード例 #24
0
    def extract_feature(self, path):
        fs, y = wavfile.read(path)
        y = y / np.max(abs(y))
        mfcc_feat = mfcc(y, fs)
        mfcc_feat = delta(mfcc_feat, 2)
        data = pad_sequences(mfcc_feat.T,
                             self.max_frames,
                             dtype=float,
                             padding='post',
                             truncating='post').T
        if (path.split('/')[-2] == '0.Background'):
            target = 0
        elif (path.split('/')[-2] == '1.Bat den'):
            target = 1
        elif (path.split('/')[-2] == '2.Tat den'):
            target = 2
        elif (path.split('/')[-2] == '3.Bat dieu hoa'):
            target = 3
        elif (path.split('/')[-2] == '4.Tat dieu hoa'):
            target = 4
        elif (path.split('/')[-2] == '5.Bat quat'):
            target = 5
        elif (path.split('/')[-2] == '6.Tat quat'):
            target = 6
        elif (path.split('/')[-2] == '7.Bat tivi'):
            target = 7
        elif (path.split('/')[-2] == '8.Tat tivi'):
            target = 8
        elif (path.split('/')[-2] == '9.Mo cua'):
            target = 9
        elif (path.split('/')[-2] == '10.Dong cua'):
            target = 10
        elif (path.split('/')[-2] == '11.Khoa cua'):
            target = 11
        elif (path.split('/')[-2] == '12.Mo cong'):
            target = 12
        elif (path.split('/')[-2] == '13.Dong cong'):
            target = 13
        elif (path.split('/')[-2] == '14.Khoa cong'):
            target = 14
        elif (path.split('/')[-2] == '15.Doremon'):
            target = 15

        return data, target
コード例 #25
0
    def extract_features(self,
                         signal,
                         rate,
                         winlen=WINLEN,
                         winstep=WINSTEP,
                         nfft=NFFT,
                         n=N):
        # signal = self._scale_signal(signal)  # obniza o 4% !?
        # signal = self._cut_the_tips(signal) # obniza o 1%

        mfcc_values = mfcc(signal=signal,
                           samplerate=rate,
                           winlen=winlen,
                           winstep=winstep,
                           nfft=nfft)
        dmfcc_values = delta(mfcc_values, n)
        result = np.append(mfcc_values, dmfcc_values, axis=1)

        return result
コード例 #26
0
 def mfcc_features(self, audio, rate, numcep = 20, nfft = 2000, N = 2):
     """
     Returns the MFCC and delta MFCC features of the given audio, stacked together horizontally
     Parameters:
     :audio: The audio file for which MFCC features must be computed
     :rate: The sample rate of the audio file
     :numcep: The number of cepstrum to return, default 20
     :nfft: The FFT size, default 2000
     :N: Calculate delta features based on preceding and following N frames, default 2
     Return Value: A numpy array which has the scaled MFCC and delta MFCC features, stacked horizontally
     """
     self.mfcc = python_speech_features.mfcc(audio, rate, numcep = numcep, nfft = nfft)
     #self.mfcc = preprocessing.scale(self.mfcc)
     
     self.delta_mfcc = python_speech_features.delta(self.mfcc, N)
     
     self.mfcc_feature = np.hstack((self.mfcc, self.delta_mfcc))
     
     return self.mfcc_feature
コード例 #27
0
def load_all_wav_into_csv():
    metadata = open_file_read('../accents_data/speakers_all.csv')
    count = 0
    for file in metadata:
        if file[FILE_MISSING_IDX] == "FALSE" and file[
                FIRST_LANGUAGE_IDX] in ACCEPTED_LANGUAGES:
            count += 1
            filename_wav = os.path.join(
                dirname, '../accents_data/recordings_wav/' +
                file[FILE_NAME_IDX] + '.wav')

            rate, sig = make_standard_length(filename_wav)
            mfcc_feat = mfcc(sig, rate, nfft=1200, nfilt=13)
            d_mfcc_feat = delta(mfcc_feat, 2)
            fbank_feat = logfbank(sig, rate, nfft=1200, nfilt=13)

            write_to_csv('../accents_data/recordings_csv/' + file[3] + '.csv',
                         fbank_feat)

            progress(count, NUMBER_OF_FILES, "Generating MFCCs")
コード例 #28
0
def run(sec) :
	n = int(rate * sec)								# 冒頭無発話区間とフレームレートをかけあわせて,計算範囲だけを残す

	for x, pcmname in enumerate(pcmnames) :
			data = read_pcm(pcmname)				# pcmファイルを読み込み
			data = data[: n]						# secに合わせて,冒頭区間のみを残す

			print("[%s]\nstart: open %s" %(ctime(), pcmname))


			# MFCCの計算とその平均
			mfcc_feature = mfcc(data, rate, winlen=length, winstep=step, numcep=n_feature)
			mfcc_mean = np.mean(mfcc_feature.T, axis=1).astype(np.str)

			# MFCCからデルタとその平均を求める
			d_mfcc_feat = delta(mfcc_feature, 2)
			d_mfcc_mean = np.mean(d_mfcc_feat.T, axis=1).astype(np.str)
			#print(d_mfcc_mean)

			# delta-deltaを計算する
			#d2_mfcc_feat = delta(d_mfcc_feat, 2)
			#d2_mfcc_mean = np.mean(d2_mfcc_feat.T, axis=1).astype(np.str)

			# 結果(pcmファイル名,MFCC,デルタ)をまとめる
			rslt = np.array([pcmname], dtype=np.str)
			rslt = np.append(rslt, mfcc_mean)
			rslt = np.append(rslt, d_mfcc_mean)

			# 結果を1つの大きな配列に結合する
			if x == 0 :
				out = np.array([rslt])
			else :
				out = np.append(out, np.array([out]), axis=0)

			print("[%s]\ndone: get features from %s" %(ctime(), pcmname))

		# 全てのpcmファイルを読み込んだら,tsv形式で出力する
		ms = sec * 1000
		outname = "mfcc_feature_%sms.tsv" %ms
		np.savetxt(outname, out, delimiter="\t", fmt="%s")
コード例 #29
0
ファイル: input_functions.py プロジェクト: hellonlp/ctc-asr
def __mfcc(audio_data, sampling_rate, win_len, win_step, num_features, n_fft,
           f_min, f_max):
    """
    Convert a wav signal into Mel Frequency Cepstral Coefficients (MFCC).

    Args:
        audio_data (np.ndarray): Wav signal.
        sampling_rate (int):  Sampling rate.
        win_len (float): Window length in seconds.
        win_step (float): Window stride in seconds.
        num_features (int): Number of features to generate.
        n_fft (int): Number of Fast Fourier Transforms.
        f_min (float): Minimum frequency to consider.
        f_max (float): Maximum frequency to consider.

    Returns:
        np.ndarray: MFCC feature vectors. Shape: [time, num_features]
    """
    if num_features % 2 != 0:
        raise ValueError('num_features is not a multiple of 2.')

    # Compute MFCC features.
    mfcc = psf.mfcc(signal=audio_data,
                    samplerate=sampling_rate,
                    winlen=win_len,
                    winstep=win_step,
                    numcep=num_features // 2,
                    nfilt=num_features,
                    nfft=n_fft,
                    lowfreq=f_min,
                    highfreq=f_max,
                    preemph=0.97,
                    ceplifter=22,
                    appendEnergy=True)

    # And the first-order differences (delta features).
    mfcc_delta = psf.delta(mfcc, 2)

    # Combine MFCC with MFCC_delta
    return np.concatenate([mfcc, mfcc_delta], axis=1)
コード例 #30
0
ファイル: preprocess.py プロジェクト: kashizui/bird-brain
def process_timit_psf(path, output_path):
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            if filename[-4:] == ".wav":
                full_path = os.path.join(dirpath, filename)
                wave, sr = librosa.load(full_path, mono=True, sr=16000)
                mfcc_features = python_speech_features.mfcc(wave,
                                                            samplerate=sr,
                                                            numcep=13,
                                                            nfilt=26,
                                                            appendEnergy=True,
                                                            winlen=0.025,
                                                            winstep=0.01)
                delta_features = python_speech_features.delta(mfcc_features, 9)

                output_filename = os.path.join(dirpath,
                                               filename[:-4] + "_mfcc")
                print(output_filename)
                concat_features = np.concatenate(
                    (mfcc_features, delta_features), axis=1).T
                print(concat_features.shape)
                np.save(output_filename, concat_features, allow_pickle=False)
コード例 #31
0
def MFCC(LOCATION, SAVELOCATION):
    # NOTE: Currently using a library for generating MFCCs, to make sure that
    # the implementation of the EM algorithm is based on correct values. Own
    # implementation of MFCCs can be found commented out below.
    data = genfromtxt(LOCATION, delimiter=",")
    signal = np.zeros([64000, 1])
    for i in range(0, 64000):
        signal[i, 0] = data[i]

    mfcc_feat = np.transpose(mfcc(signal, 16000))
    d_mfcc_feat = (delta(mfcc_feat, 2))

    rows, columns = np.shape(mfcc_feat)

    # Concatenate matrices
    # MFCC matrix is on top of the delta matrix
    mfcc_deltas = np.zeros([rows * 2, columns])
    for i in range(0, rows):
        mfcc_deltas[i, :] = mfcc_feat[i, :]
        mfcc_deltas[i + rows, :] = d_mfcc_feat[i, :]
    np.savetxt(SAVELOCATION, mfcc_deltas, delimiter=",")
    return mfcc_deltas
コード例 #32
0
#!/usr/bin/env python

from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav

(rate,sig) = wav.read("english.wav")
mfcc_feat = mfcc(sig,rate)
d_mfcc_feat = delta(mfcc_feat, 2)
fbank_feat = logfbank(sig,rate)

print(fbank_feat[1:3,:])