Esempio n. 1
0
 def from_wav_to_mfcc(self, k):
     """
       :param k: index of the sentence (wrt the list 'EMA_files'
       :return: the acoustic features( K,429); where K in the # of frames.
       calculations of the mfcc with librosa , + Delta and DeltaDelta, + 10 context frames
       # of acoustic features per frame: 13 ==> 13*3 = 39 ==> 39*11 = 429.
       parameters for mfcc calculation are defined in class_corpus
       """
     path_wav = os.path.join(self.path_files_brutes, "wav_cut",
                             self.EMA_files_2[k] + '.wav')
     data, sr = librosa.load(
         path_wav,
         sr=self.sampling_rate_wav_wanted)  # chargement de données
     mfcc = librosa.feature.mfcc(y=data,
                                 sr=self.sampling_rate_wav_wanted,
                                 n_mfcc=self.n_coeff,
                                 n_fft=self.frame_length,
                                 hop_length=self.hop_length).T
     dyna_features = get_delta_features(mfcc)
     dyna_features_2 = get_delta_features(dyna_features)
     mfcc = np.concatenate((mfcc, dyna_features, dyna_features_2), axis=1)
     padding = np.zeros((self.window, mfcc.shape[1]))
     frames = np.concatenate([padding, mfcc, padding])
     full_window = 1 + 2 * self.window
     mfcc = np.concatenate(
         [frames[j:j + len(mfcc)] for j in range(full_window)], axis=1)
     return mfcc
    def from_wav_to_mfcc(self, wav):
        mfcc = librosa.feature.mfcc(y=wav,
                                    sr=self.sampling_rate_wav,
                                    n_mfcc=self.n_coeff,
                                    n_fft=self.frame_length,
                                    hop_length=self.hop_length).T

        dyna_features = get_delta_features(mfcc)
        dyna_features_2 = get_delta_features(dyna_features)
        mfcc = np.concatenate((mfcc, dyna_features, dyna_features_2), axis=1)
        padding = np.zeros((self.window, mfcc.shape[1]))
        frames = np.concatenate([padding, mfcc, padding])
        full_window = 1 + 2 * self.window
        mfcc = np.concatenate(
            [frames[i:i + len(mfcc)] for i in range(full_window)], axis=1)
        return mfcc
Esempio n. 3
0
 def from_wav_to_mfcc(self, wav):
     """
     :param wav: list of intensity points of the wav file
     :return: the acoustic features( K,429); where K in the # of frames.
     calculations of the mfcc with librosa , + Delta and DeltaDelta, + 10 context frames
     # of acoustic features per frame: 13 ==> 13*3 = 39 ==> 39*11 = 429.
     parameters for mfcc calculation are defined in class_corpus
     """
     mfcc = librosa.feature.mfcc(y=wav,
                                 sr=self.sampling_rate_wav,
                                 n_mfcc=self.n_coeff,
                                 n_fft=self.frame_length,
                                 hop_length=self.hop_length).T
     dyna_features = get_delta_features(mfcc)
     dyna_features_2 = get_delta_features(dyna_features)
     mfcc = np.concatenate((mfcc, dyna_features, dyna_features_2), axis=1)
     padding = np.zeros((self.window, mfcc.shape[1]))
     frames = np.concatenate([padding, mfcc, padding])
     full_window = 1 + 2 * self.window
     mfcc = np.concatenate(
         [frames[i:i + len(mfcc)] for i in range(full_window)], axis=1)
     return mfcc
def preprocess_my_wav_files(wav_folder, mfcc_folder, Nmax=0):
    """
    Read all the wav files in "my_wav_files_for_inversion" and preprocess them the extract their acoustic features,
    so that it can be used as input of the my_ac2art model.
    Save the mfcc in "my_mfcc_files_for_inversion" , with the same filename as the corresponding wav.
    Warning : the acoustic features are usually normalized at the speaker level when enough data is available for
    the speaker.
    We let future users modify the code to apply this normalization (coeff = (coeff-meancoeff)/stdcoeff  )
    """
    path_wav = os.path.join(root_folder, "Predictions_arti", wav_folder)
    if not os.path.exists(os.path.join(root_folder,"Predictions_arti",mfcc_folder)):
        os.mkdir(os.path.join(root_folder,"Predictions_arti",mfcc_folder))
    frame_time = 25 / 1000
    hop_time = 10 / 1000
    sampling_rate_wav_wanted = 16000
    hop_length = int(hop_time * sampling_rate_wav_wanted)
    frame_length = int(frame_time * sampling_rate_wav_wanted)
    window = 5
    n_coeff = 13
    wav_files = os.listdir(path_wav)
    if Nmax > 0:
        wav_files = wav_files[:Nmax]
    for filename in wav_files:
        if not filename.endswith('.wav'):
            continue
        filename = filename[:-4]  #remove extension
        wav, sr = librosa.load(os.path.join(path_wav,filename+".wav"), sr=sampling_rate_wav_wanted)  # chargement de données
        wav = 0.5 * wav / np.max(wav)
        mfcc = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=n_coeff, n_fft=frame_length, hop_length=hop_length).T
        dyna_features = get_delta_features(mfcc)
        dyna_features_2 = get_delta_features(dyna_features)
        mfcc = np.concatenate((mfcc, dyna_features, dyna_features_2), axis=1)
        padding = np.zeros((window, mfcc.shape[1]))
        frames = np.concatenate([padding, mfcc, padding])
        full_window = 1 + 2 * window
        mfcc = np.concatenate([frames[j:j + len(mfcc)] for j in range(full_window)], axis=1)  # add context
        # normalize
        mfcc =( mfcc - mfcc.mean(axis = 0, keepdims=True) )/ mfcc.std(axis = 0, keepdims=True)
        np.save(os.path.join(root_folder, "Predictions_arti",mfcc_folder, filename), mfcc)
Esempio n. 5
0
    def read_ema_and_wav(self, k):
        """
        :param k: index wrt EMA_files list of the file to read
        :return: ema positions for 12 arti (K',12) , acoustic features (K,429); where K in the # of frames.
        read and reorganize the ema traj,
        calculations of the mfcc with librosa , + Delta and DeltaDelta, + 10 context frames
        # of acoustic features per frame: 13 ==> 13*3 = 39 ==> 39*11 = 429.
        parameters for mfcc calculation are defined in class_corpus
        """
        order_arti_haskins = [
            'td_x', 'td_y', 'tb_x', 'tb_y', 'tt_x', 'tt_y', 'ul_x', 'ul_y',
            "ll_x", "ll_y", "ml_x", "ml_y", "li_x", "li_y", "jl_x", "jl_y"
        ]

        order_arti = [
            'tt_x', 'tt_y', 'td_x', 'td_y', 'tb_x', 'tb_y', 'li_x', 'li_y',
            'ul_x', 'ul_y', 'll_x', 'll_y'
        ]

        data = sio.loadmat(
            os.path.join(self.path_files_brutes,
                         self.EMA_files[k] + ".mat"))[self.EMA_files[k]][0]
        ema = np.zeros((len(data[1][2]), len(order_arti_haskins)))

        for arti in range(1, len(data)):  # 在二分法中读取关节轨迹
            ema[:, (arti - 1) * 2] = data[arti][2][:, 0]
            ema[:, arti * 2 - 1] = data[arti][2][:, 2]
        new_order_arti = [order_arti_haskins.index(col) for col in order_arti]
        ema = ema[:, new_order_arti]

        # We create wav files form intensity matlab files
        wav_data = data[0][2][:, 0]
        librosa.output.write_wav(
            os.path.join(self.root_path, "Raw_data", self.corpus, self.speaker,
                         "wav", self.EMA_files[k] + ".wav"), wav_data,
            self.sampling_rate_wav)
        wav, sr = librosa.load(os.path.join(self.root_path, "Raw_data",
                                            self.corpus, self.speaker, "wav",
                                            self.EMA_files[k] + ".wav"),
                               sr=self.sampling_rate_wav_wanted)
        # np.save(os.path.join(root_path, "Raw_data", corpus, speaker, "wav",
        #                      EMA_files[k]), wav)
        wav = 0.5 * wav / np.max(wav)
        mfcc = librosa.feature.mfcc(y=wav,
                                    sr=self.sampling_rate_wav_wanted,
                                    n_mfcc=self.n_coeff,
                                    n_fft=self.frame_length,
                                    hop_length=self.hop_length).T
        dyna_features = get_delta_features(mfcc)
        dyna_features_2 = get_delta_features(dyna_features)
        mfcc = np.concatenate((mfcc, dyna_features, dyna_features_2), axis=1)
        padding = np.zeros((self.window, mfcc.shape[1]))
        frames = np.concatenate([padding, mfcc, padding])
        full_window = 1 + 2 * self.window
        mfcc = np.concatenate(
            [frames[i:i + len(mfcc)] for i in range(full_window)], axis=1)

        marge = 0
        xtrm = detect_silence(data)
        xtrm = [max(xtrm[0] - marge, 0), xtrm[1] + marge]

        xtrm_temp_ema = [
            int(np.floor(xtrm[0] * self.sampling_rate_ema)),
            int(min(np.floor(xtrm[1] * self.sampling_rate_ema) + 1, len(ema)))
        ]
        xtrm_temp_mfcc = [
            int(np.floor(xtrm[0] / self.hop_time)),
            int(np.ceil(xtrm[1] / self.hop_time))
        ]
        ema = ema[xtrm_temp_ema[0]:xtrm_temp_ema[1], :]
        mfcc = mfcc[xtrm_temp_mfcc[0]:xtrm_temp_mfcc[1]]

        n_frames_wanted = mfcc.shape[0]
        ema = scipy.signal.resample(ema, num=n_frames_wanted)
        return ema, mfcc