def process_utterance(self, audio_file, transcript): """Load, augment, featurize and normalize for speech data. :param audio_file: Filepath or file object of audio file. :type audio_file: basestring | file :param transcript: Transcription text. :type transcript: basestring :return: Tuple of audio feature tensor and data of transcription part, where transcription part could be token ids or text. :rtype: tuple of (2darray, list) """ try: is_str = isinstance(audio_file, basestring) except: is_str = isinstance(audio_file, str) if is_str and audio_file.startswith('tar:'): speech_segment = SpeechSegment.from_file( self._subfile_from_tar(audio_file), transcript) else: speech_segment = SpeechSegment.from_file(audio_file, transcript) self._augmentation_pipeline.transform_audio(speech_segment) specgram, transcript_part = self._speech_featurizer.featurize( speech_segment, self._keep_transcription_text) specgram = self._normalizer.apply(specgram) return specgram, transcript_part
def get_audio_mfcc_features(txt_files, wav_files, n_input, n_context, word_num_map, txt_labels=None, specgram_type='mfcc', mean_std_filepath='data/aishell/mean_std.npz'): """ Get MFCC/linear specgram features. The dim of MFCC is 39, contains 13 mfcc + 13 delta1 + 13 delta2. Linear specgram contains 161 features in different frequency section. :param txt_files: :param wav_files: :param n_input: :param n_context: :param word_num_map: :param txt_labels: :return: """ audio_features = [] audio_features_len = [] text_vector = [] text_vector_len = [] if txt_files != None: txt_labels = txt_files get_feature = AudioFeaturizer(specgram_type) normalizer = FeatureNormalizer(mean_std_filepath) for txt_obj, wav_file in zip(txt_labels, wav_files): # Turn inputs into features if specgram_type == 'mfcc': audio_data = audiofile_to_input_vector( wav_file, n_input, n_context) # get mfcc feature ( ???, 741 ) elif specgram_type == 'linear': speech_segment = SpeechSegment.from_file(wav_file, "") specgram = get_feature.featurize(speech_segment) audio_data = normalizer.apply(specgram) audio_data = np.transpose( audio_data) # get linear specgram feature, (?, 161) audio_data = audio_data.astype('float32') audio_features.append(audio_data) audio_features_len.append(np.int32(len(audio_data))) target = [] if txt_files != None: # txt_obj是文件 target = trans_text_ch_to_vector(txt_obj, word_num_map) else: target = trans_text_ch_to_vector(None, word_num_map, txt_obj) # txt_obj是labels text_vector.append(target) text_vector_len.append(len(target)) audio_features = np.asarray(audio_features) audio_features_len = np.asarray(audio_features_len) text_vector = np.asarray(text_vector) text_vector_len = np.asarray(text_vector_len) return audio_features, audio_features_len, text_vector, text_vector_len
def audiofile_to_input_vector(audio_filename, n_input, n_context): """ Compute MFCC features with n_context :param audio_filename: :param n_input: :param n_context: :return: """ fs, audio = wav.read(audio_filename) # get mfcc features with dim 39 get_feature = AudioFeaturizer("mfcc") speech_segment = SpeechSegment.from_file(audio_filename, "") orig_inputs = get_feature.featurize(speech_segment) # (39, ?) orig_inputs = np.transpose(orig_inputs) # trans to time major (?, 39) train_inputs = np.zeros((orig_inputs.shape[0], n_input + 2 * n_input * n_context)) #(***/2, 195) empty_mfcc = np.zeros((n_input)) # Prepare input data, consist of three parts, # output is (past hyparam.n_context * 39 + current + future hyparam.n_context * 39) time_slices = range(train_inputs.shape[0]) context_past_min = time_slices[0] + n_context context_future_max = time_slices[-1] - n_context for time_slice in time_slices: # padding with 0 for the first of 9,mfcc features need_empty_past = max(0, (context_past_min - time_slice)) empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past)) data_source_past = orig_inputs[ max(0, time_slice - n_context):time_slice] # padding with 0 for the last of 9,mfcc features need_empty_future = max(0, (time_slice - context_future_max)) empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future)) data_source_future = orig_inputs[time_slice + 1:time_slice + n_context + 1] if need_empty_past: past = np.concatenate((empty_source_past, data_source_past)) else: past = data_source_past if need_empty_future: future = np.concatenate((data_source_future, empty_source_future)) else: future = data_source_future past = np.reshape(past, n_context * 39) now = orig_inputs[time_slice] future = np.reshape(future, n_context * n_input) train_inputs[time_slice] = np.concatenate((past, now, future)) # Tran data to Norm distribution, minus mean value then over the varr train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) # shape of train_inputs: (shape(orig_inputs)/2, n_context * 2 * 39 + 39) return train_inputs
def process_utterance(self, filename, transcript): """Load, augment, featurize and normalize for speech data. :param filename: Audio filepath :type filename: basestring | file :param transcript: Transcription text. :type transcript: basestring :return: Tuple of audio feature tensor and list of token ids for transcription. :rtype: tuple of (2darray, list) """ if filename.startswith('tar:'): speech_segment = SpeechSegment.from_file( self._subfile_from_tar(filename), transcript) else: speech_segment = SpeechSegment.from_file(filename, transcript) self._augmentation_pipeline.transform_audio(speech_segment) specgram, text_ids = self._speech_featurizer.featurize(speech_segment) specgram = self._normalizer.apply(specgram) return specgram, text_ids
def process_utterance(self, audio_file): """对语音数据加载、预处理 :param audio_file: 音频文件的文件路径或文件对象 :type audio_file: str | file :return: 预处理的音频数据 :rtype: 2darray """ speech_segment = SpeechSegment.from_file(audio_file, "") specgram, _ = self._speech_featurizer.featurize(speech_segment, False) specgram = self._normalizer.apply(specgram) return specgram
def process_utterance(self, audio_file, transcript): """对语音数据加载、扩充、特征化和归一化 :param audio_file: 音频文件的文件路径或文件对象 :type audio_file: str | file :param transcript: 音频对应的文本 :type transcript: str :return: 经过归一化等预处理的音频数据,音频文件对应文本的ID :rtype: tuple of (2darray, list) """ speech_segment = SpeechSegment.from_file(audio_file, transcript) self._augmentation_pipeline.transform_audio(speech_segment) specgram, transcript_part = self._speech_featurizer.featurize( speech_segment, self._keep_transcription_text) specgram = self._normalizer.apply(specgram) return specgram, transcript_part
def process_utterance_from_bytes(self, bytes, transcript, **soundfile_options): """Load, augment, featurize and normalize for speech data. :param audio_file: Bytes read from the file. :type audio_file: byte string. :param transcript: Transcription text. :type transcript: basestring :param soundfile_options: Options for opening with soundfile library. :type soundfile_options: **kwargs :return: Tuple of audio feature tensor and data of transcription part, where transcription part could be token ids or text. :rtype: tuple of (2darray, list) """ speech_segment = SpeechSegment.from_bytes(bytes, transcript, **soundfile_options) self._augmentation_pipeline.transform_audio(speech_segment) specgram, transcript_part = self._speech_featurizer.featurize( speech_segment, self._keep_transcription_text) specgram = self._normalizer.apply(specgram) return specgram, transcript_part