def mfcc_feature(self, file): """ mfcc 特征 :param file: (str, mandatory) 音频文件 :return: (array) mfcc 特征 """ assert os.path.isfile(file), FileNotFoundException(file) (sample, audio) = wav.read(file) return mfcc(audio, sample, numcep=self.mfcc_dim)
def load(file): """ 加载字典 :param file: (str, mandatory) 字典文件 :return: (dictionary) 字典对象 """ assert os.path.isfile(file), FileNotFoundException(file) with open(file, 'rb') as f_read: audio_feature = pickle.loads(f_read.read()) return audio_feature
def spectrogram(self, file): """ 获取声谱特征 :param file: (str, mandatory) 音频文件 :return: (array) 声谱特征。shape:[时序, frame_length // 2] """ assert os.path.isfile(file), FileNotFoundException(file) # 1.提取信号。 # audio 为提取的数据。格式为[数据帧, 通道]。如果通道为0,则返回为1维度数据 # sample_rate 为采样率。 audio, sample_rate = sf.read(file, dtype='float32') assert audio.ndim == 1, UnknownError("check input data") # 2.分帧 # 2.1 根据帧长度、帧移、信号数据。判断最多可以获得多少帧数据。对多余数据抛弃。 # 例如信号数据 audio = [1,2,3,4,5,6,7,8,9]. 帧长度 = 4, 帧移 = 2 # 则第一时序数据帧是 [1,2,3,4], 第二时序数据帧是:[3,4,5,6], 第三时序数据帧是:[5,6,7,8]. 其中 9 这个数据点被抛弃。总共 3 个时序 trunc = (audio.shape[0] - self.frame_length) % self.frame_shift audio = audio[:audio.shape[0] - trunc] # 时序 sequential = (audio.shape[0] - self.frame_length) // self.frame_shift assert sequential > 0, UnknownError( "spectrogram feature sequence less than 0, " "audio file:{}, audio shape:{}".format(file, audio.shape)) # 汉宁窗。shape: [frame_length] window = np.hanning(self.frame_length) # 3. 分帧 + 加窗 + 傅立叶变换 # 3.1 定义保存数据 feature = np.zeros((sequential, self.spectrogram_dim)) for i in range(sequential): # 提取一个时序的帧数据 seq_frame = audio[i * self.frame_shift:i * self.frame_shift + self.frame_length] # 加窗 seq_frame = seq_frame * window # 傅立叶变换. seq_frame = np.fft.fft(seq_frame) # 然后取数据的一半。因为数据是对称的 seq_frame = seq_frame[:self.spectrogram_dim] # 取绝对值 feature[i] = np.abs(seq_frame)**2 feature = feature + 1.e-12 # assert feature.all() != 0, UnknownError("spectrogram feature cannot equal to 0, " # "audio file:{}, audio shape:{}".format(file, audio.shape)) feature = np.log(feature) return feature
def load(file): """ 加载模型。 :param file: (str, mandatory) 模型文件 :return: (MFSSegmentation) 分词对象 """ if not os.path.isfile(file): raise FileNotFoundException(file) with open(file, "rb") as f_read: model = pickle.loads(f_read.read()) return model
def check_input_args(args): """ 检查输入的参数 """ if args.audio is '': print(Color.red("Please input audio file !")) return False else: if not os.path.isfile(args.audio): print(FileNotFoundException(args.audio)) return False if args.top > args.beam: print(Color.red("requested top:{} than the beam:{}".format(args.top, args.beam))) return False return True
def predict(self, audio, beam_width=3, top_paths=1, greedy=True): """ 预测 """ assert os.path.isfile(audio), FileNotFoundException(audio) features, input_length = self.audio_feature.feature(audio, alignment=False) features = features[np.newaxis, :, :] features = tf.convert_to_tensor(features, dtype=tf.float32) input_length = tf.reshape(tf.convert_to_tensor([input_length], dtype=tf.int32), shape=[1, 1]) sequence, _ = self.model.predict(features, input_length, beam_width=beam_width, top_paths=top_paths, greedy=greedy) if greedy: output = self.pinyin_dict.decoding(sequence) else: output = [self.pinyin_dict.decoding(seq) for seq in sequence] return output
def load_dataset(source_data_path, data_path): """ 加载数据集 :param data_path: (str, mandatory) 训练数据所在的路径 :param source_data_path: (str, mandatory) 实际数据路径 :return: (dict of list) 列表字典。返回音频文件路径,标签文件路径,标签数据路径,拼音字符,汉字 """ audio_data, pinyin_data, chinese_data = list(), list(), list() data_file = os.listdir(data_path) for file in tqdm(data_file): # 检查文件名后缀是不是 wav 音频类型文件。 if file[-3:] == 'wav': # 标签文件。根据音频文件名得到标签文件名 label_file = file + '.trn' # 判断标签文件是否存在。不存在则报错 assert label_file in data_file, FileNotFoundException(label_file) # 音频文件路径和标签文件路径 audio_file_path = os.path.join(data_path, file) label_file_path = os.path.join(data_path, label_file) # 读取标签文件内容。找到对应标签数据路径 label_data_file_path = DataUtils.read_text_data( label_file_path, show_progress_bar=False) assert len(label_data_file_path) == 1, \ 'Get redundant data path: {}, label_file_path:{}'.format(label_data_file_path, label_file_path) # 重新拼接路径 label_data_file_path = os.path.join( source_data_path, label_data_file_path[0].split('/')[-1]) assert os.path.isfile(label_data_file_path), FileNotFoundException( label_data_file_path) # 读取标签数据。包括字符、拼音 text_data = DataUtils.read_text_data(label_data_file_path, handle_func=handle, show_progress_bar=False) chinese = text_data[0] pinyin = text_data[1] # 检查pinyin字符是否正确。主要判断每个拼音的最后一个字符是不是数字。不是数字则报错 for py in pinyin: assert py[-1].isdigit(), "The last character:{} of Pinyin is not a number! " \ "pinyin_str:{}, " \ "label_data_file_path:{}, " \ "label_file_path:{}".format(py, pinyin, label_data_file_path, label_file_path) # 将由多个中文字符组成的词转换成单个字 new_chinese = list() for ch in chinese: new_chinese += list(ch) chinese = new_chinese # 检查是否是一个拼音对应中文字符, 如果不是则报错 # assert len(chinese) == len(pinyin), "The number of pinyin:{} and chinese:{} is different, " \ # "chinese:{}, pinyin:{}, file:{}".format(len(pinyin), len(chinese), # chinese, pinyin, # label_data_file_path) audio_data.append(audio_file_path) pinyin_data.append(pinyin) chinese_data.append(chinese) return audio_data, pinyin_data, chinese_data