Exemple #1
0
    def mfcc_feature(self, file):
        """
        mfcc 特征
        :param file: (str, mandatory) 音频文件
        :return: (array) mfcc 特征
        """
        assert os.path.isfile(file), FileNotFoundException(file)

        (sample, audio) = wav.read(file)
        return mfcc(audio, sample, numcep=self.mfcc_dim)
Exemple #2
0
    def load(file):
        """
        加载字典
        :param file: (str, mandatory) 字典文件
        :return: (dictionary) 字典对象
        """
        assert os.path.isfile(file), FileNotFoundException(file)

        with open(file, 'rb') as f_read:
            audio_feature = pickle.loads(f_read.read())

        return audio_feature
Exemple #3
0
    def spectrogram(self, file):
        """
        获取声谱特征
        :param file: (str, mandatory) 音频文件
        :return: (array) 声谱特征。shape:[时序, frame_length // 2]
        """
        assert os.path.isfile(file), FileNotFoundException(file)
        # 1.提取信号。
        # audio 为提取的数据。格式为[数据帧, 通道]。如果通道为0,则返回为1维度数据
        # sample_rate 为采样率。
        audio, sample_rate = sf.read(file, dtype='float32')

        assert audio.ndim == 1, UnknownError("check input data")

        # 2.分帧
        # 2.1 根据帧长度、帧移、信号数据。判断最多可以获得多少帧数据。对多余数据抛弃。
        # 例如信号数据 audio = [1,2,3,4,5,6,7,8,9]. 帧长度 = 4, 帧移 = 2
        # 则第一时序数据帧是 [1,2,3,4], 第二时序数据帧是:[3,4,5,6], 第三时序数据帧是:[5,6,7,8]. 其中 9 这个数据点被抛弃。总共 3 个时序
        trunc = (audio.shape[0] - self.frame_length) % self.frame_shift
        audio = audio[:audio.shape[0] - trunc]

        # 时序
        sequential = (audio.shape[0] - self.frame_length) // self.frame_shift
        assert sequential > 0, UnknownError(
            "spectrogram feature sequence less than 0, "
            "audio file:{}, audio shape:{}".format(file, audio.shape))

        # 汉宁窗。shape: [frame_length]
        window = np.hanning(self.frame_length)

        # 3. 分帧 + 加窗 + 傅立叶变换
        # 3.1 定义保存数据
        feature = np.zeros((sequential, self.spectrogram_dim))
        for i in range(sequential):
            # 提取一个时序的帧数据
            seq_frame = audio[i * self.frame_shift:i * self.frame_shift +
                              self.frame_length]
            # 加窗
            seq_frame = seq_frame * window
            # 傅立叶变换.
            seq_frame = np.fft.fft(seq_frame)
            # 然后取数据的一半。因为数据是对称的
            seq_frame = seq_frame[:self.spectrogram_dim]
            # 取绝对值
            feature[i] = np.abs(seq_frame)**2

        feature = feature + 1.e-12
        # assert feature.all() != 0, UnknownError("spectrogram feature cannot equal to 0, "
        #                                        "audio file:{}, audio shape:{}".format(file, audio.shape))

        feature = np.log(feature)
        return feature
Exemple #4
0
    def load(file):
        """
        加载模型。
        :param file: (str, mandatory) 模型文件
        :return: (MFSSegmentation) 分词对象
        """
        if not os.path.isfile(file):
            raise FileNotFoundException(file)

        with open(file, "rb") as f_read:
            model = pickle.loads(f_read.read())

        return model
Exemple #5
0
def check_input_args(args):
    """ 检查输入的参数 """
    if args.audio is '':
        print(Color.red("Please input audio file !"))
        return False
    else:
        if not os.path.isfile(args.audio):
            print(FileNotFoundException(args.audio))
            return False

    if args.top > args.beam:
        print(Color.red("requested top:{} than the beam:{}".format(args.top, args.beam)))
        return False

    return True
Exemple #6
0
    def predict(self, audio, beam_width=3, top_paths=1, greedy=True):
        """ 预测 """
        assert os.path.isfile(audio), FileNotFoundException(audio)
        features, input_length = self.audio_feature.feature(audio,
                                                            alignment=False)

        features = features[np.newaxis, :, :]
        features = tf.convert_to_tensor(features, dtype=tf.float32)
        input_length = tf.reshape(tf.convert_to_tensor([input_length],
                                                       dtype=tf.int32),
                                  shape=[1, 1])

        sequence, _ = self.model.predict(features,
                                         input_length,
                                         beam_width=beam_width,
                                         top_paths=top_paths,
                                         greedy=greedy)
        if greedy:
            output = self.pinyin_dict.decoding(sequence)
        else:
            output = [self.pinyin_dict.decoding(seq) for seq in sequence]

        return output
Exemple #7
0
def load_dataset(source_data_path, data_path):
    """
    加载数据集
    :param data_path: (str, mandatory) 训练数据所在的路径
    :param source_data_path: (str, mandatory) 实际数据路径
    :return: (dict of list) 列表字典。返回音频文件路径,标签文件路径,标签数据路径,拼音字符,汉字
    """
    audio_data, pinyin_data, chinese_data = list(), list(), list()

    data_file = os.listdir(data_path)
    for file in tqdm(data_file):
        # 检查文件名后缀是不是 wav 音频类型文件。
        if file[-3:] == 'wav':
            # 标签文件。根据音频文件名得到标签文件名
            label_file = file + '.trn'

            # 判断标签文件是否存在。不存在则报错
            assert label_file in data_file, FileNotFoundException(label_file)

            # 音频文件路径和标签文件路径
            audio_file_path = os.path.join(data_path, file)
            label_file_path = os.path.join(data_path, label_file)

            # 读取标签文件内容。找到对应标签数据路径
            label_data_file_path = DataUtils.read_text_data(
                label_file_path, show_progress_bar=False)
            assert len(label_data_file_path) == 1, \
                'Get redundant data path: {}, label_file_path:{}'.format(label_data_file_path, label_file_path)

            # 重新拼接路径
            label_data_file_path = os.path.join(
                source_data_path, label_data_file_path[0].split('/')[-1])
            assert os.path.isfile(label_data_file_path), FileNotFoundException(
                label_data_file_path)

            # 读取标签数据。包括字符、拼音
            text_data = DataUtils.read_text_data(label_data_file_path,
                                                 handle_func=handle,
                                                 show_progress_bar=False)
            chinese = text_data[0]
            pinyin = text_data[1]

            # 检查pinyin字符是否正确。主要判断每个拼音的最后一个字符是不是数字。不是数字则报错
            for py in pinyin:
                assert py[-1].isdigit(), "The last character:{} of Pinyin is not a number! " \
                                         "pinyin_str:{}, " \
                                         "label_data_file_path:{}, " \
                                         "label_file_path:{}".format(py, pinyin, label_data_file_path,
                                                                     label_file_path)

            # 将由多个中文字符组成的词转换成单个字
            new_chinese = list()
            for ch in chinese:
                new_chinese += list(ch)
            chinese = new_chinese

            # 检查是否是一个拼音对应中文字符, 如果不是则报错
            # assert len(chinese) == len(pinyin), "The number of pinyin:{} and chinese:{} is different, " \
            #                                     "chinese:{}, pinyin:{}, file:{}".format(len(pinyin), len(chinese),
            #                                                                             chinese, pinyin,
            #                                                                             label_data_file_path)

            audio_data.append(audio_file_path)
            pinyin_data.append(pinyin)
            chinese_data.append(chinese)

    return audio_data, pinyin_data, chinese_data