Ejemplo n.º 1
0
 def __init__(self,
              vocab_filepath,
              mean_std_filepath,
              augmentation_config='{}',
              max_duration=float('inf'),
              min_duration=0.0,
              stride_ms=10.0,
              window_ms=20.0,
              max_freq=None,
              specgram_type='linear',
              use_dB_normalization=True,
              num_threads=multiprocessing.cpu_count() // 2,
              random_seed=0):
     self._max_duration = max_duration
     self._min_duration = min_duration
     self._normalizer = FeatureNormalizer(mean_std_filepath)
     self._augmentation_pipeline = AugmentationPipeline(
         augmentation_config=augmentation_config, random_seed=random_seed)
     self._speech_featurizer = SpeechFeaturizer(
         vocab_filepath=vocab_filepath,
         specgram_type=specgram_type,
         stride_ms=stride_ms,
         window_ms=window_ms,
         max_freq=max_freq,
         use_dB_normalization=use_dB_normalization)
     self._num_threads = num_threads
     self._rng = random.Random(random_seed)
     self._epoch = 0
     # for caching tar files info
     self._local_data = local()
     self._local_data.tar2info = {}
     self._local_data.tar2object = {}
Ejemplo n.º 2
0
def compute_mean_std(manifest_path, num_samples, output_path):
    normalizer = FeatureNormalizer(mean_std_filepath=None,
                                   manifest_path=manifest_path,
                                   num_samples=num_samples)
    # 将计算的结果保存的文件中
    normalizer.write_to_file(output_path)
    print('计算的均值和标准值已保存在 %s!' % output_path)
Ejemplo n.º 3
0
 def __init__(self,
              vocab_filepath,
              mean_std_filepath,
              augmentation_config='{}',
              max_duration=float('inf'),
              min_duration=0.0,
              stride_ms=10.0,
              window_ms=20.0,
              use_dB_normalization=True,
              random_seed=0,
              keep_transcription_text=False,
              place=paddle.CPUPlace(),
              is_training=True):
     self._max_duration = max_duration
     self._min_duration = min_duration
     self._normalizer = FeatureNormalizer(mean_std_filepath)
     self._augmentation_pipeline = AugmentationPipeline(
         augmentation_config=augmentation_config, random_seed=random_seed)
     self._speech_featurizer = SpeechFeaturizer(
         vocab_filepath=vocab_filepath,
         stride_ms=stride_ms,
         window_ms=window_ms,
         use_dB_normalization=use_dB_normalization)
     self._rng = random.Random(random_seed)
     self._keep_transcription_text = keep_transcription_text
     self.epoch = 0
     self._is_training = is_training
     # for caching tar files info
     self._local_data = local()
     self._local_data.tar2info = {}
     self._local_data.tar2object = {}
     self._place = place
Ejemplo n.º 4
0
def get_audio_mfcc_features(txt_files,
                            wav_files,
                            n_input,
                            n_context,
                            word_num_map,
                            txt_labels=None,
                            specgram_type='mfcc',
                            mean_std_filepath='data/aishell/mean_std.npz'):
    """ Get MFCC/linear specgram  features. The dim of MFCC is 39, contains 13 mfcc + 13 delta1 + 13 delta2.
        Linear specgram contains 161 features in different frequency section.
    
    :param txt_files:
    :param wav_files:
    :param n_input:
    :param n_context:
    :param word_num_map:
    :param txt_labels:
    :return:
    """
    audio_features = []
    audio_features_len = []
    text_vector = []
    text_vector_len = []
    if txt_files != None:
        txt_labels = txt_files
    get_feature = AudioFeaturizer(specgram_type)
    normalizer = FeatureNormalizer(mean_std_filepath)
    for txt_obj, wav_file in zip(txt_labels, wav_files):
        # Turn inputs into features
        if specgram_type == 'mfcc':
            audio_data = audiofile_to_input_vector(
                wav_file, n_input, n_context)  # get mfcc feature ( ???, 741 )
        elif specgram_type == 'linear':
            speech_segment = SpeechSegment.from_file(wav_file, "")
            specgram = get_feature.featurize(speech_segment)
            audio_data = normalizer.apply(specgram)
            audio_data = np.transpose(
                audio_data)  # get linear specgram feature, (?, 161)
        audio_data = audio_data.astype('float32')

        audio_features.append(audio_data)
        audio_features_len.append(np.int32(len(audio_data)))

        target = []
        if txt_files != None:  # txt_obj是文件
            target = trans_text_ch_to_vector(txt_obj, word_num_map)
        else:
            target = trans_text_ch_to_vector(None, word_num_map,
                                             txt_obj)  # txt_obj是labels
        text_vector.append(target)
        text_vector_len.append(len(target))

    audio_features = np.asarray(audio_features)
    audio_features_len = np.asarray(audio_features_len)
    text_vector = np.asarray(text_vector)
    text_vector_len = np.asarray(text_vector_len)
    return audio_features, audio_features_len, text_vector, text_vector_len
Ejemplo n.º 5
0
def compute_mean_std(manifest_path, num_samples, output_path):
    # 随机取指定的数量计算平均值归一化
    normalizer = FeatureNormalizer(mean_std_filepath=None,
                                   manifest_path=manifest_path,
                                   num_samples=num_samples,
                                   num_workers=args.num_workers)
    # 将计算的结果保存的文件中
    normalizer.write_to_file(output_path)
    print('计算的均值和标准值已保存在 %s!' % output_path)
Ejemplo n.º 6
0
 def __init__(self,
              vocab_filepath,
              mean_std_filepath,
              stride_ms=10.0,
              window_ms=20.0,
              use_dB_normalization=True):
     self._normalizer = FeatureNormalizer(mean_std_filepath)
     self._speech_featurizer = SpeechFeaturizer(
         vocab_filepath=vocab_filepath,
         stride_ms=stride_ms,
         window_ms=window_ms,
         use_dB_normalization=use_dB_normalization)
    def __init__(self,
                 manifest,
                 vocab_filepath,
                 mean_std_filepath,
                 augmentation_config='{}',
                 max_duration=float('inf'),
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 specgram_type='linear',
                 use_dB_normalization=True,
                 random_seed=0,
                 keep_transcription_text=False,
                 segmented=False):

        self._max_duration = max_duration
        self._min_duration = min_duration
        self._segmented = segmented
        self._keep_transcription_text = keep_transcription_text

        if isinstance(manifest, str) and os.path.isfile(manifest):
            self.manifest = pd.read_csv(manifest)
        elif isinstance(manifest, pd.DataFrame):
            self.manifest = manifest
        else:
            raise BaseException(
                "{} is neither an valide path or a pandas DataFrame object".
                format(manifest))

        # duration filtering
        self.manifest = self.manifest[
            (self.manifest.duration >= self._min_duration)
            & (self.manifest.duration <= self._max_duration)]

        self.manifest = self.manifest.sort_values(by=["duration"],
                                                  ascending=True)

        self._normalizer = FeatureNormalizer(mean_std_filepath)

        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=augmentation_config, random_seed=random_seed)

        self._speech_featurizer = SpeechFeaturizer(
            vocab_filepath=vocab_filepath,
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
            max_freq=max_freq,
            use_dB_normalization=use_dB_normalization)
Ejemplo n.º 8
0
def main():
    print_arguments(args)

    audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type)

    def augment_and_featurize(audio_segment):
        return audio_featurizer.featurize(audio_segment)

    normalizer = FeatureNormalizer(
        mean_std_filepath=None,
        manifest_path=args.manifest_path,
        featurize_func=augment_and_featurize,
        num_samples=args.num_samples)
    normalizer.write_to_file(args.output_path)
Ejemplo n.º 9
0
def main():
    print_arguments(args)

    augmentation_pipeline = AugmentationPipeline('{}')
    audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type)

    def augment_and_featurize(audio_segment):
        augmentation_pipeline.transform_audio(audio_segment)
        return audio_featurizer.featurize(audio_segment)

    # 随机取指定的数量计算平均值归一化
    normalizer = FeatureNormalizer(mean_std_filepath=None,
                                   manifest_path=args.manifest_path,
                                   featurize_func=augment_and_featurize,
                                   num_samples=args.num_samples)
    # 将计算的结果保存的文件中
    normalizer.write_to_file(args.output_path)
Ejemplo n.º 10
0
        'ctc_beam_search',
        '结果解码方法',
        choices=['ctc_beam_search', 'ctc_greedy'])
add_arg('lang_model_path', str, 'lm/zh_giga.no_cna_cmn.prune01244.klm',
        "语言模型文件路径")
args = parser.parse_args()

print_arguments(args)
# 加载数据字典
with open(args.dataset_vocab, 'r', encoding='utf-8') as f:
    labels = eval(f.read())
vocabulary = [labels[i] for i in range(len(labels))]

# 提取音频特征器和归一化器
audio_featurizer = AudioFeaturizer()
normalizer = FeatureNormalizer(mean_std_filepath=args.mean_std_path)

# 创建模型
model = DeepSpeech2Model(feat_size=audio_featurizer.feature_dim(),
                         dict_size=len(vocabulary),
                         num_conv_layers=args.num_conv_layers,
                         num_rnn_layers=args.num_rnn_layers,
                         rnn_size=args.rnn_layer_size)
model.set_state_dict(
    paddle.load(os.path.join(args.model_path, 'model.pdparams')))
model.eval()

# 集束搜索方法的处理
if args.decoder == "ctc_beam_search":
    try:
        from decoders.beam_search_decoder import BeamSearchDecoder