Beispiel #1
0
def infer():
    """Inference for DeepSpeech2."""
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1,
        keep_transcription_text=True)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.infer_manifest,
        batch_size=args.batch_size,
        min_batch_size=1,
        sortagrad=False,
        shuffle_method=None)
    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
    for  i in vocab_list[:10]:
        print(i.encode('utf-8'))
    if args.decoding_method == "ctc_greedy":
        ds2_model.logger.info("start inference ...")
        probs_split = ds2_model.infer_batch_probs(infer_data=infer_data,
            feeding_dict=data_generator.feeding)
        result_transcripts = ds2_model.decode_batch_greedy(
            probs_split=probs_split,
            vocab_list=vocab_list)
    else:
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                                  vocab_list)
        ds2_model.logger.info("start inference ...")
        for infer_data in batch_reader():
            probs_split = ds2_model.infer_batch_probs(infer_data=infer_data,
                      feeding_dict=data_generator.feeding)
            result_transcripts = ds2_model.decode_batch_beam_search(
              probs_split=probs_split,
              beam_alpha=args.alpha,
              beam_beta=args.beta,
              beam_size=args.beam_size,
              cutoff_prob=args.cutoff_prob,
              cutoff_top_n=args.cutoff_top_n,
              vocab_list=vocab_list,
              num_processes=args.num_proc_bsearch)

            with open(args.output_file,'a+') as f:
                for  result in result_transcripts:
                    print("\nOutput Transcription: %s" %
                       result.encode('utf-8'))
                    f.write(result.encode('utf-8'))
                    f.write('\n')
        ds2_model.logger.info("finish inference")
Beispiel #2
0
def SpeechRecognizer():
    """Evaluate on whole test data for DeepSpeech2."""
    paddle.init(use_gpu=True,
                rnn_use_batch=True,
                trainer_count=1)
    data_generator = DataGenerator(
        vocab_filepath='models/aishell/vocab.txt',
        mean_std_filepath='models/aishell/mean_std.npz',
        augmentation_config='{}',
        specgram_type='linear',
        num_threads=8,
        keep_transcription_text=True)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path='data/cctv/manifest',
        batch_size=128,
        min_batch_size=1,
        sortagrad=False,
        shuffle_method=None)

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=2,
        num_rnn_layers=3,
        rnn_layer_size=1024,
        use_gru=True,
        pretrained_model_path='models/aishell/params.tar.gz',
        share_rnn_weights=False)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]

    #if args.decoding_method == "ctc_beam_search":
    ds2_model.init_ext_scorer(2.6, 5.0, 'models/lm/zh_giga.no_cna_cmn.prune01244.klm',
                                vocab_list)
    ds2_model.logger.info("start evaluation ...")
    transcript = []
    bar = progressbar.ProgressBar(widgets=[
        progressbar.Percentage(),
        progressbar.Bar(),
        ' (', progressbar.SimpleProgress(), ') ',
        ' (', progressbar.ETA(), ') ', ])
    for infer_data in bar(batch_reader()):
        probs_split = ds2_model.infer_batch_probs(
            infer_data=infer_data,
            feeding_dict=data_generator.feeding)

        result_transcripts = ds2_model.decode_batch_beam_search(
            probs_split=probs_split,
            beam_alpha=2.6,
            beam_beta=5.0,
            beam_size=300,
            cutoff_prob=0.99,
            cutoff_top_n=40,
            vocab_list=vocab_list,
            num_processes=8)
        transcript += result_transcripts
        time.sleep(0.01)
    return transcript
Beispiel #3
0
def tune():
    """Tune parameters alpha and beta on one minibatch."""
    if not args.num_alphas >= 0:
        raise ValueError("num_alphas must be non-negative!")
    if not args.num_betas >= 0:
        raise ValueError("num_betas must be non-negative!")

    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.tune_manifest,
        batch_size=args.num_samples,
        sortagrad=False,
        shuffle_method=None)
    tune_data = batch_reader().next()
    target_transcripts = [
        ''.join([data_generator.vocab_list[token] for token in transcript])
        for _, transcript in tune_data
    ]

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    # create grid for search
    cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
    cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
    params_grid = [(alpha, beta) for alpha in cand_alphas
                   for beta in cand_betas]

    ## tune parameters in loop
    for alpha, beta in params_grid:
        result_transcripts = ds2_model.infer_batch(
            infer_data=tune_data,
            decoding_method='ctc_beam_search',
            beam_alpha=alpha,
            beam_beta=beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
            vocab_list=data_generator.vocab_list,
            language_model_path=args.lang_model_path,
            num_processes=args.num_proc_bsearch)
        wer_sum, num_ins = 0.0, 0
        for target, result in zip(target_transcripts, result_transcripts):
            wer_sum += wer(target, result)
            num_ins += 1
        print("alpha = %f\tbeta = %f\tWER = %f" %
              (alpha, beta, wer_sum / num_ins))
Beispiel #4
0
def decode_all(manifests):
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1,
        keep_transcription_text=True)

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    # decoders only accept string encoded in utf-8
    alphabet = Alphabet(args.vocab_path)
    ds2_model.logger.info("start decoding with extended output...")
    ds2_model.init_ext_scorer(args.alpha, args.beta,
                              args.lang_model_path, args.trie_path,
                              alphabet)

    for audioname, manifest_path, duration, offset in manifests:
        try:
            duration_f = float(duration)
            if duration_f < 1.:
                yield (audioname, manifest_path,
                       None, duration, offset)
                continue
        except (TypeError, ValueError):
            pass
        batch_reader = data_generator.batch_reader_creator(
            manifest_path=manifest_path,
            batch_size=args.num_samples,
            min_batch_size=1,
            sortagrad=False,
            shuffle_method=None)

        for decode_data in batch_reader():
            probs_split = ds2_model.infer_batch_probs(
                infer_data=decode_data,
                feeding_dict=data_generator.feeding)

            # note: we only perform single file decoding
            result_transcript = ds2_model.decode_beam_search(
                probs_split=probs_split,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                alphabet=alphabet)

            yield (audioname, manifest_path,
                   result_transcript, duration, offset)
def infer(filenum):
    """Inference for DeepSpeech2."""
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1,
        keep_transcription_text=True)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.infer_manifest,
        batch_size=args.batch_size,
        min_batch_size=1,
        sortagrad=False,
        shuffle_method=None)
    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)
    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
    if args.decoding_method == "ctc_beam_search":
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list)
    ds2_model.logger.info("start inference ...")
    transcript = []
    widgets = ["Start inference ...: ", Percentage(), ' ', Bar(), ' ', ETA()]
    pbar = ProgressBar(widgets=widgets, maxval=filenum/args.batch_size).start()
    for i, infer_data in enumerate(batch_reader()):
        if args.decoding_method == "ctc_greedy":
            probs_split = ds2_model.infer_batch_probs(infer_data=infer_data, feeding_dict=data_generator.feeding)
            result_transcripts = ds2_model.decode_batch_greedy(probs_split=probs_split, vocab_list=vocab_list)
        else:
            probs_split = ds2_model.infer_batch_probs(infer_data=infer_data, feeding_dict=data_generator.feeding)
            result_transcripts = ds2_model.decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=args.alpha,
                beam_beta=args.beta,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=args.num_proc_bsearch)
        transcript = transcript + result_transcripts
        pbar.update(i)
    pbar.finish()
    print("finish inference")
    return transcript
Beispiel #6
0
def evaluate():
    """Evaluate on whole test data for DeepSpeech2."""
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=args.num_proc_data)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.test_manifest,
        batch_size=args.batch_size,
        min_batch_size=1,
        sortagrad=False,
        shuffle_method=None)

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    error_rate_func = cer if args.error_rate_type == 'cer' else wer
    error_sum, num_ins = 0.0, 0
    for infer_data in batch_reader():
        result_transcripts = ds2_model.infer_batch(
            infer_data=infer_data,
            decoding_method=args.decoding_method,
            beam_alpha=args.alpha,
            beam_beta=args.beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
            vocab_list=data_generator.vocab_list,
            language_model_path=args.lang_model_path,
            num_processes=args.num_proc_bsearch)
        target_transcripts = [
            ''.join([data_generator.vocab_list[token] for token in transcript])
            for _, transcript in infer_data
        ]
        for target, result in zip(target_transcripts, result_transcripts):
            error_sum += error_rate_func(target, result)
            num_ins += 1
        print("Error rate [%s] (%d/?) = %f" %
              (args.error_rate_type, num_ins, error_sum / num_ins))
    print("Final error rate [%s] (%d/%d) = %f" %
          (args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
Beispiel #7
0
    def _start_server(self):
        self.data_generator = DataGenerator(
            vocab_filepath=args.vocab_path,
            mean_std_filepath=args.mean_std_path,
            augmentation_config='{}',
            specgram_type=SPECGRAM_TYPE,
            place=self.place,
            keep_transcription_text=True)

        self.ds2_model = DeepSpeech2Model(
            vocab_size=self.data_generator.vocab_size,
            num_conv_layers=NUM_CONV_LAYERS,
            num_rnn_layers=NUM_RNN_LAYERS,
            rnn_layer_size=RNN_LAYER_SIZE,
            use_gru=USE_GRU,
            init_from_pretrained_model=args.model_path,
            place=self.place,
            share_rnn_weights=SHARE_RNN_WEIGHTS)

        self.vocab_list = [chars for chars in self.data_generator.vocab_list]

        if args.decoding_method == "ctc_beam_search":
            self.ds2_model.init_ext_scorer(args.alpha, args.beta,
                                           args.lang_model_path,
                                           self.vocab_list)

        print('-----------------------------------------------------------')
        print('Warming up ...')
        self._warm_up_test(num_test_cases=3)
        print('-----------------------------------------------------------')
Beispiel #8
0
def infer():
    """Inference for DeepSpeech2."""
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.infer_manifest,
        batch_size=args.num_samples,
        min_batch_size=1,
        sortagrad=False,
        shuffle_method=None)
    infer_data = batch_reader().next()

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)
    result_transcripts = ds2_model.infer_batch(
        infer_data=infer_data,
        decoding_method=args.decoding_method,
        beam_alpha=args.alpha,
        beam_beta=args.beta,
        beam_size=args.beam_size,
        cutoff_prob=args.cutoff_prob,
        vocab_list=data_generator.vocab_list,
        language_model_path=args.lang_model_path,
        num_processes=args.num_proc_bsearch)

    error_rate_func = cer if args.error_rate_type == 'cer' else wer
    target_transcripts = [
        ''.join([data_generator.vocab_list[token] for token in transcript])
        for _, transcript in infer_data
    ]
    for target, result in zip(target_transcripts, result_transcripts):
        print("\nTarget Transcription: %s\nOutput Transcription: %s" %
              (target, result))
        print("Current error rate [%s] = %f" %
              (args.error_rate_type, error_rate_func(target, result)))
Beispiel #9
0
def start_server():
    """Start the ASR server"""
    # prepare data generator
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1)
    # prepare ASR model
    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    # prepare ASR inference handler
    def file_to_transcript(filename):
        feature = data_generator.process_utterance(filename, "")
        result_transcript = ds2_model.infer_batch(
            infer_data=[feature],
            decoding_method=args.decoding_method,
            beam_alpha=args.alpha,
            beam_beta=args.beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
            vocab_list=data_generator.vocab_list,
            language_model_path=args.lang_model_path,
            num_processes=1)
        return result_transcript[0]

    # warming up with utterrances sampled from Librispeech
    print('-----------------------------------------------------------')
    print('Warming up ...')
    warm_up_test(
        audio_process_handler=file_to_transcript,
        manifest_path=args.warmup_manifest,
        num_test_cases=3)
    print('-----------------------------------------------------------')

    # start the server
    server = AsrTCPServer(
        server_address=(args.host_ip, args.host_port),
        RequestHandlerClass=AsrRequestHandler,
        speech_save_dir=args.speech_save_dir,
        audio_process_handler=file_to_transcript)
    print("ASR Server Started.")
    server.serve_forever()
def train():
    """DeepSpeech2 training."""

    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    train_generator = DataGenerator(vocab_filepath=args.vocab_path,
                                    mean_std_filepath=args.mean_std_path,
                                    augmentation_config=io.open(args.augment_conf_path, mode='r', encoding='utf8').read(),
                                    max_duration=args.max_duration,
                                    min_duration=args.min_duration,
                                    specgram_type=args.specgram_type,
                                    place=place)
    dev_generator = DataGenerator(vocab_filepath=args.vocab_path,
                                  mean_std_filepath=args.mean_std_path,
                                  augmentation_config="{}",
                                  specgram_type=args.specgram_type,
                                  place=place)
    train_batch_reader = train_generator.batch_reader_creator(manifest_path=args.train_manifest,
                                                              batch_size=args.batch_size,
                                                              sortagrad=args.use_sortagrad if args.init_from_pretrained_model is None else False,
                                                              shuffle_method=args.shuffle_method)
    dev_batch_reader = dev_generator.batch_reader_creator(manifest_path=args.dev_manifest,
                                                          batch_size=args.batch_size,
                                                          sortagrad=False,
                                                          shuffle_method=None)

    ds2_model = DeepSpeech2Model(vocab_size=train_generator.vocab_size,
                                 num_conv_layers=args.num_conv_layers,
                                 num_rnn_layers=args.num_rnn_layers,
                                 rnn_layer_size=args.rnn_layer_size,
                                 use_gru=args.use_gru,
                                 share_rnn_weights=args.share_rnn_weights,
                                 place=place,
                                 init_from_pretrained_model=args.init_from_pretrained_model,
                                 output_model_dir=args.output_model_dir)

    ds2_model.train(train_batch_reader=train_batch_reader,
                    dev_batch_reader=dev_batch_reader,
                    learning_rate=args.learning_rate,
                    gradient_clipping=400,
                    batch_size=args.batch_size,
                    num_samples=args.num_samples,
                    num_epoch=args.num_epoch,
                    save_epoch=args.save_epoch,
                    num_iterations_print=args.num_iter_print,
                    test_off=args.test_off)
Beispiel #11
0
def train():
    """DeepSpeech2 training."""
    train_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config=open(args.augment_conf_path, 'r').read(),
        max_duration=args.max_duration,
        min_duration=args.min_duration,
        specgram_type=args.specgram_type,
        num_threads=args.num_proc_data)
    dev_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config="{}",
        specgram_type=args.specgram_type,
        num_threads=args.num_proc_data)
    train_batch_reader = train_generator.batch_reader_creator(
        manifest_path=args.train_manifest,
        batch_size=args.batch_size,
        min_batch_size=args.trainer_count,
        sortagrad=args.use_sortagrad if args.init_model_path is None else False,
        shuffle_method=args.shuffle_method)
    dev_batch_reader = dev_generator.batch_reader_creator(
        manifest_path=args.dev_manifest,
        batch_size=args.batch_size,
        min_batch_size=1,  # must be 1, but will have errors.
        sortagrad=False,
        shuffle_method=None)

    ds2_model = DeepSpeech2Model(
        vocab_size=train_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.init_model_path,
        share_rnn_weights=args.share_rnn_weights)
    ds2_model.train(
        train_batch_reader=train_batch_reader,
        dev_batch_reader=dev_batch_reader,
        feeding_dict=train_generator.feeding,
        learning_rate=args.learning_rate,
        gradient_clipping=400,
        num_passes=args.num_passes,
        num_iterations_print=args.num_iter_print,
        output_model_dir=args.output_model_dir,
        is_local=args.is_local,
        test_off=args.test_off)
Beispiel #12
0
def load_model():        
    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()
    
    # Load model
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        keep_transcription_text=True,
        place = place,
        is_training = False)
    
    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        share_rnn_weights=args.share_rnn_weights,
        place=place,
        init_from_pretrained_model=args.model_path)

    # decoders only accept string encoded in utf-8
    vocab_list = data_generator.vocab_list

    ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                              vocab_list)
            
    return ds2_model, data_generator, vocab_list
Beispiel #13
0
def infer():
    """Inference for DeepSpeech2."""

    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        keep_transcription_text=True,
        place = place,
        is_training = False)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.infer_manifest,
        batch_size=args.num_samples,
        sortagrad=False,
        shuffle_method=None)
    infer_data = next(batch_reader())

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        share_rnn_weights=args.share_rnn_weights,
        place=place,
        init_from_pretrained_model=args.model_path)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]

    if args.decoding_method == "ctc_greedy":
        ds2_model.logger.info("start inference ...")
        probs_split = ds2_model.infer_batch_probs(
            infer_data=infer_data,
            feeding_dict=data_generator.feeding)

        result_transcripts = ds2_model.decode_batch_greedy(
            probs_split=probs_split,
            vocab_list=vocab_list)
    else:
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                                  vocab_list)
        ds2_model.logger.info("start inference ...")
        probs_split= ds2_model.infer_batch_probs(
            infer_data=infer_data,
            feeding_dict=data_generator.feeding)

        result_transcripts= ds2_model.decode_batch_beam_search(
            probs_split=probs_split,
            beam_alpha=args.alpha,
            beam_beta=args.beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
            cutoff_top_n=args.cutoff_top_n,
            vocab_list=vocab_list,
            num_processes=args.num_proc_bsearch)

    error_rate_func = cer if args.error_rate_type == 'cer' else wer
    target_transcripts = infer_data[1]
    for target, result in zip(target_transcripts, result_transcripts):
        print("\nTarget Transcription: %s\nOutput Transcription: %s" %
              (target, result))
        print("Current error rate [%s] = %f" %
              (args.error_rate_type, error_rate_func(target, result)))

    ds2_model.logger.info("finish inference")
Beispiel #14
0
def tune():
    # 逐步调整alphas参数和betas参数
    if not args.num_alphas >= 0:
        raise ValueError("num_alphas must be non-negative!")
    if not args.num_betas >= 0:
        raise ValueError("num_betas must be non-negative!")

    # 是否使用GPU
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()

    # 获取数据生成器
    data_generator = DataGenerator(vocab_filepath=args.vocab_path,
                                   mean_std_filepath=args.mean_std_path,
                                   augmentation_config='{}',
                                   specgram_type=args.specgram_type,
                                   keep_transcription_text=True,
                                   place=place,
                                   is_training=False)
    # 获取评估数据
    batch_reader = data_generator.batch_reader_creator(manifest_path=args.tune_manifest,
                                                       batch_size=args.batch_size,
                                                       shuffle_method=None)
    # 获取DeepSpeech2模型,并设置为预测
    ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size,
                                 num_conv_layers=args.num_conv_layers,
                                 num_rnn_layers=args.num_rnn_layers,
                                 rnn_layer_size=args.rnn_layer_size,
                                 use_gru=args.use_gru,
                                 place=place,
                                 init_from_pretrained_model=args.model_path,
                                 share_rnn_weights=args.share_rnn_weights,
                                 is_infer=True)

    # 获取评估函数,有字错率和词错率
    errors_func = char_errors if args.error_rate_type == 'cer' else word_errors
    # 创建用于搜索的alphas参数和betas参数
    cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
    cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
    params_grid = [(alpha, beta) for alpha in cand_alphas for beta in cand_betas]

    err_sum = [0.0 for i in range(len(params_grid))]
    err_ave = [0.0 for i in range(len(params_grid))]
    num_ins, len_refs, cur_batch = 0, 0, 0
    # 初始化集束搜索方法
    ds2_model.init_ext_scorer(args.alpha_from, args.beta_from, args.lang_model_path, data_generator.vocab_list)
    # 多批增量调优参数
    ds2_model.logger.info("start tuning ...")
    for infer_data in batch_reader():
        if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
            break
        # 执行预测
        probs_split = ds2_model.infer_batch_probs(infer_data=infer_data)
        target_transcripts = infer_data[1]

        num_ins += len(target_transcripts)
        # 搜索alphas参数和betas参数
        for index, (alpha, beta) in enumerate(tqdm(params_grid)):
            result_transcripts = ds2_model.decode_batch_beam_search(probs_split=probs_split,
                                                                    beam_alpha=alpha,
                                                                    beam_beta=beta,
                                                                    beam_size=args.beam_size,
                                                                    cutoff_prob=args.cutoff_prob,
                                                                    cutoff_top_n=args.cutoff_top_n,
                                                                    vocab_list=data_generator.vocab_list,
                                                                    num_processes=args.num_proc_bsearch)
            for target, result in zip(target_transcripts, result_transcripts):
                errors, len_ref = errors_func(target, result)
                err_sum[index] += errors
                if args.alpha_from == alpha and args.beta_from == beta:
                    len_refs += len_ref

            err_ave[index] = err_sum[index] / len_refs

        # 输出每一个batch的计算结果
        err_ave_min = min(err_ave)
        min_index = err_ave.index(err_ave_min)
        print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
              " min [%s] = %f" % (cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
                                  "%.3f" % params_grid[min_index][1], args.error_rate_type, err_ave_min))
        cur_batch += 1

    # 输出字错率和词错率以及(alpha, beta)
    print("\nFinal %s:\n" % args.error_rate_type)
    for index in range(len(params_grid)):
        print("(alpha, beta) = (%s, %s), [%s] = %f"
              % ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1], args.error_rate_type, err_ave[index]))

    err_ave_min = min(err_ave)
    min_index = err_ave.index(err_ave_min)
    print("\n一共使用了 %d 批数据推理, 最优的参数为 (alpha, beta) = (%s, %s)"
          % (cur_batch, "%.3f" % params_grid[min_index][0], "%.3f" % params_grid[min_index][1]))

    ds2_model.logger.info("finish tuning")
        'linear',
        "Audio feature type. Options: linear, mfcc.",
        choices=['linear', 'mfcc'])
args = parser.parse_args()

# 是否使用GPU
if args.use_gpu:
    place = fluid.CUDAPlace(0)
else:
    place = fluid.CPUPlace()

# 获取数据生成器,处理数据和获取字典需要
data_generator = DataGenerator(vocab_filepath=args.vocab_path,
                               mean_std_filepath=args.mean_std_path,
                               augmentation_config='{}',
                               specgram_type=args.specgram_type,
                               keep_transcription_text=True,
                               place=place,
                               is_training=False)
# 获取DeepSpeech2模型,并设置为预测
ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size,
                             num_conv_layers=args.num_conv_layers,
                             num_rnn_layers=args.num_rnn_layers,
                             rnn_layer_size=args.rnn_layer_size,
                             use_gru=args.use_gru,
                             init_from_pretrained_model=args.model_path,
                             place=place,
                             share_rnn_weights=args.share_rnn_weights,
                             is_infer=True)
# 定向搜索方法的处理
if args.decoding_method == "ctc_beam_search":
def SpeechRecognizer():
    """Inference for DeepSpeech2."""
    paddle.init(use_gpu=True, rnn_use_batch=True, trainer_count=1)
    data_generator = DataGenerator(
        vocab_filepath='models/aishell/vocab.txt',
        mean_std_filepath='data/aishell/mean_std.npz',
        augmentation_config='{}',
        specgram_type='linear',
        num_threads=1,
        keep_transcription_text=True)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path='data/cctv/manifest',
        batch_size=10,
        min_batch_size=1,
        sortagrad=False,
        shuffle_method=None)
    infer_data = batch_reader().next()

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=2,
        num_rnn_layers=3,
        rnn_layer_size=1024,
        use_gru=True,
        pretrained_model_path='models/aishell/params.tar.gz',
        share_rnn_weights=False)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]

    # if args.decoding_method == "ctc_greedy":
    #     ds2_model.logger.info("start inference ...")
    #     probs_split = ds2_model.infer_batch_probs(infer_data=infer_data,
    #         feeding_dict=data_generator.feeding)
    #     result_transcripts = ds2_model.decode_batch_greedy(
    #         probs_split=probs_split,
    #         vocab_list=vocab_list)
    # else:
    ds2_model.init_ext_scorer(2.6, 5.0,
                              'models/lm/zh_giga.no_cna_cmn.prune01244.klm',
                              vocab_list)
    ds2_model.logger.info("start inference ...")
    probs_split = ds2_model.infer_batch_probs(
        infer_data=infer_data, feeding_dict=data_generator.feeding)
    transcript = []
    result_transcripts = ds2_model.decode_batch_beam_search(
        probs_split=probs_split,
        beam_alpha=2.6,
        beam_beta=5.0,
        beam_size=300,
        cutoff_prob=0.99,
        cutoff_top_n=40,
        vocab_list=vocab_list,
        num_processes=8)

    # for result in result_transcripts:
    #     print("\nOutput Transcription: %s" %
    #           result)
    transcript.append(result_transcripts[:])
    transcript = transcript[0]
    return transcript
Beispiel #17
0
def infer(transcript_name):
    """Inference for DeepSpeech2."""

    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        keep_transcription_text=True,
        place = place,
        is_training = False)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.infer_manifest,
        batch_size=args.num_samples,
        sortagrad=False,
        shuffle_method=None)
    infer_data = next(batch_reader())

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        share_rnn_weights=args.share_rnn_weights,
        place=place,
        init_from_pretrained_model=args.model_path)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]

    if args.decoding_method == "ctc_greedy":
        ds2_model.logger.info("start inference ...")
        probs_split = ds2_model.infer_batch_probs(
            infer_data=infer_data,
            feeding_dict=data_generator.feeding)

        result_transcripts = ds2_model.decode_batch_greedy(
            probs_split=probs_split,
            vocab_list=vocab_list)
    else:
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                                  vocab_list)
        ds2_model.logger.info("start inference ...")
        probs_split= ds2_model.infer_batch_probs(
            infer_data=infer_data,
            feeding_dict=data_generator.feeding)

        result_transcripts= ds2_model.decode_batch_beam_search(
            probs_split=probs_split,
            beam_alpha=args.alpha,
            beam_beta=args.beta,
            beam_size=args.beam_size,
            cutoff_prob=args.cutoff_prob,
            cutoff_top_n=args.cutoff_top_n,
            vocab_list=vocab_list,
            num_processes=args.num_proc_bsearch)

    transcription = result_transcripts[0].capitalize() + '.'
    print(transcription)

    with codecs.open('dataset/tap/transcription/'+transcript_name+'.txt', 'w', 'utf-8') as out_file:
        out_file.write(transcription)

    ds2_model.logger.info("finish inference")
Beispiel #18
0
def infer():
    """extract the duration from manifest"""
    f=open(args.infer_manifest)
    timelist=[]
    for line in f:
       d=json.loads(line.strip())['duration']
       timelist.append(d)
    """Inference for DeepSpeech2."""
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1,
        keep_transcription_text=True)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.infer_manifest,
        batch_size=args.batch_size,
        min_batch_size=1,
        sortagrad=False,
        shuffle_method=None) 
    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
    if args.decoding_method == "ctc_greedy":
        ds2_model.logger.info("start inference ...")
        probs_split = ds2_model.infer_batch_probs(infer_data=infer_data,
            feeding_dict=data_generator.feeding)
        result_transcripts = ds2_model.decode_batch_greedy(
            probs_split=probs_split,
            vocab_list=vocab_list)
    else:
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                                  vocab_list)
        ds2_model.logger.info("start inference ...")


        with open(args.input_file,'r') as f:
                l = f.readlines()
                l[8] = "ASR_01|CMN\n"
                start_time = l[10].split('|')[0]
                end_time = l[10].split('|')[1]
                time_now = str(datetime.datetime.now())[:16] # get the current time
                l[10] = "|".join(["ASR_01",time_now,"Source_Program=Baidu DeepSpeech2,infer.sh","Source_Person=Zhaoqing Xu,Shuwei Xu","Codebook=Chinese Speech to Text\n"])
                end_line = ""
                if l[-1].startswith("END"):
                      end_line = l[-1]
                l = l[:11]

        with open(args.output_file,"w") as f:
               f.writelines(l)
 

        for infer_data in batch_reader():
	    probs_split = ds2_model.infer_batch_probs(infer_data=infer_data,
               feeding_dict=data_generator.feeding)
            result_transcripts = ds2_model.decode_batch_beam_search(
              probs_split=probs_split,
              beam_alpha=args.alpha,
              beam_beta=args.beta,
              beam_size=args.beam_size,
              cutoff_prob=args.cutoff_prob,
              cutoff_top_n=args.cutoff_top_n,
              vocab_list=vocab_list,
              num_processes=args.num_proc_bsearch)
              index=0
	    for result in result_transcripts:

              with open(args.output_file,'a+') as f:
                 print("\nOutput Transcription: %s" %
                 result.encode('utf-8'))
            #	 try:
             #    print(start_time)
            
            #     start,m_sec = start_time.split('.')
                 time_format = '%Y%m%d%H%M%S.%f'
                 end = (datetime.datetime.strptime(start_time,time_format) + datetime.timedelta(0,timelist[index])).strftime(time_format)
                 index+=1
                 prefix = start +  '|' + end[:-3]  + '|ASR_01|'
                 f.write(prefix)			
                 f.write(result.encode('utf-8'))
                 f.write('\n')
                 start_time = end 
                # except:
                 #    continue
        with open(args.output_file, 'a+') as f:
             f.write(end_line)
        ds2_model.logger.info("finish inference")
Beispiel #19
0
def train():
    # 检测PaddlePaddle环境
    check_cuda(args.use_gpu)
    check_version()

    # 是否使用GPU
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    # 获取训练数据生成器
    train_generator = DataGenerator(vocab_filepath=args.vocab_path,
                                    mean_std_filepath=args.mean_std_path,
                                    augmentation_config=io.open(
                                        args.augment_conf_path,
                                        mode='r',
                                        encoding='utf8').read(),
                                    max_duration=args.max_duration,
                                    min_duration=args.min_duration,
                                    specgram_type=args.specgram_type,
                                    place=place)

    # 获取测试数据生成器
    test_generator = DataGenerator(vocab_filepath=args.vocab_path,
                                   mean_std_filepath=args.mean_std_path,
                                   augmentation_config="{}",
                                   specgram_type=args.specgram_type,
                                   keep_transcription_text=True,
                                   place=place,
                                   is_training=False)
    # 获取训练数据
    train_batch_reader = train_generator.batch_reader_creator(
        manifest_path=args.train_manifest,
        batch_size=args.batch_size,
        sortagrad=args.use_sortagrad
        if args.init_from_pretrained_model is None else False,
        shuffle_method=args.shuffle_method)
    # 获取测试数据
    test_batch_reader = test_generator.batch_reader_creator(
        manifest_path=args.dev_manifest,
        batch_size=args.batch_size,
        sortagrad=False,
        shuffle_method=None)
    # 获取DeepSpeech2模型
    ds2_model = DeepSpeech2Model(
        vocab_size=train_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        share_rnn_weights=args.share_rnn_weights,
        place=place,
        init_from_pretrained_model=args.init_from_pretrained_model,
        output_model_dir=args.output_model_dir,
        vocab_list=test_generator.vocab_list)
    # 获取训练数据数量
    num_samples = get_data_len(args.train_manifest, args.max_duration,
                               args.min_duration)
    print("[%s] 训练数据数量:%d\n" % (datetime.now(), num_samples))
    # 开始训练
    ds2_model.train(train_batch_reader=train_batch_reader,
                    dev_batch_reader=test_batch_reader,
                    learning_rate=args.learning_rate,
                    gradient_clipping=400,
                    batch_size=args.batch_size,
                    num_samples=num_samples,
                    num_epoch=args.num_epoch,
                    save_epoch=args.save_epoch,
                    num_iterations_print=args.num_iter_print,
                    test_off=args.test_off)
Beispiel #20
0
def start_server():
    """Start the ASR server"""
    # prepare data generator
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    data_generator = DataGenerator(vocab_filepath=args.vocab_path,
                                   mean_std_filepath=args.mean_std_path,
                                   augmentation_config='{}',
                                   specgram_type=args.specgram_type,
                                   keep_transcription_text=True,
                                   place=place,
                                   is_training=False)
    # prepare ASR model
    ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size,
                                 num_conv_layers=args.num_conv_layers,
                                 num_rnn_layers=args.num_rnn_layers,
                                 rnn_layer_size=args.rnn_layer_size,
                                 use_gru=args.use_gru,
                                 init_from_pretrained_model=args.model_path,
                                 place=place,
                                 share_rnn_weights=args.share_rnn_weights,
                                 is_infer=True)

    if args.decoding_method == "ctc_beam_search":
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, data_generator.vocab_list)

    # prepare ASR inference handler
    def file_to_transcript(filename):
        feature = data_generator.process_utterance(filename, "")
        probs_split = ds2_model.infer(feature=feature)

        if args.decoding_method == "ctc_greedy":
            result_transcript = ds2_model.decode_batch_greedy(probs_split=probs_split,
                                                              vocab_list=data_generator.vocab_list)
        else:
            result_transcript = ds2_model.decode_batch_beam_search(probs_split=probs_split,
                                                                   beam_alpha=args.alpha,
                                                                   beam_beta=args.beta,
                                                                   beam_size=args.beam_size,
                                                                   cutoff_prob=args.cutoff_prob,
                                                                   cutoff_top_n=args.cutoff_top_n,
                                                                   vocab_list=data_generator.vocab_list,
                                                                   num_processes=1)
        return result_transcript[0]

    # warming up with utterrances sampled from Librispeech
    print('-----------------------------------------------------------')
    print('Warming up ...')
    warm_up_test(audio_process_handler=file_to_transcript,
                 manifest_path=args.warmup_manifest,
                 num_test_cases=3)
    print('-----------------------------------------------------------')

    # start the server
    server = AsrTCPServer(server_address=(args.host_ip, args.host_port),
                          RequestHandlerClass=AsrRequestHandler,
                          speech_save_dir=args.speech_save_dir,
                          audio_process_handler=file_to_transcript)
    print("ASR Server Started.")
    server.serve_forever()
Beispiel #21
0
def tune():
    """Tune parameters alpha and beta incrementally."""
    if not args.num_alphas >= 0:
        raise ValueError("num_alphas must be non-negative!")
    if not args.num_betas >= 0:
        raise ValueError("num_betas must be non-negative!")

    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        keep_transcription_text=True,
        place = place,
        is_training = False)

    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.tune_manifest,
        batch_size=args.batch_size,
        sortagrad=False,
        shuffle_method=None)

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        place=place,
        init_from_pretrained_model=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
    errors_func = char_errors if args.error_rate_type == 'cer' else word_errors
    # create grid for search
    cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
    cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
    params_grid = [(alpha, beta) for alpha in cand_alphas
                   for beta in cand_betas]

    err_sum = [0.0 for i in range(len(params_grid))]
    err_ave = [0.0 for i in range(len(params_grid))]
    num_ins, len_refs, cur_batch = 0, 0, 0
    # initialize external scorer
    ds2_model.init_ext_scorer(args.alpha_from, args.beta_from,
                              args.lang_model_path, vocab_list)
    ## incremental tuning parameters over multiple batches
    ds2_model.logger.info("start tuning ...")
    for infer_data in batch_reader():
        if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
            break
        probs_split = ds2_model.infer_batch_probs(
            infer_data=infer_data,
            feeding_dict=data_generator.feeding)
        target_transcripts = infer_data[1]

        num_ins += len(target_transcripts)
        # grid search
        for index, (alpha, beta) in enumerate(params_grid):
            result_transcripts = ds2_model.decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=alpha,
                beam_beta=beta,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=args.num_proc_bsearch)
            for target, result in zip(target_transcripts, result_transcripts):
                errors, len_ref = errors_func(target, result)
                err_sum[index] += errors
                # accumulate the length of references of every batch
                # in the first iteration
                if args.alpha_from == alpha and args.beta_from == beta:
                    len_refs += len_ref

            err_ave[index] = err_sum[index] / len_refs
            if index % 2 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()

        # output on-line tuning result at the end of current batch
        err_ave_min = min(err_ave)
        min_index = err_ave.index(err_ave_min)
        print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
              " min [%s] = %f" %(cur_batch, num_ins,
              "%.3f" % params_grid[min_index][0],
              "%.3f" % params_grid[min_index][1],
              args.error_rate_type, err_ave_min))
        cur_batch += 1

    # output WER/CER at every (alpha, beta)
    print("\nFinal %s:\n" % args.error_rate_type)
    for index in range(len(params_grid)):
        print("(alpha, beta) = (%s, %s), [%s] = %f"
             % ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
             args.error_rate_type, err_ave[index]))

    err_ave_min = min(err_ave)
    min_index = err_ave.index(err_ave_min)
    print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)"
            % (cur_batch, "%.3f" % params_grid[min_index][0],
              "%.3f" % params_grid[min_index][1]))

    ds2_model.logger.info("finish tuning")
Beispiel #22
0
def evaluate():
    """Evaluate on whole test data for DeepSpeech2."""

    # check if set use_gpu=True in paddlepaddle cpu version
    check_cuda(args.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        keep_transcription_text=True,
        place = place,
        is_training = False)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.test_manifest,
        batch_size=args.batch_size,
        sortagrad=False,
        shuffle_method=None)

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        share_rnn_weights=args.share_rnn_weights,
        place=place,
        init_from_pretrained_model=args.model_path)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars for chars in data_generator.vocab_list]

    if args.decoding_method == "ctc_beam_search":
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                                  vocab_list)
    errors_func = char_errors if args.error_rate_type == 'cer' else word_errors
    errors_sum, len_refs, num_ins = 0.0, 0, 0
    ds2_model.logger.info("start evaluation ...")
    for infer_data in batch_reader():
        probs_split = ds2_model.infer_batch_probs(
            infer_data=infer_data,
            feeding_dict=data_generator.feeding)

        if args.decoding_method == "ctc_greedy":
            result_transcripts = ds2_model.decode_batch_greedy(
                probs_split=probs_split,
                vocab_list=vocab_list)
        else:
            result_transcripts = ds2_model.decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=args.alpha,
                beam_beta=args.beta,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=args.num_proc_bsearch)
        target_transcripts = infer_data[1]

        for target, result in zip(target_transcripts, result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
            num_ins += 1
        print("Error rate [%s] (%d/?) = %f" %
              (args.error_rate_type, num_ins, errors_sum / len_refs))
    print("Final error rate [%s] (%d/%d) = %f" %
          (args.error_rate_type, num_ins, num_ins, errors_sum / len_refs))

    ds2_model.logger.info("finish evaluation")
def evaluate():
    # 检测PaddlePaddle环境
    check_cuda(args.use_gpu)
    check_version()

    # 是否使用GPU
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()

    # 获取数据生成器
    data_generator = DataGenerator(vocab_filepath=args.vocab_path,
                                   mean_std_filepath=args.mean_std_path,
                                   augmentation_config='{}',
                                   specgram_type=args.specgram_type,
                                   keep_transcription_text=True,
                                   place=place,
                                   is_training=False)
    # 获取评估数据
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.test_manifest,
        batch_size=args.batch_size,
        shuffle_method=None)
    # 获取DeepSpeech2模型,并设置为预测
    ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size,
                                 num_conv_layers=args.num_conv_layers,
                                 num_rnn_layers=args.num_rnn_layers,
                                 rnn_layer_size=args.rnn_layer_size,
                                 use_gru=args.use_gru,
                                 share_rnn_weights=args.share_rnn_weights,
                                 place=place,
                                 init_from_pretrained_model=args.model_path,
                                 is_infer=True)

    # 读取数据列表
    with open(args.test_manifest, 'r', encoding='utf-8') as f_m:
        test_len = len(f_m.readlines())

    # 定向搜索方法的处理
    if args.decoding_method == "ctc_beam_search":
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                                  data_generator.vocab_list)

    # 获取评估函数,有字错率和词错率
    errors_func = char_errors if args.error_rate_type == 'cer' else word_errors
    errors_sum, len_refs, num_ins = 0.0, 0, 0
    ds2_model.logger.info("开始评估 ...")
    start = time.time()
    # 开始评估
    for infer_data in batch_reader():
        # 获取一批的识别结果
        probs_split = ds2_model.infer_batch_probs(infer_data=infer_data)

        # 执行解码
        if args.decoding_method == "ctc_greedy":
            # 最优路径解码
            result_transcripts = ds2_model.decode_batch_greedy(
                probs_split=probs_split, vocab_list=data_generator.vocab_list)
        else:
            # 定向搜索解码
            result_transcripts = ds2_model.decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=args.alpha,
                beam_beta=args.beta,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                vocab_list=data_generator.vocab_list,
                num_processes=args.num_proc_bsearch)
        target_transcripts = infer_data[1]

        # 计算字错率
        for target, result in zip(target_transcripts, result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
            num_ins += 1
        print("错误率:[%s] (%d/%d) = %f" %
              (args.error_rate_type, num_ins, test_len, errors_sum / len_refs))
    end = time.time()
    print("消耗时间:%ds, 总错误率:[%s] (%d/%d) = %f" %
          ((end - start), args.error_rate_type, num_ins, num_ins,
           errors_sum / len_refs))

    ds2_model.logger.info("完成评估!")
def infer():
    """Inference for DeepSpeech2."""

    """Start the ASR server"""
    # prepare data generator
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        keep_transcription_text=True,
        place=place,
        is_training=False)
    # prepare ASR model
    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        init_from_pretrained_model=args.model_path,
        place=place,
        share_rnn_weights=args.share_rnn_weights)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]

    if args.decoding_method == "ctc_beam_search":
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                                  vocab_list)

    # prepare ASR inference handler
    def file_to_transcript(filename):
        feature = data_generator.process_utterance(filename, "")
        audio_len = feature[0].shape[1]
        mask_shape0 = (feature[0].shape[0] - 1) // 2 + 1
        mask_shape1 = (feature[0].shape[1] - 1) // 3 + 1
        mask_max_len = (audio_len - 1) // 3 + 1
        mask_ones = np.ones((mask_shape0, mask_shape1))
        mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1))
        mask = np.repeat(
            np.reshape(
                np.concatenate((mask_ones, mask_zeros), axis=1),
                (1, mask_shape0, mask_max_len)),
            32,
            axis=0)
        feature = (np.array([feature[0]]).astype('float32'),
                   None,
                   np.array([audio_len]).astype('int64').reshape([-1, 1]),
                   np.array([mask]).astype('float32'))
        probs_split = ds2_model.infer_batch_probs(
            infer_data=feature,
            feeding_dict=data_generator.feeding)

        if args.decoding_method == "ctc_greedy":
            result_transcript = ds2_model.decode_batch_greedy(
                probs_split=probs_split,
                vocab_list=vocab_list)
        else:
            result_transcript = ds2_model.decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=args.alpha,
                beam_beta=args.beta,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=1)
        return result_transcript[0]


    fdata = open(args.infer_manifest)
    data = fdata.readlines()
    fdata.close()

    result_transcripts = []
    i = 0
    for audio_data in data :
        i += 1
        filename = json.loads(audio_data)["audio_filepath"]
        transcription = file_to_transcript(filename)
        print("DeepSpeech2 Translation - %d: %s" % (i, str(transcription)))
        result_transcripts.append(transcription)

    paddledeepspeech_translation = open(
        "output/paddledeepspeech_translation_" + str(datetime.now()) + ".txt", "w+")

    i = 0
    for result in result_transcripts:
        i += 1
        # print("%d - %s" % (i, result))
        paddledeepspeech_translation.write("%s, %d, %s\n" %
                                            ("tts_google", i, result))
    paddledeepspeech_translation.close()
Beispiel #25
0
def start_server(args):
    print(args)

    if args.decoder in ["ctc_greedy", "ctc_beam_search"]:
        global decoding_method
        decoding_method = args.decoder
    """Start the ASR server"""
    # prepare data generator
    if use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    data_generator = DataGenerator(vocab_filepath=vocab_path,
                                   mean_std_filepath=mean_std_path,
                                   augmentation_config='{}',
                                   specgram_type=specgram_type,
                                   keep_transcription_text=True,
                                   place=place,
                                   is_training=False)
    # prepare ASR model
    ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size,
                                 num_conv_layers=num_conv_layers,
                                 num_rnn_layers=num_rnn_layers,
                                 rnn_layer_size=rnn_layer_size,
                                 use_gru=use_gru,
                                 init_from_pretrained_model=model_path,
                                 place=place,
                                 share_rnn_weights=share_rnn_weights)

    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]

    if decoding_method == "ctc_beam_search":
        ds2_model.init_ext_scorer(alpha, beta, lang_model_path, vocab_list)
    # prepare ASR inference handler
    def file_to_transcript(filename):
        feature = data_generator.process_utterance(filename, "")
        audio_len = feature[0].shape[1]
        mask_shape0 = (feature[0].shape[0] - 1) // 2 + 1
        mask_shape1 = (feature[0].shape[1] - 1) // 3 + 1
        mask_max_len = (audio_len - 1) // 3 + 1
        mask_ones = np.ones((mask_shape0, mask_shape1))
        mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1))
        mask = np.repeat(np.reshape(
            np.concatenate((mask_ones, mask_zeros), axis=1),
            (1, mask_shape0, mask_max_len)),
                         32,
                         axis=0)
        feature = (np.array([feature[0]]).astype('float32'), None,
                   np.array([audio_len]).astype('int64').reshape([-1, 1]),
                   np.array([mask]).astype('float32'))
        probs_split = ds2_model.infer_batch_probs(
            infer_data=feature, feeding_dict=data_generator.feeding)

        tik = time.time()
        if decoding_method == "ctc_greedy":
            result_transcript = ds2_model.decode_batch_greedy(
                probs_split=probs_split, vocab_list=vocab_list)
        else:
            result_transcript = ds2_model.decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=alpha,
                beam_beta=beta,
                beam_size=beam_size,
                cutoff_prob=cutoff_prob,
                cutoff_top_n=cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=1)

        print(time.time() - tik)
        return result_transcript[0]

    # warming up with utterrances sampled from Librispeech
    print('-----------------------------------------------------------')
    print('Warming up ...')
    audio_file_name = "/home/Nishchith/audio_recording_" + args.user + ".wav"
    # audio_file_name = "/home/Nishchith/2722020-163399.wav"
    transcript = file_to_transcript(audio_file_name)
    # transcript = file_to_transcript("/home/Nishchith/audio_samples/test-pravar_2.wav")

    _file = open("/home/Nishchith/transcript_" + args.user + ".txt", "w")
    transcript = "\n".join(transcript.split(" "))
    _file.write(transcript + "\n")
    _file.close()

    try:
        msg = subprocess.check_output([
            "python",
            "-m",
            "aeneas.tools.execute_task",
            audio_file_name,
            "/home/Nishchith/transcript_" + args.user + ".txt",
            #"task_language=eng|os_task_file_format=json|is_text_type=mplain",
            "task_language=eng|os_task_file_format=json|is_text_type=plain|task_adjust_boundary_nonspeech_min=0.0100|task_adjust_boundary_nonspeech_string=(sil)|task_adjust_boundary_algorithm=auto",
            "/home/Nishchith/data_" + args.user + ".json",
            "--presets-word"
        ])
    except subprocess.CalledProcessError as e:
        msg = e.output.decode("utf-8")
        print(msg)

    with open("/home/Nishchith/data_" + args.user + ".json") as f:
        data = json.load(f)
    """
    [
            {
                "word":"in",
                "start_time ":0.0,
                "duration":1.06
            },
            {
                "word":"clustering",
                "start_time ":1.06,
                "duration":0.52
            }]
    """

    words_list = []
    for word in data.get("fragments"):
        word_item = dict()

        if word["lines"][0] == "(sil)":
            continue

        word_item["word"] = word["lines"][0]
        word_item["start_time"] = float(word["begin"])
        word_item["duration"] = float(word["end"]) - float(word["begin"])
        words_list.append(word_item)

    with open("/home/Nishchith/format_data_" + args.user + ".json", 'w') as f:
        json.dump(words_list, f)
    print('-----------------------------------------------------------')
Beispiel #26
0
def evaluate():
    """Evaluate on whole test data for DeepSpeech2."""
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=args.num_proc_data,
        keep_transcription_text=True)

    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        pretrained_model_path=args.model_path,
        share_rnn_weights=args.share_rnn_weights)

    # decoders only accept string encoded in utf-8
    vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]

    if args.decoding_method == "ctc_beam_search":
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                                  vocab_list)
    errors_func = char_errors if args.error_rate_type == 'cer' else word_errors

    # prepare ASR inference handler
    def file_to_transcript(filename):
        feature = data_generator.process_utterance(filename, "")
        probs_split = ds2_model.infer_batch_probs(
            infer_data=[feature],
            feeding_dict=data_generator.feeding)

        if args.decoding_method == "ctc_greedy":
            result_transcript = ds2_model.decode_batch_greedy(
                probs_split=probs_split,
                vocab_list=vocab_list)
        else:
            result_transcript = ds2_model.decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=args.alpha,
                beam_beta=args.beta,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=1)
        return result_transcript[0]

    parentdir = os.path.join(args.src_path)
    manifest_path = args.manifest_path
    manifest = read_manifest(
        manifest_path=manifest_path)
    transcripts = []
    for entry in manifest:
        fname = entry["audio_filepath"]
        transcript = file_to_transcript(fname)
        transcripts.append((fname, fname.split("/")[-1], transcript))

    df = pd.DataFrame(data=transcripts, columns=["wav_path", "wav_name", "transcripts"])
    df.sort_values("wav_name", inplace=True)
    try:
        with open(os.path.join(parentdir, 'transcripts_list_'+\
                               datetime.datetime.now().strftime("%H:%M:%S")+".b"), 'wb') as f:
            pickle.dump(transcripts, f)
    except:
        pass
    try:
        with open(os.path.join(parentdir, 'ds2_stt_complete.csv'), 'w') as f:
            df.to_csv(f, index=False)
    except:
        pass
    try:
        with open(os.path.join(parentdir, 'ds2_stt.txt'), 'w') as f:
            for trans in df["transcripts"]:
                f.write(pre_process_srt(trans) + " ")
    except:
        pass
    ds2_model.logger.info("finish evaluation")
def start_server():
    """Start the ASR server"""
    # prepare data generator
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    data_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        keep_transcription_text=True,
        place = place,
        is_training = False)
    # prepare ASR model
    ds2_model = DeepSpeech2Model(
        vocab_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
        rnn_layer_size=args.rnn_layer_size,
        use_gru=args.use_gru,
        init_from_pretrained_model=args.model_path,
        place=place,
        share_rnn_weights=args.share_rnn_weights)

    vocab_list = [chars for chars in data_generator.vocab_list]

    if args.decoding_method == "ctc_beam_search":
        ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path,
                                  vocab_list)
    # prepare ASR inference handler
    def file_to_transcript(filename):
        feature = data_generator.process_utterance(filename, "")
        audio_len = feature[0].shape[1]
        mask_shape0 = (feature[0].shape[0] - 1) // 2 + 1
        mask_shape1 = (feature[0].shape[1] - 1) // 3 + 1
        mask_max_len = (audio_len - 1) // 3 + 1
        mask_ones = np.ones((mask_shape0, mask_shape1))
        mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1))
        mask = np.repeat(
            np.reshape(
                np.concatenate((mask_ones, mask_zeros), axis=1),
                (1, mask_shape0, mask_max_len)),
            32,
            axis=0)
        feature = (np.array([feature[0]]).astype('float32'),
                   None,
                   np.array([audio_len]).astype('int64').reshape([-1,1]),
                   np.array([mask]).astype('float32'))
        probs_split = ds2_model.infer_batch_probs(
            infer_data=feature,
            feeding_dict=data_generator.feeding)

        if args.decoding_method == "ctc_greedy":
            result_transcript = ds2_model.decode_batch_greedy(
                probs_split=probs_split,
                vocab_list=vocab_list)
        else:
            result_transcript = ds2_model.decode_batch_beam_search(
                probs_split=probs_split,
                beam_alpha=args.alpha,
                beam_beta=args.beta,
                beam_size=args.beam_size,
                cutoff_prob=args.cutoff_prob,
                cutoff_top_n=args.cutoff_top_n,
                vocab_list=vocab_list,
                num_processes=1)
        return result_transcript[0]

    # warming up with utterrances sampled from Librispeech
    print('-----------------------------------------------------------')
    print('Warming up ...')
    warm_up_test(
        audio_process_handler=file_to_transcript,
        manifest_path=args.warmup_manifest,
        num_test_cases=3)
    print('-----------------------------------------------------------')

    # start the server
    server = AsrTCPServer(
        server_address=(args.host_ip, args.host_port),
        RequestHandlerClass=AsrRequestHandler,
        speech_save_dir=args.speech_save_dir,
        audio_process_handler=file_to_transcript)
    print("ASR Server Started.")
    server.serve_forever()
Beispiel #28
0
beta = 0.3
cutoff_prob = 1.0
cutoff_top_n = 40
decoding_method = 'ctc_beam_search'
error_rate_type = 'wer'
num_conv_layers = 2
num_rnn_layers = 3
rnn_layer_size = 2048
share_rnn_weights = True
specgram_type = 'linear'

paddle.init(use_gpu=USING_GPU, rnn_use_batch=True, trainer_count=trainer_count)

data_generator = DataGenerator(vocab_filepath=vocab_path,
                               mean_std_filepath=mean_std_path,
                               augmentation_config='{}',
                               specgram_type=specgram_type,
                               num_threads=num_proc_data,
                               keep_transcription_text=True)
batch_reader = data_generator.batch_reader_creator(manifest_path=test_manifest,
                                                   batch_size=batch_size,
                                                   min_batch_size=1,
                                                   sortagrad=False,
                                                   shuffle_method=None)

print('Loading inference model from files {}'.format(model_path))
log_file.write('Loading inference model from files {}'.format(model_path))
inf_model_load_start = timer()
ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size,
                             num_conv_layers=num_conv_layers,
                             num_rnn_layers=num_rnn_layers,
                             rnn_layer_size=rnn_layer_size,