def _start_server(self): self.data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=SPECGRAM_TYPE, place=self.place, keep_transcription_text=True) self.ds2_model = DeepSpeech2Model( vocab_size=self.data_generator.vocab_size, num_conv_layers=NUM_CONV_LAYERS, num_rnn_layers=NUM_RNN_LAYERS, rnn_layer_size=RNN_LAYER_SIZE, use_gru=USE_GRU, init_from_pretrained_model=args.model_path, place=self.place, share_rnn_weights=SHARE_RNN_WEIGHTS) self.vocab_list = [chars for chars in self.data_generator.vocab_list] if args.decoding_method == "ctc_beam_search": self.ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, self.vocab_list) print('-----------------------------------------------------------') print('Warming up ...') self._warm_up_test(num_test_cases=3) print('-----------------------------------------------------------')
def infer(): """Inference for DeepSpeech2.""" data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1, keep_transcription_text=True) batch_reader = data_generator.batch_reader_creator( manifest_path=args.infer_manifest, batch_size=args.batch_size, min_batch_size=1, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] for i in vocab_list[:10]: print(i.encode('utf-8')) if args.decoding_method == "ctc_greedy": ds2_model.logger.info("start inference ...") probs_split = ds2_model.infer_batch_probs(infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) ds2_model.logger.info("start inference ...") for infer_data in batch_reader(): probs_split = ds2_model.infer_batch_probs(infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) with open(args.output_file,'a+') as f: for result in result_transcripts: print("\nOutput Transcription: %s" % result.encode('utf-8')) f.write(result.encode('utf-8')) f.write('\n') ds2_model.logger.info("finish inference")
def SpeechRecognizer(): """Evaluate on whole test data for DeepSpeech2.""" paddle.init(use_gpu=True, rnn_use_batch=True, trainer_count=1) data_generator = DataGenerator( vocab_filepath='models/aishell/vocab.txt', mean_std_filepath='models/aishell/mean_std.npz', augmentation_config='{}', specgram_type='linear', num_threads=8, keep_transcription_text=True) batch_reader = data_generator.batch_reader_creator( manifest_path='data/cctv/manifest', batch_size=128, min_batch_size=1, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=2, num_rnn_layers=3, rnn_layer_size=1024, use_gru=True, pretrained_model_path='models/aishell/params.tar.gz', share_rnn_weights=False) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] #if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(2.6, 5.0, 'models/lm/zh_giga.no_cna_cmn.prune01244.klm', vocab_list) ds2_model.logger.info("start evaluation ...") transcript = [] bar = progressbar.ProgressBar(widgets=[ progressbar.Percentage(), progressbar.Bar(), ' (', progressbar.SimpleProgress(), ') ', ' (', progressbar.ETA(), ') ', ]) for infer_data in bar(batch_reader()): probs_split = ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=2.6, beam_beta=5.0, beam_size=300, cutoff_prob=0.99, cutoff_top_n=40, vocab_list=vocab_list, num_processes=8) transcript += result_transcripts time.sleep(0.01) return transcript
def tune(): """Tune parameters alpha and beta on one minibatch.""" if not args.num_alphas >= 0: raise ValueError("num_alphas must be non-negative!") if not args.num_betas >= 0: raise ValueError("num_betas must be non-negative!") data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1) batch_reader = data_generator.batch_reader_creator( manifest_path=args.tune_manifest, batch_size=args.num_samples, sortagrad=False, shuffle_method=None) tune_data = batch_reader().next() target_transcripts = [ ''.join([data_generator.vocab_list[token] for token in transcript]) for _, transcript in tune_data ] ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # create grid for search cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) params_grid = [(alpha, beta) for alpha in cand_alphas for beta in cand_betas] ## tune parameters in loop for alpha, beta in params_grid: result_transcripts = ds2_model.infer_batch( infer_data=tune_data, decoding_method='ctc_beam_search', beam_alpha=alpha, beam_beta=beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, num_processes=args.num_proc_bsearch) wer_sum, num_ins = 0.0, 0 for target, result in zip(target_transcripts, result_transcripts): wer_sum += wer(target, result) num_ins += 1 print("alpha = %f\tbeta = %f\tWER = %f" % (alpha, beta, wer_sum / num_ins))
def decode_all(manifests): data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1, keep_transcription_text=True) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # decoders only accept string encoded in utf-8 alphabet = Alphabet(args.vocab_path) ds2_model.logger.info("start decoding with extended output...") ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, args.trie_path, alphabet) for audioname, manifest_path, duration, offset in manifests: try: duration_f = float(duration) if duration_f < 1.: yield (audioname, manifest_path, None, duration, offset) continue except (TypeError, ValueError): pass batch_reader = data_generator.batch_reader_creator( manifest_path=manifest_path, batch_size=args.num_samples, min_batch_size=1, sortagrad=False, shuffle_method=None) for decode_data in batch_reader(): probs_split = ds2_model.infer_batch_probs( infer_data=decode_data, feeding_dict=data_generator.feeding) # note: we only perform single file decoding result_transcript = ds2_model.decode_beam_search( probs_split=probs_split, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, alphabet=alphabet) yield (audioname, manifest_path, result_transcript, duration, offset)
def train(): """DeepSpeech2 training.""" # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() train_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config=io.open(args.augment_conf_path, mode='r', encoding='utf8').read(), max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, place=place) dev_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, place=place) train_batch_reader = train_generator.batch_reader_creator(manifest_path=args.train_manifest, batch_size=args.batch_size, sortagrad=args.use_sortagrad if args.init_from_pretrained_model is None else False, shuffle_method=args.shuffle_method) dev_batch_reader = dev_generator.batch_reader_creator(manifest_path=args.dev_manifest, batch_size=args.batch_size, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model(vocab_size=train_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.init_from_pretrained_model, output_model_dir=args.output_model_dir) ds2_model.train(train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, learning_rate=args.learning_rate, gradient_clipping=400, batch_size=args.batch_size, num_samples=args.num_samples, num_epoch=args.num_epoch, save_epoch=args.save_epoch, num_iterations_print=args.num_iter_print, test_off=args.test_off)
def start_server(): """Start the ASR server""" # prepare data generator data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1) # prepare ASR model ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # prepare ASR inference handler def file_to_transcript(filename): feature = data_generator.process_utterance(filename, "") result_transcript = ds2_model.infer_batch( infer_data=[feature], decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, num_processes=1) return result_transcript[0] # warming up with utterrances sampled from Librispeech print('-----------------------------------------------------------') print('Warming up ...') warm_up_test( audio_process_handler=file_to_transcript, manifest_path=args.warmup_manifest, num_test_cases=3) print('-----------------------------------------------------------') # start the server server = AsrTCPServer( server_address=(args.host_ip, args.host_port), RequestHandlerClass=AsrRequestHandler, speech_save_dir=args.speech_save_dir, audio_process_handler=file_to_transcript) print("ASR Server Started.") server.serve_forever()
def infer(filenum): """Inference for DeepSpeech2.""" data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1, keep_transcription_text=True) batch_reader = data_generator.batch_reader_creator( manifest_path=args.infer_manifest, batch_size=args.batch_size, min_batch_size=1, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) ds2_model.logger.info("start inference ...") transcript = [] widgets = ["Start inference ...: ", Percentage(), ' ', Bar(), ' ', ETA()] pbar = ProgressBar(widgets=widgets, maxval=filenum/args.batch_size).start() for i, infer_data in enumerate(batch_reader()): if args.decoding_method == "ctc_greedy": probs_split = ds2_model.infer_batch_probs(infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_greedy(probs_split=probs_split, vocab_list=vocab_list) else: probs_split = ds2_model.infer_batch_probs(infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) transcript = transcript + result_transcripts pbar.update(i) pbar.finish() print("finish inference") return transcript
def train(): """DeepSpeech2 training.""" train_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config=open(args.augment_conf_path, 'r').read(), max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, num_threads=args.num_proc_data) dev_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, num_threads=args.num_proc_data) train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest, batch_size=args.batch_size, min_batch_size=args.trainer_count, sortagrad=args.use_sortagrad if args.init_model_path is None else False, shuffle_method=args.shuffle_method) dev_batch_reader = dev_generator.batch_reader_creator( manifest_path=args.dev_manifest, batch_size=args.batch_size, min_batch_size=1, # must be 1, but will have errors. sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model( vocab_size=train_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.init_model_path, share_rnn_weights=args.share_rnn_weights) ds2_model.train( train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, feeding_dict=train_generator.feeding, learning_rate=args.learning_rate, gradient_clipping=400, num_passes=args.num_passes, num_iterations_print=args.num_iter_print, output_model_dir=args.output_model_dir, is_local=args.is_local, test_off=args.test_off)
def evaluate(): """Evaluate on whole test data for DeepSpeech2.""" data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=args.num_proc_data) batch_reader = data_generator.batch_reader_creator( manifest_path=args.test_manifest, batch_size=args.batch_size, min_batch_size=1, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) error_rate_func = cer if args.error_rate_type == 'cer' else wer error_sum, num_ins = 0.0, 0 for infer_data in batch_reader(): result_transcripts = ds2_model.infer_batch( infer_data=infer_data, decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, num_processes=args.num_proc_bsearch) target_transcripts = [ ''.join([data_generator.vocab_list[token] for token in transcript]) for _, transcript in infer_data ] for target, result in zip(target_transcripts, result_transcripts): error_sum += error_rate_func(target, result) num_ins += 1 print("Error rate [%s] (%d/?) = %f" % (args.error_rate_type, num_ins, error_sum / num_ins)) print("Final error rate [%s] (%d/%d) = %f" % (args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
def infer(): """Inference for DeepSpeech2.""" data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1) batch_reader = data_generator.batch_reader_creator( manifest_path=args.infer_manifest, batch_size=args.num_samples, min_batch_size=1, sortagrad=False, shuffle_method=None) infer_data = batch_reader().next() ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, num_processes=args.num_proc_bsearch) error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = [ ''.join([data_generator.vocab_list[token] for token in transcript]) for _, transcript in infer_data ] for target, result in zip(target_transcripts, result_transcripts): print("\nTarget Transcription: %s\nOutput Transcription: %s" % (target, result)) print("Current error rate [%s] = %f" % (args.error_rate_type, error_rate_func(target, result)))
def load_model(): # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() # Load model data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path) # decoders only accept string encoded in utf-8 vocab_list = data_generator.vocab_list ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) return ds2_model, data_generator, vocab_list
def tune(): # 逐步调整alphas参数和betas参数 if not args.num_alphas >= 0: raise ValueError("num_alphas must be non-negative!") if not args.num_betas >= 0: raise ValueError("num_betas must be non-negative!") # 是否使用GPU place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() # 获取数据生成器 data_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place=place, is_training=False) # 获取评估数据 batch_reader = data_generator.batch_reader_creator(manifest_path=args.tune_manifest, batch_size=args.batch_size, shuffle_method=None) # 获取DeepSpeech2模型,并设置为预测 ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, place=place, init_from_pretrained_model=args.model_path, share_rnn_weights=args.share_rnn_weights, is_infer=True) # 获取评估函数,有字错率和词错率 errors_func = char_errors if args.error_rate_type == 'cer' else word_errors # 创建用于搜索的alphas参数和betas参数 cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) params_grid = [(alpha, beta) for alpha in cand_alphas for beta in cand_betas] err_sum = [0.0 for i in range(len(params_grid))] err_ave = [0.0 for i in range(len(params_grid))] num_ins, len_refs, cur_batch = 0, 0, 0 # 初始化集束搜索方法 ds2_model.init_ext_scorer(args.alpha_from, args.beta_from, args.lang_model_path, data_generator.vocab_list) # 多批增量调优参数 ds2_model.logger.info("start tuning ...") for infer_data in batch_reader(): if (args.num_batches >= 0) and (cur_batch >= args.num_batches): break # 执行预测 probs_split = ds2_model.infer_batch_probs(infer_data=infer_data) target_transcripts = infer_data[1] num_ins += len(target_transcripts) # 搜索alphas参数和betas参数 for index, (alpha, beta) in enumerate(tqdm(params_grid)): result_transcripts = ds2_model.decode_batch_beam_search(probs_split=probs_split, beam_alpha=alpha, beam_beta=beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=data_generator.vocab_list, num_processes=args.num_proc_bsearch) for target, result in zip(target_transcripts, result_transcripts): errors, len_ref = errors_func(target, result) err_sum[index] += errors if args.alpha_from == alpha and args.beta_from == beta: len_refs += len_ref err_ave[index] = err_sum[index] / len_refs # 输出每一个batch的计算结果 err_ave_min = min(err_ave) min_index = err_ave.index(err_ave_min) print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), " " min [%s] = %f" % (cur_batch, num_ins, "%.3f" % params_grid[min_index][0], "%.3f" % params_grid[min_index][1], args.error_rate_type, err_ave_min)) cur_batch += 1 # 输出字错率和词错率以及(alpha, beta) print("\nFinal %s:\n" % args.error_rate_type) for index in range(len(params_grid)): print("(alpha, beta) = (%s, %s), [%s] = %f" % ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1], args.error_rate_type, err_ave[index])) err_ave_min = min(err_ave) min_index = err_ave.index(err_ave_min) print("\n一共使用了 %d 批数据推理, 最优的参数为 (alpha, beta) = (%s, %s)" % (cur_batch, "%.3f" % params_grid[min_index][0], "%.3f" % params_grid[min_index][1])) ds2_model.logger.info("finish tuning")
place = fluid.CPUPlace() # 获取数据生成器,处理数据和获取字典需要 data_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place=place, is_training=False) # 获取DeepSpeech2模型,并设置为预测 ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, init_from_pretrained_model=args.model_path, place=place, share_rnn_weights=args.share_rnn_weights, is_infer=True) # 定向搜索方法的处理 if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, data_generator.vocab_list) # 开始预测 def predict(filename): # 加载音频文件,并进行预处理 feature = data_generator.process_utterance(filename, "") # 执行预测
def SpeechRecognizer(): """Inference for DeepSpeech2.""" paddle.init(use_gpu=True, rnn_use_batch=True, trainer_count=1) data_generator = DataGenerator( vocab_filepath='models/aishell/vocab.txt', mean_std_filepath='data/aishell/mean_std.npz', augmentation_config='{}', specgram_type='linear', num_threads=1, keep_transcription_text=True) batch_reader = data_generator.batch_reader_creator( manifest_path='data/cctv/manifest', batch_size=10, min_batch_size=1, sortagrad=False, shuffle_method=None) infer_data = batch_reader().next() ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=2, num_rnn_layers=3, rnn_layer_size=1024, use_gru=True, pretrained_model_path='models/aishell/params.tar.gz', share_rnn_weights=False) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] # if args.decoding_method == "ctc_greedy": # ds2_model.logger.info("start inference ...") # probs_split = ds2_model.infer_batch_probs(infer_data=infer_data, # feeding_dict=data_generator.feeding) # result_transcripts = ds2_model.decode_batch_greedy( # probs_split=probs_split, # vocab_list=vocab_list) # else: ds2_model.init_ext_scorer(2.6, 5.0, 'models/lm/zh_giga.no_cna_cmn.prune01244.klm', vocab_list) ds2_model.logger.info("start inference ...") probs_split = ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) transcript = [] result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=2.6, beam_beta=5.0, beam_size=300, cutoff_prob=0.99, cutoff_top_n=40, vocab_list=vocab_list, num_processes=8) # for result in result_transcripts: # print("\nOutput Transcription: %s" % # result) transcript.append(result_transcripts[:]) transcript = transcript[0] return transcript
def infer(transcript_name): """Inference for DeepSpeech2.""" # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) batch_reader = data_generator.batch_reader_creator( manifest_path=args.infer_manifest, batch_size=args.num_samples, sortagrad=False, shuffle_method=None) infer_data = next(batch_reader()) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if args.decoding_method == "ctc_greedy": ds2_model.logger.info("start inference ...") probs_split = ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) ds2_model.logger.info("start inference ...") probs_split= ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts= ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) transcription = result_transcripts[0].capitalize() + '.' print(transcription) with codecs.open('dataset/tap/transcription/'+transcript_name+'.txt', 'w', 'utf-8') as out_file: out_file.write(transcription) ds2_model.logger.info("finish inference")
def train(): # 检测PaddlePaddle环境 check_cuda(args.use_gpu) check_version() # 是否使用GPU if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() # 获取训练数据生成器 train_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config=io.open( args.augment_conf_path, mode='r', encoding='utf8').read(), max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, place=place) # 获取测试数据生成器 test_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, keep_transcription_text=True, place=place, is_training=False) # 获取训练数据 train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest, batch_size=args.batch_size, sortagrad=args.use_sortagrad if args.init_from_pretrained_model is None else False, shuffle_method=args.shuffle_method) # 获取测试数据 test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest, batch_size=args.batch_size, sortagrad=False, shuffle_method=None) # 获取DeepSpeech2模型 ds2_model = DeepSpeech2Model( vocab_size=train_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.init_from_pretrained_model, output_model_dir=args.output_model_dir, vocab_list=test_generator.vocab_list) # 获取训练数据数量 num_samples = get_data_len(args.train_manifest, args.max_duration, args.min_duration) print("[%s] 训练数据数量:%d\n" % (datetime.now(), num_samples)) # 开始训练 ds2_model.train(train_batch_reader=train_batch_reader, dev_batch_reader=test_batch_reader, learning_rate=args.learning_rate, gradient_clipping=400, batch_size=args.batch_size, num_samples=num_samples, num_epoch=args.num_epoch, save_epoch=args.save_epoch, num_iterations_print=args.num_iter_print, test_off=args.test_off)
def evaluate(): """Evaluate on whole test data for DeepSpeech2.""" data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=args.num_proc_data, keep_transcription_text=True) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) errors_func = char_errors if args.error_rate_type == 'cer' else word_errors # prepare ASR inference handler def file_to_transcript(filename): feature = data_generator.process_utterance(filename, "") probs_split = ds2_model.infer_batch_probs( infer_data=[feature], feeding_dict=data_generator.feeding) if args.decoding_method == "ctc_greedy": result_transcript = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: result_transcript = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=1) return result_transcript[0] parentdir = os.path.join(args.src_path) manifest_path = args.manifest_path manifest = read_manifest( manifest_path=manifest_path) transcripts = [] for entry in manifest: fname = entry["audio_filepath"] transcript = file_to_transcript(fname) transcripts.append((fname, fname.split("/")[-1], transcript)) df = pd.DataFrame(data=transcripts, columns=["wav_path", "wav_name", "transcripts"]) df.sort_values("wav_name", inplace=True) try: with open(os.path.join(parentdir, 'transcripts_list_'+\ datetime.datetime.now().strftime("%H:%M:%S")+".b"), 'wb') as f: pickle.dump(transcripts, f) except: pass try: with open(os.path.join(parentdir, 'ds2_stt_complete.csv'), 'w') as f: df.to_csv(f, index=False) except: pass try: with open(os.path.join(parentdir, 'ds2_stt.txt'), 'w') as f: for trans in df["transcripts"]: f.write(pre_process_srt(trans) + " ") except: pass ds2_model.logger.info("finish evaluation")
def main(config): model=networks[config["basic"]["model"]] pretrained_model_path=config["basic"]["pt_model_path"] device = config["basic"]["device"] exp_root_dir=config["basic"]["exp_root_path"] ds2_model_path=config["basic"]["ds2_model_path"] augmentation_config_name=config["basic"]["augmentation_config_name"] language_model_path=config["basic"]["language_model_path"] vocab_filepath=config["basic"]["vocab_filepath"] mean_std_filepath=config["basic"]["mean_std_filepath"] batch_size=config["train"]["batch_size"] max_duration=config["train"]["max_duration"] min_duration=config["train"]["min_duration"] segmented=config["train"]["segmented"] num_passes=config["train"]["num_total_epochs"] num_iterations_print=config["train"]["num_iterations_validate"] sortN_epoch=config["train"]["num_sorted_epoch"] num_workers=config["train"]["num_workers"] print(num_workers) # max_duration=config["test"]["max_duration"], # min_duration=config["test"]["min_duration"], # batch_size=config["test"]["batch_size"] # max_duration=config["test"]["max_duration"] # min_duration=config["test"]["min_duration"] # segmented=config["test"]["segmented"] # num_workers=config["test"]["num_workers"] train_csv=config["data"]["train_csv"] val_csv= config["data"]["val_csv"] test_csv= config["data"]["test_csv"] lr=config["optimizer"]["learning_rate"] included_lr_key=config["optimizer"]["included_layer_keywords"] excluded_lr_key=config["optimizer"]["excluded_layer_keywords"] try: specific_lr_dict = config["optimizer"]["specific_lr_dict"] except: specific_lr_dict = None Warning("You miss the keyword specific_lr_dict") scheduler_gamma=config["scheduler"]["gamma"] if augmentation_config_name: with open(os.path.join(exp_root_dir, "conf",augmentation_config_name), 'r') as f: augmentation_config = f.read() else: augmentation_config = "{}" filename = datetime.now().strftime("%y%m%d-%H:%M:%S") output_dir = os.path.join(exp_root_dir, "exps", filename) mkpath(os.path.join(output_dir, "models")) mkpath(os.path.join(output_dir, "vals")) # shutil.copy2(config_path, os.path.join(output_dir, "experiment.yaml")) with open(os.path.join(output_dir, "experiment.yaml"), 'w') as f: yaml.safe_dump(config, stream=f) log_dir=os.path.join(exp_root_dir, "tensorboard", filename) train_dataset = SpecgramGenerator(manifest=os.path.join(exp_root_dir, "data", train_csv), vocab_filepath=vocab_filepath, mean_std_filepath=mean_std_filepath, augmentation_config=augmentation_config, max_duration=max_duration, #20, min_duration=min_duration, # 3 segmented=segmented) # False val_dataset = SpecgramGenerator(manifest=os.path.join(exp_root_dir, "data", val_csv), vocab_filepath=vocab_filepath, mean_std_filepath=mean_std_filepath, max_duration=max_duration, #20, min_duration=min_duration, # 3 segmented=segmented) # False vocab_list = ["'", ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] ds2_model = DeepSpeech2Model(model=model, ds2_model_path=ds2_model_path, vocab_list=vocab_list, device=device) tensorboard_writer = SummaryWriter(log_dir=log_dir) with open(os.path.join(output_dir, "model_info.txt"), 'w') as f: f.write("DNN structure: \n{}\n".format(ds2_model.model)) if pretrained_model_path: ds2_model.load_weights(pretrained_model_path) ds2_model.init_ext_scorer(1.4, 0.35, language_model_path) ds2_model.train( train_dataset=train_dataset, train_batchsize=batch_size, val_dataset=val_dataset, val_batchsize=batch_size, collate_fn=SpecgramGenerator.padding_batch, lr_key=included_lr_key, exclue_lr_key=excluded_lr_key, learning_rate=lr, scheduler_gamma=scheduler_gamma, gradient_clipping=40, num_passes=num_passes, num_iterations_print=num_iterations_print, writer=tensorboard_writer, output_dir=output_dir, sortN_epoch=sortN_epoch, num_workers=num_workers, specific_lr_dict=specific_lr_dict)
def tune(): """Tune parameters alpha and beta incrementally.""" if not args.num_alphas >= 0: raise ValueError("num_alphas must be non-negative!") if not args.num_betas >= 0: raise ValueError("num_betas must be non-negative!") if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) batch_reader = data_generator.batch_reader_creator( manifest_path=args.tune_manifest, batch_size=args.batch_size, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, place=place, init_from_pretrained_model=args.model_path, share_rnn_weights=args.share_rnn_weights) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] errors_func = char_errors if args.error_rate_type == 'cer' else word_errors # create grid for search cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) params_grid = [(alpha, beta) for alpha in cand_alphas for beta in cand_betas] err_sum = [0.0 for i in range(len(params_grid))] err_ave = [0.0 for i in range(len(params_grid))] num_ins, len_refs, cur_batch = 0, 0, 0 # initialize external scorer ds2_model.init_ext_scorer(args.alpha_from, args.beta_from, args.lang_model_path, vocab_list) ## incremental tuning parameters over multiple batches ds2_model.logger.info("start tuning ...") for infer_data in batch_reader(): if (args.num_batches >= 0) and (cur_batch >= args.num_batches): break probs_split = ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) target_transcripts = infer_data[1] num_ins += len(target_transcripts) # grid search for index, (alpha, beta) in enumerate(params_grid): result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=alpha, beam_beta=beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) for target, result in zip(target_transcripts, result_transcripts): errors, len_ref = errors_func(target, result) err_sum[index] += errors # accumulate the length of references of every batch # in the first iteration if args.alpha_from == alpha and args.beta_from == beta: len_refs += len_ref err_ave[index] = err_sum[index] / len_refs if index % 2 == 0: sys.stdout.write('.') sys.stdout.flush() # output on-line tuning result at the end of current batch err_ave_min = min(err_ave) min_index = err_ave.index(err_ave_min) print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), " " min [%s] = %f" %(cur_batch, num_ins, "%.3f" % params_grid[min_index][0], "%.3f" % params_grid[min_index][1], args.error_rate_type, err_ave_min)) cur_batch += 1 # output WER/CER at every (alpha, beta) print("\nFinal %s:\n" % args.error_rate_type) for index in range(len(params_grid)): print("(alpha, beta) = (%s, %s), [%s] = %f" % ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1], args.error_rate_type, err_ave[index])) err_ave_min = min(err_ave) min_index = err_ave.index(err_ave_min) print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" % (cur_batch, "%.3f" % params_grid[min_index][0], "%.3f" % params_grid[min_index][1])) ds2_model.logger.info("finish tuning")
def evaluate(): """Evaluate on whole test data for DeepSpeech2.""" # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) batch_reader = data_generator.batch_reader_creator( manifest_path=args.test_manifest, batch_size=args.batch_size, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path) # decoders only accept string encoded in utf-8 vocab_list = [chars for chars in data_generator.vocab_list] if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) errors_func = char_errors if args.error_rate_type == 'cer' else word_errors errors_sum, len_refs, num_ins = 0.0, 0, 0 ds2_model.logger.info("start evaluation ...") for infer_data in batch_reader(): probs_split = ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) if args.decoding_method == "ctc_greedy": result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) target_transcripts = infer_data[1] for target, result in zip(target_transcripts, result_transcripts): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref num_ins += 1 print("Error rate [%s] (%d/?) = %f" % (args.error_rate_type, num_ins, errors_sum / len_refs)) print("Final error rate [%s] (%d/%d) = %f" % (args.error_rate_type, num_ins, num_ins, errors_sum / len_refs)) ds2_model.logger.info("finish evaluation")
def infer(): """extract the duration from manifest""" f=open(args.infer_manifest) timelist=[] for line in f: d=json.loads(line.strip())['duration'] timelist.append(d) """Inference for DeepSpeech2.""" data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1, keep_transcription_text=True) batch_reader = data_generator.batch_reader_creator( manifest_path=args.infer_manifest, batch_size=args.batch_size, min_batch_size=1, sortagrad=False, shuffle_method=None) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if args.decoding_method == "ctc_greedy": ds2_model.logger.info("start inference ...") probs_split = ds2_model.infer_batch_probs(infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) ds2_model.logger.info("start inference ...") with open(args.input_file,'r') as f: l = f.readlines() l[8] = "ASR_01|CMN\n" start_time = l[10].split('|')[0] end_time = l[10].split('|')[1] time_now = str(datetime.datetime.now())[:16] # get the current time l[10] = "|".join(["ASR_01",time_now,"Source_Program=Baidu DeepSpeech2,infer.sh","Source_Person=Zhaoqing Xu,Shuwei Xu","Codebook=Chinese Speech to Text\n"]) end_line = "" if l[-1].startswith("END"): end_line = l[-1] l = l[:11] with open(args.output_file,"w") as f: f.writelines(l) for infer_data in batch_reader(): probs_split = ds2_model.infer_batch_probs(infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) index=0 for result in result_transcripts: with open(args.output_file,'a+') as f: print("\nOutput Transcription: %s" % result.encode('utf-8')) # try: # print(start_time) # start,m_sec = start_time.split('.') time_format = '%Y%m%d%H%M%S.%f' end = (datetime.datetime.strptime(start_time,time_format) + datetime.timedelta(0,timelist[index])).strftime(time_format) index+=1 prefix = start + '|' + end[:-3] + '|ASR_01|' f.write(prefix) f.write(result.encode('utf-8')) f.write('\n') start_time = end # except: # continue with open(args.output_file, 'a+') as f: f.write(end_line) ds2_model.logger.info("finish inference")
def test(config): model = networks[config["basic"]["model"]] pretrained_model_path = config["basic"]["pt_model_path"] device = config["basic"]["device"] exp_root_dir = config["basic"]["exp_root_path"] ds2_model_path = config["basic"]["ds2_model_path"] pt_model_path = config["basic"]["pt_model_path"] use_pt_model = config["basic"]["use_pt_model"] augmentation_config_name = config["basic"]["augmentation_config_name"] language_model_path = config["basic"]["language_model_path"] vocab_filepath = config["basic"]["vocab_filepath"] mean_std_filepath = config["basic"]["mean_std_filepath"] batch_size = config["test"]["batch_size"] max_duration = config["test"]["max_duration"], min_duration = config["test"]["min_duration"], segmented = config["test"]["segmented"] num_workers = config["test"]["num_workers"] test_csv = config["data"]["test_csv"] test_dataset = SpecgramGenerator( manifest=os.path.join(exp_root_dir, "data", test_csv), vocab_filepath=vocab_filepath, mean_std_filepath=mean_std_filepath, augmentation_config="{}", max_duration=max_duration, #20, min_duration=min_duration, # 3 segmented=segmented) # False dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=SpecgramGenerator.padding_batch) vocab_list = [ "'", ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] ds2_model = DeepSpeech2Model(model=model, ds2_model_path=ds2_model_path, vocab_list=vocab_list, device=device) if use_pt_model and pretrained_model_path: ds2_model.load_weights(pt_model_path) ds2_model.init_ext_scorer(1.4, 0.35, language_model_path) outputs = defaultdict(list) beam_alpha = 1.1 for i_batch, sample_batched in enumerate(dataloader): batch_results = ds2_model.infer_batch_probs(infer_data=sample_batched) batch_transcripts_beam = ds2_model.decode_batch_beam_search( probs_split=batch_results, beam_alpha=beam_alpha, beam_beta=0.35, beam_size=500, cutoff_prob=1.0, cutoff_top_n=40, num_processes=6) outputs["uttid"].extend(sample_batched["uttid"]) outputs["probs"].extend(batch_results) outputs["asr"].extend(batch_transcripts_beam) outputs["text"].extend(sample_batched["trans"]) df = pd.DataFrame.from_dict(outputs) return df
def start_server(): """Start the ASR server""" # prepare data generator if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place=place, is_training=False) # prepare ASR model ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, init_from_pretrained_model=args.model_path, place=place, share_rnn_weights=args.share_rnn_weights, is_infer=True) if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, data_generator.vocab_list) # prepare ASR inference handler def file_to_transcript(filename): feature = data_generator.process_utterance(filename, "") probs_split = ds2_model.infer(feature=feature) if args.decoding_method == "ctc_greedy": result_transcript = ds2_model.decode_batch_greedy(probs_split=probs_split, vocab_list=data_generator.vocab_list) else: result_transcript = ds2_model.decode_batch_beam_search(probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=data_generator.vocab_list, num_processes=1) return result_transcript[0] # warming up with utterrances sampled from Librispeech print('-----------------------------------------------------------') print('Warming up ...') warm_up_test(audio_process_handler=file_to_transcript, manifest_path=args.warmup_manifest, num_test_cases=3) print('-----------------------------------------------------------') # start the server server = AsrTCPServer(server_address=(args.host_ip, args.host_port), RequestHandlerClass=AsrRequestHandler, speech_save_dir=args.speech_save_dir, audio_process_handler=file_to_transcript) print("ASR Server Started.") server.serve_forever()
def evaluate(): # 检测PaddlePaddle环境 check_cuda(args.use_gpu) check_version() # 是否使用GPU place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() # 获取数据生成器 data_generator = DataGenerator(vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place=place, is_training=False) # 获取评估数据 batch_reader = data_generator.batch_reader_creator( manifest_path=args.test_manifest, batch_size=args.batch_size, shuffle_method=None) # 获取DeepSpeech2模型,并设置为预测 ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path, is_infer=True) # 读取数据列表 with open(args.test_manifest, 'r', encoding='utf-8') as f_m: test_len = len(f_m.readlines()) # 定向搜索方法的处理 if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, data_generator.vocab_list) # 获取评估函数,有字错率和词错率 errors_func = char_errors if args.error_rate_type == 'cer' else word_errors errors_sum, len_refs, num_ins = 0.0, 0, 0 ds2_model.logger.info("开始评估 ...") start = time.time() # 开始评估 for infer_data in batch_reader(): # 获取一批的识别结果 probs_split = ds2_model.infer_batch_probs(infer_data=infer_data) # 执行解码 if args.decoding_method == "ctc_greedy": # 最优路径解码 result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=data_generator.vocab_list) else: # 定向搜索解码 result_transcripts = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=data_generator.vocab_list, num_processes=args.num_proc_bsearch) target_transcripts = infer_data[1] # 计算字错率 for target, result in zip(target_transcripts, result_transcripts): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref num_ins += 1 print("错误率:[%s] (%d/%d) = %f" % (args.error_rate_type, num_ins, test_len, errors_sum / len_refs)) end = time.time() print("消耗时间:%ds, 总错误率:[%s] (%d/%d) = %f" % ((end - start), args.error_rate_type, num_ins, num_ins, errors_sum / len_refs)) ds2_model.logger.info("完成评估!")
def infer(): """Inference for DeepSpeech2.""" # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) batch_reader = data_generator.batch_reader_creator( manifest_path=args.infer_manifest, batch_size=args.num_samples, sortagrad=False, shuffle_method=None) infer_data = next(batch_reader()) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, share_rnn_weights=args.share_rnn_weights, place=place, init_from_pretrained_model=args.model_path) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if args.decoding_method == "ctc_greedy": ds2_model.logger.info("start inference ...") probs_split = ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) ds2_model.logger.info("start inference ...") probs_split= ds2_model.infer_batch_probs( infer_data=infer_data, feeding_dict=data_generator.feeding) result_transcripts= ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=args.num_proc_bsearch) error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = infer_data[1] for target, result in zip(target_transcripts, result_transcripts): print("\nTarget Transcription: %s\nOutput Transcription: %s" % (target, result)) print("Current error rate [%s] = %f" % (args.error_rate_type, error_rate_func(target, result))) ds2_model.logger.info("finish inference")
def start_server(args): print(args) if args.decoder in ["ctc_greedy", "ctc_beam_search"]: global decoding_method decoding_method = args.decoder """Start the ASR server""" # prepare data generator if use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator(vocab_filepath=vocab_path, mean_std_filepath=mean_std_path, augmentation_config='{}', specgram_type=specgram_type, keep_transcription_text=True, place=place, is_training=False) # prepare ASR model ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, rnn_layer_size=rnn_layer_size, use_gru=use_gru, init_from_pretrained_model=model_path, place=place, share_rnn_weights=share_rnn_weights) vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(alpha, beta, lang_model_path, vocab_list) # prepare ASR inference handler def file_to_transcript(filename): feature = data_generator.process_utterance(filename, "") audio_len = feature[0].shape[1] mask_shape0 = (feature[0].shape[0] - 1) // 2 + 1 mask_shape1 = (feature[0].shape[1] - 1) // 3 + 1 mask_max_len = (audio_len - 1) // 3 + 1 mask_ones = np.ones((mask_shape0, mask_shape1)) mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1)) mask = np.repeat(np.reshape( np.concatenate((mask_ones, mask_zeros), axis=1), (1, mask_shape0, mask_max_len)), 32, axis=0) feature = (np.array([feature[0]]).astype('float32'), None, np.array([audio_len]).astype('int64').reshape([-1, 1]), np.array([mask]).astype('float32')) probs_split = ds2_model.infer_batch_probs( infer_data=feature, feeding_dict=data_generator.feeding) tik = time.time() if decoding_method == "ctc_greedy": result_transcript = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: result_transcript = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=alpha, beam_beta=beta, beam_size=beam_size, cutoff_prob=cutoff_prob, cutoff_top_n=cutoff_top_n, vocab_list=vocab_list, num_processes=1) print(time.time() - tik) return result_transcript[0] # warming up with utterrances sampled from Librispeech print('-----------------------------------------------------------') print('Warming up ...') audio_file_name = "/home/Nishchith/audio_recording_" + args.user + ".wav" # audio_file_name = "/home/Nishchith/2722020-163399.wav" transcript = file_to_transcript(audio_file_name) # transcript = file_to_transcript("/home/Nishchith/audio_samples/test-pravar_2.wav") _file = open("/home/Nishchith/transcript_" + args.user + ".txt", "w") transcript = "\n".join(transcript.split(" ")) _file.write(transcript + "\n") _file.close() try: msg = subprocess.check_output([ "python", "-m", "aeneas.tools.execute_task", audio_file_name, "/home/Nishchith/transcript_" + args.user + ".txt", #"task_language=eng|os_task_file_format=json|is_text_type=mplain", "task_language=eng|os_task_file_format=json|is_text_type=plain|task_adjust_boundary_nonspeech_min=0.0100|task_adjust_boundary_nonspeech_string=(sil)|task_adjust_boundary_algorithm=auto", "/home/Nishchith/data_" + args.user + ".json", "--presets-word" ]) except subprocess.CalledProcessError as e: msg = e.output.decode("utf-8") print(msg) with open("/home/Nishchith/data_" + args.user + ".json") as f: data = json.load(f) """ [ { "word":"in", "start_time ":0.0, "duration":1.06 }, { "word":"clustering", "start_time ":1.06, "duration":0.52 }] """ words_list = [] for word in data.get("fragments"): word_item = dict() if word["lines"][0] == "(sil)": continue word_item["word"] = word["lines"][0] word_item["start_time"] = float(word["begin"]) word_item["duration"] = float(word["end"]) - float(word["begin"]) words_list.append(word_item) with open("/home/Nishchith/format_data_" + args.user + ".json", 'w') as f: json.dump(words_list, f) print('-----------------------------------------------------------')
def infer(): """Inference for DeepSpeech2.""" """Start the ASR server""" # prepare data generator if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place=place, is_training=False) # prepare ASR model ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, init_from_pretrained_model=args.model_path, place=place, share_rnn_weights=args.share_rnn_weights) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) # prepare ASR inference handler def file_to_transcript(filename): feature = data_generator.process_utterance(filename, "") audio_len = feature[0].shape[1] mask_shape0 = (feature[0].shape[0] - 1) // 2 + 1 mask_shape1 = (feature[0].shape[1] - 1) // 3 + 1 mask_max_len = (audio_len - 1) // 3 + 1 mask_ones = np.ones((mask_shape0, mask_shape1)) mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1)) mask = np.repeat( np.reshape( np.concatenate((mask_ones, mask_zeros), axis=1), (1, mask_shape0, mask_max_len)), 32, axis=0) feature = (np.array([feature[0]]).astype('float32'), None, np.array([audio_len]).astype('int64').reshape([-1, 1]), np.array([mask]).astype('float32')) probs_split = ds2_model.infer_batch_probs( infer_data=feature, feeding_dict=data_generator.feeding) if args.decoding_method == "ctc_greedy": result_transcript = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: result_transcript = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=1) return result_transcript[0] fdata = open(args.infer_manifest) data = fdata.readlines() fdata.close() result_transcripts = [] i = 0 for audio_data in data : i += 1 filename = json.loads(audio_data)["audio_filepath"] transcription = file_to_transcript(filename) print("DeepSpeech2 Translation - %d: %s" % (i, str(transcription))) result_transcripts.append(transcription) paddledeepspeech_translation = open( "output/paddledeepspeech_translation_" + str(datetime.now()) + ".txt", "w+") i = 0 for result in result_transcripts: i += 1 # print("%d - %s" % (i, result)) paddledeepspeech_translation.write("%s, %d, %s\n" % ("tts_google", i, result)) paddledeepspeech_translation.close()
def start_server(): """Start the ASR server""" # prepare data generator if args.use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, keep_transcription_text=True, place = place, is_training = False) # prepare ASR model ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, init_from_pretrained_model=args.model_path, place=place, share_rnn_weights=args.share_rnn_weights) vocab_list = [chars for chars in data_generator.vocab_list] if args.decoding_method == "ctc_beam_search": ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, vocab_list) # prepare ASR inference handler def file_to_transcript(filename): feature = data_generator.process_utterance(filename, "") audio_len = feature[0].shape[1] mask_shape0 = (feature[0].shape[0] - 1) // 2 + 1 mask_shape1 = (feature[0].shape[1] - 1) // 3 + 1 mask_max_len = (audio_len - 1) // 3 + 1 mask_ones = np.ones((mask_shape0, mask_shape1)) mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1)) mask = np.repeat( np.reshape( np.concatenate((mask_ones, mask_zeros), axis=1), (1, mask_shape0, mask_max_len)), 32, axis=0) feature = (np.array([feature[0]]).astype('float32'), None, np.array([audio_len]).astype('int64').reshape([-1,1]), np.array([mask]).astype('float32')) probs_split = ds2_model.infer_batch_probs( infer_data=feature, feeding_dict=data_generator.feeding) if args.decoding_method == "ctc_greedy": result_transcript = ds2_model.decode_batch_greedy( probs_split=probs_split, vocab_list=vocab_list) else: result_transcript = ds2_model.decode_batch_beam_search( probs_split=probs_split, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, vocab_list=vocab_list, num_processes=1) return result_transcript[0] # warming up with utterrances sampled from Librispeech print('-----------------------------------------------------------') print('Warming up ...') warm_up_test( audio_process_handler=file_to_transcript, manifest_path=args.warmup_manifest, num_test_cases=3) print('-----------------------------------------------------------') # start the server server = AsrTCPServer( server_address=(args.host_ip, args.host_port), RequestHandlerClass=AsrRequestHandler, speech_save_dir=args.speech_save_dir, audio_process_handler=file_to_transcript) print("ASR Server Started.") server.serve_forever()
specgram_type=specgram_type, num_threads=num_proc_data, keep_transcription_text=True) batch_reader = data_generator.batch_reader_creator(manifest_path=test_manifest, batch_size=batch_size, min_batch_size=1, sortagrad=False, shuffle_method=None) print('Loading inference model from files {}'.format(model_path)) log_file.write('Loading inference model from files {}'.format(model_path)) inf_model_load_start = timer() ds2_model = DeepSpeech2Model(vocab_size=data_generator.vocab_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, rnn_layer_size=rnn_layer_size, use_gru=USING_GRU, pretrained_model_path=model_path, share_rnn_weights=share_rnn_weights) inf_model_load_end = timer() - inf_model_load_start print('Loaded inference model in {:.3}s.'.format(inf_model_load_end)) log_file.write('Loaded inference model in {:.3}s.'.format(inf_model_load_end)) summ_file.write( 'Loaded inference model in,{:.3}s \n'.format(inf_model_load_end)) # decoders only accept string encoded in utf-8 vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list] if decoding_method == "ctc_beam_search": print('Loading language model (scorer) from files {}'.format( lang_model_path),