Beispiel #1
0
    def batch_ctc_prefix_beam_search_cpu(self, batch_log_probs_seq,
                                         batch_log_probs_idx,
                                         batch_len, batch_root,
                                         batch_start, beam_size,
                                         blank_id, space_id,
                                         cutoff_prob, num_processes,
                                         scorer):
        """
        Return: Batch x Beam_size elements, each element is a tuple
                (score, list of ids),
        """

        batch_len_list = batch_len
        batch_log_probs_seq_list = []
        batch_log_probs_idx_list = []
        for i in range(len(batch_len_list)):
            cur_len = int(batch_len_list[i])
            batch_log_probs_seq_list.append(batch_log_probs_seq[i][0:cur_len].tolist())
            batch_log_probs_idx_list.append(batch_log_probs_idx[i][0:cur_len].tolist())
        score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq_list,
                                                   batch_log_probs_idx_list,
                                                   batch_root,
                                                   batch_start,
                                                   beam_size,
                                                   num_processes,
                                                   blank_id,
                                                   space_id,
                                                   cutoff_prob,
                                                   scorer)
        return score_hyps
def ctc_beam_search_decoder_batch(probs_split,
                                  vocabulary,
                                  beam_size,
                                  num_processes,
                                  cutoff_prob=1.0,
                                  cutoff_top_n=40,
                                  ext_scoring_func=None):
    """Wrapper for the batched CTC beam search decoder.

    :param probs_seq: 3-D list with each element as an instance of 2-D list
                      of probabilities used by ctc_beam_search_decoder().
    :type probs_seq: 3-D list
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :param beam_size: Width for beam search.
    :type beam_size: int
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param cutoff_prob: Cutoff probability in vocabulary pruning,
                        default 1.0, no pruning.
    :type cutoff_prob: float
    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                         characters with highest probs in vocabulary will be
                         used in beam search, default 40.
    :type cutoff_top_n: int
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param ext_scoring_func: External scoring function for
                             partially decoded sentence, e.g. word count
                             or language model.
    :type external_scoring_function: callable
    :return: List of tuples of log probability and sentence as decoding
             results, in descending order of the probability.
    :rtype: list
    """
    probs_split = [probs_seq.tolist() for probs_seq in probs_split]

    batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
        cutoff_top_n, ext_scoring_func)
    batch_beam_results = [
        [(res[0], res[1]) for res in beam_results]
        for beam_results in batch_beam_results
    ]
    return batch_beam_results
Beispiel #3
0
def ctc_beam_search_decoder_batch(probs_split,
                                  vocabulary,
                                  beam_size,
                                  num_processes,
                                  cutoff_prob=1.0,
                                  cutoff_top_n=40,
                                  blank_id=0,
                                  ext_scoring_func=None):
    """Wrapper for the batched CTC beam search decoder.

    :param probs_seq: 3-D列表,每个元素作为ctc_beam_search_decoder()使用的2-D概率列表的实例
    :type probs_seq: 3-D list
    :param vocabulary: 词汇列表
    :type vocabulary: list
    :param beam_size: 集束搜索宽度
    :type beam_size: int
    :param cutoff_prob: 剪枝中的截断概率,默认1.0,没有剪枝
    :type cutoff_prob: float
    :param cutoff_top_n: 剪枝时的截断数,仅在词汇表中具有最大probs的cutoff_top_n字符用于光束搜索,默认为40
    :type cutoff_top_n: int
    :param blank_id 空白索引
    :type blank_id int
    :param num_processes: 并行解码进程数
    :type num_processes: int
    :param ext_scoring_func: 外部评分功能部分解码句子,如字计数或语言模型
    :type external_scoring_function: callable
    :return: 解码结果为log概率和句子的元组列表,按概率降序排列的列表
    :rtype: list
    """
    probs_split = [probs_seq.tolist() for probs_seq in probs_split]

    batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
        cutoff_top_n, blank_id, ext_scoring_func)
    batch_beam_results = [[(res[0], res[1]) for res in beam_results]
                          for beam_results in batch_beam_results]
    return batch_beam_results
Beispiel #4
0
    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference is requested
        for this model.

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []

        # Every Python backend must iterate through list of requests and create
        # an instance of pb_utils.InferenceResponse class for each of them. You
        # should avoid storing any of the input Tensors in the class attributes
        # as they will be overridden in subsequent inference requests. You can
        # make a copy of the underlying NumPy array and store it if it is
        # required.

        batch_encoder_out, batch_encoder_lens = [], []
        batch_log_probs, batch_log_probs_idx = [], []
        batch_count = []
        batch_root = TrieVector()
        batch_start = []
        root_dict = {}

        encoder_max_len = 0
        hyps_max_len = 0
        total = 0
        for request in requests:
            # Perform inference on the request and append it to responses list...
            in_0 = pb_utils.get_input_tensor_by_name(request, "encoder_out")
            in_1 = pb_utils.get_input_tensor_by_name(request,
                                                     "encoder_out_lens")
            in_2 = pb_utils.get_input_tensor_by_name(request,
                                                     "batch_log_probs")
            in_3 = pb_utils.get_input_tensor_by_name(request,
                                                     "batch_log_probs_idx")

            batch_encoder_out.append(in_0.as_numpy())
            encoder_max_len = max(encoder_max_len,
                                  batch_encoder_out[-1].shape[1])

            cur_b_lens = in_1.as_numpy()
            batch_encoder_lens.append(cur_b_lens)
            cur_batch = cur_b_lens.shape[0]
            batch_count.append(cur_batch)

            cur_b_log_probs = in_2.as_numpy()
            cur_b_log_probs_idx = in_3.as_numpy()
            for i in range(cur_batch):
                cur_len = cur_b_lens[i]
                cur_probs = cur_b_log_probs[i][
                    0:cur_len, :].tolist()  # T X Beam
                cur_idx = cur_b_log_probs_idx[i][
                    0:cur_len, :].tolist()  # T x Beam
                batch_log_probs.append(cur_probs)
                batch_log_probs_idx.append(cur_idx)
                root_dict[total] = PathTrie()
                batch_root.append(root_dict[total])
                batch_start.append(True)
                total += 1

        score_hyps = ctc_beam_search_decoder_batch(
            batch_log_probs,
            batch_log_probs_idx,
            batch_root,
            batch_start,
            self.beam_size,
            min(total, self.num_processes),
            blank_id=self.blank_id,
            space_id=-2,
            cutoff_prob=self.cutoff_prob,
            ext_scorer=self.lm)
        all_hyps = []
        all_ctc_score = []
        max_seq_len = 0
        for seq_cand in score_hyps:
            # if candidates less than beam size
            if len(seq_cand) != self.beam_size:
                seq_cand = list(seq_cand)
                seq_cand += (self.beam_size - len(seq_cand)) * [(-float("INF"),
                                                                 (0, ))]

            for score, hyps in seq_cand:
                all_hyps.append(list(hyps))
                all_ctc_score.append(score)
                max_seq_len = max(len(hyps), max_seq_len)

        beam_size = self.beam_size
        feature_size = self.feature_size
        hyps_max_len = max_seq_len + 2
        in_ctc_score = np.zeros((total, beam_size), dtype=self.data_type)
        in_hyps_pad_sos_eos = np.ones(
            (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos
        if self.bidecoder:
            in_r_hyps_pad_sos_eos = np.ones(
                (total, beam_size, hyps_max_len), dtype=np.int64) * self.eos

        in_hyps_lens_sos = np.ones((total, beam_size), dtype=np.int32)

        in_encoder_out = np.zeros((total, encoder_max_len, feature_size),
                                  dtype=self.data_type)
        in_encoder_out_lens = np.zeros(total, dtype=np.int32)
        st = 0
        for b in batch_count:
            t = batch_encoder_out.pop(0)
            in_encoder_out[st:st + b, 0:t.shape[1]] = t
            in_encoder_out_lens[st:st + b] = batch_encoder_lens.pop(0)
            for i in range(b):
                for j in range(beam_size):
                    cur_hyp = all_hyps.pop(0)
                    cur_len = len(cur_hyp) + 2
                    in_hyp = [self.sos] + cur_hyp + [self.eos]
                    in_hyps_pad_sos_eos[st + i][j][0:cur_len] = in_hyp
                    in_hyps_lens_sos[st + i][j] = cur_len - 1
                    if self.bidecoder:
                        r_in_hyp = [self.sos] + cur_hyp[::-1] + [self.eos]
                        in_r_hyps_pad_sos_eos[st + i][j][0:cur_len] = r_in_hyp
                    in_ctc_score[st + i][j] = all_ctc_score.pop(0)
            st += b
        in_encoder_out_lens = np.expand_dims(in_encoder_out_lens, axis=1)
        in_tensor_0 = pb_utils.Tensor("encoder_out", in_encoder_out)
        in_tensor_1 = pb_utils.Tensor("encoder_out_lens", in_encoder_out_lens)
        in_tensor_2 = pb_utils.Tensor("hyps_pad_sos_eos", in_hyps_pad_sos_eos)
        in_tensor_3 = pb_utils.Tensor("hyps_lens_sos", in_hyps_lens_sos)
        input_tensors = [in_tensor_0, in_tensor_1, in_tensor_2, in_tensor_3]
        if self.bidecoder:
            in_tensor_4 = pb_utils.Tensor("r_hyps_pad_sos_eos",
                                          in_r_hyps_pad_sos_eos)
            input_tensors.append(in_tensor_4)
        in_tensor_5 = pb_utils.Tensor("ctc_score", in_ctc_score)
        input_tensors.append(in_tensor_5)

        inference_request = pb_utils.InferenceRequest(
            model_name='decoder',
            requested_output_names=['best_index'],
            inputs=input_tensors)

        inference_response = inference_request.exec()
        if inference_response.has_error():
            raise pb_utils.TritonModelException(
                inference_response.error().message())
        else:
            # Extract the output tensors from the inference response.
            best_index = pb_utils.get_output_tensor_by_name(
                inference_response, 'best_index')
            best_index = best_index.as_numpy()
            hyps = []
            idx = 0
            for cands, cand_lens in zip(in_hyps_pad_sos_eos, in_hyps_lens_sos):
                best_idx = best_index[idx][0]
                best_cand_len = cand_lens[best_idx] - 1  # remove sos
                best_cand = cands[best_idx][1:1 + best_cand_len].tolist()
                hyps.append(best_cand)
                idx += 1

            hyps = map_batch(
                hyps, self.vocabulary,
                min(multiprocessing.cpu_count(), len(in_ctc_score)))
            st = 0
            for b in batch_count:
                sents = np.array(hyps[st:st + b])
                out0 = pb_utils.Tensor("OUTPUT0",
                                       sents.astype(self.out0_dtype))
                inference_response = pb_utils.InferenceResponse(
                    output_tensors=[out0])
                responses.append(inference_response)
                st += b
        return responses
Beispiel #5
0
def main():
    args = get_args()
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s')
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    with open(args.config, 'r') as fin:
        configs = yaml.load(fin, Loader=yaml.FullLoader)
    if len(args.override_config) > 0:
        configs = override_config(configs, args.override_config)

    symbol_table = read_symbol_table(args.dict)
    test_conf = copy.deepcopy(configs['dataset_conf'])

    test_conf['filter_conf']['max_length'] = 102400
    test_conf['filter_conf']['min_length'] = 0
    test_conf['filter_conf']['token_max_length'] = 102400
    test_conf['filter_conf']['token_min_length'] = 0
    test_conf['filter_conf']['max_output_input_ratio'] = 102400
    test_conf['filter_conf']['min_output_input_ratio'] = 0
    test_conf['speed_perturb'] = False
    test_conf['spec_aug'] = False
    test_conf['shuffle'] = False
    test_conf['sort'] = False
    test_conf['fbank_conf']['dither'] = 0.0
    test_conf['batch_conf']['batch_type'] = "static"
    test_conf['batch_conf']['batch_size'] = args.batch_size

    test_dataset = Dataset(args.data_type,
                           args.test_data,
                           symbol_table,
                           test_conf,
                           args.bpe_model,
                           partition=False)

    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)

    # Init asr model from configs
    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
    if use_cuda:
        EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider']
    else:
        EP_list = ['CPUExecutionProvider']

    encoder_ort_session = rt.InferenceSession(args.encoder_onnx,
                                              providers=EP_list)
    decoder_ort_session = None
    if args.mode == "attention_rescoring":
        decoder_ort_session = rt.InferenceSession(args.decoder_onnx,
                                                  providers=EP_list)

    # Load dict
    vocabulary = []
    char_dict = {}
    with open(args.dict, 'r') as fin:
        for line in fin:
            arr = line.strip().split()
            assert len(arr) == 2
            char_dict[int(arr[1])] = arr[0]
            vocabulary.append(arr[0])
    eos = sos = len(char_dict) - 1
    with torch.no_grad(), open(args.result_file, 'w') as fout:
        for _, batch in enumerate(test_data_loader):
            keys, feats, _, feats_lengths, _ = batch
            ort_inputs = {
                encoder_ort_session.get_inputs()[0].name: feats.numpy(),
                encoder_ort_session.get_inputs()[1].name:
                feats_lengths.numpy()
            }
            ort_outs = encoder_ort_session.run(None, ort_inputs)
            encoder_out, encoder_out_lens, ctc_log_probs, \
                beam_log_probs, beam_log_probs_idx = ort_outs
            beam_size = beam_log_probs.shape[-1]
            batch_size = beam_log_probs.shape[0]
            num_processes = min(multiprocessing.cpu_count(), batch_size)
            if args.mode == 'ctc_greedy_search':
                if beam_size != 1:
                    log_probs_idx = beam_log_probs_idx[:, :, 0]
                batch_sents = []
                for idx, seq in enumerate(log_probs_idx):
                    batch_sents.append(seq[0:encoder_out_lens[idx]].tolist())
                hyps = map_batch(batch_sents, vocabulary, num_processes, True,
                                 0)
            elif args.mode in ('ctc_prefix_beam_search',
                               "attention_rescoring"):
                batch_log_probs_seq_list = beam_log_probs.tolist()
                batch_log_probs_idx_list = beam_log_probs_idx.tolist()
                batch_len_list = encoder_out_lens.tolist()
                batch_log_probs_seq = []
                batch_log_probs_ids = []
                batch_start = []  # only effective in streaming deployment
                batch_root = TrieVector()
                root_dict = {}
                for i in range(len(batch_len_list)):
                    num_sent = batch_len_list[i]
                    batch_log_probs_seq.append(
                        batch_log_probs_seq_list[i][0:num_sent])
                    batch_log_probs_ids.append(
                        batch_log_probs_idx_list[i][0:num_sent])
                    root_dict[i] = PathTrie()
                    batch_root.append(root_dict[i])
                    batch_start.append(True)
                score_hyps = ctc_beam_search_decoder_batch(
                    batch_log_probs_seq, batch_log_probs_ids, batch_root,
                    batch_start, beam_size, num_processes, 0, -2, 0.99999)
                if args.mode == 'ctc_prefix_beam_search':
                    hyps = []
                    for cand_hyps in score_hyps:
                        hyps.append(cand_hyps[0][1])
                    hyps = map_batch(hyps, vocabulary, num_processes, False, 0)
            if args.mode == 'attention_rescoring':
                ctc_score, all_hyps = [], []
                max_len = 0
                for hyps in score_hyps:
                    cur_len = len(hyps)
                    if len(hyps) < beam_size:
                        hyps += (beam_size - cur_len) * [(-float("INF"),
                                                          (0, ))]
                    for hyp in hyps:
                        ctc_score.append(hyp[0])
                        all_hyps.append(list(hyp[1]))
                        if len(hyp[1]) + 1 > max_len:
                            max_len = len(hyp[1]) + 1
                assert len(ctc_score) == beam_size * batch_size
                hyps_pad_sos = np.ones((batch_size, beam_size, max_len),
                                       dtype=np.int64) * IGNORE_ID
                r_hyps_pad_sos = np.ones((batch_size, beam_size, max_len),
                                         dtype=np.int64) * IGNORE_ID
                hyps_lens_sos = np.ones((batch_size, beam_size),
                                        dtype=np.int32)
                k = 0
                for i in range(batch_size):
                    for j in range(beam_size):
                        cand = all_hyps[k]
                        hyps_pad_sos[i][j][0:len(cand) + 1] = [sos] + cand
                        r_hyps_pad_sos[i][j][0:len(cand) +
                                             1] = [sos] + cand[::-1]
                        hyps_lens_sos[i][j] = len(cand) + 1
                        k += 1
                decoder_ort_inputs = {
                    decoder_ort_session.get_inputs()[0].name: encoder_out,
                    decoder_ort_session.get_inputs()[1].name: encoder_out_lens,
                    decoder_ort_session.get_inputs()[2].name: hyps_pad_sos,
                    decoder_ort_session.get_inputs()[3].name: hyps_lens_sos,
                    decoder_ort_session.get_inputs()[4].name: r_hyps_pad_sos
                }
                decoder_out, r_decoder_out = decoder_ort_session.run(
                    None, decoder_ort_inputs)
                best_sents = []
                k = 0
                for d_o, r_d_o in zip(decoder_out, r_decoder_out):
                    # d_0 & r_d_o: beam x T x V
                    cur_best_sent = []
                    cur_best_score = -float("inf")
                    for sent_d_o, sent_r_d_o in zip(d_o, r_d_o):
                        cand = all_hyps[k] + [eos]
                        r_cand = all_hyps[k][::-1] + [eos]
                        score, r_score = 0, 0
                        for i in range(len(cand)):
                            index, r_index = cand[i], r_cand[i]
                            score += sent_d_o[i][index]
                            r_score += sent_r_d_o[i][r_index]
                        if args.reverse_weight > 0:
                            score = score * (1 - args.reverse_weight) + \
                                args.reverse_weight * r_score
                        score = score + args.ctc_weight * ctc_score[k]
                        if score > cur_best_score:
                            cur_best_sent = all_hyps[k]
                            cur_best_score = score
                        k += 1
                    best_sents.append(cur_best_sent)
                hyps = map_batch(best_sents, vocabulary, num_processes)

            for i, key in enumerate(keys):
                content = hyps[i]
                logging.info('{} {}'.format(key, content))
                fout.write('{} {}\n'.format(key, content))