Example #1
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          input,
                          target=None,
                          ntarget=None,
                          reserved_len=0,
                          handle_exceed='start_slice',
                          add_end_tok=True,
                          **kwargs):
    feature_dict_list = []
    tokenized_target = tokenizer.tokenize(target) if target is not None else []
    t_input_list, _ = tok.handle_exceed(tokenizer, input,
                                        maxlen - 3 - len(tokenized_target),
                                        handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep and prediction end sep
        row_dict = dict()
        tokenized_input = [
            tok.tok_begin(tokenizer)
        ] + t_input[:maxlen - reserved_len - 3] + [tok.tok_sep(tokenizer)]

        row_dict['target'] = [-1] * maxlen
        row_dict['target_once'] = [-1] * maxlen
        tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input)
        target_start = len(tokenized_input_id)
        target_end = maxlen
        target_length = target_end - target_start

        if target is not None:
            if add_end_tok:
                tokenized_target += [tok.tok_sep(tokenizer)]
            tokenized_target_id = []
            tokenized_target_once_id = [-1] * len(tokenized_input)
            target_ids = tokenizer.convert_tokens_to_ids(tokenized_target)
            target_length = len(target_ids)
            tokenized_target_id.extend(target_ids)
            tokenized_target_once_id.extend(target_ids)
            target_end = len(tokenized_target_id) - 1
            tokenized_target_id.extend([-1] *
                                       (maxlen - len(tokenized_target_id)))
            tokenized_target_once_id.extend(
                [-1] * (maxlen - len(tokenized_target_once_id)))
            row_dict['target'] = tokenized_target_id
            row_dict['target_once'] = tokenized_target_once_id

        input_length = min(maxlen, target_start * 3)
        tokenized_input_id.extend([tokenizer.mask_token_id] *
                                  (maxlen - len(tokenized_input_id)))
        mask_id = [1] * input_length
        mask_id.extend([0] * (maxlen - len(mask_id)))
        row_dict['input'] = tokenized_input_id
        row_dict['mask'] = mask_id
        row_dict['start'] = target_start
        row_dict['end'] = target_end
        row_dict['input_length'] = input_length
        row_dict['target_length'] = target_length
        feature_dict_list.append(row_dict)

    return feature_dict_list
Example #2
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          tasks,
                          task,
                          input,
                          target=None,
                          handle_exceed='slide',
                          **kwargs):
    feature_dict_list = []
    t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2,
                                        handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        row_dict['task'] = task
        input_token = [tok.tok_begin(tokenizer)
                       ] + t_input + [tok.tok_sep(tokenizer)]
        tokenized_input_id = tokenizer.convert_tokens_to_ids(input_token)
        mask_id = [1] * len(tokenized_input_id)
        tokenized_input_id.extend([tokenizer.pad_token_id] *
                                  (maxlen - len(tokenized_input_id)))
        mask_id.extend([-1] * (maxlen - len(mask_id)))
        row_dict['input'] = tokenized_input_id
        row_dict['mask'] = mask_id
        row_dict['target'] = [-1]
        if target is not None:
            if 'multi_label' in task:
                mlb = MultiLabelBinarizer(classes=tasks[task])
                tar = mlb.fit_transform([target])
                tokenize_label = tar
            else:
                tokenize_label = [tasks[task].index(target[0])]
            row_dict['target'] = tokenize_label
        feature_dict_list.append(row_dict)
    return feature_dict_list
Example #3
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          input,
                          previous,
                          target=None,
                          ntarget=None,
                          reserved_len=0,
                          handle_exceed='noop',
                          **kwargs):
    feature_dict_list = []
    pred_len = len(tokenizer.convert_tokens_to_ids(
        target)) if target is not None else len(previous)
    t_input_list, _ = tok.handle_exceed(tokenizer, input,
                                        maxlen - 3 - pred_len - reserved_len,
                                        handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        t_input = [tok.tok_begin(tokenizer)
                   ] + t_input + [tok.tok_begin(tokenizer)]
        t_input.extend(previous)
        t_input_id = tokenizer.convert_tokens_to_ids(t_input)
        target_start = len(t_input_id) - 1

        row_dict['target'] = [-1] * maxlen
        row_dict['ntarget'] = [-1] * maxlen

        if target is not None:
            t_input_id.extend(
                tokenizer.convert_tokens_to_ids(target[:len(target)]))
            tokenized_target_id = [-1] * target_start
            # tokenized_target_id = tokenizer.convert_tokens_to_ids(t_input[1:])
            tokenized_target_id.extend(
                tokenizer.convert_tokens_to_ids(target +
                                                [tok.tok_sep(tokenizer)]))
            tokenized_target_id.extend([-1] *
                                       (maxlen - len(tokenized_target_id)))
            row_dict['target'] = tokenized_target_id
        if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0:
            tokenized_ntarget = tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(ntarget))
            tokenized_ntarget_id = [-1] * target_start
            # tokenized_ntarget_id = tokenizer.convert_tokens_to_ids(t_input[1:])
            tokenized_ntarget_id.extend(tokenized_ntarget)
            tokenized_ntarget_id.extend([-1] *
                                        (maxlen - len(tokenized_ntarget_id)))
            if len(tokenized_ntarget_id) <= maxlen:
                row_dict['ntarget'] = tokenized_ntarget_id

        mask_id = [1] * len(t_input_id)
        t_input_id.extend(
            tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) *
            (maxlen - len(t_input_id)))
        mask_id.extend([0] * (maxlen - len(mask_id)))
        row_dict['input'] = t_input_id
        row_dict['mask'] = mask_id
        row_dict['start'] = target_start
        feature_dict_list.append(row_dict)

    return feature_dict_list
Example #4
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          input,
                          previous,
                          target=None,
                          ntarget=None,
                          reserved_len=0,
                          handle_exceed='noop',
                          **kwargs):
    feature_dict_list = []
    t_input_list, _ = tok.handle_exceed(tokenizer, input,
                                        maxlen - 2 - len(previous) - 1,
                                        handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        t_input = [tok.tok_begin(tokenizer)] + \
                  t_input[:maxlen - reserved_len - 2] + \
                  [tok.tok_sep(tokenizer)]
        t_input.extend(previous)
        t_input.append(tok.tok_mask(tokenizer))
        t_input_id = tokenizer.convert_tokens_to_ids(t_input)
        mask_id = [1] * len(t_input)
        target_start = len(t_input_id) - 1
        target_end = maxlen
        t_input_id.extend([0] * (maxlen - len(t_input_id)))
        row_dict['target'] = [-1] * maxlen
        row_dict['ntarget'] = [-1] * maxlen
        tokenized_target_id = None
        if target is not None:
            tokenized_target_id = [-1] * target_start
            tokenized_target_id.append(
                tokenizer.convert_tokens_to_ids(target)[-1])
            target_end = len(tokenized_target_id) - 1
            tokenized_target_id.extend([-1] *
                                       (maxlen - len(tokenized_target_id)))
            row_dict['target'] = tokenized_target_id
        if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0:
            tokenized_ntarget = tokenizer.convert_tokens_to_ids(
                tokenizer.tokenize(ntarget))
            tokenized_ntarget_id = [-1] * target_start
            tokenized_ntarget_id.extend(tokenized_ntarget)
            tokenized_ntarget_id.extend([-1] *
                                        (maxlen - len(tokenized_ntarget_id)))
            if len(tokenized_ntarget_id) <= maxlen:
                row_dict['ntarget'] = tokenized_ntarget_id

        mask_id.extend([0] * (maxlen - len(mask_id)))
        type_id = [0] * len(t_input)
        type_id.extend([1] * (maxlen - len(type_id)))
        row_dict['input'] = t_input_id
        row_dict['type'] = type_id
        row_dict['mask'] = mask_id
        row_dict['start'] = target_start
        row_dict['end'] = target_end
        feature_dict_list.append(row_dict)

    return feature_dict_list
Example #5
0
def get_feature_from_data(tokenizer, maxlen, input, target=None, handle_exceed='start_slice', **kwargs):
    feature_dict_list = []
    t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 2, handle_exceed)

    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        tokenized_input = [tok.tok_begin(tokenizer)] + t_input + [tok.tok_sep(tokenizer)]
        tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input)

        row_dict['target'] = [-1] * maxlen
        if target is not None:
            tokenized_target = []
            targets_pointer = 0
            for tok_pos, text in enumerate(tokenized_input):
                if text == tok.tok_mask(tokenizer):
                    if targets_pointer == int(target):
                        tok_target = 1
                    else:
                        tok_target = 0
                    tokenized_target.extend([tok_target])
                    targets_pointer += 1
                else:
                    tokenized_target.append(-1)
            tokenized_target.extend([-1] * (maxlen - len(tokenized_target)))
            row_dict['target'] = tokenized_target
        target_pos_list = []
        for tok_pos, text in enumerate(tokenized_input):
            if text == tok.tok_mask(tokenizer):
                target_pos_list.append(tok_pos)
        target_pos_list.extend([0] * (4 - len(target_pos_list)))
        if len(target_pos_list) != 4:
            continue
        row_dict['target_pos'] = target_pos_list

        mask_id = [1] * len(tokenized_input)
        type_id = [0] * len(tokenized_input)
        tokenized_input_id.extend(
            [tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)])[0]] * (maxlen - len(tokenized_input_id)))
        mask_id.extend([0] * (maxlen - len(mask_id)))
        type_id.extend([1] * (maxlen - len(type_id)))
        row_dict['input'] = tokenized_input_id
        row_dict['type'] = type_id
        row_dict['mask'] = mask_id
        feature_dict_list.append(row_dict)
    return feature_dict_list
Example #6
0
def get_feature_from_data(tokenizer, maxlen, input, target=None, ntarget=None, reserved_len=0,
                          handle_exceed='start_slice', add_end_tok=True, **kwargs):
    feature_dict_list = []
    tokenized_target = tokenizer.tokenize(target) if target is not None else []
    t_input_list, _ = tok.handle_exceed(tokenizer, input, maxlen - 3 - len(tokenized_target), handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep and prediction end sep
        row_dict = dict()
        tokenized_input = [tok.tok_begin(tokenizer)] + t_input[:maxlen - reserved_len - 3] + [tok.tok_sep(tokenizer)]
        mask_id = [1] * len(tokenized_input)
        type_id = [0] * len(tokenized_input)

        row_dict['target'] = [-1] * maxlen
        row_dict['ntarget'] = [-1] * maxlen

        tokenized_input_id = tokenizer.convert_tokens_to_ids(tokenized_input)
        target_start = len(tokenized_input_id)
        if target is not None:
            if add_end_tok:
                tokenized_target += [tok.tok_sep(tokenizer)]
            tokenized_target_id = [-1] * len(tokenized_input)
            tokenized_target_id.extend(tokenizer.convert_tokens_to_ids(tokenized_target))
            tokenized_target_id.extend([-1] * (maxlen - len(tokenized_target_id)))
            print(len(tokenized_target_id), len(tokenized_input), len(t_input), tokenized_target_id)
            row_dict['target'] = tokenized_target_id

        if ntarget is not None:
            tokenized_ntarget = tokenizer.tokenize(ntarget)
            tokenized_ntarget_id = [-1] * target_start
            tokenized_ntarget_id.extend(tokenizer.convert_tokens_to_ids(tokenized_ntarget))
            tokenized_ntarget_id.extend([-1] * (maxlen - len(tokenized_ntarget_id)))
            row_dict['ntarget'] = tokenized_ntarget_id

        tokenized_input_id.extend([tokenizer.mask_token_id] * (maxlen - len(tokenized_input_id)))
        mask_id.extend([0] * (maxlen - len(mask_id)))
        type_id.extend([1] * (maxlen - len(type_id)))

        row_dict['input'] = tokenized_input_id
        row_dict['type'] = type_id
        row_dict['mask'] = mask_id
        row_dict['start'] = target_start
        feature_dict_list.append(row_dict)
    return feature_dict_list
Example #7
0
    def predict(self, input='', topK=1, topP=0.85, mode=['greedy', 'topK', 'topP'], decodenum=1, filtersim=True,
                reserved_len=0, task=None, handle_exceed='start_slice'):
        filtersim = json.loads(str(filtersim).lower())
        topK = int(topK)
        topP = float(topP)
        decodenum = int(decodenum)
        mode = mode[0] if isinstance(mode, list) else mode.lower()

        self.eval()
        sequences = [[[], 1.0]]
        with torch.no_grad():
            while True:
                all_candidates = list()
                exceed = False
                for seq in sequences:
                    if tok.tok_sep(self.tokenizer) not in seq[0]:
                        tokens, score = seq
                        feature_dict = get_feature_from_data(self.tokenizer, self.maxlen, input, tokens,
                                                             reserved_len=reserved_len,
                                                             handle_exceed=handle_exceed)[-1]
                        # check input exceed
                        if len(tokens) >= self.maxlen or feature_dict['start'] >= self.maxlen:
                            exceed = True
                            all_candidates.append(seq)
                            continue

                        for k, v in feature_dict.items():
                            feature_dict[k] = [v]
                        predictions = self.forward(feature_dict, eval=True, use_prev=True)
                        token_prob_list = predictions['label_prob_all'][0]
                        # topK topP
                        if 'top' in mode:
                            prob_list = [prob for _, prob in token_prob_list]
                            if 'topk' in mode:
                                sample_list = prob_list[:topK]
                                decode_range = max(decodenum, topK)
                                prob_norm = [float(i) / sum(sample_list) for i in sample_list]
                                choice_list = np.random.choice(sample_list, p=prob_norm,
                                                               size=decode_range,
                                                               replace=False)
                            else:
                                topP_list = np.cumsum(prob_list)
                                index_overP = [i for i, x in enumerate(topP_list) if x > topP]
                                index_overP = 0 if len(index_overP) < 1 else index_overP[0]
                                sample_list = prob_list[:index_overP + 1]
                                prob_norm = [float(i) / sum(sample_list) for i in sample_list]
                                choice_list = np.random.choice(sample_list, p=prob_norm,
                                                               size=decodenum)
                            for idx in range(decodenum):
                                sampling_index = prob_list.index(choice_list[idx])
                                k, v = token_prob_list[sampling_index]
                                candidate = [tokens + [k], score + -log(v)]
                                all_candidates.append(candidate)

                        # greedy / beam search
                        else:
                            for k, v in token_prob_list[:50]:
                                if len(tokens) > 0 and tokens[-1] == k or len(k) < 1:
                                    continue
                                candidate = [tokens + [k], score + -log(v)]
                                all_candidates.append(candidate)
                    else:
                        all_candidates.append(seq)

                ordered = sorted(all_candidates, key=lambda tup: tup[1])
                if filtersim:
                    self._filterSimilar(ordered, decodenum)
                sequences = ordered[:decodenum]
                stop = 0
                for i in sequences:
                    # i[0] - sequence,i[1] - sequence score
                    if tok.tok_sep(self.tokenizer) in i[0] or i[1] > self.maxlen:
                        stop += 1
                if stop == len(sequences) or exceed:
                    break

            for i in range(len(sequences)):
                if tok.tok_sep(self.tokenizer) in sequences[i][0]:  # remove sep token
                    sequences[i][0] = sequences[i][0][:sequences[i][0].index(tok.tok_sep(self.tokenizer))]
                sequences[i][0] = "".join(self.tokenizer.convert_tokens_to_string(sequences[i][0]))

            result_dict = {
                'label_map': sequences
            }
            self.encoder_hidden = None
            return [i[0] for i in sequences], [result_dict]
Example #8
0
def get_feature_from_data(tokenizer,
                          maxlen,
                          input,
                          previous,
                          target=None,
                          ntarget=None,
                          reserved_len=0,
                          handle_exceed='noop',
                          **kwargs):
    feature_dict_list = []

    pred_len = len(tokenizer.convert_tokens_to_ids(
        target)) + 1 if target is not None else len(previous) - 1
    t_input_list, _ = tok.handle_exceed(tokenizer, input,
                                        maxlen - 2 - pred_len, handle_exceed)
    for t_input in t_input_list:  # -2 for cls and sep
        row_dict = dict()
        t_input = [tok.tok_begin(tokenizer)] + \
                  t_input[:maxlen - reserved_len - 2] + \
                  [tok.tok_sep(tokenizer)]
        t_input_id = tokenizer.convert_tokens_to_ids(t_input)
        encoder_mask_id = [1] * (len(t_input))
        encoder_mask_id.extend([0] * (maxlen - len(encoder_mask_id)))
        t_input_id.extend(
            tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) *
            (maxlen - len(t_input_id)))

        if target is not None:
            tokenized_target_id = []
            tokenized_prev_id = []
            tokenized_prev_id.extend(
                tokenizer.convert_tokens_to_ids([tok.tok_begin(tokenizer)] +
                                                target))
            tokenized_target_id.extend(
                tokenizer.convert_tokens_to_ids(target +
                                                [tok.tok_sep(tokenizer)]))
            decoder_mask_id = [1] * (len(tokenized_prev_id))
            decoder_mask_id.extend([0] * (maxlen - len(decoder_mask_id)))
            tokenized_prev_id.extend(
                tokenizer.convert_tokens_to_ids([tok.tok_pad(tokenizer)]) *
                (maxlen - len(tokenized_prev_id)))
            tokenized_target_id.extend([-100] *
                                       (maxlen - len(tokenized_target_id)))
            row_dict['target'] = tokenized_target_id
            row_dict['prev'] = tokenized_prev_id
            if ntarget is not None and len(tokenizer.tokenize(ntarget)) > 0:
                tokenized_ntarget = tokenizer.convert_tokens_to_ids(
                    tokenizer.tokenize(ntarget))
                tokenized_ntarget_id = tokenized_ntarget
                tokenized_ntarget_id.extend(
                    [-100] * (maxlen - len(tokenized_ntarget_id)))
                if len(tokenized_ntarget_id) <= maxlen:
                    row_dict['ntarget'] = tokenized_ntarget_id
        else:
            tokenized_prev_id = [
                tokenizer.convert_tokens_to_ids(tok.tok_begin(tokenizer))
            ]
            tokenized_prev_id.extend(tokenizer.convert_tokens_to_ids(previous))
            target_start = len(tokenized_prev_id) - 1
            row_dict['start'] = target_start
            decoder_mask_id = [1] * (len(tokenized_prev_id))
            row_dict['prev'] = tokenized_prev_id

        row_dict['input'] = t_input_id
        row_dict['encoder_mask'] = encoder_mask_id
        row_dict['decoder_mask'] = decoder_mask_id
        feature_dict_list.append(row_dict)

    return feature_dict_list
Example #9
0
def preprocessing_data(item,
                       tokenizer,
                       maxlen=512,
                       handle_exceed='start_slice',
                       likelihood=['none', 'pos', 'neg', 'both'],
                       reserved_len=0,
                       **kwargs):
    likelihood = likelihood[0] if isinstance(likelihood, list) else likelihood
    tasks, task, input, targets = item
    p_target, n_target = targets
    input = input.strip()
    tokenized_target = tokenizer.tokenize(" ".join(p_target))
    param_dict = {
        'tokenizer': tokenizer,
        'maxlen': maxlen,
        'handle_exceed': handle_exceed,
        'reserved_len': reserved_len
    }

    # each word in sentence
    for j in range(1, len(tokenized_target) + 1):
        if "neg" in likelihood or 'both' in likelihood:
            # formatting neg data in csv
            if n_target is None:
                ntext_arr = [
                    tokenizer.convert_tokens_to_string(tokenized_target[:j -
                                                                        1])
                ]
            elif "[SEP]" in n_target:
                ntext_arr = [
                    ntext.strip() for ntext in n_target.split("[SEP]")
                ]
            else:
                ntext_arr = [n_target.strip()]
            # adding neg data
            for neg_text in ntext_arr:
                yield once.get_feature_from_data, {
                    **{
                        'input': input + " " + " ".join(tokenized_target[:j - 1]),
                        'target': tokenized_target[:j][-1],
                        'ntarget': neg_text,
                        "add_end_tok": False
                    },
                    **param_dict
                }
        else:
            yield get_feature_from_data, {
                **{
                    'input': input,
                    'previous': tokenized_target[:j - 1],
                    'target': tokenized_target[:j],
                    'ntarget': None
                },
                **param_dict
            }

    # end of the last word
    if "neg" in likelihood or 'both' in likelihood:
        # formatting neg data in csv
        if n_target is None:
            ntext_arr = [
                tokenizer.convert_tokens_to_string(tokenized_target[:j - 1])
            ]
        elif "[SEP]" in n_target:
            ntext_arr = [ntext.strip() for ntext in n_target.split("[SEP]")]
        else:
            ntext_arr = [n_target.strip()]
        # adding neg data
        for neg_text in ntext_arr:
            yield get_feature_from_data, {
                **{
                    'input': input,
                    'previous': tokenized_target,
                    'target': [tok.tok_sep(tokenizer)],
                    'ntarget': neg_text
                },
                **param_dict
            }
    else:
        yield get_feature_from_data, {
            **{
                'input': input,
                'previous': tokenized_target,
                'target': [tok.tok_sep(tokenizer)],
                'ntarget': None
            },
            **param_dict
        }

    # whole sentence masking
    if 'pos' in likelihood:
        yield once.get_feature_from_data, {
            **{
                'input': input,
                'target': " ".join(p_target)
            },
            **param_dict
        }
    elif 'both' in likelihood or "neg" in likelihood:
        # formatting neg data in csv
        if n_target is None:
            ntext_arr = [
                tokenizer.convert_tokens_to_string(tokenized_target[:j - 1])
            ]
        elif "[SEP]" in n_target:
            ntext_arr = [ntext.strip() for ntext in n_target.split("[SEP]")]
        else:
            ntext_arr = [n_target.strip()]
        for neg_text in ntext_arr:
            yield once.get_feature_from_data, {
                **{
                    'input': input,
                    'target': " ".join(p_target),
                    'ntarget': neg_text
                },
                **param_dict
            }

    return get_feature_from_data, param_dict
Example #10
0
def get_feature_from_data(tokenizer,
                          labels,
                          input,
                          target=None,
                          maxlen=512,
                          separator=" ",
                          handle_exceed='slide'):
    feature_dict_list = []

    mapping_index = []
    pos = 1  # cls as start 0
    for i in input.split(" "):
        for _ in range(len(tokenizer.tokenize(i))):
            if _ < 1:
                mapping_index.append({'char': i, 'pos': pos})
            pos += 1
    if target is not None:
        target = target.split(separator)

    t_input_list, t_pos_list = tok.handle_exceed(tokenizer,
                                                 input,
                                                 maxlen - 2,
                                                 mode=handle_exceed,
                                                 keep_after_sep=False)
    for t_input, t_pos in zip(t_input_list, t_pos_list):  # -2 for cls and sep
        # ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        row_dict = dict()
        tokenized_input = [tok.tok_begin(tokenizer)
                           ] + t_input + [tok.tok_sep(tokenizer)]
        input_id = tokenizer.convert_tokens_to_ids(tokenized_input)

        if target is not None:
            target_token = []
            pev = 0

            for tok_map, target_label in zip(mapping_index, target):
                if t_pos[0] < tok_map['pos'] <= t_pos[1]:
                    for _ in range(tok_map['pos'] - pev):
                        target_token += [labels.index(target_label)]
                pev = tok_map['pos']

            if "O" in labels:
                target_id = [labels.index("O")
                             ] + target_token + [labels.index("O")]
            else:
                target_id = [target_token[0]
                             ] + target_token + [target_token[-1]]

            if len(input_id) != len(target_id):
                print("input target len not equal ", len(input_id),
                      len(target_id))
            target_id.extend([0] * (maxlen - len(target_id)))
            row_dict['target'] = target_id

        map_start = 0
        map_end = len(mapping_index)
        for pos, tok_map in enumerate(mapping_index):
            if t_pos[0] == tok_map['pos']:
                map_start = pos
            elif t_pos[1] == tok_map['pos']:
                map_end = pos

        row_dict['mapping'] = mapping_index[map_start:map_end + 1]
        mask_id = [1] * len(input_id)
        mask_id.extend([0] * (maxlen - len(mask_id)))
        row_dict['mask'] = mask_id
        row_dict['end'] = len(input_id)
        input_id.extend([0] * (maxlen - len(input_id)))
        row_dict['input'] = input_id
        row_dict['pos'] = [map_start, map_end]
        feature_dict_list.append(row_dict)

    return feature_dict_list