Exemple #1
0
def get_special_ids(tokenizer: AutoTokenizer) -> tuple[int, ...]:
    """ Returns seperator id, close id, pad id, mask_id, and unk id """
    return tuple(
        tokenizer.convert_tokens_to_ids(t)
        for t in (tokenizer.sep_token, tokenizer.cls_token,
                  tokenizer.pad_token, tokenizer.mask_token,
                  tokenizer.unk_token))
def mask_tokens(inputs: torch.Tensor, tokenizer: AutoTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ 
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. 
        This is the standard script used in the huggingface libaray with slight adjustments for pytorch-lightning. 
        That is only adjusting how tensors are casted to the device (e.g. probability_matrix = probability_matrix.to(inputs.device)).
    """

    labels = inputs.clone()

    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    probability_matrix = probability_matrix.to(inputs.device)

    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    special_tokens_mask_tensor = torch.tensor(
        special_tokens_mask, dtype=torch.bool)
    special_tokens_mask_tensor = special_tokens_mask_tensor.to(inputs.device)

    probability_matrix.masked_fill_(special_tokens_mask_tensor, value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    masked_indices = masked_indices.to(inputs.device)

    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    full_tensor = torch.full(labels.shape, 0.8)
    full_tensor = full_tensor.to(inputs.device)

    indices_replaced = torch.bernoulli(full_tensor).bool() & masked_indices
    indices_replaced = indices_replaced.to(inputs.device)

    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    other_full_tensor = torch.full(labels.shape, 0.5)
    other_full_tensor = other_full_tensor.to(inputs.device)

    indices_random = torch.bernoulli(
        other_full_tensor).bool() & masked_indices & ~indices_replaced
    indices_random = indices_random.to(inputs.device)

    random_words = torch.randint(
        len(tokenizer), labels.shape, dtype=torch.long)
    random_words = random_words.to(inputs.device)

    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels
def mask_tokens(inputs: torch.Tensor, tokenizer: AutoTokenizer,
                args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """

    labels = inputs.clone()

    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    probability_matrix = probability_matrix.to(inputs.device)

    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    special_tokens_mask_tensor = torch.tensor(special_tokens_mask,
                                              dtype=torch.bool)
    special_tokens_mask_tensor = special_tokens_mask_tensor.to(inputs.device)

    # print(special_tokens_mask_tensor.device)
    probability_matrix.masked_fill_(special_tokens_mask_tensor, value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    masked_indices = masked_indices.to(inputs.device)

    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    full_tensor = torch.full(labels.shape, 0.8)
    full_tensor = full_tensor.to(inputs.device)

    indices_replaced = torch.bernoulli(full_tensor).bool() & masked_indices
    indices_replaced = indices_replaced.to(inputs.device)

    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    other_full_tensor = torch.full(labels.shape, 0.5)
    other_full_tensor = other_full_tensor.to(inputs.device)

    indices_random = torch.bernoulli(
        other_full_tensor).bool() & masked_indices & ~indices_replaced
    indices_random = indices_random.to(inputs.device)

    random_words = torch.randint(len(tokenizer),
                                 labels.shape,
                                 dtype=torch.long)
    random_words = random_words.to(inputs.device)

    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels
Exemple #4
0
def preprocess_text(x: str, tokenizer: AutoTokenizer, max_sequence_len: int):
    cur_x = x
    if isinstance(tokenizer, BertTokenizer):
        cur_x = "[CLS] " + cur_x
    cur_x = cur_x.replace("\n", "")
    cur_x = cur_x.replace(" cannot ", " can not ")
    cur_x = tokenizer.tokenize(cur_x)
    cur_x = tokenizer.convert_tokens_to_ids(cur_x)
    cur_x = cur_x[:max_sequence_len]
    cur_x = cur_x + [0] * (max_sequence_len - len(cur_x))
    return cur_x
def convert_example_to_feature(
        example,
        tokenizer: AutoTokenizer,
        chineseandpunctuationextractor: ChineseAndPunctuationExtractor,
        label_map,
        max_length: Optional[int] = 512,
        pad_to_max_length: Optional[bool] = None):
    spo_list = example['spo_list'] if "spo_list" in example.keys() else None
    text_raw = example['text']

    sub_text = []  # 放置中文字符
    buff = ""  # 存放非中文字符
    for char in text_raw:
        if chineseandpunctuationextractor.is_chinese_or_punct(char):
            if buff != "":
                sub_text.append(buff)
                buff = ""
            sub_text.append(char)
        else:
            buff += char
    if buff != "":
        sub_text.append(buff)

    tok_to_orig_start_index = []
    tok_to_orig_end_index = []
    orig_to_tok_index = []
    tokens = []
    text_tmp = ''
    for (i, token) in enumerate(sub_text):
        orig_to_tok_index.append(len(tokens))
        sub_tokens = tokenizer.tokenize(token)
        text_tmp += token
        for sub_token in sub_tokens:
            tok_to_orig_start_index.append(len(text_tmp) - len(token))
            tok_to_orig_end_index.append(len(text_tmp) - 1)
            tokens.append(sub_token)
            if len(tokens) >= max_length - 2:
                break
        else:
            continue
        break
    # print("tok_to_orig_start_index: ", tok_to_orig_start_index)
    # print("tok_to_orig_end_index: ", tok_to_orig_end_index)
    # print("orig_to_tok_index: ", orig_to_tok_index)
    # print("tokens: ", tokens)
    seq_len = len(tokens)
    # 2 tags for each predicate + I tag + O tag
    num_labels = 2 * (len(label_map.keys()) - 2) + 2
    # initialize tag
    labels = [[0] * num_labels for i in range(seq_len)]  # 每个字都要生成标签表示,用于预测

    if spo_list is not None:
        labels = parse_label(spo_list, label_map, tokens, tokenizer)

    # add [CLS] and [SEP] token, they are tagged into "O" for outside
    if seq_len > max_length - 2:
        tokens = tokens[0:(max_length - 2)]
        labels = labels[0:(max_length - 2)]
        tok_to_orig_start_index = tok_to_orig_start_index[0:(max_length - 2)]
        tok_to_orig_end_index = tok_to_orig_end_index[0:(max_length - 2)]
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    # "O" tag for [PAD], [CLS], [SEP] token
    outside_label = [[1] + [0] * (num_labels - 1)]

    labels = outside_label + labels + outside_label
    tok_to_orig_start_index = [-1] + tok_to_orig_start_index + [-1]
    tok_to_orig_end_index = [-1] + tok_to_orig_end_index + [-1]
    if seq_len < max_length:
        tokens = tokens + ["[PAD]"] * (max_length - seq_len - 2)
        labels = labels + outside_label * (max_length - len(labels))
        tok_to_orig_start_index = tok_to_orig_start_index + [-1] * (
            max_length - len(tok_to_orig_start_index))
        tok_to_orig_end_index = tok_to_orig_end_index + [-1] * (
            max_length - len(tok_to_orig_end_index))

    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    return InputFeature(
        input_ids=np.array(token_ids),
        seq_len=np.array(seq_len),
        tok_to_orig_start_index=np.array(tok_to_orig_start_index),
        tok_to_orig_end_index=np.array(tok_to_orig_end_index),
        labels=np.array(labels),
    )
def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_seq_length: int,
    tokenizer: AutoTokenizer,
    cls_token="[CLS]",
    cls_token_segment_id=0,
    sep_token="[SEP]",
    pad_token=0,
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    sequence_b_segment_id=1,
    mask_padding_with_zero=True,
    verbose=False
) -> List[InputFeatures]:
    """ Loads a data file into a list of `InputFeatures`
    """
    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10_000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []
        for word, label in zip(example.words, example.labels):
            word_tokens = tokenizer.tokenize(word)
            # word_tokens = word_tokens[:5]

            if len(word_tokens) > 0:
                tokens.extend(word_tokens)
                label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

        if len(tokens) > max_seq_length - 2:
            logger.warning("Sequence length exceed {} (cut).".format(max_seq_length))
            tokens = tokens[: (max_seq_length - 2)]
            label_ids = label_ids[: (max_seq_length - 2)]

        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        tokens = [cls_token] + tokens
        label_ids = [pad_token_label_id] + label_ids
        segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        seq_length = len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        input_ids += [pad_token] * padding_length
        input_mask += [0 if mask_padding_with_zero else 1] * padding_length
        segment_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_token_label_id] * padding_length

        decoder_mask = [(x != pad_token_label_id) for x in label_ids]

        # assert len(input_ids) == max_seq_length
        # assert len(input_mask) == max_seq_length
        # assert len(segment_ids) == max_seq_length
        # assert len(label_ids) == max_seq_length

        if verbose and ex_index < 1:
            logger.info("*** Example ***")
            logger.info("guid: {} (length: {})".format(example.guid, seq_length))
            logger.info("tokens: %s", " ".join([str(x) for x in tokens[:seq_length]]))
            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids[:seq_length]]))
            # logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
            # logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids[:seq_length]]))
            logger.info("decode_mask: %s", " ".join([str(x) for x in decoder_mask[:seq_length]]))

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=input_mask,
                token_type_ids=segment_ids,
                label_ids=label_ids,
                decoder_mask=decoder_mask
            )
        )
Exemple #7
0
def convert_examples_to_features(examples: List[InputExample],
                                 label_list: List[str],
                                 max_seq_length: int,
                                 tokenizer: AutoTokenizer,
                                 cls_token="[CLS]",
                                 cls_token_segment_id=0,
                                 sep_token="[SEP]",
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 pad_token_label_id=-100,
                                 sequence_a_segment_id=0,
                                 sequence_b_segment_id=1,
                                 mask_padding_with_zero=True,
                                 verbose=False) -> List[InputFeatures]:
    """ Loads a data file into a list of `InputFeatures`
    """
    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10_000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []

        prod_start_index = prod_end_index = -1
        for wid, (word, label) in enumerate(zip(example.words,
                                                example.labels)):
            if label == "B-arm_description":
                prod_start_index = len(tokens)
                tokens.append(PROD_START_MARKER)
                label_ids.append(pad_token_label_id)
            elif prod_start_index >= 0 and prod_end_index < 0 and label != "I-arm_description":
                prod_end_index = len(tokens)
                tokens.append(PROD_END_MARKER)
                label_ids.append(pad_token_label_id)

            word_tokens = tokenizer.tokenize(word)
            word_tokens = word_tokens[:5]  # avoid long chemical names

            if len(word_tokens) > 0:
                tokens.extend(word_tokens)
                # Use the real label id for the first token of the word,
                # and padding ids for the remaining tokens
                # skip unknown labels (used by semi-supervised training with partial annotations
                label_ids.extend([label_map.get(label, pad_token_label_id)] +
                                 [pad_token_label_id] * (len(word_tokens) - 1))

        # Product at the end of sequence
        if prod_start_index >= 0 and prod_end_index < 0:
            prod_end_index = len(tokens)
            tokens.append(PROD_END_MARKER)
            label_ids.append(pad_token_label_id)

        assert prod_start_index >= 0
        assert prod_end_index >= 0

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        if len(tokens) > max_seq_length - 2:  # [CLS], [SEP]
            logger.info(
                "Sentence length exceeds max_seq_length: {} ({})".format(
                    " ".join(tokens), len(tokens)))
            # This will fail if PROD is cut
            tokens = tokens[:(max_seq_length - 2)]
            label_ids = label_ids[:(max_seq_length - 2)]

        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        tokens = [cls_token] + tokens
        label_ids = [pad_token_label_id] + label_ids
        segment_ids = [cls_token_segment_id] + segment_ids

        prod_start_index += 1  # cls_token added to th beginning
        prod_end_index += 1

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        prod_start_mask = [0 for i in range(len(input_ids))]
        prod_start_mask[prod_start_index] = 1
        prod_end_mask = [0 for i in range(len(input_ids))]
        prod_end_mask[prod_end_index] = 1
        prod_mask = [0 for i in range(len(input_ids))]
        prod_mask[prod_start_index:prod_end_index +
                  1] = [1] * (prod_end_index + 1 - prod_start_index)

        # set segment ids for product
        # segment_ids[prod_start_index:prod_end_index+1] = [1] * (prod_end_index+1-prod_start_index)

        # Zero-pad up to the sequence length.
        seq_length = len(input_ids)
        padding_length = max_seq_length - seq_length
        input_ids += [pad_token] * padding_length
        input_mask += [0 if mask_padding_with_zero else 1] * padding_length
        prod_start_mask += ([0 if mask_padding_with_zero else 1] *
                            padding_length)
        prod_end_mask += ([0 if mask_padding_with_zero else 1] *
                          padding_length)
        prod_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
        segment_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_token_label_id] * padding_length
        decoder_mask = [(x != pad_token_label_id) for x in label_ids]

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(prod_start_mask) == max_seq_length
        assert len(prod_end_mask) == max_seq_length
        assert len(prod_mask) == max_seq_length
        assert len(prod_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

        if verbose and ex_index < 1:
            logger.info("*** Example ***")
            logger.info("guid: {} (length: {})".format(example.guid,
                                                       seq_length))
            logger.info("tokens: " +
                        " ".join([str(x) for x in tokens[:seq_length]]))
            logger.info("input_ids: " +
                        " ".join([str(x) for x in input_ids[:seq_length]]))
            logger.info("label_ids: " +
                        " ".join([str(x) for x in label_ids[:seq_length]]))
            logger.info("decoder_mask: " +
                        " ".join([str(x) for x in decoder_mask[:seq_length]]))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=input_mask,
                          prod_start_mask=prod_start_mask,
                          prod_end_mask=prod_end_mask,
                          prod_mask=prod_mask,
                          token_type_ids=segment_ids,
                          label_ids=label_ids,
                          decoder_mask=decoder_mask))
Exemple #8
0
class TorchTransformersNerPreprocessor(Component):
    """Takes tokens and splits them into bert subtokens, encodes subtokens with their indices.
    Creates a mask of subtokens (one for the first subtoken, zero for the others).

    If tags are provided, calculates tags for subtokens.

    Args:
        vocab_file: path to vocabulary
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        max_subword_length: replace token to <unk> if it's length is larger than this
            (defaults to None, which is equal to +infinity)
        token_masking_prob: probability of masking token while training
        provide_subword_tags: output tags for subwords or for words
        subword_mask_mode: subword to select inside word tokens, can be "first" or "last"
            (default="first")

    Attributes:
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        max_subword_length: rmax lenght of a bert subtoken
        tokenizer: instance of Bert FullTokenizer
    """
    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = False,
                 max_seq_length: int = 512,
                 max_subword_length: int = None,
                 token_masking_prob: float = 0.0,
                 provide_subword_tags: bool = False,
                 subword_mask_mode: str = "first",
                 **kwargs):
        self._re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.provide_subword_tags = provide_subword_tags
        self.mode = kwargs.get('mode')
        self.max_seq_length = max_seq_length
        self.max_subword_length = max_subword_length
        self.subword_mask_mode = subword_mask_mode
        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(
                vocab_file, do_lower_case=do_lower_case)
        self.token_masking_prob = token_masking_prob

    def __call__(self,
                 tokens: Union[List[List[str]], List[str]],
                 tags: List[List[str]] = None,
                 **kwargs):
        if isinstance(tokens[0], str):
            tokens = [re.findall(self._re_tokenizer, s) for s in tokens]
        subword_tokens, subword_tok_ids, startofword_markers, subword_tags = [], [], [], []
        for i in range(len(tokens)):
            toks = tokens[i]
            ys = ['O'] * len(toks) if tags is None else tags[i]
            assert len(toks) == len(ys), \
                f"toks({len(toks)}) should have the same length as ys({len(ys)})"
            sw_toks, sw_marker, sw_ys = \
                self._ner_bert_tokenize(toks,
                                        ys,
                                        self.tokenizer,
                                        self.max_subword_length,
                                        mode=self.mode,
                                        subword_mask_mode=self.subword_mask_mode,
                                        token_masking_prob=self.token_masking_prob)
            if self.max_seq_length is not None:
                if len(sw_toks) > self.max_seq_length:
                    raise RuntimeError(
                        f"input sequence after bert tokenization"
                        f" shouldn't exceed {self.max_seq_length} tokens.")
            subword_tokens.append(sw_toks)
            subword_tok_ids.append(
                self.tokenizer.convert_tokens_to_ids(sw_toks))
            startofword_markers.append(sw_marker)
            subword_tags.append(sw_ys)
            assert len(sw_marker) == len(sw_toks) == len(subword_tok_ids[-1]) == len(sw_ys), \
                f"length of sow_marker({len(sw_marker)}), tokens({len(sw_toks)})," \
                f" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})" \
                f" for tokens = `{toks}` should match"

        subword_tok_ids = zero_pad(subword_tok_ids, dtype=int, padding=0)
        startofword_markers = zero_pad(startofword_markers,
                                       dtype=int,
                                       padding=0)
        attention_mask = Mask()(subword_tokens)

        if tags is not None:
            if self.provide_subword_tags:
                return tokens, subword_tokens, subword_tok_ids, \
                       attention_mask, startofword_markers, subword_tags
            else:
                nonmasked_tags = [[t for t in ts if t != 'X'] for ts in tags]
                for swts, swids, swms, ts in zip(subword_tokens,
                                                 subword_tok_ids,
                                                 startofword_markers,
                                                 nonmasked_tags):
                    if (len(swids) != len(swms)) or (len(ts) != sum(swms)):
                        log.warning(
                            'Not matching lengths of the tokenization!')
                        log.warning(
                            f'Tokens len: {len(swts)}\n Tokens: {swts}')
                        log.warning(
                            f'Markers len: {len(swms)}, sum: {sum(swms)}')
                        log.warning(f'Masks: {swms}')
                        log.warning(f'Tags len: {len(ts)}\n Tags: {ts}')
                return tokens, subword_tokens, subword_tok_ids, \
                       attention_mask, startofword_markers, nonmasked_tags
        return tokens, subword_tokens, subword_tok_ids, startofword_markers, attention_mask

    @staticmethod
    def _ner_bert_tokenize(
        tokens: List[str],
        tags: List[str],
        tokenizer: AutoTokenizer,
        max_subword_len: int = None,
        mode: str = None,
        subword_mask_mode: str = "first",
        token_masking_prob: float = None
    ) -> Tuple[List[str], List[int], List[str]]:
        do_masking = (mode == 'train') and (token_masking_prob is not None)
        do_cutting = (max_subword_len is not None)
        tokens_subword = ['[CLS]']
        startofword_markers = [0]
        tags_subword = ['X']
        for token, tag in zip(tokens, tags):
            token_marker = int(tag != 'X')
            subwords = tokenizer.tokenize(token)
            if not subwords or (do_cutting and
                                (len(subwords) > max_subword_len)):
                tokens_subword.append('[UNK]')
                startofword_markers.append(token_marker)
                tags_subword.append(tag)
            else:
                if do_masking and (random.random() < token_masking_prob):
                    tokens_subword.extend(['[MASK]'] * len(subwords))
                else:
                    tokens_subword.extend(subwords)
                if subword_mask_mode == "last":
                    startofword_markers.extend([0] * (len(subwords) - 1) +
                                               [token_marker])
                else:
                    startofword_markers.extend([token_marker] + [0] *
                                               (len(subwords) - 1))
                tags_subword.extend([tag] + ['X'] * (len(subwords) - 1))

        tokens_subword.append('[SEP]')
        startofword_markers.append(0)
        tags_subword.append('X')
        return tokens_subword, startofword_markers, tags_subword
Exemple #9
0
def convert_examples_to_features(
    examples: List[InputExample],
    max_seq_len: int,
    tokenizer: AutoTokenizer,
    pad_token_label_id: int = -100,
    cls_token_segment_id: int = 0,
    pad_token_segment_id: int = 0,
    sequence_a_segment_id: int = 0,
    mask_padding_with_zero: bool = True,
) -> List[InputFeatures]:
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logging.debug("Processing example %d of %d", ex_index,
                          len(examples))

        # Tokenize word by word (for NER)
        tokens: List[str] = []
        slot_labels_ids = []
        pos_labels_ids = []
        np_labels_ids, vp_labels_ids, entity_labels_ids, acronym_labels_ids = (
            [],
            [],
            [],
            [],
        )

        for (
                word,
                slot_label,
                pos_label,
                np_label,
                vp_label,
                entity_label,
                acronym_label,
        ) in zip(
                example.words,
                example.slot_labels,
                example.pos_labels,
                example.np_labels,
                example.vp_labels,
                example.entity_labels,
                example.acronym_labels,
        ):
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                # For handling the bad-encoded word
                word_tokens = [unk_token]
            tokens.extend(word_tokens)

            # Use the real label ID for the first token of the word, and padding IDs for the
            # remaining tokens.
            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] *
                                   (len(word_tokens) - 1))
            pos_labels_ids.extend([int(pos_label)] + [pad_token_label_id] *
                                  (len(word_tokens) - 1))
            np_labels_ids.extend([int(np_label)] + [pad_token_label_id] *
                                 (len(word_tokens) - 1))
            vp_labels_ids.extend([int(vp_label)] + [pad_token_label_id] *
                                 (len(word_tokens) - 1))
            entity_labels_ids.extend([int(entity_label)] +
                                     [pad_token_label_id] *
                                     (len(word_tokens) - 1))
            acronym_labels_ids.extend([int(acronym_label)] +
                                      [pad_token_label_id] *
                                      (len(word_tokens) - 1))

        # Account for [CLS] and [SEP].
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            slot_labels_ids = slot_labels_ids[:(max_seq_len -
                                                special_tokens_count)]
            pos_labels_ids = pos_labels_ids[:(max_seq_len -
                                              special_tokens_count)]

            np_labels_ids = np_labels_ids[:(max_seq_len -
                                            special_tokens_count)]
            vp_labels_ids = vp_labels_ids[:(max_seq_len -
                                            special_tokens_count)]
            entity_labels_ids = entity_labels_ids[:(max_seq_len -
                                                    special_tokens_count)]
            acronym_labels_ids = acronym_labels_ids[:(max_seq_len -
                                                      special_tokens_count)]

        # Add [SEP] token.
        tokens += [sep_token]
        slot_labels_ids += [pad_token_label_id]
        pos_labels_ids += [pad_token_label_id]
        np_labels_ids += [pad_token_label_id]
        vp_labels_ids += [pad_token_label_id]
        entity_labels_ids += [pad_token_label_id]
        acronym_labels_ids += [pad_token_label_id]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token.
        tokens = [cls_token] + tokens
        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
        pos_labels_ids = [pad_token_label_id] + pos_labels_ids
        np_labels_ids = [pad_token_label_id] + np_labels_ids
        vp_labels_ids = [pad_token_label_id] + vp_labels_ids
        entity_labels_ids = [pad_token_label_id] + entity_labels_ids
        acronym_labels_ids = [pad_token_label_id] + acronym_labels_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + (
            [0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                           padding_length)
        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] *
                                             padding_length)
        pos_labels_ids = pos_labels_ids + ([pad_token_label_id] *
                                           padding_length)

        np_labels_ids = np_labels_ids + ([pad_token_label_id] * padding_length)
        vp_labels_ids = vp_labels_ids + ([pad_token_label_id] * padding_length)
        entity_labels_ids = entity_labels_ids + ([pad_token_label_id] *
                                                 padding_length)
        acronym_labels_ids = acronym_labels_ids + ([pad_token_label_id] *
                                                   padding_length)

        assert len(input_ids
                   ) == max_seq_len, "Error with input length {} vs {}".format(
                       len(input_ids), max_seq_len)
        assert (len(attention_mask) == max_seq_len
                ), "Error with attention mask length {} vs {}".format(
                    len(attention_mask), max_seq_len)
        assert (len(token_type_ids) == max_seq_len
                ), "Error with token type length {} vs {}".format(
                    len(token_type_ids), max_seq_len)
        assert (len(slot_labels_ids) == max_seq_len
                ), "Error with slot labels length {} vs {}".format(
                    len(slot_labels_ids), max_seq_len)
        assert (len(pos_labels_ids) == max_seq_len
                ), "Error with pos labels length {} vs {}".format(
                    len(pos_labels_ids), max_seq_len)
        assert (len(np_labels_ids) == max_seq_len
                ), "Error with np labels length {} vs {}".format(
                    len(np_labels_ids), max_seq_len)
        assert (len(vp_labels_ids) == max_seq_len
                ), "Error with vp labels length {} vs {}".format(
                    len(vp_labels_ids), max_seq_len)
        assert (len(entity_labels_ids) == max_seq_len
                ), "Error with entity labels length {} vs {}".format(
                    len(entity_labels_ids), max_seq_len)
        assert (len(acronym_labels_ids) == max_seq_len
                ), "Error with acronym labels length {} vs {}".format(
                    len(acronym_labels_ids), max_seq_len)

        intent_label_id = int(example.intent_label)

        if ex_index < 3:
            logging.debug(  # pylint: disable=logging-not-lazy
                "Example created. guid: %s, tokens: %s, input_ids: %s, " +
                "attention_mask: %s, token_type_ids: %s, intent_label: %s (id = %d), "
                + "slot_labels: %s, POS_labels: %s, NP_labels: %s" +
                "VP_labels: %s, entity_labels, %s acronym_labels: %s",
                example.guid,
                " ".join([str(x) for x in tokens]),
                " ".join([str(x) for x in input_ids]),
                " ".join([str(x) for x in attention_mask]),
                " ".join([str(x) for x in token_type_ids]),
                example.intent_label,
                intent_label_id,
                " ".join([str(x) for x in slot_labels_ids]),
                " ".join([str(x) for x in pos_labels_ids]),
                " ".join([str(x) for x in np_labels_ids]),
                " ".join([str(x) for x in vp_labels_ids]),
                " ".join([str(x) for x in entity_labels_ids]),
                " ".join([str(x) for x in acronym_labels_ids]),
            )

        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                intent_label_id=intent_label_id,
                slot_labels_ids=slot_labels_ids,
                pos_labels_ids=pos_labels_ids,
                np_labels_ids=np_labels_ids,
                vp_labels_ids=vp_labels_ids,
                entity_labels_ids=entity_labels_ids,
                acronym_labels_ids=acronym_labels_ids,
            ))
    return features