Exemple #1
0
    def __init__(
        self,
        examples: List[SequenceClassificationExample],
        tokenizer: PreTrainedTokenizerFast,
        label_to_id: Dict[str, int],
        tokens_per_batch: int = 32,
    ):
        self.features: List[InputFeatures] = []
        self.examples: List[SequenceClassificationExample] = examples
        texts: StrList = [ex.text for ex in self.examples]
        labels: StrList = [ex.label for ex in self.examples]

        # tokenize text into subwords with padding and truncation
        self.encodings: List[BatchEncoding] = [
            tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=tokens_per_batch,
                return_token_type_ids=False,
                padding="max_length",
                return_attention_mask=True,
                return_tensors="np",
                truncation=True,
            ) for text in texts
        ]

        # register features
        self.features = [
            InputFeatures(
                input_ids=encoding.input_ids.flatten().tolist(),
                attention_mask=encoding.attention_mask.flatten().tolist(),
                label_ids=[label_to_id.get(label, 0)],
            ) for encoding, label in zip(self.encodings, labels)
        ]
        self._n_features = len(self.features)
Exemple #2
0
def get_adjusted_lengths(
    sentences: Sentences,
    tokenizer: PreTrainedTokenizerFast,
    max_sequence_length,
) -> Tuple[int, ...]:
    """Return adjusted lengths based on a tokenizer and model max length."""
    encodings = [tokenizer.encode_plus(" ".join(sentence), return_offsets_mapping=True) for sentence in sentences]
    # Create end-token masks: [CLS] Hauk ur er [SEP] -> [dropped, 0, 1, 1, dropped]
    # By getting  initial token masks and shifting them:
    # [CLS] Hauk ur er [SEP] -> [0, 1, 0, 1, 0] ->
    # -> drop [mid shifted to left] + [1] drop
    # -> [_, 0, 1, 1, _]
    end_token_masks = [get_initial_token_mask(encoded["offset_mapping"])[2:-1] + [1] for encoded in encodings]
    # We need to account for two special tokens (SEP and CLS) or (<s> and </s>) when finding the cuts
    max_sequence_length -= 2
    # And some extra, because of errors
    max_sequence_length -= 6
    lengths = []
    for end_token_mask in end_token_masks:
        while len(end_token_mask) != 0:
            prefix, end_token_mask = (
                end_token_mask[:max_sequence_length],
                end_token_mask[max_sequence_length:],
            )
            length = sum(prefix)
            lengths.append(length)

    return tuple(int(length) for length in lengths)
Exemple #3
0
def convert_instances_to_feature_tensors(
        instances: List[Instance], tokenizer: PreTrainedTokenizerFast,
        label2idx: Dict[str, int]) -> List[Feature]:
    features = []
    ## tokenize the word into word_piece / BPE
    ## NOTE: adding a leading space is important for BART/GPT/Roberta tokenization.
    ## Related GitHub issues:
    ##      https://github.com/huggingface/transformers/issues/1196
    ##      https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py#L38-L56
    ##      https://github.com/ThilinaRajapakse/simpletransformers/issues/458
    assert tokenizer.add_prefix_space  ## has to be true, in order to tokenize pre-tokenized input
    print(
        "[Data Info] We are not limiting the max length in tokenizer. You should be aware of that"
    )
    for idx, inst in enumerate(instances):
        words = inst.ori_words
        orig_to_tok_index = []
        res = tokenizer.encode_plus(words, is_split_into_words=True)
        subword_idx2word_idx = res.word_ids(batch_index=0)
        prev_word_idx = -1
        for i, mapped_word_idx in enumerate(subword_idx2word_idx):
            """
            Note: by default, we use the first wordpiece/subword token to represent the word
            If you want to do something else (e.g., use last wordpiece to represent), modify them here.
            """
            if mapped_word_idx is None:  ## cls and sep token
                continue
            if mapped_word_idx != prev_word_idx:
                ## because we take the first subword to represent the whold word
                orig_to_tok_index.append(i)
                prev_word_idx = mapped_word_idx
        assert len(orig_to_tok_index) == len(words)
        labels = inst.labels
        label_ids = [label2idx[label]
                     for label in labels] if labels else [-100] * len(words)
        segment_ids = [0] * len(res["input_ids"])

        features.append(
            Feature(input_ids=res["input_ids"],
                    attention_mask=res["attention_mask"],
                    orig_to_tok_index=orig_to_tok_index,
                    token_type_ids=segment_ids,
                    word_seq_len=len(orig_to_tok_index),
                    label_ids=label_ids))
    return features
def preprocess(texts, tokenizer_path, max_len=32):

    input_ids, input_masks = [], []

    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
    tokenizer.mask_token = '[MASK]'
    tokenizer.pad_token = "[PAD]"
    tokenizer.sep_token = "[SEP]"
    tokenizer.cls_token = "[CLS]"
    tokenizer.unk_token = "[UNK]"

    for text in tqdm(texts):
        encoded = tokenizer.encode_plus(text,
                                        max_length=max_len,
                                        pad_to_max_length=True,
                                        truncation=True)
        input_ids.append(encoded['input_ids'])
        input_masks.append(encoded['attention_mask'])

    return [np.array(input_ids), np.array(input_masks)]