Example #1
0
def get_word_labels_from_token_labels(
    hf_arch: str,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A list of tuples, where each represents a token and its label (e.g., [('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG), ...])
    tok_labels,
) -> List[Tuple[str, str]]:
    """
    Given a list of tuples where each tuple defines a token and its label, return a list of tuples whereby each tuple defines the
    "word" and its label. Method assumes that model inputs are a list of words, and in conjunction with the `align_labels_with_tokens` method,
    allows the user to reconstruct the orginal raw inputs and labels.
    """
    # recreate raw words list (we assume for token classification that the input is a list of words)
    words = hf_tokenizer.convert_tokens_to_string(
        [tok_label[0] for tok_label in tok_labels]).split()

    if hf_arch == "canine":
        word_list = [f"{word} " for word in words]
    else:
        word_list = [word for word in words]

    # align "words" with labels
    word_labels, idx = [], 0
    for word in word_list:
        word_labels.append((word, tok_labels[idx][1]))
        idx += len(hf_tokenizer.tokenize(word))

    return word_labels
Example #2
0
    def __init__(self,
                 vocab_file,
                 tokenizer_file=None,
                 eos_token="</s>",
                 unk_token="<unk>",
                 pad_token="<pad>",
                 extra_ids=100,
                 vis_extra_ids=100,
                 additional_special_tokens=None,
                 **kwargs):
        # Add extra_ids to the special token list
        if extra_ids > 0 and additional_special_tokens is None:
            additional_special_tokens = [
                "<extra_id_{}>".format(i) for i in range(extra_ids)
            ]
        elif extra_ids > 0 and additional_special_tokens is not None:
            # Check that we have the right number of extra_id special tokens
            extra_tokens = len(
                set(
                    filter(lambda x: bool("extra_id" in x),
                           additional_special_tokens)))
            if extra_tokens != extra_ids:
                raise ValueError(
                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are provided to T5Tokenizer. "
                    "In this case the additional_special_tokens must include the extra_ids tokens"
                )

        if vis_extra_ids > 0:
            additional_special_tokens.extend(
                ["<vis_extra_id_{}>".format(i) for i in range(vis_extra_ids)])

        slow_tokenizer = self.slow_tokenizer_class(
            vocab_file,
            tokenizer_file=tokenizer_file,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            extra_ids=extra_ids,
            vis_extra_ids=vis_extra_ids,
            # additional_special_tokens=additional_special_tokens,
            **kwargs)
        fast_tokenizer = convert_slow_vlt5tokenizer(slow_tokenizer)
        self._tokenizer = fast_tokenizer

        PreTrainedTokenizerBase.__init__(
            self,
            tokenizer_file=tokenizer_file,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            extra_ids=extra_ids,
            vis_extra_ids=vis_extra_ids,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )

        self.vocab_file = vocab_file
        self._extra_ids = extra_ids
        self._vis_extra_ids = vis_extra_ids
Example #3
0
def batch_encode_pretokenized(tokenizer: transformers.PreTrainedTokenizerBase,
                              tokenized_inputs: List[List[str]],
                              tokenized_pair_inputs: Optional[List[
                                  List[str]]] = None,
                              tensor_type="tf",
                              **kw) -> transformers.BatchEncoding:
    """Batch encode pre-tokenized text, without further splitting.

  This is necessary because tokenizer(..., is_split_into_words=True) doesn't
  guarantee that tokens will stay intact - only that the final tokens will not
  span the given boundaries. If the tokenizer is called directly, you'll get
  things like: "foo" "##bar" -> "foo" "#" "#" "bar"

  Based on the implementation of batch_encode_plus in
  https://github.com/huggingface/transformers/blob/v4.1.1/src/transformers/tokenization_utils_base.py#L2489

  Args:
    tokenizer: Transformers tokenizer
    tokenized_inputs: list of tokenized inputs
    tokenized_pair_inputs: (optional) list of tokenized second-segment inputs
    tensor_type: tensor type to return
    **kw: additional args, forwarded to tokenizer.prepare_for_model

  Returns:
    BatchEncoding, suitable for model input
  """
    encoded_input = {}
    tokenized_pair_inputs = (tokenized_pair_inputs
                             or [None] * len(tokenized_inputs))
    for tokens, pair_tokens in zip(tokenized_inputs, tokenized_pair_inputs):
        ids = tokenizer.convert_tokens_to_ids(tokens)
        pair_ids = (tokenizer.convert_tokens_to_ids(pair_tokens)
                    if pair_tokens is not None else None)
        encoded = tokenizer.prepare_for_model(ids,
                                              pair_ids=pair_ids,
                                              add_special_tokens=True,
                                              padding="do_not_pad",
                                              truncation="longest_first",
                                              return_attention_mask=False,
                                              pad_to_multiple_of=False,
                                              **kw)
        for k, v in encoded.items():
            encoded_input.setdefault(k, []).append(v)

    encoded_input = tokenizer.pad(encoded_input,
                                  padding="longest",
                                  return_attention_mask=True)
    return transformers.BatchEncoding(encoded_input, tensor_type=tensor_type)
Example #4
0
    def convert_to_features(
        examples: Any,
        tokenizer: PreTrainedTokenizerBase,
        padding: str,
        max_source_length: int,
        max_target_length: int,
        src_text_column_name: str,
        tgt_text_column_name: str,
    ):
        translations = examples[
            "translation"]  # Extract translations from dict

        def extract_text(lang):
            return [text[lang] for text in translations]

        src_texts = extract_text(src_text_column_name)
        src_texts = ["Translate from source text: " + src for src in src_texts]

        encoded_results = tokenizer.prepare_seq2seq_batch(
            src_texts=src_texts,
            tgt_texts=extract_text(tgt_text_column_name),
            max_length=max_source_length,
            max_target_length=max_target_length,
            padding=padding,
        )
        return encoded_results
Example #5
0
def get_token_labels_from_input_ids(
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # List of input_ids for the tokens in a single piece of processed text
    input_ids: List[int],
    # List of label indexs for each token
    token_label_ids: List[int],
    # List of label names from witch the `label` indicies can be used to find the name of the label
    vocab: List[str],
    # The token ID that should be ignored when calculating the loss
    ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
    # The token used to identifiy ignored tokens (default: [xIGNx])
    ignore_token: str = "[xIGNx]",
) -> List[Tuple[str, str]]:
    """
    Given a list of input IDs, the label ID associated to each, and the labels vocab, this method will return a list of tuples whereby
    each tuple defines the "token" and its label name. For example:
    [('ĠWay', B-PER), ('de', B-PER), ('ĠGill', I-PER), ('iam', I-PER), ('Ġloves'), ('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG)]
    """
    # convert ids to tokens
    toks = hf_tokenizer.convert_ids_to_tokens(input_ids)
    # align "tokens" with labels
    tok_labels = [
        (tok, ignore_token if label_id == ignore_token_id else vocab[label_id])
        for tok_id, tok, label_id in zip(input_ids, toks, token_label_ids)
        if tok_id not in hf_tokenizer.all_special_ids
    ]
    return tok_labels
Example #6
0
def get_tokens_and_offsets(
        text: str,
        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[Any, int, int]]:
    tokens = tokenizer.tokenize(text)
    token_lens = [len(token) for token in tokens]
    token_lens[0] -= 1  # Ignore first "_" token
    token_ends = np.cumsum(token_lens)
    token_starts = [0] + token_ends[:-1].tolist()
    tokens_and_offsets = list(zip(tokens, token_starts, token_ends))
    return tokens_and_offsets
Example #7
0
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    txt = example[0]["text"] if isinstance(example[0], dict) else example[0]
    return len(txt) if is_split_into_words else len(
        hf_tokenizer.tokenize(txt, **tok_kwargs))
Example #8
0
def get_hf_objects(
    pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
    model_cls: PreTrainedModel,
    config: Union[PretrainedConfig, str, os.PathLike] = None,
    tokenizer_cls: PreTrainedTokenizerBase = None,
    config_kwargs: dict = {},
    tokenizer_kwargs: dict = {},
    model_kwargs: dict = {},
    cache_dir: Union[str, os.PathLike] = None
) -> Tuple[str, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel]:
    """
    Given at minimum a `pretrained_model_name_or_path` and `model_cls (such as
    `AutoModelForSequenceClassification"), this method returns all the Hugging Face objects you need to train
    a model using Blurr
    """
    # config
    if config is None:
        config = AutoConfig.from_pretrained(pretrained_model_name_or_path,
                                            cache_dir=cache_dir,
                                            **config_kwargs)

    # tokenizer (gpt2, roberta, bart (and maybe others) tokenizers require a prefix space)
    if any(s in pretrained_model_name_or_path
           for s in ["gpt2", "roberta", "bart", "longformer"]):
        tokenizer_kwargs = {**{"add_prefix_space": True}, **tokenizer_kwargs}

    if tokenizer_cls is None:
        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path,
            cache_dir=cache_dir,
            **tokenizer_kwargs)
    else:
        tokenizer = tokenizer_cls.from_pretrained(
            pretrained_model_name_or_path,
            cache_dir=cache_dir,
            **tokenizer_kwargs)

    # model
    model = model_cls.from_pretrained(pretrained_model_name_or_path,
                                      config=config,
                                      cache_dir=cache_dir,
                                      **model_kwargs)

    # arch
    try:
        arch = model.__module__.split(".")[2]
    except:
        arch = "unknown"

    return (arch, config, tokenizer, model)
Example #9
0
 def convert_to_features(
     examples: Any,
     tokenizer: PreTrainedTokenizerBase,
     padding: str,
     max_source_length: int,
     max_target_length: int,
     src_text_column_name: str,
     tgt_text_column_name: str,
 ):
     encoded_results = tokenizer.prepare_seq2seq_batch(
         src_texts=examples[src_text_column_name],
         tgt_texts=examples[tgt_text_column_name],
         max_length=max_source_length,
         max_target_length=max_target_length,
         padding=padding,
     )
     return encoded_results
Example #10
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizerBase,
                 file_path: str,
                 block_size: int = 512,
                 overwrite_cache=False):
        super(TextDataset, self).__init__()
        self.path = file_path
        assert os.path.isfile(file_path)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory,
            "cached_lm_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.data = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)
            else:
                logger.info(
                    f"Creating features from dataset file at {directory}")

                self.data = []
                with open(file_path, encoding="utf-8") as f:
                    for each_line in f:
                        obj = json.loads(each_line)
                        tokenized_source = tokenizer.encode(
                            obj['source'],
                            truncation=True,
                            max_length=block_size,
                            padding=True)
                        tokenized_target = tokenizer.encode(
                            obj['target'],
                            truncation=True,
                            max_length=block_size,
                            padding=True)
                        self.data.append((tokenized_source, tokenized_target))

                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should loook for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.data,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)