Example #1
0
def get_word_labels_from_token_labels(
    hf_arch: str,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # A list of tuples, where each represents a token and its label (e.g., [('ĠHug', B-ORG), ('ging', B-ORG), ('ĠFace', I-ORG), ...])
    tok_labels,
) -> List[Tuple[str, str]]:
    """
    Given a list of tuples where each tuple defines a token and its label, return a list of tuples whereby each tuple defines the
    "word" and its label. Method assumes that model inputs are a list of words, and in conjunction with the `align_labels_with_tokens` method,
    allows the user to reconstruct the orginal raw inputs and labels.
    """
    # recreate raw words list (we assume for token classification that the input is a list of words)
    words = hf_tokenizer.convert_tokens_to_string(
        [tok_label[0] for tok_label in tok_labels]).split()

    if hf_arch == "canine":
        word_list = [f"{word} " for word in words]
    else:
        word_list = [word for word in words]

    # align "words" with labels
    word_labels, idx = [], 0
    for word in word_list:
        word_labels.append((word, tok_labels[idx][1]))
        idx += len(hf_tokenizer.tokenize(word))

    return word_labels
Example #2
0
def get_tokens_and_offsets(
        text: str,
        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[Any, int, int]]:
    tokens = tokenizer.tokenize(text)
    token_lens = [len(token) for token in tokens]
    token_lens[0] -= 1  # Ignore first "_" token
    token_ends = np.cumsum(token_lens)
    token_starts = [0] + token_ends[:-1].tolist()
    tokens_and_offsets = list(zip(tokens, token_starts, token_ends))
    return tokens_and_offsets
Example #3
0
def blurr_sort_func(
    example,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
    # if your inputs are pre-tokenized (not numericalized)
    is_split_into_words: bool = False,
    # Any other keyword arguments you want to include during tokenization
    tok_kwargs: dict = {},
):
    """This method is used by the `SortedDL` to ensure your dataset is sorted *after* tokenization"""
    txt = example[0]["text"] if isinstance(example[0], dict) else example[0]
    return len(txt) if is_split_into_words else len(
        hf_tokenizer.tokenize(txt, **tok_kwargs))