def string_to_fields(string: str, tokenizer: Tokenizer,
                     token_indexers: Dict[str, TokenIndexer]):
    tokenized_string = tokenizer.tokenize(string)
    tokenized_string.insert(0, Token(END_SYMBOL))
    field = TextField(tokenized_string, token_indexers)

    # TODO: always use single id token indexer and tokenizer default/bpe cause we will have bert/elmo passed to main str
    tokenized_golden_string = golden_tokenizer.tokenize(string)
    tokenized_golden_string.append(
        Token(END_SYMBOL))  # with eos at the end for loss compute
    field_golden = TextField(tokenized_golden_string, golden_token_indexers)

    return field, field_golden
Esempio n. 2
0
    def _check_start_end_tokens(self, start_symbol: str, end_symbol: str,
                                tokenizer: Tokenizer) -> None:
        """Check that `tokenizer` correctly appends `start_symbol` and `end_symbol` to the
        sequence without splitting them. Raises a `ValueError` if this is not the case.
        """

        tokens = tokenizer.tokenize(start_symbol + " " + end_symbol)
        err_msg = (
            f"Bad start or end symbol ('{start_symbol}', '{end_symbol}') "
            f"for tokenizer {self._source_tokenizer}")
        try:
            start_token, end_token = tokens[0], tokens[-1]
        except IndexError:
            raise ValueError(err_msg)
        if start_token.text != start_symbol or end_token.text != end_symbol:
            raise ValueError(err_msg)
Esempio n. 3
0
def token_alignment(
    data_tokens: Union[List[str], List[Token]],
    model_tokenizer: Tokenizer,
    start_tokens: List[str] = None,
    end_tokens: List[str] = None,
) -> Tuple[List[Token], List[int]]:
    """Aligns word tokens (data_tokens), with sub-word tokens.

    The Tokens in data_tokens may or may not be split into sub-words by
    model_tokenizer, e.g. if it's a tokenizer for a model like BERT.

    This method returns:
    (a) the tokens produced by model_tokenizer
        with optional start_tokens (e.g. [CLS]) and end tokens (e.g. [SEP])
    (b) a list of spans: pairs of (start, inclusive-end) for which spans of
        sub-words correspond to the words
    """
    model_tokens = []
    data_to_model_map = []

    if start_tokens:
        model_tokens.extend([Token(t) for t in start_tokens])

    for token in data_tokens:
        # where in the model tokens are we starting the new data token
        data_to_model_map.append(len(model_tokens))
        # if data_tokens is a list of Tokens, get the text out by using the
        # .text attribute
        if hasattr(token, 'text'):
            token = token.text
        model_tokens.extend(model_tokenizer.tokenize(token))

    data_to_model_map.append(len(model_tokens))
    data_to_model_map = [(data_to_model_map[i], data_to_model_map[i+1]-1)
                         for i in range(len(data_to_model_map)-1)]

    if end_tokens:
        model_tokens.extend([Token(t) for t in end_tokens])

    return model_tokens, data_to_model_map
Esempio n. 4
0
def tokenize_to_string(text: str, tokenizer: Tokenizer) -> List[str]:
    """Sigh"""
    return [token.text for token in tokenizer.tokenize(text)]