sentence

# In[12]:

tokens = tokenizer(sentence)
tokens

# In[13]:

input_toks = [Token(w) for w in tokens]
input_toks

# In[14]:

token_indexer.tokens_to_indices(input_toks, global_vocab, "tokens")

# In[15]:

bert_input = (token_indexer.tokens_to_indices(input_toks, global_vocab,
                                              "tokens"))

# In[16]:

token_ids = torch.zeros(1, config.max_seq_len, dtype=torch.long)
token_ids[0, :len(bert_input["tokens"])] = torch.LongTensor(
    bert_input["tokens"])

# In[17]:

token_ids
Esempio n. 2
0
class BertPreprocessor:
    def __init__(self, model_type: str, max_seq_len: int = 128):
        self.model_type = model_type
        self.max_seq_len = max_seq_len
        self.token_indexer = PretrainedBertIndexer(
            pretrained_model=self.model_type,
            max_pieces=self.max_seq_len,
            do_lowercase=True,
        )
        self.vocab = Vocabulary()
        self.token_indexer._add_encoding_to_vocabulary(self.vocab)
        self.full_vocab = {v: k for k, v in self.token_indexer.vocab.items()}

    def tokenize(self, x: str) -> List[Token]:
        return [
            Token(w) for w in flatten([
                self.token_indexer.wordpiece_tokenizer(w) for w in spacy_tok(x)
            ])[:self.max_seq_len]
        ]

    def index_to_token(self, idx: int) -> str:
        return self.full_vocab[idx]

    def indices_to_tokens(self, indices: Iterable[int]) -> List[str]:
        return [self.index_to_word(x) for x in indices]

    def token_to_index(
        self,
        token: str,
        accept_wordpiece: bool = False,
    ) -> int:
        wordpieces = self.tokenize(token)
        if len(wordpieces) > 1 and not accept_wordpiece:
            raise TokenizationError(f"{token} is not a single wordpiece")
        else:
            token = wordpieces[0].text
        return self.token_indexer.vocab[token]

    def get_index(self,
                  sentence: str,
                  word: str,
                  accept_wordpiece: bool = False,
                  last: bool = False) -> int:
        toks = self.tokenize(sentence)
        wordpieces = self.tokenize(word)
        if len(wordpieces) > 1 and not accept_wordpiece:
            raise TokenizationError(f"{word} is not a single wordpiece")
        else:
            word = wordpieces[0].text  # use first wordpiece

        if not last:
            for i, t in enumerate(toks):
                if t.text == word:
                    return i + 1  # take the [CLS] token into account
        else:
            for i, t in enumerate(reversed(toks)):
                if t.text == word:
                    return len(toks) - 1 - i
        raise ValueError(f"No {word} tokenn tokens {toks} found")

    def to_bert_model_input(self, input_sentence: str) -> np.ndarray:
        input_toks = self.tokenize(input_sentence)
        batch = self.token_indexer.tokens_to_indices(input_toks, self.vocab,
                                                     "tokens")
        token_ids = torch.LongTensor(batch["tokens"]).unsqueeze(0)
        return token_ids