Esempio n. 1
0
 def text_to_tokens_mask(self, pair, Y=None, context=None):
     out_gen = self._text_to_ids(pair, pad_token=self.config.pad_token)
     for i, out in enumerate(out_gen):
         if context is None:
             feats = {"tokens": out.token_ids, "mask": out.mask}
         else:
             num_answers = len(out.tokens)
             tokenized_context = []
             for answer_idx in range(num_answers):
                 out_instance = ArrayEncodedOutput(
                     token_ids=out.token_ids[answer_idx],
                     tokens=out.token_ids[answer_idx],
                     labels=None,
                     char_locs=out.char_locs,
                     mask=out.mask[answer_idx],
                 )
                 context_instance = context[0] + context[answer_idx + 1]
                 tokenized_context.append(
                     tokenize_context(context_instance, out_instance,
                                      self.config))
             feats = {
                 "tokens": out.token_ids,
                 "mask": out.mask,
                 "context": tokenized_context
             }
         if Y is None:
             yield feats
         else:
             yield feats, self.label_encoder.transform([Y])[0]
Esempio n. 2
0
    def _array_format(self, encoded_output, pad_token=None):
        """
        Returns numpy array of token idxs and corresponding mask
        Returned `x` array contains two channels:
            0: byte-pair encoding embedding
            1: positional embedding
        """
        seq_length = len(encoded_output.token_ids)
        x = np.zeros((self.config.max_length, 2), dtype=np.int32)
        if self.config.base_model.__name__ == "RoBERTa":
            x += 1
        mask = np.zeros((self.config.max_length), dtype=np.float32)

        if encoded_output.labels is not None:
            labels_arr = np.empty((self.config.max_length), dtype="object")
            labels_arr.fill((pad_token or self.config.pad_token))
        else:
            labels_arr = None

        # BPE embedding
        x[:seq_length, 0] = encoded_output.token_ids
        # masking: value of 1 means "consider this in cross-entropy LM loss"
        mask[1:seq_length] = 1
        if encoded_output.labels:
            labels_arr[:seq_length] = encoded_output.labels

        # positional_embeddings
        x[:, 1] = np.arange(
            self.text_encoder.vocab_size,
            self.text_encoder.vocab_size + self.config.max_length,
        )

        # roberta uses different positional embedding structure
        if self.config.base_model.__name__ == "RoBERTa":
            mask = np.zeros((self.config.max_length), dtype=np.float32)
            mask[0:seq_length] = 1
            positions = np.cumsum(mask, dtype=np.int32)
            positions += 1  # add padding idx because RoBERTa's pos embeds depend on it
            positions += (
                self.text_encoder.vocab_size + 1
            )  # + 1 to include unused mask token in embedding layer
            x[:, 1] = positions

        output = ArrayEncodedOutput(
            token_ids=x,
            tokens=encoded_output.tokens,
            labels=labels_arr,
            char_locs=encoded_output.char_locs,
            mask=mask,
        )
        return output
Esempio n. 3
0
 def text_to_tokens_mask(self, pair, Y=None, context=None):
     out_gen = self._text_to_ids(pair, pad_token=self.config.pad_token)
     for i, out in enumerate(out_gen):
         if context is None:
             feats = {"tokens": out.token_ids, "mask": out.mask}
         else:
             out_forward = ArrayEncodedOutput(
                 token_ids=out.token_ids[0],
                 tokens=out.token_ids[0],
                 labels=None,
                 char_locs=out.char_locs,
                 mask=out.mask[0],
             )
             out_backward = ArrayEncodedOutput(
                 token_ids=out.token_ids[1],
                 tokens=out.token_ids[1],
                 labels=None,
                 char_locs=out.char_locs,
                 mask=out.mask[1],
             )
             tokenized_context_forward = tokenize_context(
                 context[0], out_forward, self.config)
             tokenized_context_backward = tokenize_context(
                 context[1], out_backward, self.config)
             tokenized_context = [
                 tokenized_context_forward, tokenized_context_backward
             ]
             feats = {
                 "tokens": out.token_ids,
                 "mask": out.mask,
                 "context": tokenized_context
             }
         if Y is None:
             yield feats
         else:
             yield feats, self.label_encoder.transform([Y])[0]
Esempio n. 4
0
    def _text_to_ids(self, Xs, Y=None, context=None, pad_token=None):
        """
        Format multi question examples as a list of IDs
        """
        q, answer_list = Xs

        pairs = [[q, answer_list[idx]] for idx in range(len(answer_list))]
        arrays = []
        for pair in pairs:
            arrays.append(next(super()._text_to_ids(pair, Y=Y)))

        kwargs = arrays[0]._asdict()
        kwargs["tokens"] = [arr.tokens for arr in arrays]
        kwargs["token_ids"] = np.stack([arr.token_ids for arr in arrays], 0)
        kwargs["mask"] = np.stack([arr.mask for arr in arrays], 0)
        yield ArrayEncodedOutput(**kwargs)
Esempio n. 5
0
    def _text_to_ids(self, pair, Y=None, pad_token=None):
        """
        Format comparison examples as a list of IDs

        pairs: Array of text, shape [batch, 2]
        """
        assert self.config.chunk_long_sequences is False, "Chunk Long Sequences is not compatible with comparison"
        arr_forward = next(super()._text_to_ids(pair, Y=None))
        reversed_pair = pair[::-1]
        arr_backward = next(super()._text_to_ids(reversed_pair, Y=None))
        kwargs = arr_forward._asdict()
        kwargs['tokens'] = [arr_forward.tokens, arr_backward.tokens]
        kwargs['token_ids'] = np.stack(
            [arr_forward.token_ids, arr_backward.token_ids], 0)
        kwargs['mask'] = np.stack([arr_forward.mask, arr_backward.mask], 0)
        yield ArrayEncodedOutput(**kwargs)
Esempio n. 6
0
 def test_tokenize_context(self):
     encoded_output = ArrayEncodedOutput(
         token_ids=[[40478, 40481], [1180, 40482], [535, 40483],
                    [808, 40484], [289, 40485], [17164, 40486],
                    [40480, 40487]],
         tokens=[
             40478, 'everything</w>', "'s</w>", 'only</w>', '$</w>',
             '80</w>', 40480
         ],
         labels=[0] * 7,
         char_locs=[-1, 10, 12, 17, 19, 21, -1],
         mask=[0, 1, 1, 1, 1, 1, 0],
     )
     context = [
         {
             'token': "everything's",
             'start': 0,
             'end': 12,
             'left': 10,
             'bold': False
         },
         {
             'token': "only",
             'start': 13,
             'end': 17,
             'left': 20,
             'bold': False
         },
         {
             'token': "$80",
             'start': 18,
             'end': 21,
             'left': 30,
             'bold': True
         },
     ]
     config = get_config(**{'default_context': {'left': 0, 'bold': False}})
     expanded_context = tokenize_context(context, encoded_output, config)
     expected = [[False, 0], [False, 10], [False, 10], [False, 20],
                 [True, 30], [True, 30], [False, 0]]
     np.testing.assert_array_equal(expected, expanded_context)
Esempio n. 7
0
    def _array_format(self, encoded_output, pad_token=None):
        """
        Returns numpy array of token idxs and corresponding mask
        Returned `x` array contains two channels:
            0: byte-pair encoding embedding
            1: positional embedding
        """
        seq_length = len(encoded_output.token_ids)
        x = np.zeros((self.config.max_length, 2), dtype=np.int32)
        mask = np.zeros((self.config.max_length), dtype=np.float32)

        if encoded_output.labels is not None:
            labels_arr = np.empty((self.config.max_length), dtype='object')
            labels_arr.fill((pad_token or self.config.pad_token))
        else:
            labels_arr = None

        # BPE embedding
        x[:seq_length, 0] = encoded_output.token_ids
        # masking: value of 1 means "consider this in cross-entropy LM loss"
        mask[1:seq_length] = 1
        if encoded_output.labels:
            labels_arr[:seq_length] = encoded_output.labels
        # positional_embeddings
        x[:,
          1] = np.arange(self.text_encoder.vocab_size,
                         self.text_encoder.vocab_size + self.config.max_length)

        output = ArrayEncodedOutput(
            token_ids=x,
            tokens=encoded_output.tokens,
            labels=labels_arr,
            char_locs=encoded_output.char_locs,
            mask=mask,
        )
        return output