コード例 #1
0
    def __call__(self,
                 texts_a: List[str],
                 texts_b: Optional[List[str]] = None) -> List[InputFeatures]:
        """Call Bert convert_examples_to_features function to tokenize and create masks.

        texts_a and texts_b are separated by [SEP] token

        Args:
            texts_a: list of texts,
            texts_b: list of texts, it could be None, e.g. single sentence classification task

        Returns:
            batch of InputFeatures with subtokens, subtoken ids, subtoken mask, segment mask.

        """

        if texts_b is None:
            texts_b = [None] * len(texts_a)
        # unique_id is not used
        examples = [
            InputExample(unique_id=0, text_a=text_a, text_b=text_b)
            for text_a, text_b in zip(texts_a, texts_b)
        ]
        return convert_examples_to_features(examples, self.max_seq_length,
                                            self.tokenizer)
コード例 #2
0
    def __call__(self, batch: List[List[str]]) -> List[List[InputFeatures]]:
        """Call BERT convert_examples_to_features function to tokenize and create masks.

        Args:
            batch: list of elemenents where the first element represents the batch with contexts
                and the rest of elements represent response candidates batches

        Returns:
            list of feature batches with subtokens, subtoken ids, subtoken mask, segment mask
            for the context and each of response candidates separately.
        """

        if isinstance(batch[0], str):
            batch = [batch]

        samples = []
        for i in range(len(batch[0])):
            s = []
            for el in batch:
                s.append(el[i])
            samples.append(s)
        s_empt = [None] * len(samples[0])
        # TODO: add unique id
        examples = []
        for s in samples:
            ex = [InputExample(unique_id=0, text_a=text_a, text_b=text_b) for text_a, text_b in
                  zip(s, s_empt)]
            examples.append(ex)
        features = [convert_examples_to_features(el, self.max_seq_length, self.tokenizer) for el in examples]

        return features
コード例 #3
0
    def __call__(self, batch: List[List[str]]) -> List[List[InputFeatures]]:
        """Call BERT convert_examples_to_features function to tokenize and create masks.

        Args:
            batch: list of elemenents where the first element represents the batch with contexts
                and the rest of elements represent response candidates batches

        Returns:
            list of feature batches with subtokens, subtoken ids, subtoken mask, segment mask.
        """

        if isinstance(batch[0], str):
            batch = [batch]

        cont_resp_pairs = []
        if len(batch[0]) == 1:
            contexts = batch[0]
            responses_empt = [None] * len(batch)
            cont_resp_pairs.append(zip(contexts, responses_empt))
        else:
            contexts = [el[0] for el in batch]
            for i in range(1, len(batch[0])):
                responses = []
                for el in batch:
                    responses.append(el[i])
                cont_resp_pairs.append(zip(contexts, responses))
        examples = []
        for s in cont_resp_pairs:
            ex = [InputExample(unique_id=0, text_a=context, text_b=response) for context, response in s]
            examples.append(ex)
        features = [convert_examples_to_features(el, self.max_seq_length, self.tokenizer) for el in examples]

        return features
コード例 #4
0
    def __call__(self, batch):
        if isinstance(batch[0], str):
            batch = [batch]
        samples = []
        for i in range(len(batch[0])):
            s = []
            for el in batch:
                s.append(el[i])
            samples.append(s)
        s_dummy = [None] * len(samples[0])
        # TODO: add unique id
        examples = []
        for s in samples:
            ex = [
                InputExample(unique_id=0, text_a=text_a, text_b=text_b)
                for text_a, text_b in zip(s, s_dummy)
            ]
            examples.append(ex)
        features = [
            convert_examples_to_features(el, self.max_seq_length,
                                         self.tokenizer) for el in examples
        ]

        return features