Beispiel #1
0
    def text_to_instance(self,  # type: ignore
                         example) -> Instance:

        fields: Dict[str, Field] = {}

        joint_tokens_seq = example['paired_c_tokens']
        assert len(joint_tokens_seq) <= 512

        segments_ids = example['segment_ids']

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        # This text span is begin inclusive and end exclusive.
        # text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use)
        # text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens']))

        # fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span)
        # fields['bert_s1_span'] = MetadataField(text1_span)
        # fields['bert_s2_span'] = MetadataField(text2_span)

        # However, the ground truth span is begin and end both inclusive
        fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence'])

        fields['fid'] = IdField(example['fid'])
        fields['uid'] = IdField(example['uid'])

        return Instance(fields)
    def text_to_instance(
            self,  # type: ignore
            query: str,  # Important type information
            context: str,
            fid: str = None,
            qid: str = None,
            selection_label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(query)
        tokenized_text2 = self.bert_tokenizer.tokenize(context)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.query_l]
        tokenized_text2 = tokenized_text2[:self.context_l]

        s1_tokens_seq = ['[CLS]'] + tokenized_text1
        s2_tokens_seq = ['[CLS]'] + tokenized_text2

        # text1_len = len(tokenized_text1) + 1
        # text2_len = len(tokenized_text2) + 1

        # segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        s1_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(
            s1_tokens_seq)
        s2_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(
            s2_tokens_seq)

        fields['s1_sequence'] = BertIndexField(
            np.asarray(s1_tokens_ids, dtype=np.int64))
        fields['s2_sequence'] = BertIndexField(
            np.asarray(s2_tokens_ids, dtype=np.int64))

        text1_span = (1, len(tokenized_text1)
                      )  # End is exclusive (important for later use)
        text2_span = (1, len(tokenized_text2))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1],
                                           fields['s1_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1],
                                           fields['s2_sequence'])

        if selection_label:
            fields['label'] = LabelField(selection_label,
                                         label_namespace='labels')

        assert fid is not None
        assert qid is not None
        fields['fid'] = IdField(fid)
        fields['qid'] = IdField(qid)

        return Instance(fields)
Beispiel #3
0
    def text_to_instance(
            self,  # type: ignore
            sent1: str,  # Important type information
            sent2: str,
            pid: str = None,
            label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(sent1)
        tokenized_text2 = self.bert_tokenizer.tokenize(sent2)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.query_l]
        tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))]

        joint_tokens_seq = ['[CLS]'] + tokenized_text1 + [
            '[SEP]'
        ] + tokenized_text2 + ['[SEP]']
        text1_len = len(tokenized_text1) + 2
        text2_len = len(tokenized_text2) + 1
        segments_ids = [0 for _ in range(text1_len)
                        ] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(
            joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(
            np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(
            np.asarray(segments_ids, dtype=np.int64))

        text1_span = (1, 1 + len(tokenized_text1)
                      )  # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1,
                      text1_span[1] + 1 + len(tokenized_text2))

        fields['bert_s1_span'] = MetadataField(text1_span)
        fields['bert_s2_span'] = MetadataField(text2_span)

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)
    def text_to_instance(
            self,  # type: ignore
            seq1: str,  # Important type information
            seq2: str,
            fid: str = None,
            oid: str = None,
            retain_item: str = None,
            selection_label: str = None) -> Instance:

        if self.is_paired:
            fields: Dict[str, Field] = {}

            tokenized_text1 = self.bert_tokenizer.tokenize(seq1)
            tokenized_text2 = self.bert_tokenizer.tokenize(seq2)

            if self.pair_order == 'qc':
                tokenized_text1 = tokenized_text1[:self.query_l]
                tokenized_text2 = tokenized_text2[:(self.max_l -
                                                    len(tokenized_text1))]
            elif self.pair_order == 'cq':
                tokenized_text2 = tokenized_text2[:self.query_l]
                tokenized_text1 = tokenized_text1[:(self.max_l -
                                                    len(tokenized_text2))]
            else:
                raise ValueError(f"No valid pair ordering. {self.pair_order}")

            joint_tokens_seq = ['[CLS]'] + tokenized_text1 + [
                '[SEP]'
            ] + tokenized_text2 + ['[SEP]']
            text1_len = len(tokenized_text1) + 2
            text2_len = len(tokenized_text2) + 1
            segments_ids = [0 for _ in range(text1_len)
                            ] + [1 for _ in range(text2_len)]

            joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(
                joint_tokens_seq)
            assert len(joint_tokens_ids) == len(segments_ids)

            fields['paired_sequence'] = BertIndexField(
                np.asarray(joint_tokens_ids, dtype=np.int64))
            fields['paired_segments_ids'] = BertIndexField(
                np.asarray(segments_ids, dtype=np.int64))

            text1_span = (1, 1 + len(tokenized_text1)
                          )  # End is exclusive (important for later use)
            text2_span = (text1_span[1] + 1,
                          text1_span[1] + 1 + len(tokenized_text2))

            fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1],
                                               fields['paired_sequence'])
            fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1],
                                               fields['paired_sequence'])

            if selection_label:
                fields['label'] = LabelField(selection_label,
                                             label_namespace='labels')

            assert fid is not None
            assert oid is not None
            fields['fid'] = IdField(fid)
            fields['oid'] = IdField(oid)
            fields['item'] = IdField(retain_item)

            return Instance(fields)

        else:
            fields: Dict[str, Field] = {}

            tokenized_text1 = self.bert_tokenizer.tokenize(seq1)
            tokenized_text2 = self.bert_tokenizer.tokenize(seq2)

            # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
            tokenized_text1 = tokenized_text1[:self.query_l]
            tokenized_text2 = tokenized_text2[:self.context_l]

            s1_tokens_seq = ['[CLS]'] + tokenized_text1
            s2_tokens_seq = ['[CLS]'] + tokenized_text2

            s1_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(
                s1_tokens_seq)
            s2_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(
                s2_tokens_seq)

            fields['s1_sequence'] = BertIndexField(
                np.asarray(s1_tokens_ids, dtype=np.int64))
            fields['s2_sequence'] = BertIndexField(
                np.asarray(s2_tokens_ids, dtype=np.int64))

            text1_span = (1, len(tokenized_text1)
                          )  # End is exclusive (important for later use)
            text2_span = (1, len(tokenized_text2))

            fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1],
                                               fields['s1_sequence'])
            fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1],
                                               fields['s2_sequence'])

            if selection_label:
                fields['label'] = LabelField(selection_label,
                                             label_namespace='labels')

            assert fid is not None
            assert oid is not None
            fields['fid'] = IdField(fid)
            fields['oid'] = IdField(oid)
            fields['item'] = IdField(retain_item)

            return Instance(fields)