コード例 #1
0
    def convert_example_to_features(self,
                                    tokens_a,
                                    tokens_b,
                                    is_next_label,
                                    max_pos,
                                    short_seq_prob=0.1,
                                    masked_lm_prob=0.15):
        """
        Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
        IDs, LM labels, input_mask, CLS and SEP tokens etc.
        :param tokens_a: str, example tokens.
        :param tokens_b: str, example next tokens.
        :param is_next_label: int, is next label.
        :param max_pos: int, maximum length of sequence.
        :param short_seq_prob: float, Probability of creating sequences which are shorter than the maximum length.
        :param masked_lm_prob: float, Masked LM probability.
        :return: features
        """

        target_max_pos = max_pos - 3 if tokens_b else max_pos - 2

        tokens_a_ids = copy.copy(tokens_a)
        tokens_b_ids = copy.copy(tokens_b)
        # However, sequences to minimize the mismatch between pre-training and fine-tuning.
        if random() < short_seq_prob:
            target_max_pos = randint(2, target_max_pos - 1)
        truncate_seq_pair(tokens_a_ids, tokens_b_ids, target_max_pos)

        # Add Special Tokens
        tokens_a_ids.insert(0, self.cls_id)
        tokens_a_ids.append(self.sep_id)
        if len(tokens_b_ids) != 0:
            tokens_b_ids.append(self.sep_id)
        else:
            tokens_b_ids = []

        tokens = copy.copy(tokens_a_ids)
        tokens.extend(copy.copy(tokens_b_ids))
        # Add next sentence segment
        segment_ids = [0] * len(tokens_a_ids) + [1] * len(tokens_b_ids)

        lm_label_ids = [self.pad_id] * len(tokens_a_ids) + [self.pad_id] * len(
            tokens_b_ids)

        # mask prediction calc
        mask_prediction = int(round(len(tokens) * masked_lm_prob))

        mask_candidate_pos = [
            i for i, token in enumerate(tokens)
            if token != self.cls_id and token != self.sep_id
        ]
        # masked and random token
        shuffle(mask_candidate_pos)
        for pos in mask_candidate_pos[:mask_prediction]:
            if random() < 0.8:  # 80%
                # masked
                lm_label_ids[pos] = tokens[pos]
                tokens[pos] = self.mask_id
            elif random() < 0.5:  # 10%
                # random token
                lm_label_ids[pos] = tokens[pos]
                tokens[pos] = self.get_random_token_id()
            else:
                # 10% not mask and not modify
                lm_label_ids[pos] = tokens[pos]

        input_ids = tokens
        input_mask = [1] * len(input_ids)

        # zero padding
        num_zero_pad = max_pos - len(input_ids)
        input_ids.extend([self.pad_id] * num_zero_pad)
        segment_ids.extend([0] * num_zero_pad)
        input_mask.extend([0] * num_zero_pad)
        lm_label_ids.extend([self.pad_id] * num_zero_pad)

        return [
            input_ids, segment_ids, input_mask, is_next_label, lm_label_ids
        ]
コード例 #2
0
    def convert_example_to_features(self,
                                    token_ids,
                                    max_pos,
                                    short_seq_prob=0.1,
                                    masked_lm_prob=0.15):
        """
        Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
        IDs, LM labels, input_mask, CLS and SEP tokens etc.
        :param token_ids: str, example tokens.
        :param max_pos: int, maximum length of sequence.
        :param short_seq_prob: float, Probability of creating sequences which are shorter than the maximum length.
        :param masked_lm_prob: float, Masked LM probability.
        :return: features
        """

        target_max_pos = max_pos - self.bert_ids_num
        if random() < short_seq_prob:
            target_max_pos = randint(2, target_max_pos - 1)

        if self.is_sop:
            split_ids = copy.copy(token_ids)
            if len(split_ids) > 2:
                split_id = randint(1, len(split_ids) - 1)
                tokens_a = []
                for i in range(split_id):
                    tokens_a.append(split_ids[i])

                tokens_b = []
                for i in range(split_id, len(split_ids)):
                    tokens_b.append(split_ids[i])
            else:
                tokens_a = split_ids
                tokens_b = []

            if len(tokens_b) > 0 and random() < 0.5:
                is_random_next = 1
                temp = tokens_a
                tokens_a = tokens_b
                tokens_b = temp
            else:
                is_random_next = 0
            tokens_a = list(itertools.chain.from_iterable(tokens_a))
            tokens_b = list(itertools.chain.from_iterable(tokens_b))

            truncate_seq_pair(tokens_a, tokens_b, target_max_pos)

            # Add Special Tokens
            tokens_a.insert(0, self.cls_id)
            tokens_a.append(self.sep_id)
            if len(tokens_b) > 0:
                tokens_b.append(self.sep_id)
            else:
                tokens_b = []

            lm_label_ids = [self.pad_id] * len(tokens_a + tokens_b)

            bert_token_ids = copy.copy(tokens_a)
            bert_token_ids.extend(copy.copy(tokens_b))
            # Add next sentence segment
            segment_ids = [0] * len(tokens_a) + [1] * len(tokens_b)

        else:
            label_ids = copy.copy(token_ids)
            truncate_seq_pair(label_ids, [], target_max_pos)
            # Add Special Tokens
            label_ids = [self.cls_id] + label_ids + [self.sep_id]
            lm_label_ids = [self.pad_id] * len(label_ids)

            bert_token_ids = copy.copy(label_ids)
            segment_ids = [0] * len(label_ids)
            is_random_next = 0

        # mask prediction calc
        mask_prediction = round(
            (len(bert_token_ids) - self.bert_ids_num) * masked_lm_prob)
        mask_candidate_pos = [
            i for i, token in enumerate(bert_token_ids)
            if token != self.cls_id and token != self.sep_id
        ]
        mask_length = np.random.geometric(0.4)
        if mask_length > self.max_words_length:
            mask_length = self.max_words_length
        if mask_length > mask_prediction:
            mask_length = mask_prediction

        if mask_length > 0:
            mask_candidate_words = np.array_split(
                mask_candidate_pos, int(len(mask_candidate_pos) / mask_length))
            shuffle(mask_candidate_words)
            masked = 0
            # masked and random token
            for words in mask_candidate_words:
                for pos in words:
                    masked += 1
                    if random() < 0.8:  # 80%
                        # masked
                        lm_label_ids[pos] = bert_token_ids[pos]
                        bert_token_ids[pos] = self.mask_id
                    elif random() < 0.5:  # 10%
                        # random token
                        lm_label_ids[pos] = bert_token_ids[pos]
                        bert_token_ids[pos] = self.get_random_token_id()
                    else:
                        # 10% not mask and not modify
                        lm_label_ids[pos] = bert_token_ids[pos]
                    if masked == mask_prediction:
                        break

                if masked == mask_prediction:
                    break

        input_ids = bert_token_ids

        # zero padding
        num_zero_pad = self.max_pos - len(input_ids)
        input_mask = [1] * len(input_ids)
        if segment_ids:
            segment_ids.extend([0] * num_zero_pad)
        input_mask.extend([0] * num_zero_pad)
        input_ids.extend([self.pad_id] * num_zero_pad)
        lm_label_ids.extend([self.pad_id] * num_zero_pad)

        return [
            input_ids, segment_ids, input_mask, is_random_next, lm_label_ids
        ]