def __iter__(self):  # iterator to load data
        batch = []
        count = 0
        for line in itertools.islice(self.lines, 1, None):
            is_next = int(line[0])
            tokens_a = self.tokenize(line[1])
            tokens_b = self.tokenize(line[2])
            truncate_tokens_pair(tokens_a, tokens_b, self.max_len)
            instance = (is_next, tokens_a, tokens_b)
            for proc in self.pipeline:
                instance = proc(instance)

            batch.append(instance)
            count += 1

            if count == self.batch_size:
                batch_tensors = [
                    torch.tensor(x, dtype=torch.long) for x in zip(*batch)
                ]
                yield batch_tensors

                count = 0
                batch = []

        self.f_pos.seek(0)
Exemple #2
0
    def __call__(self, instance):
        is_next, tokens_a, tokens_b = instance

        # -3  for special tokens [CLS], [SEP], [SEP]
        truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3)

        # Add Special Tokens
        tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]']
        segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)
        input_mask = [1] * len(tokens)

        # the number of prediction is sometimes less than max_pred when sequence is short
        n_pred = min(self.max_pred,
                     max(1, int(round(len(tokens) * self.mask_prob))))

        # For masked Language Models
        masked_tokens, masked_pos, tokens = _sample_mask(
            tokens,
            self.mask_alpha,
            self.mask_beta,
            self.max_gram,
            goal_num_predict=n_pred)
        # masked_token -> 마스크 된 token의 원래 값
        # masked_pos -> 마스크 된 token의 index 값
        # tokens > 마스크 처리된 전체 tokens([CLS] + masked_sentence_A + [SEP] + masked_setence_B + [SEP])
        masked_weights = [1] * len(masked_tokens)

        # Token Indexing
        input_ids = self.indexer(tokens)
        masked_ids = self.indexer(masked_tokens)
        """ Indexer
        def convert_tokens_to_ids(vocab, tokens):
        #    Converts a sequence of tokens into ids using the vocab.
            ids = []
            for token in tokens:
                ids.append(vocab[token])
            return ids
        """

        # Zero Padding
        n_pad = self.max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        input_mask.extend([0] * n_pad)

        # Zero Padding for masked target
        if self.max_pred > len(masked_ids):
            masked_ids.extend([0] * (self.max_pred - len(masked_ids)))
        if self.max_pred > len(masked_pos):
            masked_pos.extend([0] * (self.max_pred - len(masked_pos)))
        if self.max_pred > len(masked_weights):
            masked_weights.extend([0] * (self.max_pred - len(masked_weights)))

        return (input_ids, segment_ids, input_mask, masked_ids, masked_pos,
                masked_weights, is_next)
    def __call__(self, instance):
        label, tokens_a, tokens_b = instance

        # -3 special tokens for [CLS] text_a [SEP] text_b [SEP]
        # -2 special tokens for [CLS] text_a [SEP]
        _max_len = self.max_len - 3 if tokens_b else self.max_len - 2
        truncate_tokens_pair(tokens_a, tokens_b, _max_len)

        # Add Special Tokens
        tokens_a = ['[CLS]'] + tokens_a + ['[SEP]']
        tokens_b = tokens_b + ['[SEP]'] if tokens_b else []

        return (label, tokens_a, tokens_b)
Exemple #4
0
    def __call__(self, instance):
        is_next, tokens_a, tokens_b = instance

        # -3  for special tokens [CLS], [SEP], [SEP]
        truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3)

        # Add Special Tokens
        tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]']
        segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)
        input_mask = [1] * len(tokens)

        # For masked Language Models
        masked_tokens, masked_pos = [], []
        # the number of prediction is sometimes less than max_pred when sequence is short
        n_pred = min(self.max_pred,
                     max(1, int(round(len(tokens) * self.mask_prob))))
        # candidate positions of masked tokens
        cand_pos = [
            i for i, token in enumerate(tokens)
            if token != '[CLS]' and token != '[SEP]'
        ]
        shuffle(cand_pos)
        for pos in cand_pos[:n_pred]:
            masked_tokens.append(tokens[pos])
            masked_pos.append(pos)
            if rand() < 0.8:  # 80%
                tokens[pos] = '[MASK]'
            elif rand() < 0.5:  # 10%
                tokens[pos] = get_random_word(self.vocab_words)
        # when n_pred < max_pred, we only calculate loss within n_pred
        masked_weights = [1] * len(masked_tokens)

        # Token Indexing
        input_ids = self.indexer(tokens)
        masked_ids = self.indexer(masked_tokens)

        # Zero Padding
        n_pad = self.max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        input_mask.extend([0] * n_pad)

        # Zero Padding for masked target
        if self.max_pred > n_pred:
            n_pad = self.max_pred - n_pred
            masked_ids.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
            masked_weights.extend([0] * n_pad)

        return (input_ids, segment_ids, input_mask, masked_ids, masked_pos,
                masked_weights, is_next)
Exemple #5
0
    def __call__(self, instance):
        label, tokens_a, tokens_b = instance

        #print(tokens_a)
        _max_len = self.max_len - 3 if tokens_b else self.max_len - 2
        truncate_tokens_pair(tokens_a, tokens_b, _max_len)
        # -3 special tokens for [CLS] text_a [SEP] text_b [SEP]
        # -2 special tokens for [CLS] text_a [SEP]

        # Add Special Tokens
        tokens_a = tokens_a
        #print(label)

        return (label, tokens_a)
Exemple #6
0
    def __call__(self, instance):
        is_next, tokens_a, tokens_b = instance

        # -3  for special tokens [CLS], [SEP], [SEP]
        truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3)

        # Add Special Tokens
        tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]']
        segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)
        input_mask = [1] * len(tokens)

        # the number of prediction is sometimes less than max_pred when sequence is short
        n_pred = min(self.max_pred,
                     max(1, int(round(len(tokens) * self.mask_prob))))

        original_ids = self.indexer(tokens)
        # For masked Language Models
        masked_tokens, masked_pos, tokens = _sample_mask(
            tokens,
            self.mask_alpha,
            self.mask_beta,
            self.max_gram,
            goal_num_predict=n_pred)

        masked_weights = [1] * len(masked_tokens)

        # Token Indexing

        input_ids = self.indexer(tokens)
        masked_ids = self.indexer(masked_tokens)

        # Zero Padding
        n_pad = self.max_len - len(input_ids)
        original_ids.extend([0] * n_pad)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        input_mask.extend([0] * n_pad)

        # Zero Padding for masked target
        if self.max_pred > len(masked_ids):
            masked_ids.extend([0] * (self.max_pred - len(masked_ids)))
        if self.max_pred > len(masked_pos):
            masked_pos.extend([0] * (self.max_pred - len(masked_pos)))
        if self.max_pred > len(masked_weights):
            masked_weights.extend([0] * (self.max_pred - len(masked_weights)))

        # Author implementation isn't exact the same as original bert model
        # as masked_ids only contain the un-masked token
        return (input_ids, segment_ids, input_mask, masked_ids, masked_pos,
                masked_weights, is_next, original_ids)
    def __call__(self, data):
        is_next, tokens_a, tokens_b = data
        truncate_tokens_pair(tokens_a, tokens_b, self.max_len - 3)

        # Add Special Tokens
        tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]']
        segment_ids = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)
        input_mask = [1] * len(tokens)

        # For masked Language Models
        masked_tokens, masked_pos = [], []
        n_pred = min(self.max_pred,
                     max(1, int(round(len(tokens) * self.mask_prob))))
        cand_pos = [
            i for i, token in enumerate(tokens)
            if token != '[CLS]' and token != '[SEP]'
        ]
        shuffle(cand_pos)
        for pos in cand_pos[:int(n_pred)]:
            masked_tokens.append(tokens[pos])
            masked_pos.append(pos)
            if rand() < 0.8:  # 80%
                tokens[pos] = '[MASK]'
            elif rand() < 0.5:  # 10%
                tokens[pos] = get_random_word(self.indexer.vocab)
        masked_weights = [1] * len(masked_tokens)

        # Token Indexing
        input_ids = self.indexer.convert_tokens_to_ids(tokens)
        masked_ids = self.indexer.convert_tokens_to_ids(masked_tokens)

        # Zero Padding
        n_pad = self.max_len - len(input_ids)
        input_ids.extend([0] * int(n_pad))
        segment_ids.extend([0] * int(n_pad))
        input_mask.extend([0] * int(n_pad))

        # Zero Padding for masked target
        if self.max_pred > n_pred:
            n_pad = self.max_pred - n_pred
            masked_ids.extend([0] * int(n_pad))
            masked_pos.extend([0] * int(n_pad))
            masked_weights.extend([0] * int(n_pad))

        return (input_ids, segment_ids, input_mask, masked_ids, masked_pos,
                masked_weights, is_next)
Exemple #8
0
    def __call__(self, tokens_a):
        # -2  for special tokens [CLS], [SEP]
        truncate_tokens_pair(tokens_a, [], self.max_len - 2)

        # Add Special Tokens
        tokens = ['[CLS]'] + tokens_a + ['[SEP]']
        token_type_ids = [0] * self.max_len
        attention_mask = [1] * len(tokens)
        original_attention_mask = attention_mask.copy()

        # Get ElectraGenerator label. "-100" means the corresponding token is unmasked, else means the masked token ids
        g_label = [-100] * self.max_len

        # Get original input ids as ElectraDiscriminator labels
        original_input_ids = self.indexer(tokens)

        # For masked Language Models
        # The number of prediction is sometimes less than max_pred when sequence is short
        n_pred = min(self.max_pred,
                     max(1, int(round(len(tokens) * self.mask_prob))))
        # candidate positions of masked tokens
        cand_pos = [
            i for i, token in enumerate(tokens)
            if token != '[CLS]' and token != '[SEP]'
        ]
        shuffle(cand_pos)
        for pos in cand_pos[:n_pred]:
            attention_mask[pos] = 0
            g_label[pos] = self.indexer(
                tokens[pos])[0]  # get the only one element from list
            tokens[pos] = '[MASK]'

        # Token Indexing
        input_ids = self.indexer(tokens)

        # Zero Padding
        n_pad = self.max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        attention_mask.extend([0] * n_pad)

        return input_ids, attention_mask, token_type_ids, g_label, original_input_ids, original_attention_mask
    def __call__(self, instance):
        input_tokens, input_pos, input_dep, target_tokens, target_pos, target_dep = instance

        # -3  for special tokens [CLS], [SEP], [SEP]

        truncate_tokens_pair(input_tokens, target_tokens, self.max_len - 3)
        truncate_tokens_pair(input_pos, target_pos, self.max_len - 3)
        truncate_tokens_pair(input_dep, target_dep, self.max_len - 3)
        target_tokens = truncate_tokens(target_tokens, self.max_len)
        target_pos = truncate_tokens(target_pos, self.max_len)
        target_dep = truncate_tokens(target_dep, self.max_len)

        # Add Special Tokens
        origin_word_tokens = ['[CLS]'] + input_tokens + [
            '[SEP]'
        ] + target_tokens + ['[SEP]']
        if rand() < 0.5:
            word_tokens = origin_word_tokens
        else:
            word_tokens = ['[CLS]'] + input_tokens + ['[SEP]'] + (
                ['[MASK]'] * len(target_tokens)) + ['[SEP]']

        #word_tokens = ['[CLS]'] + input_tokens + ['[SEP]'] + target_tokens + ['[SEP]']
        pos_tokens = ['[CLS]'] + input_pos + ['[SEP]'] + target_pos + ['[SEP]']
        dep_tokens = ['[CLS]'] + input_dep + ['[SEP]'] + target_dep + ['[SEP]']
        input_segment_ids = [0] * (len(input_tokens) +
                                   2) + [1] * (len(target_tokens) + 1)
        input_mask = [1] * len(word_tokens)
        target_mask = [1] * (len(target_tokens) + 1)
        input_len = len(input_tokens) + 2
        target_len = len(target_tokens) + 1

        input_word_ids, input_pos_ids, input_dep_ids = self.indexer(
            word_tokens, pos_tokens, dep_tokens)
        origin_input_word_ids, _, _ = self.indexer(origin_word_tokens, [], [])
        target_word_ids, target_pos_ids, target_dep_ids = self.indexer(
            target_tokens + ['[SEP]'], target_pos + ['[SEP]'],
            target_dep + ['[SEP]'])

        # Zero Padding
        input_n_pad = self.max_len - len(input_word_ids)
        origin_input_word_ids.extend([0] * input_n_pad)
        input_word_ids.extend([0] * input_n_pad)
        input_pos_ids.extend([0] * input_n_pad)
        input_dep_ids.extend([0] * input_n_pad)
        input_segment_ids.extend([0] * input_n_pad)
        input_mask.extend([0] * input_n_pad)

        target_n_pad = self.max_len - len(target_word_ids)
        target_word_ids.extend([0] * target_n_pad)
        target_pos_ids.extend([0] * target_n_pad)
        target_dep_ids.extend([0] * target_n_pad)
        target_mask.extend([0] * target_n_pad)

        return (origin_input_word_ids, input_word_ids, input_pos_ids,
                input_dep_ids, input_segment_ids, input_mask, target_word_ids,
                target_pos_ids, target_dep_ids, target_mask, input_len,
                target_len)
    def __call__(self, instance):
        input_tokens, input_pos, input_dep, target_tokens, target_pos, target_dep = instance

        # -3  for special tokens [CLS], [SEP], [SEP]
        truncate_tokens_pair(input_tokens, target_tokens, self.max_len - 3)
        truncate_tokens_pair(input_pos, target_pos, self.max_len - 3)
        truncate_tokens_pair(input_dep, target_dep, self.max_len - 3)
        target_tokens = truncate_tokens(target_tokens, self.max_len)
        target_pos = truncate_tokens(target_pos, self.max_len)
        target_dep = truncate_tokens(target_dep, self.max_len)

        # Add Special Tokens
        word_tokens = ['[CLS]'] + input_tokens + ['[SEP]'] + target_tokens + [
            '[SEP]'
        ]
        pos_tokens = ['[CLS]'] + input_pos + ['[SEP]'] + target_pos + ['[SEP]']
        dep_tokens = ['[CLS]'] + input_dep + ['[SEP]'] + target_dep + ['[SEP]']
        input_segment_ids = [0] * (len(input_tokens) +
                                   2) + [1] * (len(target_tokens) + 1)
        input_mask = [1] * len(word_tokens)

        target_mask = [1] * len(target_tokens)

        # For masked Language Models
        masked_word_tokens, masked_pos_tokens, masked_dep_tokens, masked_pos = [], [], [], []
        # the number of prediction is sometimes less than max_pred when sequence is short
        n_pred = min(self.max_pred,
                     max(1, int(round(len(word_tokens) * self.mask_prob))))
        # candidate positions of masked tokens
        #
        #cand_pos = [i for i, token in enumerate(word_tokens)
        #            if word_tokens != '[CLS]' and word_tokens != '[SEP]']
        #Detect SEP for summary
        cand_pos = [
            i for i, token in enumerate(word_tokens) if word_tokens != '[CLS]'
        ]
        shuffle(cand_pos)
        for pos in cand_pos[:n_pred]:
            masked_word_tokens.append(word_tokens[pos])
            masked_pos_tokens.append(pos_tokens[pos])
            masked_dep_tokens.append(dep_tokens[pos])
            masked_pos.append(pos)
            if rand() < 0.8:  # 80%
                word_tokens[pos] = '[MASK]'
                pos_tokens[pos] = '[MASK]'
                dep_tokens[pos] = '[MASK]'
            #elif rand() < 0.5: # 10%
            #    word_tokens[pos] = get_random_word(self.vocab_words)
            #    pos_tokens[pos] = get_random_word(self.vocab_pos)
            #    dep_tokens[pos] = get_random_word(self.vocab_dep)
        # when n_pred < max_pred, we only calculate loss within n_pred
        masked_weights = [1] * len(masked_pos_tokens)

        #replace right as mask for summary
        #if rand() < 0.1:
        #    word_tokens = word_tokens[:len(input_tokens)+2] + ['[MASK]']*len(self.max_len - len(input_tokens)+2)
        #    pos_tokens = pos_tokens[:len(input_pos)+2] + ['[MASK]']*len(self.max_len - len(input_pos)+2)
        #    dep_tokens = dep_tokens[:len(input_dep)+2] + ['[MASK]']*len(self.max_len - len(input_dep)+2)

        input_word_ids, input_pos_ids, input_dep_ids = self.indexer(
            word_tokens, pos_tokens, dep_tokens)
        masked_word_ids, masked_pos_ids, masked_dep_ids = self.indexer(
            masked_word_tokens, masked_pos_tokens, masked_dep_tokens)
        target_word_ids, target_pos_ids, target_dep_ids = self.indexer(
            target_tokens, target_pos, target_dep)

        # Zero Padding
        input_n_pad = self.max_len - len(input_word_ids)
        input_word_ids.extend([0] * input_n_pad)
        input_pos_ids.extend([0] * input_n_pad)
        input_dep_ids.extend([0] * input_n_pad)
        input_segment_ids.extend([0] * input_n_pad)
        input_mask.extend([0] * input_n_pad)

        target_n_pad = self.max_len - len(target_word_ids)
        target_word_ids.extend([0] * target_n_pad)
        target_pos_ids.extend([0] * target_n_pad)
        target_dep_ids.extend([0] * target_n_pad)
        target_mask.extend([0] * target_n_pad)

        # Zero Padding for masked target
        if self.max_pred > n_pred:
            n_pad = self.max_pred - n_pred
            masked_word_ids.extend([0] * n_pad)
            masked_pos_ids.extend([0] * n_pad)
            masked_dep_ids.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
            masked_weights.extend([0] * n_pad)

        return (input_word_ids, input_pos_ids, input_dep_ids,
                input_segment_ids, input_mask, masked_word_ids, masked_pos_ids,
                masked_dep_ids, masked_pos, masked_weights, target_word_ids,
                target_pos_ids, target_dep_ids, target_mask)