def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids, batch_labels = [], [], []
     for is_end, item in self.get_sample():
         token_ids, labels = [tokenizer._token_start_id], [0]
         for w, l in item:
             w_token_ids = tokenizer.encode(w)[0][1:-1]
             if len(token_ids) + len(w_token_ids) < maxlen:
                 token_ids += w_token_ids
                 if l == 'O':
                     labels += [0] * len(w_token_ids)
                 else:
                     B = label2id[l] * 3 + 1
                     I = label2id[l] * 3 + 2
                     E = label2id[l] * 3 + 3
                     labels += ([B] + [I] * (len(w_token_ids) - 2) +
                                [E] * int(len(w_token_ids) > 1))
             else:
                 break
         token_ids += [tokenizer._token_end_id]
         labels += [0]
         segment_ids = [0] * len(token_ids)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_labels.append(labels)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = pad_sequences(batch_token_ids)
             batch_segment_ids = pad_sequences(batch_segment_ids)
             batch_labels = pad_sequences(batch_labels)
             yield [batch_token_ids, batch_segment_ids], batch_labels
             batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Example #2
0
    def __iter__(self, shuffle=False):
        batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked, = [], [], [], []

        for is_end, item in self.get_sample(shuffle):
            source_tokens, target_tokens, segment_ids = random_masking(item)

            is_masked = [0 if i == 0 else 1 for i in target_tokens]
            batch_token_ids.append(source_tokens)
            batch_segment_ids.append(segment_ids)
            batch_target_ids.append(target_tokens)
            batch_is_masked.append(is_masked)
            #             batch_nsp.append([label])

            if is_end or len(batch_token_ids) == self.batch_size:
                batch_token_ids = pad_sequences(batch_token_ids)
                batch_segment_ids = pad_sequences(batch_segment_ids)
                batch_target_ids = pad_sequences(batch_target_ids)
                batch_is_masked = pad_sequences(batch_is_masked)
                #                 batch_nsp = sequence_padding(batch_nsp)

                yield [
                    batch_token_ids, batch_segment_ids, batch_target_ids,
                    batch_is_masked
                ], None

                batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked = [], [], [], []
Example #3
0
    def __iter__(self, random=False):
        # 0: 单字,1,多字词开头,2,中间,3,末尾
        batch_tokens, batch_segs, batch_labels = [], [], []
        for is_end, item in self.get_sample():
            token_ids, labels = [tokenizer._token_start_id], [0]
            for word in item:
                token_id = tokenizer.encode(word)[0][1:-1]
                if len(token_ids) + len(token_id) > maxlen:
                    break
                if len(token_id) == 1:
                    labels += [0]
                else:
                    labels += [1] + [2] * (len(token_id) - 2) + [3]
                token_ids += token_id

            token_ids.append(tokenizer._token_end_id)
            labels.append(0)
            batch_tokens.append(token_ids)
            batch_segs.append([0] * len(token_ids))
            batch_labels.append(labels)

            if len(batch_tokens) >= self.batch_size or is_end:
                batch_tokens = pad_sequences(batch_tokens)
                batch_segs = pad_sequences(batch_segs)
                batch_labels = pad_sequences(batch_labels)
                yield [batch_tokens, batch_segs], batch_labels
                batch_tokens, batch_segs, batch_labels = [], [], []
    def __iter__(self, shuffle=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label, label_des) in self.get_sample(shuffle):
            if not self.seq2seq:
                token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
            else:
                text_token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
                label_token_ids = tokenizer.encode(label_des,
                                                   maxlen=max_label + 2)[0][1:]
                token_ids = text_token_ids + label_token_ids
                segment_ids = [0] * len(text_token_ids) + [1] * len(
                    label_token_ids)

            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = pad_sequences(batch_token_ids)
                batch_segment_ids = pad_sequences(batch_segment_ids)
                batch_labels = pad_sequences(batch_labels)
                #                 if self.seq2seq:
                #                     yield [batch_token_ids, batch_segment_ids], None
                #                 else:
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Example #5
0
    def __iter__(self, shuffle=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, item in self.get_sample(shuffle):
            context, question, answers = item[1:]

            context = self.random_generator(context)

            token_ids, segment_ids = tokenizer.encode(question,
                                                      context,
                                                      maxlen=maxlen)
            qt = tokenizer.tokenize(question)
            token_ids = self.random_padding(token_ids, len(qt))

            a = np.random.choice(answers)
            a_token_ids = tokenizer.encode(a)[0][1:-1]
            start_index = search(a_token_ids, token_ids)
            if start_index != -1:
                labels = [[start_index], [start_index + len(a_token_ids) - 1]]
                batch_token_ids.append(token_ids)
                batch_segment_ids.append(segment_ids)
                batch_labels.append(labels)
                if len(batch_token_ids) == self.batch_size or is_end:
                    batch_token_ids = pad_sequences(batch_token_ids)
                    batch_segment_ids = pad_sequences(batch_segment_ids)
                    batch_labels = pad_sequences(batch_labels)
                    yield [batch_token_ids, batch_segment_ids], batch_labels
                    batch_token_ids, batch_segment_ids, batch_labels = [], [], []
    def __iter__(self, shuffle=False):
        batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked, batch_nsp = [], [], [], [], []

        for is_end, item in self.get_sample(shuffle):
            # 50% shuffle order
            label = 1
            p = np.random.random()
            if p < 0.5:
                label = 0
                item = shuffle_reply(item)

            source_tokens, target_tokens, segment_ids = random_masking(item)

            is_masked = [0 if i == 0 else 1 for i in target_tokens]
            batch_token_ids.append(source_tokens)
            batch_segment_ids.append(segment_ids)
            batch_target_ids.append(target_tokens)
            batch_is_masked.append(is_masked)
            batch_nsp.append([label])

            if is_end or len(batch_token_ids) == self.batch_size:
                batch_token_ids = pad_sequences(batch_token_ids, maxlen=maxlen)
                batch_segment_ids = pad_sequences(batch_segment_ids, maxlen=maxlen)
                batch_target_ids = pad_sequences(batch_target_ids, maxlen=maxlen)
                batch_is_masked = pad_sequences(batch_is_masked, maxlen=maxlen)
                batch_nsp = pad_sequences(batch_nsp)

                yield [batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked, batch_nsp], None

                batch_token_ids, batch_segment_ids, batch_target_ids, batch_is_masked = [], [], [], []
                batch_nsp = []
Example #7
0
    def __iter__(self, shuffle=False):
        batch_tokens, batch_segments, batch_targets = [], [], []

        for is_end, (text, _, label) in self.get_sample(shuffle):
            text = pattern + text
            token_ids, seg_ids = tokenizer.encode(text, maxlen=maxlen)
            # 训练时随机masking
            if shuffle:
                source_tokens, target_tokens = self.random_masking(token_ids)
            else:
                source_tokens, target_tokens = token_ids[:], token_ids[:]

            # mask label
            if len(label) == 2:
                label_ids = tokenizer.encode(label)[0][1:-1]
                for m, l in zip(mask_idx, label_ids):
                    source_tokens[m] = tokenizer._token_mask_id
                    target_tokens[m] = l

            batch_tokens.append(source_tokens)
            batch_segments.append(seg_ids)
            batch_targets.append(target_tokens)

            if len(batch_tokens) == self.batch_size or is_end:
                batch_tokens = pad_sequences(batch_tokens)
                batch_segments = pad_sequences(batch_segments)
                batch_targets = pad_sequences(batch_targets)

                yield [batch_tokens, batch_segments, batch_targets], None

                batch_tokens, batch_segments, batch_targets = [], [], []
Example #8
0
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, item in self.get_sample():
            context, questions, answers = item[1:]
            if type(questions) != list:
                question = questions
            else:
                question = questions[0] if np.random.random(
                ) > 0.5 else np.random.choice(questions)

            token_ids, segment_ids = tokenizer.encode(question,
                                                      context,
                                                      maxlen=maxlen)
            a = np.random.choice(answers)
            a_token_ids = tokenizer.encode(a)[0][1:-1]
            start_index = search(a_token_ids, token_ids)
            if start_index != -1:
                labels = [[start_index], [start_index + len(a_token_ids) - 1]]
                batch_token_ids.append(token_ids)
                batch_segment_ids.append(segment_ids)
                batch_labels.append(labels)
                if len(batch_token_ids) == self.batch_size or is_end:
                    batch_token_ids = pad_sequences(batch_token_ids)
                    batch_segment_ids = pad_sequences(batch_segment_ids)
                    batch_labels = pad_sequences(batch_labels)
                    yield [batch_token_ids, batch_segment_ids], batch_labels
                    batch_token_ids, batch_segment_ids, batch_labels = [], [], []
 def __iter__(self, shuffle=False):
     batch_token_ids, batch_segment_ids, batch_labels = [], [], []
     for is_end, (text, label) in self.get_sample(shuffle):
         token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_labels.append([label])
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = pad_sequences(batch_token_ids)
             batch_segment_ids = pad_sequences(batch_segment_ids)
             batch_labels = pad_sequences(batch_labels)
             yield [batch_token_ids, batch_segment_ids], batch_labels
             batch_token_ids, batch_segment_ids, batch_labels = [], [], []
    def __iter__(self):
        batch_token_ids, batch_segment_ids, batch_labels, batch_logits = [], [], [], []
        for is_end, (text, label, logits) in self.get_sample():
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)

            batch_labels.append(label)
            batch_logits.append(logits)

            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = pad_sequences(batch_token_ids)
                batch_segment_ids = pad_sequences(batch_segment_ids)
                batch_labels = pad_sequences(batch_labels)
                batch_logits = pad_sequences(batch_logits)
                yield [batch_token_ids,
                       batch_segment_ids], [batch_labels, batch_logits]

                batch_token_ids, batch_segment_ids, batch_labels, batch_logits = [], [], [], []
Example #11
0
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label, label_des) in self.get_sample():
            if self.data_augmentation:
                text = self.generate_text(text)
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)

            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = pad_sequences(batch_token_ids)
                batch_segment_ids = pad_sequences(batch_segment_ids)
                batch_labels = pad_sequences(batch_labels)
                if not self.transfer:
                    yield [batch_token_ids, batch_segment_ids], batch_labels
                else:
                    yield [batch_token_ids, batch_segment_ids
                           ] + [batch_labels], None
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
    def __iter__(self, shuffle=False):
        """[CLS]context[SEP]answer[SEP]question[SEP]"""
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, item in self.get_sample(shuffle):
            context, question, answer = item[1:]
            c_token_ids, _ = tokenizer.encode(context,
                                              maxlen=max_context_len + 1)
            q_token_ids, _ = tokenizer.encode(question,
                                              maxlen=max_question_len)
            a_token_ids, _ = tokenizer.encode(answer, maxlen=max_answer_len)

            token_ids = c_token_ids + a_token_ids[1:] + q_token_ids[1:]
            segment_ids = [0] * len(c_token_ids) + [1] * (len(token_ids) -
                                                          len(c_token_ids))

            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = pad_sequences(batch_token_ids)
                batch_segment_ids = pad_sequences(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []
    def __iter__(self, shuffle=False):
        batch_token_ids, batch_segment_ids, batch_target_ids = [], [], []

        for is_end, (q_id, q, r_id, r, label) in self.get_sample(shuffle):
            label = int(label) if label is not None else None

            if label is not None or self.prefix:
                q = pattern + q

            token_ids, segment_ids = tokenizer.encode(q, r, maxlen=maxlen)

            if shuffle:
                source_tokens, target_tokens = random_masking(token_ids)
            else:
                source_tokens, target_tokens = token_ids[:], token_ids[:]

            # mask label
            if label is not None:
                label_ids = tokenizer.encode(id2label[label])[0][1:-1]
                for m, lb in zip(mask_idx, label_ids):
                    source_tokens[m] = tokenizer._token_mask_id
                    target_tokens[m] = lb
            elif self.prefix:
                for i in mask_idx:
                    source_tokens[i] = tokenizer._token_mask_id

            batch_token_ids.append(source_tokens)
            batch_segment_ids.append(segment_ids)
            batch_target_ids.append(target_tokens)

            if is_end or len(batch_token_ids) == self.batch_size:
                batch_token_ids = pad_sequences(batch_token_ids)
                batch_segment_ids = pad_sequences(batch_segment_ids)
                batch_target_ids = pad_sequences(batch_target_ids)

                yield [batch_token_ids, batch_segment_ids,
                       batch_target_ids], None

                batch_token_ids, batch_segment_ids, batch_target_ids = [], [], []
Example #14
0
    def __iter__(self, shuffle=False):
        batch_token_ids, batch_segment_ids, batch_original_token_ids = [], [], []
        for is_end, item in self.get_sample(shuffle):
            context, question, answers = item[1:]
            answer = np.random.choice(answers)
            token_ids, _ = tokenizer.encode(answer,
                                            context,
                                            maxlen=maxlen - max_question_len -
                                            1)
            segment_ids = [0] * len(token_ids)

            question_token_ids = tokenizer.encode(question)[0][1:]
            token_ids = token_ids + question_token_ids
            segment_ids += [1] * len(question_token_ids)

            original_tokens = token_ids
            # random replace decoder tokens to generate negative sample
            is_negative = np.random.random() > 0.5
            if is_negative:
                token_ids = [
                    token if seg == 0 or np.random.random() > 0.3 else
                    np.random.choice(token_ids)
                    for token, seg in zip(token_ids, segment_ids)
                ]  # 0.5 概率替换,其中0.3的概率替换
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_original_token_ids.append(original_tokens)

            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = pad_sequences(batch_token_ids)
                batch_segment_ids = pad_sequences(batch_segment_ids)
                batch_original_token_ids = pad_sequences(
                    batch_original_token_ids)
                yield [
                    batch_token_ids, batch_segment_ids,
                    batch_original_token_ids
                ], None
                batch_token_ids, batch_segment_ids, batch_original_token_ids = [], [], []
Example #15
0
    def __iter__(self, shuffle=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label, label_des) in self.get_sample(shuffle):
            if not self.sim:
                token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
                batch_token_ids.append(token_ids)
                batch_segment_ids.append(segment_ids)
                batch_labels.append([label])

            else:
                text_token_ids = tokenizer.encode(text, maxlen=maxlen)[0]
                label_token_ids = tokenizer.encode(label_des, maxlen=max_label + 2)[0][1:]
                token_ids = [text_token_ids] + [label_token_ids]
                segment_ids = [[0] * len(text_token_ids)] + [[0] * len(label_token_ids)]
                batch_token_ids.extend(token_ids)
                batch_segment_ids.extend(segment_ids)
                batch_labels.extend([[label]] * 2)

            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = pad_sequences(batch_token_ids)
                batch_segment_ids = pad_sequences(batch_segment_ids)
                batch_labels = pad_sequences(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []