Esempio n. 1
0
 def __iter__(self, random=False):
     """标签含义
     0: 单字词; 1: 多字词首字; 2: 多字词中间; 3: 多字词末字
     """
     batch_token_ids, batch_segment_ids, batch_labels = [], [], []
     for is_end, item in self.sample(random):
         token_ids, labels = [tokenizer._token_start_id], [0]
         for w in item:
             w_token_ids = tokenizer.encode(w)[0][1:-1]
             if len(token_ids) + len(w_token_ids) < maxlen:
                 token_ids += w_token_ids
                 if len(w_token_ids) == 1:
                     labels += [0]
                 else:
                     labels += [1] + [2] * (len(w_token_ids) - 2) + [3]
             else:
                 break
         token_ids += [tokenizer._token_end_id]
         labels += [0]
         segment_ids = [0] * len(token_ids)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_labels.append(labels)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_labels = sequence_padding(batch_labels)
             yield [batch_token_ids, batch_segment_ids], batch_labels
             batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Esempio n. 2
0
    def __iter__(self, random=False): # TODO 这里的random是指否需要对原始文本进行mask
        batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
        for is_end, (text, label) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
            if label != 2:
                token_ids = token_ids[:1] + desc_ids + token_ids[1:] # # token_ids[:1] = [CLS]
                segment_ids = [0] * len(desc_ids) + segment_ids
            if random: # 暂时没有用呢
                source_ids, target_ids = random_masking(token_ids)
            else:
                source_ids, target_ids = token_ids[:], token_ids[:]
            #label_ids = tokenizer.encode(label)[0][1:-1] # label_ids: [1092, 752] ;tokenizer.token_to_id(label[0]): 1092. 得到标签(如"财经")对应的词汇表的编码ID。label_ids: [1093, 689]。 e.g. [101, 1093, 689, 102] =[CLS,农,业,SEP]. tokenizer.encode(label): ([101, 1093, 689, 102], [0, 0, 0, 0])
            # print("label_ids:",label_ids,";tokenizer.token_to_id(label[0]):",tokenizer.token_to_id(label[0]))
            for i,mask_id in enumerate(mask_idxs):
                source_ids[mask_id] = tokenizer._token_mask_id
                target_ids[mask_id] = tokenizer.token_to_id(label[i]) # token_to_id与tokenizer.encode可以实现类似的效果。

            batch_token_ids.append(source_ids)
            batch_segment_ids.append(segment_ids)
            batch_output_ids.append(target_ids)
            if len(batch_token_ids) == self.batch_size or is_end: # padding操作
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_output_ids = sequence_padding(batch_output_ids)
                yield [
                    batch_token_ids, batch_segment_ids, batch_output_ids
                ], None
                batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
 def __iter__(self, random=False):
     """单条样本格式:[CLS]篇章[SEP]问题[SEP]答案[SEP]
     """
     batch_token_ids, batch_segment_ids = [], []
     for is_end, D in self.sample(random):
         question = D['question']
         answers = [p['answer'] for p in D['passages'] if p['answer']]
         passage = np.random.choice(D['passages'])['passage']
         passage = re.sub(u' |、|;|,', ',', passage)
         final_answer = ''
         for answer in answers:
             if all(
                 [a in passage[:max_p_len - 2] for a in answer.split(' ')]):
                 final_answer = answer.replace(' ', ',')
                 break
         qa_token_ids, qa_segment_ids = tokenizer.encode(
             question, final_answer, max_length=max_qa_len + 1)
         p_token_ids, p_segment_ids = tokenizer.encode(passage,
                                                       max_length=max_p_len)
         token_ids = p_token_ids + qa_token_ids[1:]
         segment_ids = p_segment_ids + qa_segment_ids[1:]
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             yield [batch_token_ids, batch_segment_ids], None
             batch_token_ids, batch_segment_ids = [], []
 def __iter__(self, random=False):
     """标签含义
     0: 单字词; 1: 多字词首字; 2: 多字词中间; 3: 多字词末字
     """
     idxs = list(range(len(self.data)))
     if random:
         np.random.shuffle(idxs)
     batch_token_ids, batch_segment_ids, batch_labels = [], [], []
     for i in idxs:
         token_ids, labels = [tokenizer._token_cls_id], [0]
         for w in self.data[i]:
             w_token_ids = tokenizer.encode(w)[0][1:-1]
             if len(token_ids) + len(w_token_ids) < maxlen:
                 token_ids += w_token_ids
                 if len(w_token_ids) == 1:
                     labels += [0]
                 else:
                     labels += [1] + [2] * (len(w_token_ids) - 2) + [3]
             else:
                 break
         token_ids += [tokenizer._token_sep_id]
         labels += [0]
         segment_ids = [0] * len(token_ids)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_labels.append(labels)
         if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_labels = sequence_padding(batch_labels)
             yield [batch_token_ids, batch_segment_ids], batch_labels
             batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Esempio n. 5
0
    def __iter__(self, random=True):
        """
        单条样本格式: [cls]错误词汇[sep][mask][mask]..[sep]
        :param random:
        :return:
        """
        batch_tokens_ids, batch_segment_ids, batch_right_token_ids = [], [], []
        for is_end, D in self.sample(random):
            wrong, right = D
            right_token_ids, _ = tokenizer.encode(first_text=right)
            wrong_token_ids, _ = tokenizer.encode(first_text=wrong)

            token_ids = wrong_token_ids
            token_ids += [tokenizer._token_mask_id] * max_len
            token_ids += [tokenizer._token_end_id]

            segemnt_ids = [0] * len(token_ids)

            batch_tokens_ids.append(token_ids)
            batch_segment_ids.append(segemnt_ids)
            batch_right_token_ids.append(right_token_ids[1:])

            if len(batch_tokens_ids) == self.batch_size or is_end:
                batch_tokens_ids = sequence_padding(batch_tokens_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_right_token_ids = sequence_padding(
                    batch_right_token_ids, max_len)

                yield [batch_tokens_ids,
                       batch_segment_ids], batch_right_token_ids
                batch_tokens_ids, batch_segment_ids, batch_right_token_ids = [], [], []
Esempio n. 6
0
    def __iter__(self, random=False):
        batch_ori_sentence = []
        batch_token_ids, batch_segment_ids = [], []
        batch_start, batch_end, batch_insert_pos, batch_start_ner, batch_end_ner = [], [], [], [], []
        for is_end, d in self.sample(random):
            ori_sentence, sentence, token_type, pointer = d["ori_sentence"], d[
                "sentence"], d["token_type"], d["pointer"]
            token_ids, segment_ids = sentence, token_type
            batch_ori_sentence.append(ori_sentence)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            start, end, insert_pos, start_ner, end_ner = pointer
            batch_start.append(start)
            batch_end.append(end)
            batch_insert_pos.append(insert_pos)
            batch_start_ner.append(start_ner)
            batch_end_ner.append(end_ner)

            if len(batch_token_ids) == self.batch_size or is_end:
                batch_ori_sentence = np.array(batch_ori_sentence)
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids,
                                                     padding=1)
                batch_start = np.array(batch_start)
                batch_end = np.array(batch_end)
                batch_insert_pos = np.array(batch_insert_pos)
                batch_start_ner = np.array(batch_start_ner)
                batch_end_ner = np.array(batch_end_ner)
                yield [
                    batch_token_ids, batch_segment_ids, batch_start, batch_end,
                    batch_insert_pos, batch_start_ner, batch_end_ner
                ], batch_ori_sentence
                batch_ori_sentence = []
                batch_token_ids, batch_segment_ids = [], []
                batch_start, batch_end, batch_insert_pos, batch_start_ner, batch_end_ner = [], [], [], [], []
def evaluate_report(df_data):
    model = tf.keras.models.load_model('{}-model.h5'.format(model_name))
    true_y_list = [i for i in df_data["new_label"].tolist()]
    pred_y_list = []
    for text in df_data["text"].tolist():
        tokenizer = Tokenizer(dict_path, do_lower_case=True)
        token_ids, segment_ids = tokenizer.encode(first_text=text,
                                                  maxlen=maxlen)
        token_list = sequence_padding([token_ids])
        segment_list = sequence_padding([segment_ids])
        label = model.predict([np.array(token_list),
                               np.array(segment_list)]).argmax(axis=1)
        pred_y_list.append(label[0])

    with open("label.json", "r", encoding="utf-8") as f:
        labels = json.loads(f.read())
    target_name_list = list(labels.values())
    report = classification_report(true_y_list,
                                   pred_y_list,
                                   target_names=target_name_list,
                                   digits=4,
                                   output_dict=True)
    print(report)
    df = pd.DataFrame(report).transpose()
    df.to_csv("{}-report.csv".format(model_type),
              encoding='utf_8_sig',
              index=True)
Esempio n. 8
0
def gen_synonyms(text, n=100, k=20):
    """"含义: 产生sent的n个相似句,然后返回最相似的k个。
    做法:用seq2seq生成,并用encoder算相似度并排序。
    效果:
        >>> gen_synonyms(u'微信和支付宝哪个好?')
        [
            u'微信和支付宝,哪个好?',
            u'微信和支付宝哪个好',
            u'支付宝和微信哪个好',
            u'支付宝和微信哪个好啊',
            u'微信和支付宝那个好用?',
            u'微信和支付宝哪个好用',
            u'支付宝和微信那个更好',
            u'支付宝和微信哪个好用',
            u'微信和支付宝用起来哪个好?',
            u'微信和支付宝选哪个好',
        ]
    """
    r = synonyms_generator.generate(text, n)
    r = [i for i in set(r) if i != text]
    r = [text] + r
    X, S = [], []
    for t in r:
        x, s = tokenizer.encode(t)
        X.append(x)
        S.append(s)
    X = sequence_padding(X)
    S = sequence_padding(S)
    Z = encoder.predict([X, S])
    Z /= (Z**2).sum(axis=1, keepdims=True)**0.5
    argsort = np.dot(Z[1:], -Z[0]).argsort()
    return [r[i + 1] for i in argsort[:k]]
Esempio n. 9
0
 def __iter__(self, r=False):
     idxs = list(range(len(self.data)))
     np.random.shuffle(idxs)
     batch_token_ids, batch_segment_ids, batch_labels = [], [], []
     for i in idxs:
         line = self.data.loc[i]
         if (random.random() < 0.5):
             s1 = line['s1'].replace('***', '*')
             s2 = line['s2'].replace('***', '*')
         else:
             s2 = line['s1'].replace('***', '*')
             s1 = line['s2'].replace('***', '*')
         token_ids, segment_ids = tokenizer.encode(s1,
                                                   s2,
                                                   max_length=maxlen)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_labels.append([line['label']])
         if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_labels = sequence_padding(batch_labels)
             yield [batch_token_ids, batch_segment_ids,
                    batch_labels], None
             batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Esempio n. 10
0
 def __iter__(self, _):
     batch_token_ids, batch_segment_ids = [], []
     fpI = open(self.data + '/in.txt', 'r', encoding='utf-8')
     fpO = open(self.data + '/out.txt', 'r', encoding='utf-8')
     for lineI in fpI:
         lineI = lineI.rstrip()
         lineO = fpO.readline().rstrip()
         token_ids, segment_ids = tokenizer.encode(lineI,
                                                   lineO,
                                                   maxlen=maxlen)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         token_ids, segment_ids = tokenizer.encode(lineO,
                                                   lineI,
                                                   maxlen=maxlen)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         if len(batch_token_ids) == self.batch_size:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             yield [batch_token_ids, batch_segment_ids], None
             batch_token_ids, batch_segment_ids = [], []
     if batch_token_ids:
         batch_token_ids = sequence_padding(batch_token_ids)
         batch_segment_ids = sequence_padding(batch_segment_ids)
         yield [batch_token_ids, batch_segment_ids], None
Esempio n. 11
0
 def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
     for is_end, (text, label) in self.sample(random):
         if len(label) == 2:
             text = prefix + text
         token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
         if random:
             source_ids, target_ids = random_masking(token_ids)
         else:
             source_ids, target_ids = token_ids[:], token_ids[:]
         if len(label) == 2:
             label_ids = tokenizer.encode(label)[0][1:-1]
             for i, j in zip(mask_idxs, label_ids):
                 source_ids[i] = tokenizer._token_mask_id
                 target_ids[i] = j
         batch_token_ids.append(source_ids)
         batch_segment_ids.append(segment_ids)
         batch_output_ids.append(target_ids)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_output_ids = sequence_padding(batch_output_ids)
             yield [batch_token_ids, batch_segment_ids,
                    batch_output_ids], None
             batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
Esempio n. 12
0
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids = [], []
        batch_output_ids, batch_labels = [], []
        for is_end, (question, equation, answer) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(
                question, equation, maxlen=maxlen
            )
            idx = token_ids.index(tokenizer._token_end_id) + 1

            masked_token_ids = random_masking(token_ids)

            source_labels, target_labels = masked_token_ids[:idx], token_ids[idx:]
            labels = source_labels + target_labels[1:]
            batch_token_ids.append(masked_token_ids)
            batch_segment_ids.append(segment_ids)
            batch_output_ids.append(token_ids)
            batch_labels.append(labels)


            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_output_ids = sequence_padding(batch_output_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [
                    batch_token_ids, batch_segment_ids\
                    batch_output_ids, batch_labels
                ], None
                batch_token_ids, batch_segment_ids = [], []
                batch_output_ids, batch_labels = [], []
Esempio n. 13
0
 def __iter__(self, random=False):  # TODO 这里的random是指否需要对原始文本进行mask
     batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
     for is_end, (text, label) in self.sample(random):
         token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
         if label != 2:
             token_ids = token_ids[:1] + desc_ids + token_ids[1:]
             segment_ids = [0] * len(desc_ids) + segment_ids
         if random:  # 暂时没有用呢
             source_ids, target_ids = random_masking(token_ids)
         else:
             source_ids, target_ids = token_ids[:], token_ids[:]
         # 0: neutral, 1: entailment, 2:contradiction
         if label == 0:
             source_ids[mask_idx] = tokenizer._token_mask_id
             target_ids[mask_idx] = neutral_id
         elif label == 1:
             source_ids[mask_idx] = tokenizer._token_mask_id
             target_ids[mask_idx] = pos_id
         elif label == 2:
             source_ids[mask_idx] = tokenizer._token_mask_id
             target_ids[mask_idx] = neg_id
         batch_token_ids.append(source_ids)
         batch_segment_ids.append(segment_ids)
         batch_output_ids.append(target_ids)
         if len(batch_token_ids) == self.batch_size or is_end:  # padding操作
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_output_ids = sequence_padding(batch_output_ids)
             yield [batch_token_ids, batch_segment_ids,
                    batch_output_ids], None
             batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
Esempio n. 14
0
    def data_score(self, text_path):
        time_start = time.time()
        # 测试集的准确率
        _, _, _, test_data = json_data_process(text_path)
        y_pred = []
        y_true = []
        # for label,text in test_data:
        for text, label, in test_data:
            # print(text)
            # print(self.label2index)
            # y_true.append(self.label2index[label])
            y_true.append(self.index2label[label])
            token_ids, segment_ids = self.tokenizer.encode(text,
                                                           maxlen=self.max_len)
            token_ids = sequence_padding([token_ids], length=self.max_len)
            segment_ids = sequence_padding([segment_ids], length=self.max_len)
            pre = self.model.predict([token_ids, segment_ids])
            # res = self.index2label.get(str(np.argmax(pre[0])))
            # token_ids = np.array([token_ids])
            # # 预测
            # pred = self.model.predict(token_ids)
            pred = str(np.argmax(pre[0]))
            y_pred.append(self.index2label[pred])

        print("data pred ok!")
        # 评估
        target_names = [str(label) for label in self.labels]
        report_predict = classification_report(y_true,
                                               y_pred,
                                               target_names=target_names,
                                               digits=9)
        print(report_predict)
        print("耗时:" + str(time.time() - time_start))
Esempio n. 15
0
 def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
     for is_end, (text, label) in self.sample(random):
         token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
         if label != 2:
             token_ids = token_ids[:1] + desc_ids + token_ids[1:]
             segment_ids = [0] * len(desc_ids) + segment_ids
         if random:  # 暂时没有用呢
             source_ids, target_ids = random_masking(token_ids)
         else:
             source_ids, target_ids = token_ids[:], token_ids[:]
         # if label == 0: # 负样本
         #    source_ids[mask_idx] = tokenizer._token_mask_id
         #    target_ids[mask_idx] = neg_id
         # elif label == 1: # 正向样本
         #     source_ids[mask_idx] = tokenizer._token_mask_id
         #    target_ids[mask_idx] = pos_id
         ############################################################
         source_ids[mask_idx] = tokenizer._token_mask_id
         # print("label2tokenid_dict:,label2tokenid_dict,label:",label). e.g. {'like':like_id,'happiness':happiness_id,'sadness':sadness_id,'anger':anger_id,'disgust':disgust_id}
         target_id = label2tokenid_dict[label]  # label2tokenid_dict:
         target_ids[mask_idx] = target_id
         ############################################################
         batch_token_ids.append(source_ids)
         batch_segment_ids.append(segment_ids)
         batch_output_ids.append(target_ids)
         if len(batch_token_ids) == self.batch_size or is_end:  # padding操作
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_output_ids = sequence_padding(batch_output_ids)
             yield [batch_token_ids, batch_segment_ids,
                    batch_output_ids], None
             batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
Esempio n. 16
0
 def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
     for is_end, (text, label) in self.sample(random):
         token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
         if label != 2:
             token_ids = token_ids[:1] + desc_ids + token_ids[1:]
             segment_ids = [0] * len(desc_ids) + segment_ids
         if random:
             source_ids, target_ids = random_masking(token_ids)
         else:
             source_ids, target_ids = token_ids[:], token_ids[:]
         if label == 0:
             source_ids[mask_idx] = tokenizer._token_mask_id
             target_ids[mask_idx] = neg_id
         elif label == 1:
             source_ids[mask_idx] = tokenizer._token_mask_id
             target_ids[mask_idx] = pos_id
         batch_token_ids.append(source_ids)
         batch_segment_ids.append(segment_ids)
         batch_output_ids.append(target_ids)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_output_ids = sequence_padding(batch_output_ids)
             yield [
                 batch_token_ids, batch_segment_ids, batch_output_ids
             ], None
             batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
 def __iter__(self, random=False):
     """单条样本格式:[CLS]篇章[SEP]问题[SEP]答案[SEP]
     """
     idxs = list(range(len(self.data)))
     if random:
         np.random.shuffle(idxs)
     batch_token_ids, batch_segment_ids = [], []
     for i in idxs:
         D = self.data[i]
         question = ''.join(D['question_tokens'])
         question = re.sub(u' |、|;|,', ',', question)[:max_q_len]
         start = np.argwhere(D['answer_feature'] == 1)[0][0]
         end = np.argwhere(D['answer_feature'] == 1)[0][-1]
         answer = ''.join(D['passage_tokens'][start:end + 1])
         answer = re.sub(u' |、|;|,', ',', answer)[:max_a_len]
         passage = ''.join(D['passage_tokens'])
         passage = re.sub(u' |、|;|,', ',', passage)
         qa_token_ids, qa_segment_ids = tokenizer.encode(
             answer, question, max_length=max_qa_len + 1)
         p_token_ids, p_segment_ids = tokenizer.encode(passage,
                                                       max_length=max_p_len)
         token_ids = p_token_ids + qa_token_ids[1:]
         segment_ids = p_segment_ids + qa_segment_ids[1:]
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             yield [batch_token_ids, batch_segment_ids], None
             batch_token_ids, batch_segment_ids = [], []
Esempio n. 18
0
 def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids = [], []
     for is_end, d in self.sample(random):
         text, synonyms = d['text'], d['synonyms']
         synonyms = [text] + synonyms
         np.random.shuffle(synonyms)
         text, synonym = synonyms[:2]
         text, synonym = truncate(text), truncate(synonym)
         self.some_samples.append(text)
         if len(self.some_samples) > 1000:
             self.some_samples.pop(0)
         # sentence a 和sentence b 按前后顺序加入到序列
         token_ids, segment_ids = tokenizer.encode(text,
                                                   synonym,
                                                   max_length=maxlen * 2)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         #把[CLS] SENT_a [SEP] SENT_b [SEP]和[CLS] SENT_b [SEP] SENT_a [SEP]都加入训练
         token_ids, segment_ids = tokenizer.encode(synonym,
                                                   text,
                                                   max_length=maxlen * 2)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             yield [batch_token_ids, batch_segment_ids], None
             batch_token_ids, batch_segment_ids = [], []
Esempio n. 19
0
 def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
     for is_end, (text, label) in self.sample(random):
         token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
         if label != 2:
             token_ids = token_ids[:1] + desc_ids + token_ids[1:]
             segment_ids = [0] * len(desc_ids) + segment_ids
         if random:  # 暂时没有用呢
             source_ids, target_ids = random_masking(token_ids)
         else:
             source_ids, target_ids = token_ids[:], token_ids[:]
         source_ids[
             mask_idxs[0]] = tokenizer._token_mask_id  # 1的位置用[mask]填充
         source_ids[
             mask_idxs[1]] = tokenizer._token_mask_id  # 2的位置用[mask]填充
         targt_id_1 = label2tokenid_dict[label][0]
         targt_id_2 = label2tokenid_dict[label][
             1]  # print("targt_id_1:",targt_id_1,";targt_id_2:",targt_id_2) # targt_id_1: 839(代表“伤”) ;targt_id_2: 2552(代表“心”)
         target_ids[mask_idxs[0]] = targt_id_1  # 第一个[mask]对应的正确的标签字,如:“伤”;
         target_ids[mask_idxs[1]] = targt_id_2  # 第二个[mask]对应的正确的标签字,如:“心”。
         batch_token_ids.append(source_ids)
         batch_segment_ids.append(segment_ids)
         batch_output_ids.append(target_ids)
         if len(batch_token_ids) == self.batch_size or is_end:  # padding操作
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_output_ids = sequence_padding(batch_output_ids)
             yield [batch_token_ids, batch_segment_ids,
                    batch_output_ids], None
             batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
 def __iter__(self, random=False):
     idxs = list(range(len(self.data)))
     if random:
         np.random.shuffle(idxs)
     batch_token_ids, batch_segment_ids, batch_labels = [], [], []
     for i in idxs:
         token_ids, labels = [tokenizer._token_cls_id], [0]
         for w, l in self.data[i]:
             w_token_ids = tokenizer.encode(w)[0][1:-1]
             if len(token_ids) + len(w_token_ids) < maxlen:
                 token_ids += w_token_ids
                 if l == 'O':
                     labels += [0] * len(w_token_ids)
                 else:
                     B = class2id[l] * 2 + 1
                     I = class2id[l] * 2 + 2
                     labels += ([B] + [I] * (len(w_token_ids) - 1))
             else:
                 break
         token_ids += [tokenizer._token_sep_id]
         labels += [0]
         segment_ids = [0] * len(token_ids)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_labels.append(labels)
         if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_labels = sequence_padding(batch_labels)
             yield [batch_token_ids, batch_segment_ids], batch_labels
             batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Esempio n. 21
0
 def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids, batch_labels = [], [], []
     # is_end是是否是最后一个样本,如果是,则为1
     for is_end, (text, arguments) in self.sample(random):
         '''注意,tokenizer没重写,所以,4000 就存在'''
         #token_ids
         token_ids, segment_ids = tokenizer.encode(
             text, max_length=maxlen)  #其实最长的是113,算上前后共115
         labels = [0] * len(token_ids)  #注意,这个labels是有第一位cls和最后一位的
         for argument in arguments.items():
             a_token_ids = tokenizer.encode(argument[0])[0][1:-1]  #雀巢的数字形式
             start_index = search(a_token_ids, token_ids)  #寻找在句子中的位置,+1了
             if start_index != -1:  #如果找到
                 labels[start_index] = label2id[argument[1]] * 2 + 1
                 for i in range(1, len(a_token_ids)):
                     labels[start_index + i] = label2id[argument[1]] * 2 + 2
         #labels  [0, 363, 364, 0, 0, 365, 366, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_labels.append(labels)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_labels = sequence_padding(batch_labels)
             yield [batch_token_ids, batch_segment_ids], batch_labels
             batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Esempio n. 22
0
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
        for is_end, (text, label) in self.sample(random):
            if len(label) == 2:  # label是两个字的文本
                text = prefix + text  # 拼接文本
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
            if random:
                source_ids, target_ids = random_masking(token_ids)
            else:
                source_ids, target_ids = token_ids[:], token_ids[:]
            if len(label) == 2:  # label是两个字的文本
                label_ids = tokenizer.encode(label)[0][
                    1:
                    -1]  # label_ids: [1093, 689]。 e.g. [101, 1093, 689, 102] =[CLS,农,业,SEP]. tokenizer.encode(label): ([101, 1093, 689, 102], [0, 0, 0, 0])
                for i, label_id_ in zip(mask_idxs, label_ids):
                    source_ids[
                        i] = tokenizer._token_mask_id  # i: 7(mask1的index) ;j: 1093(农); i:8 (mask2的index) ;j: 689(业)
                    target_ids[i] = label_id_

            batch_token_ids.append(source_ids)
            batch_segment_ids.append(segment_ids)
            batch_output_ids.append(target_ids)

            if len(batch_token_ids
                   ) == self.batch_size or is_end:  # 分批padding和生成
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_output_ids = sequence_padding(batch_output_ids)
                yield [batch_token_ids, batch_segment_ids,
                       batch_output_ids], None
                batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
Esempio n. 23
0
    def __iter__(self, random=False):
        """单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]
        """
        idxs = list(range(len(self.data)))
        batch_token_ids, batch_segment_ids = [], []
        for i in idxs:
            text, question, answer = self.data[i]
            text_begin, text_end, _ = split_str(text, answer)
            text_cut_len = max(0, len(text) - 507 - 132)
            text_combine = delete_text(text, len(text_begin), len(text_end),
                                       text_cut_len)

            # text_b_token_ids, _ = tokenizer.encode(text_begin, max_length=375)
            # text_e_token_ids, _ = tokenizer.encode(text_end, max_length=375)
            # answer_token_ids, _ = tokenizer.encode(answer, max_length=256)

            token_ids = text_b_token_ids[:min(len(text_begin), 375)] + \
                     answer_token_ids[:min(len(answer), 256)] + text_e_token_ids[:min(len(text_end),375)]
            text_b_token_ids[:min(len(text_begin), 375)].insert(0, '[CLS]')
            text_b_token_ids[:min(len(text_begin), 375)].append('[SEP]')
            answer_token_ids[:min(len(answer), 256)]

            #             text_token_ids, _ = tokenizer.encode(text_combine, max_length=375)

            question_token_ids, _ = tokenizer.encode(question, max_length=132)
            token_ids = text_token_ids + question_token_ids
            segment_ids = [0] * (len(token_ids) - len(question_token_ids[1:]))
            segment_ids += [1] * (len(question_token_ids[1:]))
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                yield [batch_token_ids, batch_segment_ids], None
                batch_token_ids, batch_segment_ids = [], []
Esempio n. 24
0
 def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids, batch_labels = [], [], []
     for is_end, d in self.sample(random):
         tokens = self.tokenizer.tokenize(d[0], maxlen=self.maxlen)
         mapping = self.tokenizer.rematch(d[0], tokens)
         start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
         end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
         token_ids = self.tokenizer.tokens_to_ids(tokens)
         segment_ids = [0] * len(token_ids)
         labels = np.zeros(len(token_ids))
         for start, end, label in d[1:]:
             if start in start_mapping and end in end_mapping:
                 start = start_mapping[start]
                 end = end_mapping[end]
                 labels[start] = self.categories.index(label) * 2 + 1
                 labels[start + 1:end +
                        1] = self.categories.index(label) * 2 + 2
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_labels.append(labels)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_labels = sequence_padding(batch_labels)
             yield [batch_token_ids, batch_segment_ids], batch_labels
             batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Esempio n. 25
0
 def predict(self, text):
     token_ids, segment_ids = self.tokenizer.encode(text)
     token_ids = sequence_padding([token_ids], length=self.max_len)
     segment_ids = sequence_padding([segment_ids], length=self.max_len)
     pre = self.model.predict([token_ids, segment_ids])
     res = self.index2label.get(str(np.argmax(pre[0])))
     return res
Esempio n. 26
0
 def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids, batch_labels = [], [], []
     for is_end, item in self.sample(random):
         token_ids, labels = [tokenizer._token_start_id], [0]
         for w, l in item:
             w_token_ids = tokenizer.encode(w)[0][1:-1]
             if len(token_ids) + len(w_token_ids) < maxlen:
                 token_ids += w_token_ids
                 if l == 'O':
                     labels += [0] * len(w_token_ids)
                 else:
                     B = label2id[l] * 2 + 1
                     I = label2id[l] * 2 + 2
                     labels += ([B] + [I] * (len(w_token_ids) - 1))
             else:
                 break
         token_ids += [tokenizer._token_end_id]
         labels += [0]
         segment_ids = [0] * len(token_ids)
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_labels.append(labels)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_labels = sequence_padding(batch_labels)
             yield [batch_token_ids, batch_segment_ids], batch_labels
             batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Esempio n. 27
0
 def __iter__(self, random=False):
     """单条样本格式:[CLS]篇章[SEP]答案[SEP]问题[SEP]。"""
     batch_token_ids, batch_segment_ids, batch_o_token_ids = [], [], []
     for is_end, (p, q, a) in self.sample(random):
         p_token_ids, _ = tokenizer.encode(p, maxlen=max_p_len)
         a_token_ids, _ = tokenizer.encode(a, maxlen=max_a_len)
         q_token_ids, _ = tokenizer.encode(q, maxlen=max_q_len)
         token_ids = p_token_ids + a_token_ids[1:] + q_token_ids[1:]
         segment_ids = [0] * (len(p_token_ids) + len(a_token_ids[1:]))
         segment_ids += [1] * (len(token_ids) - len(p_token_ids) -
                               len(a_token_ids[1:]))
         o_token_ids = token_ids
         if np.random.random() > 0.5:
             token_ids = [
                 t if s == 0 or (s == 1 and np.random.random() > 0.3) else
                 np.random.choice(token_ids)
                 for t, s in zip(token_ids, segment_ids)
             ]
         batch_token_ids.append(token_ids)
         batch_segment_ids.append(segment_ids)
         batch_o_token_ids.append(o_token_ids)
         if len(batch_token_ids) == self.batch_size or is_end:
             batch_token_ids = sequence_padding(batch_token_ids)
             batch_segment_ids = sequence_padding(batch_segment_ids)
             batch_o_token_ids = sequence_padding(batch_o_token_ids)
             yield [batch_token_ids, batch_segment_ids,
                    batch_o_token_ids], None
             batch_token_ids, batch_segment_ids, batch_o_token_ids = [], [], []
Esempio n. 28
0
    def __iter__(self, random=True):
        idxs = list(range(len(self.data)))
        if random:
            np.random.shuffle(idxs)
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for i in idxs:
            #print(self.data[i])
            _, _, text1, text2, label = self.data[i]
            #            text1 = reproduce_text(text1)
            #            text2 = reproduce_text(text2)

            token_ids, segment_ids = tokenizer.encode(text1,
                                                      text2,
                                                      max_length=maxlen)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)

                #                transfer_flag = np.random.rand()
                #                if transfer_flag>0.8:
                #                    batch_token_ids, batch_segment_ids = batch_segment_ids, batch_token_ids

                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
Esempio n. 29
0
    def __iter__(self, random=False):
        idxs = list(range(len(self.data)))
        if random:
            np.random.shuffle(idxs)
        batch_token_ids, batch_segment_ids = [], []
        batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
        for i in idxs:
            d = self.data[i]
            token_ids, segment_ids = tokenizer.encode(d['text'],
                                                      max_length=maxlen)
            # 整理三元组 {s: [(o, p)]}
            spoes = {}
            for s, p, o in d['spo_list']:
                s = tokenizer.encode(s)[0][1:-1]
                p = predicate2id[p]
                o = tokenizer.encode(o)[0][1:-1]
                s_idx = search(s, token_ids)
                o_idx = search(o, token_ids)
                if s_idx != -1 and o_idx != -1:
                    s = (s_idx, s_idx + len(s) - 1)
                    o = (o_idx, o_idx + len(o) - 1, p
                         )  # [o-start,o-end,predicate]
                    if s not in spoes:
                        spoes[s] = []
                    spoes[s].append(o)
            if spoes:
                # subject标签
                #subject_labels = np.zeros((len(token_ids),2))
                subject_labels = np.zeros(
                    (len(token_ids), len(predicate2id), 2))  #[69step ,49,2]
                for s in spoes:
                    for o_s, o_e, p in spoes[s]:
                        subject_labels[s[0], p, 0] = 1
                        subject_labels[s[1], p, 1] = 1
                #

                # 构建batch
                batch_token_ids.append(token_ids)
                batch_segment_ids.append(segment_ids)
                batch_subject_labels.append(subject_labels)
                #batch_subject_ids.append(subject_ids)
                #batch_object_labels.append(object_labels)
                if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                    batch_token_ids = sequence_padding(batch_token_ids)
                    batch_segment_ids = sequence_padding(batch_segment_ids)
                    batch_subject_labels = sequence_padding(
                        batch_subject_labels,
                        #padding=np.zeros(2),
                        padding=np.zeros((len(predicate2id), 2)))
                    #batch_subject_ids = np.array(batch_subject_ids)
                    #batch_object_labels = sequence_padding(batch_object_labels, padding=np.zeros((len(predicate2id), 2)))
                    yield [
                        batch_token_ids,
                        batch_segment_ids,
                        batch_subject_labels,
                        #batch_subject_ids, batch_object_labels
                    ], None
                    batch_token_ids, batch_segment_ids = [], []
                    batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
Esempio n. 30
0
 def __iter__(self, random=False):
     batch_token_ids, batch_segment_ids = [], []
     batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
     for is_end, d in self.sample(random):
         token_ids, segment_ids = tokenizer.encode(d['text'],
                                                   max_length=maxlen)
         # 整理三元组 {s: [(o, p)]}
         spoes = {}
         for s, p, o in d['spo_list']:
             s = tokenizer.encode(s)[0][1:-1]
             p = predicate2id[p]
             o = tokenizer.encode(o)[0][1:-1]
             s_idx = search(s, token_ids)
             o_idx = search(o, token_ids)
             if s_idx != -1 and o_idx != -1:
                 s = (s_idx, s_idx + len(s) - 1)
                 o = (o_idx, o_idx + len(o) - 1, p)
                 if s not in spoes:
                     spoes[s] = []
                 spoes[s].append(o)
         if spoes:
             # subject标签
             subject_labels = np.zeros((len(token_ids), 2))
             for s in spoes:
                 subject_labels[s[0], 0] = 1
                 subject_labels[s[1], 1] = 1
             # 随机选一个subject
             start, end = np.array(list(spoes.keys())).T
             start = np.random.choice(start)
             end = np.random.choice(end[end >= start])
             subject_ids = (start, end)
             # 对应的object标签
             object_labels = np.zeros(
                 (len(token_ids), len(predicate2id), 2))
             for o in spoes.get(subject_ids, []):
                 object_labels[o[0], o[2], 0] = 1
                 object_labels[o[1], o[2], 1] = 1
             # 构建batch
             batch_token_ids.append(token_ids)
             batch_segment_ids.append(segment_ids)
             batch_subject_labels.append(subject_labels)
             batch_subject_ids.append(subject_ids)
             batch_object_labels.append(object_labels)
             if len(batch_token_ids) == self.batch_size or is_end:
                 batch_token_ids = sequence_padding(batch_token_ids)
                 batch_segment_ids = sequence_padding(batch_segment_ids)
                 batch_subject_labels = sequence_padding(
                     batch_subject_labels, padding=np.zeros(2))
                 batch_subject_ids = np.array(batch_subject_ids)
                 batch_object_labels = sequence_padding(
                     batch_object_labels,
                     padding=np.zeros((len(predicate2id), 2)))
                 yield [
                     batch_token_ids, batch_segment_ids,
                     batch_subject_labels, batch_subject_ids,
                     batch_object_labels
                 ], None
                 batch_token_ids, batch_segment_ids = [], []
                 batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []