def parse(self): def parse_file(inf): src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts = [], [], [], [], [], [] with open(inf, encoding="utf8") as contexts: for line in contexts: contexts = line.strip().split("\t") querys, target = contexts[:-1], contexts[-1] # 不能长于最大长度 while sum([len(q) for q in querys]) > self.max_len-1: querys.pop(0) turns_count = len(querys)-1 turns, q_words, replaced_qs = [], [], [] for turn, query in enumerate(querys): # last index if turn == turns_count: eos_index = len(q_words) q_words += [WORD[EOS]] replaced_qs += [WORD[EOS]] turns += [turn] q_word, repalced_q = common.split_char(query) assert len(q_word) == len(repalced_q) replaced_qs += repalced_q q_words += q_word turns += [turn+1]*(len(q_word)) tgt_words, replaced_tgt = common.split_char(target) assert len(tgt_words) == len(replaced_tgt) new_tgt_words, replaced_new_tgt_words = [], [] for word, replaced_word in zip(tgt_words, replaced_tgt): if word in q_words: new_tgt_words.append(word) replaced_new_tgt_words.append(replaced_word) new_tgt_words = new_tgt_words + [WORD[EOS]] t_index = common.find_text_index(q_words, new_tgt_words) # 保存 # step 1 - 替换后的q replaced_qs += [WORD[PAD]] * \ (self.max_len-len(replaced_qs)) turns += [PAD] * (self.max_len-len(turns)) assert len(replaced_qs) == len(turns) src_texts.append(replaced_qs) src_turn.append(turns) eos_indexs.append(eos_index) tgt_texts.append(([WORD[BOS]]+replaced_new_tgt_words)) tgt_indexs.append(t_index) src_contexts.append("".join(replaced_qs)) return src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_context = parse_file( f"{DATAPATH}/data") print( f"Ignored word counts - {self.dict(src_texts, self.min_word_count)}") src_context = np.asarray(src_context) src_texts = np.asarray(common.texts2idx(src_texts, self.dict.word2idx)) src_turn = np.asarray(src_turn) tgt_indexs = np.asarray(tgt_indexs) eos_indexs = np.asarray(eos_indexs) tgt_texts = np.asarray(common.texts2idx(tgt_texts, self.dict.word2idx)) assert src_texts.shape == src_turn.shape assert tgt_indexs.shape == tgt_texts.shape index = np.arange(tgt_texts.shape[0]) np.random.shuffle(index) src_context = src_context[index] src_texts = src_texts[index] src_turn = src_turn[index] tgt_indexs = tgt_indexs[index] tgt_texts = tgt_texts[index] eos_indexs = eos_indexs[index] self.src_context_train = src_context[2000:] self.src_texts_train = src_texts[2000:] self.src_turn_train = src_turn[2000:] self.tgt_indexs_train = tgt_indexs[2000:] self.tgt_texts_train = tgt_texts[2000:] self.eos_indexs_train = eos_indexs[2000:] self.src_context_test = src_context[2000:] self.src_texts_test = src_texts[:2000] self.src_turn_test = src_turn[:2000] self.tgt_indexs_test = tgt_indexs[:2000] self.tgt_texts_test = tgt_texts[:2000] self.eos_indexs_test = eos_indexs[:2000]
def parse(self): def parse_file(inf): src_texts, src_turn, tgt_indexs, tgt_texts = [], [], [], [] with open(inf, encoding="utf8") as contexts: for line in contexts: query1, query2, query3, target = line.strip().split("\t\t") q1_words = common.split_char(query1) turn1 = [1] * (len(q1_words)) q2_words = common.split_char(query2) turn2 = [2] * (len(q2_words) + 1) q3_words = common.split_char(query3) turn3 = [3] * (len(q3_words)) q_words = q1_words + q2_words + [WORD[EOS]] + q3_words turns = turn1 + turn2 + turn3 if len(q_words) > self.max_len: continue assert len(q_words) == len(turns) src_texts.append(q_words) src_turn.append(turns) tgt_words = common.split_char(target) new_tgt_words = [] for word in tgt_words: if word in q_words: new_tgt_words.append(word) tgt_texts.append([WORD[BOS]] + new_tgt_words) new_tgt_words = new_tgt_words + [WORD[EOS]] t_index = common.find_text_index(q_words, new_tgt_words) tgt_indexs.append(t_index) return src_texts, src_turn, tgt_indexs, tgt_texts src_texts, src_turn, tgt_indexs, tgt_texts = parse_file( f"{DATAPATH}/corpus") print( f"Ignored word counts - {self.dict(src_texts, self.min_word_count)}" ) src_texts = np.asarray(common.texts2idx(src_texts, self.dict.word2idx)) src_turn = np.asarray(src_turn) tgt_indexs = np.asarray(tgt_indexs) tgt_texts = np.asarray(common.texts2idx(tgt_texts, self.dict.word2idx)) assert src_texts.shape == src_turn.shape assert tgt_indexs.shape == tgt_texts.shape index = np.arange(tgt_texts.shape[0]) np.random.shuffle(index) src_texts = src_texts[index] src_turn = src_turn[index] tgt_indexs = tgt_indexs[index] tgt_texts = tgt_texts[index] self.src_texts_train = src_texts[2000:] self.src_turn_train = src_turn[2000:] self.tgt_indexs_train = tgt_indexs[2000:] self.tgt_texts_train = tgt_texts[2000:] self.src_texts_test = src_texts[:2000] self.src_turn_test = src_turn[:2000] self.tgt_indexs_test = tgt_indexs[:2000] self.tgt_texts_test = tgt_texts[:2000]