def sent2tenosr(self, sentences): assert isinstance(sentences, list) and len(sentences) == 3 max_len = self.args.max_context_len query1, query2, query3 = sentences q1_words = common.split_char(query1) turn1 = [1] * (len(q1_words)) q2_words = common.split_char(query2) turn2 = [2] * (len(q2_words) + 1) q3_words = common.split_char(query3) turn3 = [3] * (len(q3_words)) words = q1_words + q2_words + [WORD[EOS]] + q3_words turns = turn1 + turn2 + turn3 if len(words) > max_len: words = words[:max_len] idx = [self.dict[w] if w in self.dict else UNK for w in words] inp = torch.LongTensor(idx).unsqueeze(0) position = torch.LongTensor([ pos_i + 1 if w_i != PAD else 0 for pos_i, w_i in enumerate(idx) ]).unsqueeze(0) turns = torch.LongTensor(turns).unsqueeze(0) self.word = words return inp, position, turns
def parse_file(inf): src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts = [], [], [], [], [], [] with open(inf, encoding="utf8") as contexts: for line in contexts: contexts = line.strip().split("\t") querys, target = contexts[:-1], contexts[-1] # 不能长于最大长度 while sum([len(q) for q in querys]) > self.max_len-1: querys.pop(0) turns_count = len(querys)-1 turns, q_words, replaced_qs = [], [], [] for turn, query in enumerate(querys): # last index if turn == turns_count: eos_index = len(q_words) q_words += [WORD[EOS]] replaced_qs += [WORD[EOS]] turns += [turn] q_word, repalced_q = common.split_char(query) assert len(q_word) == len(repalced_q) replaced_qs += repalced_q q_words += q_word turns += [turn+1]*(len(q_word)) tgt_words, replaced_tgt = common.split_char(target) assert len(tgt_words) == len(replaced_tgt) new_tgt_words, replaced_new_tgt_words = [], [] for word, replaced_word in zip(tgt_words, replaced_tgt): if word in q_words: new_tgt_words.append(word) replaced_new_tgt_words.append(replaced_word) new_tgt_words = new_tgt_words + [WORD[EOS]] t_index = common.find_text_index(q_words, new_tgt_words) # 保存 # step 1 - 替换后的q replaced_qs += [WORD[PAD]] * \ (self.max_len-len(replaced_qs)) turns += [PAD] * (self.max_len-len(turns)) assert len(replaced_qs) == len(turns) src_texts.append(replaced_qs) src_turn.append(turns) eos_indexs.append(eos_index) tgt_texts.append(([WORD[BOS]]+replaced_new_tgt_words)) tgt_indexs.append(t_index) src_contexts.append("".join(replaced_qs)) return src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts
def preprocess(self, sentences): assert isinstance(sentences, list) while sum([len(q) for q in sentences]) > self.max_context_len - 1: sentences.pop(0) q_words, replaced_qs, turns = [], [], [] turns_count = len(sentences) - 1 for turn, query in enumerate(sentences): if turn == turns_count: q_words += [WORD[EOS]] replaced_qs += [WORD[EOS]] turns += [turn] q_word, repalced_q = common.split_char(query) assert len(q_word) == len(repalced_q) replaced_qs += repalced_q q_words += q_word turns += [turn + 1] * (len(q_word)) assert len(q_words) == len(replaced_qs) == len(turns) idx = np.asarray( [[self.dict[w] if w in self.dict else UNK for w in replaced_qs]]) position = np.asarray([[ pos_i + 1 if w_i != PAD else 0 for pos_i, w_i in enumerate(idx[0]) ]]) turns = np.asarray([turns]) self.word = q_words return idx, position, turns
def parse_file(inf): src_texts, src_turn, tgt_indexs, tgt_texts = [], [], [], [] with open(inf, encoding="utf8") as contexts: for line in contexts: query1, query2, query3, target = line.strip().split("\t\t") q1_words = common.split_char(query1) turn1 = [1] * (len(q1_words)) q2_words = common.split_char(query2) turn2 = [2] * (len(q2_words) + 1) q3_words = common.split_char(query3) turn3 = [3] * (len(q3_words)) q_words = q1_words + q2_words + [WORD[EOS]] + q3_words turns = turn1 + turn2 + turn3 if len(q_words) > self.max_len: continue assert len(q_words) == len(turns) src_texts.append(q_words) src_turn.append(turns) tgt_words = common.split_char(target) new_tgt_words = [] for word in tgt_words: if word in q_words: new_tgt_words.append(word) tgt_texts.append([WORD[BOS]] + new_tgt_words) new_tgt_words = new_tgt_words + [WORD[EOS]] t_index = common.find_text_index(q_words, new_tgt_words) tgt_indexs.append(t_index) return src_texts, src_turn, tgt_indexs, tgt_texts