Example #1
0
    def sent2tenosr(self, sentences):
        assert isinstance(sentences, list) and len(sentences) == 3

        max_len = self.args.max_context_len
        query1, query2, query3 = sentences
        q1_words = common.split_char(query1)
        turn1 = [1] * (len(q1_words))
        q2_words = common.split_char(query2)
        turn2 = [2] * (len(q2_words) + 1)
        q3_words = common.split_char(query3)
        turn3 = [3] * (len(q3_words))
        words = q1_words + q2_words + [WORD[EOS]] + q3_words
        turns = turn1 + turn2 + turn3

        if len(words) > max_len:
            words = words[:max_len]

        idx = [self.dict[w] if w in self.dict else UNK for w in words]

        inp = torch.LongTensor(idx).unsqueeze(0)
        position = torch.LongTensor([
            pos_i + 1 if w_i != PAD else 0 for pos_i, w_i in enumerate(idx)
        ]).unsqueeze(0)
        turns = torch.LongTensor(turns).unsqueeze(0)

        self.word = words

        return inp, position, turns
        def parse_file(inf):
            src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts = [], [], [], [], [], []
            with open(inf, encoding="utf8") as contexts:
                for line in contexts:
                    contexts = line.strip().split("\t")
                    querys, target = contexts[:-1], contexts[-1]

                    # 不能长于最大长度
                    while sum([len(q) for q in querys]) > self.max_len-1:
                        querys.pop(0)

                    turns_count = len(querys)-1
                    turns, q_words, replaced_qs = [], [], []
                    for turn, query in enumerate(querys):
                        # last index
                        if turn == turns_count:
                            eos_index = len(q_words)
                            q_words += [WORD[EOS]]
                            replaced_qs += [WORD[EOS]]
                            turns += [turn]

                        q_word, repalced_q = common.split_char(query)
                        assert len(q_word) == len(repalced_q)

                        replaced_qs += repalced_q
                        q_words += q_word
                        turns += [turn+1]*(len(q_word))

                    tgt_words, replaced_tgt = common.split_char(target)
                    assert len(tgt_words) == len(replaced_tgt)

                    new_tgt_words, replaced_new_tgt_words = [], []
                    for word, replaced_word in zip(tgt_words, replaced_tgt):
                        if word in q_words:
                            new_tgt_words.append(word)
                            replaced_new_tgt_words.append(replaced_word)

                    new_tgt_words = new_tgt_words + [WORD[EOS]]
                    t_index = common.find_text_index(q_words, new_tgt_words)

                    # 保存
                    # step 1 - 替换后的q
                    replaced_qs += [WORD[PAD]] * \
                        (self.max_len-len(replaced_qs))
                    turns += [PAD] * (self.max_len-len(turns))
                    assert len(replaced_qs) == len(turns)
                    src_texts.append(replaced_qs)
                    src_turn.append(turns)

                    eos_indexs.append(eos_index)

                    tgt_texts.append(([WORD[BOS]]+replaced_new_tgt_words))

                    tgt_indexs.append(t_index)

                    src_contexts.append("".join(replaced_qs))

            return src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts
Example #3
0
    def preprocess(self, sentences):
        assert isinstance(sentences, list)

        while sum([len(q) for q in sentences]) > self.max_context_len - 1:
            sentences.pop(0)

        q_words, replaced_qs, turns = [], [], []

        turns_count = len(sentences) - 1
        for turn, query in enumerate(sentences):
            if turn == turns_count:
                q_words += [WORD[EOS]]
                replaced_qs += [WORD[EOS]]
                turns += [turn]

            q_word, repalced_q = common.split_char(query)
            assert len(q_word) == len(repalced_q)

            replaced_qs += repalced_q
            q_words += q_word
            turns += [turn + 1] * (len(q_word))

        assert len(q_words) == len(replaced_qs) == len(turns)

        idx = np.asarray(
            [[self.dict[w] if w in self.dict else UNK for w in replaced_qs]])
        position = np.asarray([[
            pos_i + 1 if w_i != PAD else 0 for pos_i, w_i in enumerate(idx[0])
        ]])
        turns = np.asarray([turns])

        self.word = q_words

        return idx, position, turns
Example #4
0
        def parse_file(inf):
            src_texts, src_turn, tgt_indexs, tgt_texts = [], [], [], []
            with open(inf, encoding="utf8") as contexts:
                for line in contexts:
                    query1, query2, query3, target = line.strip().split("\t\t")

                    q1_words = common.split_char(query1)
                    turn1 = [1] * (len(q1_words))
                    q2_words = common.split_char(query2)
                    turn2 = [2] * (len(q2_words) + 1)
                    q3_words = common.split_char(query3)
                    turn3 = [3] * (len(q3_words))

                    q_words = q1_words + q2_words + [WORD[EOS]] + q3_words
                    turns = turn1 + turn2 + turn3
                    if len(q_words) > self.max_len:
                        continue

                    assert len(q_words) == len(turns)
                    src_texts.append(q_words)
                    src_turn.append(turns)

                    tgt_words = common.split_char(target)
                    new_tgt_words = []
                    for word in tgt_words:
                        if word in q_words:
                            new_tgt_words.append(word)

                    tgt_texts.append([WORD[BOS]] + new_tgt_words)

                    new_tgt_words = new_tgt_words + [WORD[EOS]]
                    t_index = common.find_text_index(q_words, new_tgt_words)

                    tgt_indexs.append(t_index)

                return src_texts, src_turn, tgt_indexs, tgt_texts