def parse(self):
        def parse_file(inf):
            src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts = [], [], [], [], [], []
            with open(inf, encoding="utf8") as contexts:
                for line in contexts:
                    contexts = line.strip().split("\t")
                    querys, target = contexts[:-1], contexts[-1]

                    # 不能长于最大长度
                    while sum([len(q) for q in querys]) > self.max_len-1:
                        querys.pop(0)

                    turns_count = len(querys)-1
                    turns, q_words, replaced_qs = [], [], []
                    for turn, query in enumerate(querys):
                        # last index
                        if turn == turns_count:
                            eos_index = len(q_words)
                            q_words += [WORD[EOS]]
                            replaced_qs += [WORD[EOS]]
                            turns += [turn]

                        q_word, repalced_q = common.split_char(query)
                        assert len(q_word) == len(repalced_q)

                        replaced_qs += repalced_q
                        q_words += q_word
                        turns += [turn+1]*(len(q_word))

                    tgt_words, replaced_tgt = common.split_char(target)
                    assert len(tgt_words) == len(replaced_tgt)

                    new_tgt_words, replaced_new_tgt_words = [], []
                    for word, replaced_word in zip(tgt_words, replaced_tgt):
                        if word in q_words:
                            new_tgt_words.append(word)
                            replaced_new_tgt_words.append(replaced_word)

                    new_tgt_words = new_tgt_words + [WORD[EOS]]
                    t_index = common.find_text_index(q_words, new_tgt_words)

                    # 保存
                    # step 1 - 替换后的q
                    replaced_qs += [WORD[PAD]] * \
                        (self.max_len-len(replaced_qs))
                    turns += [PAD] * (self.max_len-len(turns))
                    assert len(replaced_qs) == len(turns)
                    src_texts.append(replaced_qs)
                    src_turn.append(turns)

                    eos_indexs.append(eos_index)

                    tgt_texts.append(([WORD[BOS]]+replaced_new_tgt_words))

                    tgt_indexs.append(t_index)

                    src_contexts.append("".join(replaced_qs))

            return src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts

        src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_context = parse_file(
            f"{DATAPATH}/data")
        print(
            f"Ignored word counts - {self.dict(src_texts, self.min_word_count)}")

        src_context = np.asarray(src_context)
        src_texts = np.asarray(common.texts2idx(src_texts, self.dict.word2idx))
        src_turn = np.asarray(src_turn)
        tgt_indexs = np.asarray(tgt_indexs)
        eos_indexs = np.asarray(eos_indexs)
        tgt_texts = np.asarray(common.texts2idx(tgt_texts, self.dict.word2idx))

        assert src_texts.shape == src_turn.shape
        assert tgt_indexs.shape == tgt_texts.shape

        index = np.arange(tgt_texts.shape[0])
        np.random.shuffle(index)

        src_context = src_context[index]
        src_texts = src_texts[index]
        src_turn = src_turn[index]
        tgt_indexs = tgt_indexs[index]
        tgt_texts = tgt_texts[index]
        eos_indexs = eos_indexs[index]

        self.src_context_train = src_context[2000:]
        self.src_texts_train = src_texts[2000:]
        self.src_turn_train = src_turn[2000:]
        self.tgt_indexs_train = tgt_indexs[2000:]
        self.tgt_texts_train = tgt_texts[2000:]
        self.eos_indexs_train = eos_indexs[2000:]

        self.src_context_test = src_context[2000:]
        self.src_texts_test = src_texts[:2000]
        self.src_turn_test = src_turn[:2000]
        self.tgt_indexs_test = tgt_indexs[:2000]
        self.tgt_texts_test = tgt_texts[:2000]
        self.eos_indexs_test = eos_indexs[:2000]
Example #2
0
    def parse(self):
        def parse_file(inf):
            src_texts, src_turn, tgt_indexs, tgt_texts = [], [], [], []
            with open(inf, encoding="utf8") as contexts:
                for line in contexts:
                    query1, query2, query3, target = line.strip().split("\t\t")

                    q1_words = common.split_char(query1)
                    turn1 = [1] * (len(q1_words))
                    q2_words = common.split_char(query2)
                    turn2 = [2] * (len(q2_words) + 1)
                    q3_words = common.split_char(query3)
                    turn3 = [3] * (len(q3_words))

                    q_words = q1_words + q2_words + [WORD[EOS]] + q3_words
                    turns = turn1 + turn2 + turn3
                    if len(q_words) > self.max_len:
                        continue

                    assert len(q_words) == len(turns)
                    src_texts.append(q_words)
                    src_turn.append(turns)

                    tgt_words = common.split_char(target)
                    new_tgt_words = []
                    for word in tgt_words:
                        if word in q_words:
                            new_tgt_words.append(word)

                    tgt_texts.append([WORD[BOS]] + new_tgt_words)

                    new_tgt_words = new_tgt_words + [WORD[EOS]]
                    t_index = common.find_text_index(q_words, new_tgt_words)

                    tgt_indexs.append(t_index)

                return src_texts, src_turn, tgt_indexs, tgt_texts

        src_texts, src_turn, tgt_indexs, tgt_texts = parse_file(
            f"{DATAPATH}/corpus")
        print(
            f"Ignored word counts - {self.dict(src_texts, self.min_word_count)}"
        )

        src_texts = np.asarray(common.texts2idx(src_texts, self.dict.word2idx))
        src_turn = np.asarray(src_turn)
        tgt_indexs = np.asarray(tgt_indexs)
        tgt_texts = np.asarray(common.texts2idx(tgt_texts, self.dict.word2idx))

        assert src_texts.shape == src_turn.shape
        assert tgt_indexs.shape == tgt_texts.shape

        index = np.arange(tgt_texts.shape[0])
        np.random.shuffle(index)
        src_texts = src_texts[index]
        src_turn = src_turn[index]
        tgt_indexs = tgt_indexs[index]
        tgt_texts = tgt_texts[index]

        self.src_texts_train = src_texts[2000:]
        self.src_turn_train = src_turn[2000:]
        self.tgt_indexs_train = tgt_indexs[2000:]
        self.tgt_texts_train = tgt_texts[2000:]

        self.src_texts_test = src_texts[:2000]
        self.src_turn_test = src_turn[:2000]
        self.tgt_indexs_test = tgt_indexs[:2000]
        self.tgt_texts_test = tgt_texts[:2000]