Python texts2idx Examples

Programming Language: Python

Namespace/Package Name: common

Method/Function: texts2idx

Examples at hotexamples.com: 2

Python texts2idx - 2 examples found. These are the top rated real world Python examples of common.texts2idx extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: corpus.py Project: panda2019-ai/character_identification

    def parse(self):
        def parse_file(inf):
            src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts = [], [], [], [], [], []
            with open(inf, encoding="utf8") as contexts:
                for line in contexts:
                    contexts = line.strip().split("\t")
                    querys, target = contexts[:-1], contexts[-1]

                    # 不能长于最大长度
                    while sum([len(q) for q in querys]) > self.max_len-1:
                        querys.pop(0)

                    turns_count = len(querys)-1
                    turns, q_words, replaced_qs = [], [], []
                    for turn, query in enumerate(querys):
                        # last index
                        if turn == turns_count:
                            eos_index = len(q_words)
                            q_words += [WORD[EOS]]
                            replaced_qs += [WORD[EOS]]
                            turns += [turn]

                        q_word, repalced_q = common.split_char(query)
                        assert len(q_word) == len(repalced_q)

                        replaced_qs += repalced_q
                        q_words += q_word
                        turns += [turn+1]*(len(q_word))

                    tgt_words, replaced_tgt = common.split_char(target)
                    assert len(tgt_words) == len(replaced_tgt)

                    new_tgt_words, replaced_new_tgt_words = [], []
                    for word, replaced_word in zip(tgt_words, replaced_tgt):
                        if word in q_words:
                            new_tgt_words.append(word)
                            replaced_new_tgt_words.append(replaced_word)

                    new_tgt_words = new_tgt_words + [WORD[EOS]]
                    t_index = common.find_text_index(q_words, new_tgt_words)

                    # 保存
                    # step 1 - 替换后的q
                    replaced_qs += [WORD[PAD]] * \
                        (self.max_len-len(replaced_qs))
                    turns += [PAD] * (self.max_len-len(turns))
                    assert len(replaced_qs) == len(turns)
                    src_texts.append(replaced_qs)
                    src_turn.append(turns)

                    eos_indexs.append(eos_index)

                    tgt_texts.append(([WORD[BOS]]+replaced_new_tgt_words))

                    tgt_indexs.append(t_index)

                    src_contexts.append("".join(replaced_qs))

            return src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_contexts

        src_texts, src_turn, tgt_indexs, tgt_texts, eos_indexs, src_context = parse_file(
            f"{DATAPATH}/data")
        print(
            f"Ignored word counts - {self.dict(src_texts, self.min_word_count)}")

        src_context = np.asarray(src_context)
        src_texts = np.asarray(common.texts2idx(src_texts, self.dict.word2idx))
        src_turn = np.asarray(src_turn)
        tgt_indexs = np.asarray(tgt_indexs)
        eos_indexs = np.asarray(eos_indexs)
        tgt_texts = np.asarray(common.texts2idx(tgt_texts, self.dict.word2idx))

        assert src_texts.shape == src_turn.shape
        assert tgt_indexs.shape == tgt_texts.shape

        index = np.arange(tgt_texts.shape[0])
        np.random.shuffle(index)

        src_context = src_context[index]
        src_texts = src_texts[index]
        src_turn = src_turn[index]
        tgt_indexs = tgt_indexs[index]
        tgt_texts = tgt_texts[index]
        eos_indexs = eos_indexs[index]

        self.src_context_train = src_context[2000:]
        self.src_texts_train = src_texts[2000:]
        self.src_turn_train = src_turn[2000:]
        self.tgt_indexs_train = tgt_indexs[2000:]
        self.tgt_texts_train = tgt_texts[2000:]
        self.eos_indexs_train = eos_indexs[2000:]

        self.src_context_test = src_context[2000:]
        self.src_texts_test = src_texts[:2000]
        self.src_turn_test = src_turn[:2000]
        self.tgt_indexs_test = tgt_indexs[:2000]
        self.tgt_texts_test = tgt_texts[:2000]
        self.eos_indexs_test = eos_indexs[:2000]

Example #2

Show file

File: corpus.py Project: zhhhzhang/torch-light

    def parse(self):
        def parse_file(inf):
            src_texts, src_turn, tgt_indexs, tgt_texts = [], [], [], []
            with open(inf, encoding="utf8") as contexts:
                for line in contexts:
                    query1, query2, query3, target = line.strip().split("\t\t")

                    q1_words = common.split_char(query1)
                    turn1 = [1] * (len(q1_words))
                    q2_words = common.split_char(query2)
                    turn2 = [2] * (len(q2_words) + 1)
                    q3_words = common.split_char(query3)
                    turn3 = [3] * (len(q3_words))

                    q_words = q1_words + q2_words + [WORD[EOS]] + q3_words
                    turns = turn1 + turn2 + turn3
                    if len(q_words) > self.max_len:
                        continue

                    assert len(q_words) == len(turns)
                    src_texts.append(q_words)
                    src_turn.append(turns)

                    tgt_words = common.split_char(target)
                    new_tgt_words = []
                    for word in tgt_words:
                        if word in q_words:
                            new_tgt_words.append(word)

                    tgt_texts.append([WORD[BOS]] + new_tgt_words)

                    new_tgt_words = new_tgt_words + [WORD[EOS]]
                    t_index = common.find_text_index(q_words, new_tgt_words)

                    tgt_indexs.append(t_index)

                return src_texts, src_turn, tgt_indexs, tgt_texts

        src_texts, src_turn, tgt_indexs, tgt_texts = parse_file(
            f"{DATAPATH}/corpus")
        print(
            f"Ignored word counts - {self.dict(src_texts, self.min_word_count)}"
        )

        src_texts = np.asarray(common.texts2idx(src_texts, self.dict.word2idx))
        src_turn = np.asarray(src_turn)
        tgt_indexs = np.asarray(tgt_indexs)
        tgt_texts = np.asarray(common.texts2idx(tgt_texts, self.dict.word2idx))

        assert src_texts.shape == src_turn.shape
        assert tgt_indexs.shape == tgt_texts.shape

        index = np.arange(tgt_texts.shape[0])
        np.random.shuffle(index)
        src_texts = src_texts[index]
        src_turn = src_turn[index]
        tgt_indexs = tgt_indexs[index]
        tgt_texts = tgt_texts[index]

        self.src_texts_train = src_texts[2000:]
        self.src_turn_train = src_turn[2000:]
        self.tgt_indexs_train = tgt_indexs[2000:]
        self.tgt_texts_train = tgt_texts[2000:]

        self.src_texts_test = src_texts[:2000]
        self.src_turn_test = src_turn[:2000]
        self.tgt_indexs_test = tgt_indexs[:2000]
        self.tgt_texts_test = tgt_texts[:2000]