コード例 #1
0
class Conversation:
    def __init__(self):
        self.dial_info = Dialogue_Info()
        self.vocab = Vocabulary()

        self.dial_pair = []
        self.train_dial_pair = []
        self.valid_dial_pair = []
        self.test_dial_pair = []

        self.encoded_train_dial_pair = []
        self.encoded_valid_dial_pair = []
        self.encoded_test_dial_pair = []

    def split_data_set(self, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1):

        train_size = int(len(self.dial_pair) * train_ratio)
        valid_size = int(len(self.dial_pair) * valid_ratio)
        test_size = int(len(self.dial_pair) * test_ratio)

        while len(self.valid_dial_pair) < valid_size:
            index = random.randint(0, len(self.dial_pair) - 1)
            if index not in self.valid_dial_pair:
                self.valid_dial_pair.append(self.dial_pair[index])

        while len(self.test_dial_pair) < test_size:
            index = random.randint(0, len(self.dial_pair) - 1)
            if index not in self.test_dial_pair and index not in self.valid_dial_pair:
                self.test_dial_pair.append(self.dial_pair[index])

        for index in range(len(self.dial_pair)):
            if index not in self.valid_dial_pair and index not in self.test_dial_pair:
                self.train_dial_pair.append(self.dial_pair[index])
                if len(self.train_dial_pair) >= train_size: break

    def build_dialogue_pair(self):

        for item in self.dial_info.dial_info:
            item_len = len(item)
            if item_len % 2 != 0:
                item = item[0:int(item_len / 2) * 2]

            for index in range(0, len(item), 2):
                self.dial_pair.append(list((item[index], item[index + 1])))

    def encode_dialogue_pair(self):
        # self.get_raw_sent(source_path)

        for train_item in self.train_dial_pair:
            encode_sent = [
                self.vocab.encode_sent(train_item[0]),
                self.vocab.encode_sent(train_item[1])
            ]
            if len(encode_sent[0]) > 1 and len(
                    encode_sent[0]) < max_len and len(
                        encode_sent[1]) > 1 and len(encode_sent[1]) < max_len:
                self.encoded_train_dial_pair.append(encode_sent)

        for valid_item in self.valid_dial_pair:
            encode_sent = [
                self.vocab.encode_sent(valid_item[0]),
                self.vocab.encode_sent(valid_item[1])
            ]
            if len(encode_sent[0]) > 1 and len(
                    encode_sent[0]) < max_len and len(
                        encode_sent[1]) > 1 and len(encode_sent[1]) < max_len:
                self.encoded_valid_dial_pair.append(encode_sent)

        for test_item in self.test_dial_pair:
            encode_sent = [
                self.vocab.encode_sent(test_item[0]),
                self.vocab.encode_sent(test_item[1])
            ]
            if len(encode_sent[0]) > 1 and len(
                    encode_sent[0]) < max_len and len(
                        encode_sent[1]) > 1 and len(encode_sent[1]) < max_len:
                self.encoded_test_dial_pair.append(encode_sent)

    def save_dialogue_pair(self, exp_data_dir):
        dial_pair_path = os.path.join(exp_data_dir, 'dialogue_pair.json')
        train_dial_pair_path = os.path.join(exp_data_dir,
                                            'train_dialogue_pair.json')
        valid_dial_pair_path = os.path.join(exp_data_dir,
                                            'valid_dialogue_pair.json')
        test_dial_pair_path = os.path.join(exp_data_dir,
                                           'test_dialogue_pair.json')

        encoded_train_dial_pair_path = os.path.join(
            exp_data_dir, 'encoded_train_dialogue_pair.json')
        encoded_valid_dial_pair_path = os.path.join(
            exp_data_dir, 'encoded_valid_dialogue_pair.json')
        encoded_test_dial_pair_path = os.path.join(
            exp_data_dir, 'encoded_test_dialogue_pair.json')

        with open(dial_pair_path, 'w') as f:
            json.dump(self.dial_pair, f)

        with open(train_dial_pair_path, 'w') as f:
            json.dump(self.train_dial_pair, f)

        with open(valid_dial_pair_path, 'w') as f:
            json.dump(self.valid_dial_pair, f)

        with open(test_dial_pair_path, 'w') as f:
            json.dump(self.test_dial_pair, f)

        with open(encoded_train_dial_pair_path, 'w') as f:
            json.dump(self.encoded_train_dial_pair, f)

        with open(encoded_valid_dial_pair_path, 'w') as f:
            json.dump(self.encoded_valid_dial_pair, f)

        with open(encoded_test_dial_pair_path, 'w') as f:
            json.dump(self.encoded_test_dial_pair, f)

    def load_dialogue_pair(self, exp_data_dir):
        train_dial_pair_path = os.path.join(exp_data_dir,
                                            'train_dialogue_pair.json')
        valid_dial_pair_path = os.path.join(exp_data_dir,
                                            'valid_dialogue_pair.json')
        test_dial_pair_path = os.path.join(exp_data_dir,
                                           'test_dialogue_pair.json')

        with open(train_dial_pair_path, 'r') as f:
            self.train_dial_pair = json.load(f)

        with open(valid_dial_pair_path, 'r') as f:
            self.valid_dial_pair = json.load(f)

        with open(test_dial_pair_path, 'r') as f:
            self.test_dial_pair = json.load(f)

    def create_conversation(self, raw_data_dir, exp_data_dir):
        threshold = 5

        self.dial_info.build_dialogue_info(raw_data_dir)
        self.dial_info.save_dialogue_info(exp_data_dir)

        self.vocab.build_vocab(threshold, self.train_dial_pair)
        self.vocab.save_vocab(exp_data_dir)

        self.build_dialogue_pair()
        self.split_data_set()
        self.encode_dialogue_pair()
        self.save_dialogue_pair(exp_data_dir)

    def re_encode_dialogue_pair(self, exp_data_dir):
        vocab_path = os.path.join(exp_data_dir, 'vocabulary.json')
        self.vocab.load_vocab(vocab_path)
        self.load_dialogue_pair(exp_data_dir)
        self.encode_dialogue_pair()
        self.save_dialogue_pair(exp_data_dir)