class Conversation: def __init__(self): self.dial_info = Dialogue_Info() self.vocab = Vocabulary() self.dial_pair = [] self.train_dial_pair = [] self.valid_dial_pair = [] self.test_dial_pair = [] self.encoded_train_dial_pair = [] self.encoded_valid_dial_pair = [] self.encoded_test_dial_pair = [] def split_data_set(self, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1): train_size = int(len(self.dial_pair) * train_ratio) valid_size = int(len(self.dial_pair) * valid_ratio) test_size = int(len(self.dial_pair) * test_ratio) while len(self.valid_dial_pair) < valid_size: index = random.randint(0, len(self.dial_pair) - 1) if index not in self.valid_dial_pair: self.valid_dial_pair.append(self.dial_pair[index]) while len(self.test_dial_pair) < test_size: index = random.randint(0, len(self.dial_pair) - 1) if index not in self.test_dial_pair and index not in self.valid_dial_pair: self.test_dial_pair.append(self.dial_pair[index]) for index in range(len(self.dial_pair)): if index not in self.valid_dial_pair and index not in self.test_dial_pair: self.train_dial_pair.append(self.dial_pair[index]) if len(self.train_dial_pair) >= train_size: break def build_dialogue_pair(self): for item in self.dial_info.dial_info: item_len = len(item) if item_len % 2 != 0: item = item[0:int(item_len / 2) * 2] for index in range(0, len(item), 2): self.dial_pair.append(list((item[index], item[index + 1]))) def encode_dialogue_pair(self): # self.get_raw_sent(source_path) for train_item in self.train_dial_pair: encode_sent = [ self.vocab.encode_sent(train_item[0]), self.vocab.encode_sent(train_item[1]) ] if len(encode_sent[0]) > 1 and len( encode_sent[0]) < max_len and len( encode_sent[1]) > 1 and len(encode_sent[1]) < max_len: self.encoded_train_dial_pair.append(encode_sent) for valid_item in self.valid_dial_pair: encode_sent = [ self.vocab.encode_sent(valid_item[0]), self.vocab.encode_sent(valid_item[1]) ] if len(encode_sent[0]) > 1 and len( encode_sent[0]) < max_len and len( encode_sent[1]) > 1 and len(encode_sent[1]) < max_len: self.encoded_valid_dial_pair.append(encode_sent) for test_item in self.test_dial_pair: encode_sent = [ self.vocab.encode_sent(test_item[0]), self.vocab.encode_sent(test_item[1]) ] if len(encode_sent[0]) > 1 and len( encode_sent[0]) < max_len and len( encode_sent[1]) > 1 and len(encode_sent[1]) < max_len: self.encoded_test_dial_pair.append(encode_sent) def save_dialogue_pair(self, exp_data_dir): dial_pair_path = os.path.join(exp_data_dir, 'dialogue_pair.json') train_dial_pair_path = os.path.join(exp_data_dir, 'train_dialogue_pair.json') valid_dial_pair_path = os.path.join(exp_data_dir, 'valid_dialogue_pair.json') test_dial_pair_path = os.path.join(exp_data_dir, 'test_dialogue_pair.json') encoded_train_dial_pair_path = os.path.join( exp_data_dir, 'encoded_train_dialogue_pair.json') encoded_valid_dial_pair_path = os.path.join( exp_data_dir, 'encoded_valid_dialogue_pair.json') encoded_test_dial_pair_path = os.path.join( exp_data_dir, 'encoded_test_dialogue_pair.json') with open(dial_pair_path, 'w') as f: json.dump(self.dial_pair, f) with open(train_dial_pair_path, 'w') as f: json.dump(self.train_dial_pair, f) with open(valid_dial_pair_path, 'w') as f: json.dump(self.valid_dial_pair, f) with open(test_dial_pair_path, 'w') as f: json.dump(self.test_dial_pair, f) with open(encoded_train_dial_pair_path, 'w') as f: json.dump(self.encoded_train_dial_pair, f) with open(encoded_valid_dial_pair_path, 'w') as f: json.dump(self.encoded_valid_dial_pair, f) with open(encoded_test_dial_pair_path, 'w') as f: json.dump(self.encoded_test_dial_pair, f) def load_dialogue_pair(self, exp_data_dir): train_dial_pair_path = os.path.join(exp_data_dir, 'train_dialogue_pair.json') valid_dial_pair_path = os.path.join(exp_data_dir, 'valid_dialogue_pair.json') test_dial_pair_path = os.path.join(exp_data_dir, 'test_dialogue_pair.json') with open(train_dial_pair_path, 'r') as f: self.train_dial_pair = json.load(f) with open(valid_dial_pair_path, 'r') as f: self.valid_dial_pair = json.load(f) with open(test_dial_pair_path, 'r') as f: self.test_dial_pair = json.load(f) def create_conversation(self, raw_data_dir, exp_data_dir): threshold = 5 self.dial_info.build_dialogue_info(raw_data_dir) self.dial_info.save_dialogue_info(exp_data_dir) self.vocab.build_vocab(threshold, self.train_dial_pair) self.vocab.save_vocab(exp_data_dir) self.build_dialogue_pair() self.split_data_set() self.encode_dialogue_pair() self.save_dialogue_pair(exp_data_dir) def re_encode_dialogue_pair(self, exp_data_dir): vocab_path = os.path.join(exp_data_dir, 'vocabulary.json') self.vocab.load_vocab(vocab_path) self.load_dialogue_pair(exp_data_dir) self.encode_dialogue_pair() self.save_dialogue_pair(exp_data_dir)