def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode) : """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = {'algo_name': algo_name, 'batch_size': 96, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 1024, 'n_style': 4, 'style_embedding_dim': 128, 'charset': charset, 'shuffle': False, 'save_freq': 100} if mode == 'train' : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') else : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final') param_dict = load_params_val(self.param_path) self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train' : self.cr = CorpusReaderDialog(dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else : self.cr = CorpusReaderDialog(dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = StyleEncoderDecoderNetwork(n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], n_style=self.conf_dict['n_style'], style_embedding_dim=self.conf_dict['style_embedding_dim'], input_params=param_dict)
def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode) : """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = {'algo_name': algo_name, 'batch_size': 128, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 1024, 'charset': charset, 'shuffle': False, 'save_freq': 100} if mode == 'train' : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') else : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final') param_dict = load_params_val(self.param_path) self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train' : self.cr = CorpusReaderDialog(dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else : self.cr = CorpusReaderDialog(dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = BiEncoderAttentionDecoderNetwork(n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], input_params=param_dict) # create param sets if mode == 'test_all' : self.model_sets, self.model_paths = list(), list() for parent, dirnames, filenames in os.walk(os.path.join(dataset_folder, 'model', 'dialog')) : for filename in filenames : if re.match(algo_name + '_(\w+).model[0-9]+', filename) != None : self.model_paths.append(filename) param_dict = load_params_val(os.path.join(dataset_folder, 'model', 'dialog', filename)) model = BiEncoderAttentionDecoderNetwork(n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], input_params=param_dict) self.model_sets.append(model)
def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode): """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = { 'algo_name': algo_name, 'batch_size': 1, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 256, 'charset': charset, 'shuffle': False, 'save_freq': 1 } if mode == 'train': self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') else: self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final') param_dict = load_params_val(self.param_path) self.conf_path = os.path.join( dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train': self.cr = CorpusReaderDialog( dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else: self.cr = CorpusReaderDialog( dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = RnnEncoderDecoderNetwork( n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], input_params=param_dict)
def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode): """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = { 'algo_name': algo_name, 'batch_size': 256, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'n_topics': 5000, 'topic_embedding_dim': 256, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 512, 'charset': charset, 'shuffle': False, 'save_freq': 100 } self.param_path = os.path.join( dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') print self.param_path #self.param_path ='ChoEncoderDecoderTopic_5908276eb2ae513520ca72135e5b82d0.model83' #self.param_path='ChoEncoderDecoderDT_4575b6c5893c10a009e29b6eb2988387.model42' #self.param_path='ChoEncoderDecoderDT_cc7f5ed5d9e9fe5a90a012f4e017106a.model' param_dict = load_params_val(self.param_path) self.conf_path = os.path.join( dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train': self.cr = CorpusReaderDialogTopic( dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else: self.cr = CorpusReaderDialogTopic( dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = RnnEncoderDecoderNetwork( n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], n_topics=self.conf_dict['n_topics'], topic_embedding_dim=self.conf_dict['topic_embedding_dim'], input_params=param_dict)