def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file,
              train_rate, valid_rate, test_rate, algo_name, charset, mode) :
     """
     Need to set these attributes.
         1. conf_dict: configuration of the model.
         2. cr: CorpursReader for operate data.
         3. model: the network model.
     """
     self.conf_dict = {'algo_name': algo_name, 'batch_size': 96,
                       'train_valid_test_rate': [train_rate, valid_rate, test_rate],
                       'split_level': 'zi', 'pre_word_embedding': False,
                       'word_embedding_dim': 128, 'max_sentence_word_num': 150,
                       'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True,
                       'hidden_dim': 1024, 'n_style': 4,
                       'style_embedding_dim': 128, 'charset': charset, 'shuffle': False,
                       'save_freq': 100}
     if mode == 'train' :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model')
     else :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final')
     param_dict = load_params_val(self.param_path)
     self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf')
     save_confs_val(self.conf_dict, self.conf_path)
     
     # set corpus reader
     if mode == 'train' :
         self.cr = CorpusReaderDialog(dataset_file=dataset_file,
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file,
                                      word_embedding_file=None,
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'],
                                      charset=self.conf_dict['charset'],
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'],
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     else :
         self.cr = CorpusReaderDialog(dataset_file=None,
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file,
                                      word_embedding_file=None,
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'],
                                      charset=self.conf_dict['charset'],
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'],
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     # set model
     self.model = StyleEncoderDecoderNetwork(n_words=len(self.cr.get_word_dictionary()),
                                           hidden_status_dim=self.conf_dict['hidden_dim'],
                                           word_embedding_dim=self.conf_dict['word_embedding_dim'],
                                           n_style=self.conf_dict['n_style'],
                                           style_embedding_dim=self.conf_dict['style_embedding_dim'],
                                           input_params=param_dict)
                 
                 
                 
                 
Beispiel #2
0
 def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file,
              train_rate, valid_rate, test_rate, algo_name, charset, mode) :
     """
     Need to set these attributes.
         1. conf_dict: configuration of the model.
         2. cr: CorpursReader for operate data.
         3. model: the network model.
     """
     self.conf_dict = {'algo_name': algo_name, 'batch_size': 256,
                       'train_valid_test_rate': [train_rate, valid_rate, test_rate],
                       'split_level': 'zi', 'pre_word_embedding': False,
                       'word_embedding_dim': 128, 'max_sentence_word_num': 150,
                       'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True,
                       'hidden_dim': 512, 'charset': charset, 'shuffle': False,
                       'save_freq': 100}
     if mode == 'train' :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model')
     else :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final')
     self.param_path = 'ChoEncoderDecoder_bda37ef460ea58d4cfaf1122e4a7e2d8.model302'
     param_dict = load_params_val(self.param_path)
     self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf')
     #save_confs_val(self.conf_dict, self.conf_path)
     # set corpus reader
     if mode == 'train' :
         self.cr = CorpusReaderDialog(dataset_file=dataset_file, 
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file, 
                                      word_embedding_file=None, 
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'], 
                                      charset=self.conf_dict['charset'], 
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'], 
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     else :
         self.cr = CorpusReaderDialog(dataset_file=None, 
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file, 
                                      word_embedding_file=None,
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'], 
                                      charset=self.conf_dict['charset'], 
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'], 
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     # set model
     self.model = RnnEncoderDecoderNetwork(n_words=len(self.cr.get_word_dictionary()),
                                           hidden_status_dim=self.conf_dict['hidden_dim'],
                                           word_embedding_dim=self.conf_dict['word_embedding_dim'],
                                           input_params=param_dict)
 def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file,
              train_rate, valid_rate, test_rate, algo_name, charset, mode) :
     """
     Need to set these attributes.
         1. conf_dict: configuration of the model.
         2. cr: CorpursReader for operate data.
         3. model: the network model.
     """
     self.conf_dict = {'algo_name': algo_name, 'batch_size': 128,
                       'train_valid_test_rate': [train_rate, valid_rate, test_rate],
                       'split_level': 'zi', 'pre_word_embedding': False,
                       'word_embedding_dim': 128, 'max_sentence_word_num': 150,
                       'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True,
                       'hidden_dim': 1024, 'charset': charset, 'shuffle': False,
                       'save_freq': 100}
     if mode == 'train' :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model')
     else :
         self.param_path = \
             os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final')
     param_dict = load_params_val(self.param_path)
     self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf')
     save_confs_val(self.conf_dict, self.conf_path)
     # set corpus reader
     if mode == 'train' :
         self.cr = CorpusReaderDialog(dataset_file=dataset_file, 
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file, 
                                      word_embedding_file=None, 
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'], 
                                      charset=self.conf_dict['charset'], 
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'], 
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     else :
         self.cr = CorpusReaderDialog(dataset_file=None, 
                                      stopwords_file=stopwords_file,
                                      dict_file=dict_file, 
                                      word_embedding_file=None,
                                      train_valid_test_rate=self.conf_dict['train_valid_test_rate'], 
                                      charset=self.conf_dict['charset'], 
                                      max_sentence_word_num=self.conf_dict['max_sentence_word_num'], 
                                      min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
                                      is_BEG_available=self.conf_dict['is_BEG'],
                                      is_END_available=self.conf_dict['is_END'])
     # set model
     self.model = BiEncoderAttentionDecoderNetwork(n_words=len(self.cr.get_word_dictionary()),
                                                   hidden_status_dim=self.conf_dict['hidden_dim'],
                                                   word_embedding_dim=self.conf_dict['word_embedding_dim'],
                                                   input_params=param_dict)
     # create param sets
     if mode == 'test_all' :
         self.model_sets, self.model_paths = list(), list()
         for parent, dirnames, filenames in os.walk(os.path.join(dataset_folder, 'model', 'dialog')) :
             for filename in filenames :
                 if re.match(algo_name + '_(\w+).model[0-9]+', filename) != None :
                     self.model_paths.append(filename)
                     param_dict = load_params_val(os.path.join(dataset_folder, 'model', 'dialog', filename))
                     model = BiEncoderAttentionDecoderNetwork(n_words=len(self.cr.get_word_dictionary()),
                                                      hidden_status_dim=self.conf_dict['hidden_dim'],
                                                      word_embedding_dim=self.conf_dict['word_embedding_dim'],
                                                      input_params=param_dict)
                     self.model_sets.append(model)
 def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file,
              word_embedding_file, train_rate, valid_rate, test_rate,
              algo_name, charset, mode):
     """
     Need to set these attributes.
         1. conf_dict: configuration of the model.
         2. cr: CorpursReader for operate data.
         3. model: the network model.
     """
     self.conf_dict = {
         'algo_name': algo_name,
         'batch_size': 256,
         'train_valid_test_rate': [train_rate, valid_rate, test_rate],
         'split_level': 'zi',
         'pre_word_embedding': False,
         'word_embedding_dim': 128,
         'n_topics': 5000,
         'topic_embedding_dim': 256,
         'max_sentence_word_num': 150,
         'min_sentence_word_num': 1,
         'is_BEG': False,
         'is_END': True,
         'hidden_dim': 512,
         'charset': charset,
         'shuffle': False,
         'save_freq': 100
     }
     self.param_path = os.path.join(
         dataset_folder, 'model', 'dialog',
         get_params_file_name(self.conf_dict) + '.model')
     print self.param_path
     #self.param_path ='ChoEncoderDecoderTopic_5908276eb2ae513520ca72135e5b82d0.model83'
     #self.param_path='ChoEncoderDecoderDT_4575b6c5893c10a009e29b6eb2988387.model42'
     #self.param_path='ChoEncoderDecoderDT_cc7f5ed5d9e9fe5a90a012f4e017106a.model'
     param_dict = load_params_val(self.param_path)
     self.conf_path = os.path.join(
         dataset_folder, 'model', 'dialog',
         get_params_file_name(self.conf_dict) + '.conf')
     save_confs_val(self.conf_dict, self.conf_path)
     # set corpus reader
     if mode == 'train':
         self.cr = CorpusReaderDialogTopic(
             dataset_file=dataset_file,
             stopwords_file=stopwords_file,
             dict_file=dict_file,
             word_embedding_file=None,
             train_valid_test_rate=self.conf_dict['train_valid_test_rate'],
             charset=self.conf_dict['charset'],
             max_sentence_word_num=self.conf_dict['max_sentence_word_num'],
             min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
             is_BEG_available=self.conf_dict['is_BEG'],
             is_END_available=self.conf_dict['is_END'])
     else:
         self.cr = CorpusReaderDialogTopic(
             dataset_file=None,
             stopwords_file=stopwords_file,
             dict_file=dict_file,
             word_embedding_file=None,
             train_valid_test_rate=self.conf_dict['train_valid_test_rate'],
             charset=self.conf_dict['charset'],
             max_sentence_word_num=self.conf_dict['max_sentence_word_num'],
             min_sentence_word_num=self.conf_dict['min_sentence_word_num'],
             is_BEG_available=self.conf_dict['is_BEG'],
             is_END_available=self.conf_dict['is_END'])
     # set model
     self.model = RnnEncoderDecoderNetwork(
         n_words=len(self.cr.get_word_dictionary()),
         hidden_status_dim=self.conf_dict['hidden_dim'],
         word_embedding_dim=self.conf_dict['word_embedding_dim'],
         n_topics=self.conf_dict['n_topics'],
         topic_embedding_dim=self.conf_dict['topic_embedding_dim'],
         input_params=param_dict)