def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode) : """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = {'algo_name': algo_name, 'batch_size': 96, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 1024, 'n_style': 4, 'style_embedding_dim': 128, 'charset': charset, 'shuffle': False, 'save_freq': 100} if mode == 'train' : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') else : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final') param_dict = load_params_val(self.param_path) self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train' : self.cr = CorpusReaderDialog(dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else : self.cr = CorpusReaderDialog(dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = StyleEncoderDecoderNetwork(n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], n_style=self.conf_dict['n_style'], style_embedding_dim=self.conf_dict['style_embedding_dim'], input_params=param_dict)
def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode) : """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = {'algo_name': algo_name, 'batch_size': 256, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 512, 'charset': charset, 'shuffle': False, 'save_freq': 100} if mode == 'train' : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') else : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final') self.param_path = 'ChoEncoderDecoder_bda37ef460ea58d4cfaf1122e4a7e2d8.model302' param_dict = load_params_val(self.param_path) self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') #save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train' : self.cr = CorpusReaderDialog(dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else : self.cr = CorpusReaderDialog(dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = RnnEncoderDecoderNetwork(n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], input_params=param_dict)
def train(self): """ Train a model. """ n_trainset, n_validset, n_testset = self.cr.get_size() n_batches = max(1, (n_trainset - 1) / self.conf_dict['batch_size'] + 1) # print model information print '#' * 100 self.print_model_info() print ('Compiling model...') train_model = \ self.model.get_training_function(self.cr, batch_size=self.conf_dict['batch_size']) valid_model = self.model.get_validing_function(self.cr, batch_size=self.conf_dict['batch_size']) test_model = self.model.get_testing_function(self.cr, batch_size=self.conf_dict['batch_size']) # start train print '#' * 100 print ('Start to train.') test_error = test_model()[0] print ('Now testing model. Cost Error: %.10f' % (test_error)) epoch, it, n_epochs = 0, 0, 1000000 while (epoch < n_epochs): epoch += 1 for i in xrange(n_batches): # train model train_error = train_model(i)[0] # print 'Step error: %f\r' % train_error, if math.isnan(train_error): print ('Train error is NaN in iteration %d, batch %d' % (epoch, i)) error_model_path = self.param_path + str(epoch) + '.error' save_params_val(error_model_path, self.model.get_parameters()) print ('model saved in %s , reload and skip the %d batch.' % \ (error_model_path, self.conf_dict['save_freq'])) param_dict = load_params_val(self.param_path) self.model.set_parameters(param_dict) exit() if it % self.conf_dict['save_freq'] == 0: # valid model # valid_error = valid_model()[0] print ('@iter: %d\tTraining Error: %.10f' % (it, train_error)) print self.param_path save_params_val(self.param_path, self.model.get_parameters()) it = it + 1 # test model # print ('Finished a epoch.') test_error = test_model()[0] print ('Now testing model. Testing Error: %.10f' % (test_error)) save_params_val(self.param_path + str(epoch), self.model.get_parameters())
def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode) : """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = {'algo_name': algo_name, 'batch_size': 128, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 1024, 'charset': charset, 'shuffle': False, 'save_freq': 100} if mode == 'train' : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') else : self.param_path = \ os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model.final') param_dict = load_params_val(self.param_path) self.conf_path = os.path.join(dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train' : self.cr = CorpusReaderDialog(dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else : self.cr = CorpusReaderDialog(dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = BiEncoderAttentionDecoderNetwork(n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], input_params=param_dict) # create param sets if mode == 'test_all' : self.model_sets, self.model_paths = list(), list() for parent, dirnames, filenames in os.walk(os.path.join(dataset_folder, 'model', 'dialog')) : for filename in filenames : if re.match(algo_name + '_(\w+).model[0-9]+', filename) != None : self.model_paths.append(filename) param_dict = load_params_val(os.path.join(dataset_folder, 'model', 'dialog', filename)) model = BiEncoderAttentionDecoderNetwork(n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], input_params=param_dict) self.model_sets.append(model)
def train(self): """ Train a model. """ if self.conf_dict['shuffle'] : self.cr.shuffle() # The data may be shuffle by some implement of CorpusReader, but NOT ALL. n_train_set, n_valid_set, n_test_set = self.cr.get_size() n_batches = (n_train_set - 1) / self.conf_dict['batch_size'] + 1 print 'Compiling model' train_model = self.model.get_training_function(self.cr, batch_size=self.conf_dict['batch_size'], batch_repeat=1) valid_model = self.model.get_validing_function(self.cr) test_model, pr_model = self.model.get_testing_function(self.cr) print ('Start to train.') zae, sae, bae = test_model() print ('Now testing model. Test Zi Average Error: %s' % (str(zae))) print ('Now testing model. Test Sentence Average Error: %s' % (str(sae))) print ('Now testing model. Test Batch Average Error: %s' % (str(bae))) epoch = 0 n_epochs = 1000000 it = 0 while (epoch < n_epochs): epoch += 1 for i in xrange(n_batches): # train model train_error = train_model(i)[0] # print 'Step error: %f\r' % train_error, if math.isnan(train_error): print 'Train error is NaN in iteration %d' % i error_model_path = self.param_path + str(epoch) + '.error' save_params_val(error_model_path, self.model.get_parameters()) print 'model saved in %s , reload and skip the %d batch' % (error_model_path, self.conf_dict['save_freq']) scope = (self.conf_dict['batch_size'] * i, self.conf_dict['batch_size'] * (i + 1)) for j in range(scope[0], scope[1]): print '%s\t%s' % (self.cr.train_set[0][j], self.cr.train_set[1][j]) param_dict = load_params_val(self.param_path) self.model.set_parameters(param_dict) exit() # continue if it % self.conf_dict['save_freq'] == 0: valid_error = valid_model()[0] # valid model print ('@iter: %s\tTraining Error: %s\tValid Error: %s.' % (it, str(train_error), str(valid_error))) # Save model parameters print ('Saving parameters to %s.' % (self.param_path)) save_params_val(self.param_path, self.model.get_parameters()) it = it + 1 # test model print 'Finished a epoch.' save_params_val(self.param_path + str(epoch), self.model.get_parameters()) zae, sae, bae = test_model() print ('Now testing model. Test Zi Average Error: %s' % (str(zae))) print ('Now testing model. Test Sentence Average Error: %s' % (str(sae))) print ('Now testing model. Test Batch Average Error: %s' % (str(bae))) ''' pr_error = pr_model()[0] test_pr = 0.0 n_samples = config.globalTestPRSamples() n_data = pr_error.shape[0] / n_samples trues = [0.0] * (n_samples / 2) + [1.0] * (n_samples / 2) for i in range(n_data) : test_pr += pearsonr(trues, list(pr_error[i * n_samples:(i + 1) * n_samples]))[0] test_pr /= n_data print ('Now testing model. Test PR: %s' % (str(test_pr))) ''' print ('\n')
import cPickle import math from deep.util.parameter_operation import load_params_val p = load_params_val('data/human/model/dialog/StyleEncoderDecoderMulti_b498fc540bd33b3d2b33077573e8efe6.model.final') # def check(l): # if isinstance(l,list): # for i in l: # c = check(i) # if c: # return c # return False # else: # return math.isnan(l) # for k,v in p.items(): # print k, check(v.tolist()) print p # with open('error', 'wb') as fw: # cPickle.dump(p, fw)
def __init__(self, dataset_folder, dataset_file, dict_file, stopwords_file, word_embedding_file, train_rate, valid_rate, test_rate, algo_name, charset, mode): """ Need to set these attributes. 1. conf_dict: configuration of the model. 2. cr: CorpursReader for operate data. 3. model: the network model. """ self.conf_dict = { 'algo_name': algo_name, 'batch_size': 256, 'train_valid_test_rate': [train_rate, valid_rate, test_rate], 'split_level': 'zi', 'pre_word_embedding': False, 'word_embedding_dim': 128, 'n_topics': 5000, 'topic_embedding_dim': 256, 'max_sentence_word_num': 150, 'min_sentence_word_num': 1, 'is_BEG': False, 'is_END': True, 'hidden_dim': 512, 'charset': charset, 'shuffle': False, 'save_freq': 100 } self.param_path = os.path.join( dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.model') print self.param_path #self.param_path ='ChoEncoderDecoderTopic_5908276eb2ae513520ca72135e5b82d0.model83' #self.param_path='ChoEncoderDecoderDT_4575b6c5893c10a009e29b6eb2988387.model42' #self.param_path='ChoEncoderDecoderDT_cc7f5ed5d9e9fe5a90a012f4e017106a.model' param_dict = load_params_val(self.param_path) self.conf_path = os.path.join( dataset_folder, 'model', 'dialog', get_params_file_name(self.conf_dict) + '.conf') save_confs_val(self.conf_dict, self.conf_path) # set corpus reader if mode == 'train': self.cr = CorpusReaderDialogTopic( dataset_file=dataset_file, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) else: self.cr = CorpusReaderDialogTopic( dataset_file=None, stopwords_file=stopwords_file, dict_file=dict_file, word_embedding_file=None, train_valid_test_rate=self.conf_dict['train_valid_test_rate'], charset=self.conf_dict['charset'], max_sentence_word_num=self.conf_dict['max_sentence_word_num'], min_sentence_word_num=self.conf_dict['min_sentence_word_num'], is_BEG_available=self.conf_dict['is_BEG'], is_END_available=self.conf_dict['is_END']) # set model self.model = RnnEncoderDecoderNetwork( n_words=len(self.cr.get_word_dictionary()), hidden_status_dim=self.conf_dict['hidden_dim'], word_embedding_dim=self.conf_dict['word_embedding_dim'], n_topics=self.conf_dict['n_topics'], topic_embedding_dim=self.conf_dict['topic_embedding_dim'], input_params=param_dict)