def train(self, pred_path=None, loss_log_path=None, csv_log_path=None, boost_round_log_path=None, train_seed=None, cv_args=None, parameters=None, show_importance=False, save_cv_pred=True, save_cv_pred_train=False, save_final_pred=True, save_final_pred_train=False, save_csv_log=True, csv_idx=None, use_global_valid=False, return_pred_test=False, mode=None, param_name_list=None, param_value_list=None, use_custom_obj=False, file_name_params=None, append_info=None, loss_fuc=None): # Check if directories exit or not utils.check_dir_model(pred_path, loss_log_path) # Global Validation self.use_global_valid = use_global_valid # Use Custom Objective Function self.use_custom_obj = use_custom_obj cv_args_copy = copy.deepcopy(cv_args) n_cv = cv_args_copy['n_cv'] cv_seed = cv_args_copy['cv_seed'] valid_rate = 1/n_cv # Append Information if append_info is None: append_info = '_c-' + str(n_cv) if csv_idx is None: csv_idx = self.model_name # Print Start Information and Get Model Name self.print_start_info() if use_global_valid: print('------------------------------------------------------') print('[W] Using Global Validation...') cv_count = 0 pred_test_total = [] pred_train_total = [] loss_train_total = [] loss_valid_total = [] idx_round = [] train_loss_round_total = [] valid_loss_round_total = [] global_valid_loss_round_total = [] pred_global_valid_total = [] loss_global_valid_total = [] # Get Cross Validation Generator if 'cv_generator' in cv_args_copy: cv_generator = cv_args_copy['cv_generator'] if cv_generator is None: cv_generator = CrossValidation.random_split cv_args_copy.pop('cv_generator') else: cv_generator = CrossValidation.random_split print('------------------------------------------------------') print('[W] Using CV Generator: {}'.format(getattr(cv_generator, '__name__'))) # Training on Cross Validation Sets for x_train, y_train, x_valid, y_valid in cv_generator(x=self.x_train, y=self.y_train, **cv_args_copy): # CV Start Time cv_start_time = time.time() cv_count += 1 # Fitting and Training Model if mode == 'auto_train_boost_round': if use_global_valid: reg, idx_round_cv, train_loss_round_cv, valid_loss_round_cv, global_valid_loss_round_cv = \ self.fit_with_round_log(boost_round_log_path, cv_count, x_train, y_train, x_valid, y_valid, parameters, param_name_list, param_value_list, append_info=append_info) global_valid_loss_round_total.append(global_valid_loss_round_cv) else: reg, idx_round_cv, train_loss_round_cv, valid_loss_round_cv = \ self.fit_with_round_log(boost_round_log_path, cv_count, x_train, y_train, x_valid, y_valid, parameters, param_name_list, param_value_list, append_info=append_info) idx_round = idx_round_cv train_loss_round_total.append(train_loss_round_cv) valid_loss_round_total.append(valid_loss_round_cv) else: reg = self.fit(x_train, y_train, x_valid, y_valid, parameters) # Feature Importance if show_importance: self.get_importance(reg) # Prediction if save_cv_pred: cv_pred_path = pred_path + 'cv_results/' + self.model_name + '_cv_{}_'.format(cv_count) else: cv_pred_path = None pred_test = self.predict(reg, self.x_test, pred_path=cv_pred_path) # Save Train Prediction to CSV File if save_cv_pred_train: cv_pred_train_path = pred_path + 'cv_pred_train/' + self.model_name + '_cv_{}_'.format(cv_count) else: cv_pred_train_path = None pred_train = self.get_pred_train(reg, x_train, pred_path=cv_pred_train_path) pred_train_all = self.get_pred_train(reg, self.x_train, pred_path=cv_pred_train_path) # Predict Global Validation Set if use_global_valid: pred_global_valid = self.predict(reg, self.x_global_valid) else: pred_global_valid = np.array([]) # Get Prediction sof Validation Set pred_valid = self.predict(reg, x_valid) # Print LogLoss loss_train, loss_valid = utils.print_loss(pred_train, y_train, pred_valid, y_valid, loss_fuc) # Print Loss and Accuracy of Global Validation Set if use_global_valid: loss_global_valid = utils.print_global_valid_loss(pred_global_valid, self.y_global_valid, loss_fuc) pred_global_valid_total.append(pred_global_valid) loss_global_valid_total.append(loss_global_valid) # Save Losses to File utils.save_loss_log(loss_log_path + self.model_name + '_', cv_count, parameters, valid_rate, n_cv, loss_train, loss_valid, train_seed, cv_seed) pred_test_total.append(pred_test) pred_train_total.append(pred_train_all) loss_train_total.append(loss_train) loss_valid_total.append(loss_valid) # CV End Time print('------------------------------------------------------') print('CV Done! Using Time: {}s'.format(time.time() - cv_start_time)) print('======================================================') print('Calculating Final Result...') # Calculate Means of pred and losses pred_test_mean, pred_train_mean, loss_train_mean, loss_valid_mean = \ utils.calculate_means(pred_test_total, pred_train_total, loss_train_total, loss_valid_total) # Save Logs of num_boost_round if mode == 'auto_train_boost_round': if use_global_valid: train_loss_round_mean, valid_loss_round_mean, global_valid_loss_round_mean = \ utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total, global_valid_loss_round_total=global_valid_loss_round_total) self.save_boost_round_log(boost_round_log_path, idx_round, train_loss_round_mean, valid_loss_round_mean, train_seed, cv_seed, csv_idx, parameters, param_name_list, param_value_list, append_info=append_info, global_valid_loss_round_mean=global_valid_loss_round_mean) else: train_loss_round_mean, valid_loss_round_mean = \ utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total) self.save_boost_round_log(boost_round_log_path, idx_round, train_loss_round_mean, valid_loss_round_mean, train_seed, cv_seed, csv_idx, parameters, param_name_list, param_value_list, append_info=append_info) # Save 'num_boost_round' if self.model_name in ['xgb', 'lgb']: parameters['num_boost_round'] = self.num_boost_round # Save Final Result if save_final_pred: self.save_final_pred(mode, save_final_pred, pred_test_mean, pred_path, parameters, csv_idx, train_seed, cv_seed, boost_round_log_path, param_name_list, param_value_list, file_name_params=file_name_params, append_info=append_info) # Save Final pred_train if save_final_pred_train: utils.save_pred_train_to_csv(pred_path + 'final_pred_train/' + self.model_name + '_', pred_train_mean, self.y_train) # Print Total Losses utils.print_total_loss(loss_train_mean, loss_valid_mean) # Save Final Losses to File utils.save_final_loss_log(loss_log_path + self.model_name + '_', parameters, valid_rate, n_cv, loss_train_mean, loss_valid_mean, train_seed, cv_seed) # Print Global Validation Information and Save if use_global_valid: # Calculate Means of Predictions and Losses loss_global_valid_mean = utils.calculate_global_valid_means(loss_global_valid_total) # Save csv log if save_csv_log: self.save_csv_log(mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_mean, loss_global_valid_mean, train_seed, cv_seed, valid_rate, n_cv, parameters, boost_round_log_path=boost_round_log_path, file_name_params=file_name_params, append_info=append_info, loss_global_valid=loss_global_valid_mean) # Save Loss Log to csv File if save_csv_log: if not use_global_valid: self.save_csv_log(mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_mean, loss_valid_mean, train_seed, cv_seed, valid_rate, n_cv, parameters, boost_round_log_path=boost_round_log_path, file_name_params=file_name_params, append_info=append_info) # Remove 'num_boost_round' of parameters if 'num_boost_round' in parameters: parameters.pop('num_boost_round') # Return Final Result if return_pred_test: return pred_test_mean
def prejudge_stack_train(self, x_train, x_g_train, y_train, w_train, e_train, x_valid, x_g_valid, y_valid, w_valid, e_valid, x_test, x_g_test, pred_path=None, loss_log_path=None, csv_log_path=None, parameters=None, cv_args=None, train_seed=None, show_importance=False, show_accuracy=False, save_final_pred=True, save_csv_log=True, csv_idx=None, mode=None, file_name_params=None, param_name_list=None, param_value_list=None, append_info=None): # Check if directories exit or not utils.check_dir_model(pred_path, loss_log_path) cv_args_copy = copy.deepcopy(cv_args) if 'n_valid' in cv_args: n_valid = cv_args_copy['n_valid'] elif 'valid_rate' in cv_args: n_valid = cv_args_copy['valid_rate'] else: n_valid = '' n_cv = cv_args_copy['n_cv'] cv_seed = cv_args_copy['cv_seed'] if csv_idx is None: csv_idx = self.model_name # Print Start Information and Get Model Name self.print_start_info() print('======================================================') print('Number of Features: ', x_train.shape[1]) print('------------------------------------------------------') # Build Network tf.reset_default_graph() train_graph = tf.Graph() with train_graph.as_default(): # Inputs inputs, labels, weights, lr, keep_prob, is_train = self.input_tensor( ) # Logits logits = self.model(inputs, self.unit_number, keep_prob, is_train) logits = tf.identity(logits, name='logits') # Loss with tf.name_scope('Loss'): # cost_ = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) cost_ = self.log_loss(logits, weights, labels) # Optimizer optimizer = tf.train.AdamOptimizer(lr).minimize(cost_) with tf.Session(graph=train_graph) as sess: start_time = time.time() sess.run(tf.global_variables_initializer()) for epoch_i in range(self.epochs): batch_counter = 0 for batch_i, (batch_x, batch_y, batch_w) in enumerate( self.get_batches(x_train, y_train, w_train, self.batch_size)): batch_counter += 1 _, cost_train = sess.run( [optimizer, cost_], { inputs: batch_x, labels: batch_y, weights: batch_w, lr: self.learning_rate, keep_prob: self.keep_probability, is_train: True }) if str(cost_train) == 'nan': raise ValueError('NaN BUG!!! Try Another Seed!!!') if batch_counter % self.display_step == 0 and batch_i > 0: cost_valid_all = [] for iii, (valid_batch_x, valid_batch_y, valid_batch_w) in enumerate( self.get_batches(x_valid, y_valid, w_valid, self.batch_size)): cost_valid_i = sess.run( cost_, { inputs: valid_batch_x, labels: valid_batch_y, weights: valid_batch_w, keep_prob: 1.0, is_train: False }) cost_valid_all.append(cost_valid_i) cost_valid = sum(cost_valid_all) / len(cost_valid_all) total_time = time.time() - start_time print( 'Epoch: {}/{} |'.format(epoch_i + 1, self.epochs), 'Batch: {} |'.format(batch_counter), 'Time: {:>3.2f}s |'.format(total_time), 'Train_Loss: {:>.8f} |'.format(cost_train), 'Valid_Loss: {:>.8f}'.format(cost_valid)) # Prediction print('------------------------------------------------------') print('Predicting Train Probabilities...') logits_pred_train = sess.run(logits, { inputs: x_train, keep_prob: 1.0, is_train: False }) logits_pred_valid = sess.run(logits, { inputs: x_valid, keep_prob: 1.0, is_train: False }) logits_pred_test = sess.run(logits, { inputs: x_test, keep_prob: 1.0, is_train: False }) logits_pred_train = logits_pred_train.flatten() logits_pred_valid = logits_pred_valid.flatten() logits_pred_test = logits_pred_test.flatten() prob_train = 1.0 / (1.0 + np.exp(-logits_pred_train)) prob_valid = 1.0 / (1.0 + np.exp(-logits_pred_valid)) prob_test = 1.0 / (1.0 + np.exp(-logits_pred_test)) loss_train, loss_valid, loss_train_w, loss_valid_w = \ utils.print_loss(prob_train, y_train, w_train, prob_valid, y_valid, w_valid) # Save Final Result if save_final_pred: self.save_final_pred(mode, save_final_pred, prob_test, pred_path, parameters, csv_idx, train_seed, cv_seed, file_name_params=file_name_params) # Print Total Losses utils.print_total_loss(loss_train, loss_valid, loss_train_w, loss_valid_w) losses = [loss_train, loss_valid, loss_train_w, loss_valid_w] # Print and Get Accuracies of CV acc_train, acc_valid, acc_train_era, acc_valid_era = \ utils.print_and_get_accuracy(prob_train, y_train, e_train, prob_valid, y_valid, e_valid, show_accuracy) # Save Final Losses to File utils.save_final_loss_log(loss_log_path + self.model_name + '_', parameters, n_valid, n_cv, loss_train, loss_valid, loss_train_w, loss_valid_w, train_seed, cv_seed, acc_train, acc_train_era) # Save Loss Log to csv File if save_csv_log: self.save_csv_log(mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_w, loss_valid_w, acc_train, train_seed, cv_seed, n_valid, n_cv, parameters, file_name_params=file_name_params) return prob_valid, prob_test, losses
def train(self, pred_path=None, loss_log_path=None, csv_log_path=None, boost_round_log_path=None, train_seed=None, cv_args=None, parameters=None, show_importance=False, show_accuracy=False, save_cv_pred=True, save_cv_pred_train=False, save_final_pred=True, save_final_pred_train=False, save_csv_log=True, csv_idx=None, prescale=False, postscale=False, use_global_valid=False, return_pred_test=False, mode=None, param_name_list=None, param_value_list=None, use_custom_obj=False, use_scale_pos_weight=False, file_name_params=None, append_info=None): # Check if directories exit or not utils.check_dir_model(pred_path, loss_log_path) utils.check_dir([pred_path, loss_log_path, csv_log_path, boost_round_log_path]) # Global Validation self.use_global_valid = use_global_valid # Use Custom Objective Function self.use_custom_obj = use_custom_obj # Cross Validation Arguments cv_args_copy, n_valid, n_cv, n_era, cv_seed = utils.get_cv_args(cv_args, append_info) if csv_idx is None: csv_idx = self.model_name # Print Start Information and Get Model Name self.print_start_info() if use_global_valid: print('------------------------------------------------------') print('[W] Using Global Validation...') cv_count = 0 pred_test_total = [] pred_train_total = [] loss_train_total = [] loss_valid_total = [] loss_train_w_total = [] loss_valid_w_total = [] idx_round = [] train_loss_round_total = [] valid_loss_round_total = [] global_valid_loss_round_total = [] pred_global_valid_total = [] loss_global_valid_total = [] loss_global_valid_w_total = [] # Get Cross Validation Generator if 'cv_generator' in cv_args_copy: cv_generator = cv_args_copy['cv_generator'] if cv_generator is None: cv_generator = CrossValidation.era_k_fold cv_args_copy.pop('cv_generator') else: cv_generator = CrossValidation.era_k_fold print('------------------------------------------------------') print('[W] Using CV Generator: {}'.format(getattr(cv_generator, '__name__'))) if 'era_list' in cv_args_copy: print('Era List: ', cv_args_copy['era_list']) if 'window_size' in cv_args_copy: print('Window Size: ', cv_args_copy['window_size']) if 'cv_weights' in cv_args_copy: cv_weights = cv_args_copy['cv_weights'] cv_args_copy.pop('cv_weights') if cv_weights is not None: if len(cv_weights) != n_cv: raise ValueError("The length of 'cv_weights'({}) should be equal to 'n_cv'({})!" .format(len(cv_weights), n_cv)) else: cv_weights = None # Training on Cross Validation Sets for x_train, y_train, w_train, e_train, x_valid, y_valid, w_valid, e_valid, valid_era \ in cv_generator(x=self.x_train, y=self.y_train, w=self.w_train, e=self.e_train, **cv_args_copy): # CV Start Time cv_start_time = time.time() cv_count += 1 # Get Positive Rate of Train Set and postscale Rate positive_rate_train, postscale_rate = self.get_postscale_rate(y_train) positive_rate_valid, _ = self.get_postscale_rate(y_valid) # Remove Metric of Post Scale if postscale: self.postscale = True self.postscale_rate = postscale_rate if 'metric' in parameters.keys(): parameters.pop('metric') if 'eval_metric' in parameters.keys(): parameters.pop('eval_metric') if use_scale_pos_weight: if self.model_name == 'xgb': parameters['scale_pos_weight'] = postscale_rate print('------------------------------------------------------') print('Validation Set Era: ', valid_era) print('Number of Features: ', x_train.shape[1]) print('------------------------------------------------------') print('Positive Rate of Train Set: {:.6f}'.format(positive_rate_train)) print('Positive Rate of Valid Set: {:.6f}'.format(positive_rate_valid)) print('------------------------------------------------------') # prescale if prescale: x_train, y_train, w_train, e_train = self.prescale(x_train, y_train, w_train, e_train) # Fitting and Training Model if mode == 'auto_train_boost_round': if use_global_valid: reg, idx_round_cv, train_loss_round_cv, \ valid_loss_round_cv, global_valid_loss_round_cv = \ self.fit_with_round_log( boost_round_log_path, cv_count, x_train, y_train, w_train, x_valid, y_valid, w_valid, parameters, param_name_list, param_value_list, append_info=append_info) global_valid_loss_round_total.append(global_valid_loss_round_cv) else: reg, idx_round_cv, train_loss_round_cv, valid_loss_round_cv = \ self.fit_with_round_log( boost_round_log_path, cv_count, x_train, y_train, w_train, x_valid, y_valid, w_valid, parameters, param_name_list, param_value_list, append_info=append_info) idx_round = idx_round_cv train_loss_round_total.append(train_loss_round_cv) valid_loss_round_total.append(valid_loss_round_cv) else: reg = self.fit(x_train, y_train, w_train, x_valid, y_valid, w_valid, parameters) # Feature Importance if show_importance: self.get_importance(reg) # Prediction if save_cv_pred: cv_pred_path = \ pred_path + 'cv_results/' + self.model_name + '_cv_{}_'.format(cv_count) else: cv_pred_path = None pred_test = self.predict(reg, self.x_test, pred_path=cv_pred_path) # Save Train Probabilities to CSV File if save_cv_pred_train: cv_pred_train_path = \ pred_path + 'cv_pred_train/' + self.model_name + '_cv_{}_'.format(cv_count) else: cv_pred_train_path = None pred_train = self.get_pred_train(reg, x_train, pred_path=cv_pred_train_path) pred_train_all = self.get_pred_train(reg, self.x_train, pred_path=cv_pred_train_path) # Predict Global Validation Set if use_global_valid: pred_global_valid = self.predict(reg, self.x_global_valid) else: pred_global_valid = np.array([]) # Get Probabilities of Validation Set pred_valid = self.predict(reg, x_valid) # postscale if postscale: print('------------------------------------------------------') print('[W] PostScaling Results...') print('PostScale Rate: {:.6f}'.format(postscale_rate)) pred_test *= postscale_rate pred_train *= postscale_rate pred_valid *= postscale_rate if use_global_valid: pred_global_valid *= postscale_rate # Print LogLoss print('------------------------------------------------------') print('Validation Set Era: ', valid_era) loss_train, loss_valid, loss_train_w, loss_valid_w = \ utils.print_loss(pred_train, y_train, w_train, pred_valid, y_valid, w_valid) # Print and Get Accuracies of CV acc_train_cv, acc_valid_cv, acc_train_cv_era, acc_valid_cv_era = \ utils.print_and_get_accuracy(pred_train, y_train, e_train, pred_valid, y_valid, e_valid, show_accuracy) # Print Loss and Accuracy of Global Validation Set if use_global_valid: loss_global_valid, loss_global_valid_w, acc_global_valid = \ utils.print_global_valid_loss_and_acc( pred_global_valid, self.y_global_valid, self.w_global_valid) pred_global_valid_total.append(pred_global_valid) loss_global_valid_total.append(loss_global_valid) loss_global_valid_w_total.append(loss_global_valid_w) # Save Losses to File utils.save_loss_log( loss_log_path + self.model_name + '_', cv_count, parameters, n_valid, n_cv, valid_era, loss_train, loss_valid, loss_train_w, loss_valid_w, train_seed, cv_seed, acc_train_cv, acc_valid_cv, acc_train_cv_era, acc_valid_cv_era) pred_test_total.append(pred_test) pred_train_total.append(pred_train_all) loss_train_total.append(loss_train) loss_valid_total.append(loss_valid) loss_train_w_total.append(loss_train_w) loss_valid_w_total.append(loss_valid_w) # CV End Time print('------------------------------------------------------') print('CV Done! Using Time: {}s'.format(time.time() - cv_start_time)) print('======================================================') print('Calculating Final Result...') # Calculate Means of pred and losses pred_test_mean, pred_train_mean, loss_train_mean, \ loss_valid_mean, loss_train_w_mean, loss_valid_w_mean = \ utils.calculate_means(pred_test_total, pred_train_total, loss_train_total, loss_valid_total, loss_train_w_total, loss_valid_w_total, weights=cv_weights) # Save 'num_boost_round' if self.model_name in ['xgb', 'lgb']: parameters['num_boost_round'] = self.num_boost_round # Calculate Profit profit = 0 # Save Logs of num_boost_round if mode == 'auto_train_boost_round': if use_global_valid: train_loss_round_mean, valid_loss_round_mean, global_valid_loss_round_mean = \ utils.calculate_boost_round_means( train_loss_round_total, valid_loss_round_total, weights=cv_weights, global_valid_loss_round_total=global_valid_loss_round_total) self.save_boost_round_log( boost_round_log_path, idx_round, train_loss_round_mean, valid_loss_round_mean, train_seed, cv_seed, csv_idx, parameters, param_name_list, param_value_list, append_info=append_info, global_valid_loss_round_mean=global_valid_loss_round_mean, profit=profit) else: train_loss_round_mean, valid_loss_round_mean = \ utils.calculate_boost_round_means( train_loss_round_total, valid_loss_round_total, weights=cv_weights) self.save_boost_round_log( boost_round_log_path, idx_round, train_loss_round_mean, valid_loss_round_mean, train_seed, cv_seed, csv_idx, parameters, param_name_list, param_value_list, append_info=append_info, profit=profit) # Save Final Result if save_final_pred: self.save_final_pred( mode, pred_test_mean, pred_path, parameters, csv_idx, train_seed, cv_seed, boost_round_log_path, param_name_list, param_value_list, file_name_params=file_name_params, append_info=append_info) # Save Final pred_train if save_final_pred_train: utils.save_pred_train_to_csv(pred_path + 'final_pred_train/' + self.model_name + '_', pred_train_mean, self.y_train) # Print Total Losses utils.print_total_loss(loss_train_mean, loss_valid_mean, loss_train_w_mean, loss_valid_w_mean, profit=profit) # Print and Get Accuracies of CV of All Train Set acc_train, acc_train_era = \ utils.print_and_get_train_accuracy(pred_train_mean, self.y_train, self.e_train, show_accuracy) # Save Final Losses to File utils.save_final_loss_log( loss_log_path + self.model_name + '_', parameters, n_valid, n_cv, loss_train_mean, loss_valid_mean, loss_train_w_mean, loss_valid_w_mean, train_seed, cv_seed, acc_train, acc_train_era) # Print Global Validation Information and Save if use_global_valid: # Calculate Means of Probabilities and Losses pred_global_valid_mean, loss_global_valid_mean, loss_global_valid_w_mean = \ utils.calculate_global_valid_means(pred_global_valid_total, loss_global_valid_total, loss_global_valid_w_total, weights=cv_weights) # Print Loss and Accuracy acc_total_global_valid = \ utils.print_total_global_valid_loss_and_acc( pred_global_valid_mean, self.y_global_valid, loss_global_valid_mean, loss_global_valid_w_mean) # Save csv log if save_csv_log: self.save_csv_log( mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_w_mean, loss_valid_w_mean, acc_train, train_seed, cv_seed, n_valid, n_cv, parameters, boost_round_log_path=boost_round_log_path, file_name_params=file_name_params, append_info=append_info, loss_global_valid=loss_global_valid_w_mean, acc_global_valid=acc_total_global_valid, profit=profit) # Save Loss Log to csv File if save_csv_log: if not use_global_valid: self.save_csv_log( mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_w_mean, loss_valid_w_mean, acc_train, train_seed, cv_seed, n_valid, n_cv, parameters, boost_round_log_path=boost_round_log_path, file_name_params=file_name_params, append_info=append_info, profit=profit) # Remove 'num_boost_round' of parameters if 'num_boost_round' in parameters: parameters.pop('num_boost_round') # Return Final Result if return_pred_test: return pred_test_mean
def train(self, pred_path=None, loss_log_path=None, csv_log_path=None, boost_round_log_path=None, train_seed=None, cv_args=None, parameters=None, show_importance=False, show_accuracy=False, save_cv_pred=True, save_cv_prob_train=False, save_final_pred=True, save_final_prob_train=False, save_csv_log=True, csv_idx=None, prescale=False, postscale=False, use_global_valid=False, return_prob_test=False, mode=None, param_name_list=None, param_value_list=None, use_custom_obj=False, use_scale_pos_weight=False, file_name_params=None, append_info=None): # Check if directories exit or not utils.check_dir_model(pred_path, loss_log_path) # Global Validation self.use_global_valid = use_global_valid cv_args_copy = copy.deepcopy(cv_args) if 'n_valid' in cv_args: n_valid = cv_args_copy['n_valid'] elif 'valid_rate' in cv_args: n_valid = cv_args_copy['valid_rate'] else: n_valid = '' n_cv = cv_args_copy['n_cv'] n_era = cv_args_copy['n_era'] cv_seed = cv_args_copy['cv_seed'] # Append Information if append_info is None: append_info = 'v-' + str(n_valid) + '_c-' + str( n_cv) + '_e-' + str(n_era) if 'window_size' in cv_args_copy: append_info += '_w-' + str(cv_args_copy['window_size']) if csv_idx is None: csv_idx = self.model_name # Build Network tf.reset_default_graph() train_graph = tf.Graph() with train_graph.as_default(): # Inputs inputs, labels, weights, lr, keep_prob, is_training = self.input_tensor( ) # Logits logits = self.model(inputs, self.unit_number, keep_prob, is_training) logits = tf.identity(logits, name='logits') # Loss with tf.name_scope('Loss'): # cost_ = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) cost_ = self.log_loss(logits, weights, labels) # Optimizer optimizer = tf.train.AdamOptimizer(lr).minimize(cost_) # Training self.print_start_info() if use_global_valid: print('------------------------------------------------------') print('[W] Using Global Validation...') with tf.Session(graph=train_graph) as sess: # Merge all the summaries merged = tf.summary.merge_all() start_time = time.time() cv_counter = 0 prob_test_total = [] prob_train_total = [] loss_train_total = [] loss_valid_total = [] loss_train_w_total = [] loss_valid_w_total = [] idx_round = [] train_loss_round_total = [] valid_loss_round_total = [] prob_global_valid_total = [] loss_global_valid_total = [] loss_global_valid_w_total = [] # Get Cross Validation Generator if 'cv_generator' in cv_args_copy: cv_generator = cv_args_copy['cv_generator'] if cv_generator is None: cv_generator = CrossValidation.era_k_fold cv_args_copy.pop('cv_generator') else: cv_generator = CrossValidation.era_k_fold print('------------------------------------------------------') print('[W] Using CV Generator: {}'.format( getattr(cv_generator, '__name__'))) if 'era_list' in cv_args_copy: print('Era List: ', cv_args_copy['era_list']) if 'window_size' in cv_args_copy: print('Window Size: ', cv_args_copy['window_size']) if 'cv_weights' in cv_args_copy: cv_weights = cv_args_copy['cv_weights'] cv_args_copy.pop('cv_weights') else: cv_weights = None for x_train, y_train, w_train, e_train, x_valid, y_valid, w_valid, e_valid, valid_era \ in cv_generator(self.x_train, self.y_train, self.w_train, self.e_train, **cv_args_copy): # CV Start Time cv_start_time = time.time() cv_counter += 1 # Get Positive Rate of Train Set and postscale Rate positive_rate_train, postscale_rate = self.get_postscale_rate( y_train) positive_rate_valid, _ = self.get_postscale_rate(y_valid) print('------------------------------------------------------') print('Number of Features: ', x_train.shape[1]) print('Validation Set Era: ', valid_era) print('------------------------------------------------------') print('Positive Rate of Train Set: ', positive_rate_train) print('Positive Rate of Valid Set: ', positive_rate_valid) print('------------------------------------------------------') # prescale if prescale: x_train, y_train, w_train, e_train = self.prescale( x_train, y_train, w_train, e_train) # Training if mode == 'auto_train_boost_round': idx_round_cv, train_loss_round_cv, valid_loss_round_cv = \ self.train_with_round_log(boost_round_log_path, sess, cv_counter, x_train, y_train, w_train, x_valid, y_valid, w_valid, optimizer, merged, cost_, inputs, labels, weights, lr, keep_prob, is_training, start_time, param_name_list, param_value_list, append_info=append_info) idx_round = idx_round_cv train_loss_round_total.append(train_loss_round_cv) valid_loss_round_total.append(valid_loss_round_cv) else: self.trainer(sess, cv_counter, x_train, y_train, w_train, x_valid, y_valid, w_valid, optimizer, merged, cost_, inputs, labels, weights, lr, keep_prob, is_training, start_time) # Save Model # print('Saving model...') # saver = tf.train.Saver() # saver.save(sess, self.save_path + 'model.' + self.version + '.ckpt') # Prediction print('------------------------------------------------------') print('Predicting Probabilities...') prob_train = self.get_prob(sess, logits, x_train, self.batch_size, inputs, keep_prob, is_training) prob_train_all = self.get_prob(sess, logits, self.x_train, self.batch_size, inputs, keep_prob, is_training) prob_valid = self.get_prob(sess, logits, x_valid, self.batch_size, inputs, keep_prob, is_training) prob_test = self.get_prob(sess, logits, self.x_test, self.batch_size, inputs, keep_prob, is_training) # Predict Global Validation Set if use_global_valid: prob_global_valid = self.get_prob(sess, logits, self.x_global_valid, self.batch_size, inputs, keep_prob, is_training) else: prob_global_valid = np.array([]) # postscale if postscale: print( '------------------------------------------------------' ) print('[W] PostScaling Results...') print('PostScale Rate: {:.6f}'.format(postscale_rate)) prob_test *= postscale_rate prob_train *= postscale_rate prob_valid *= postscale_rate if use_global_valid: prob_global_valid *= postscale_rate # Print Losses of CV loss_train, loss_valid, loss_train_w, loss_valid_w = \ utils.print_loss(prob_train, y_train, w_train, prob_valid, y_valid, w_valid) prob_test_total.append(prob_test) prob_train_total.append(prob_train_all) loss_train_total.append(loss_train) loss_valid_total.append(loss_valid) loss_train_w_total.append(loss_train_w) loss_valid_w_total.append(loss_valid_w) # Print and Get Accuracies of CV acc_train_cv, acc_valid_cv, acc_train_cv_era, acc_valid_cv_era = \ utils.print_and_get_accuracy(prob_train, y_train, e_train, prob_valid, y_valid, e_valid, show_accuracy) # Print Loss and Accuracy of Global Validation Set if use_global_valid: loss_global_valid, loss_global_valid_w, acc_global_valid = \ utils.print_global_valid_loss_and_acc(prob_global_valid, self.y_global_valid, self.w_global_valid) prob_global_valid_total.append(prob_global_valid) loss_global_valid_total.append(loss_global_valid) loss_global_valid_w_total.append(loss_global_valid_w) # Save losses utils.save_loss_log(loss_log_path + self.model_name + '_', cv_counter, self.parameters, n_valid, n_cv, valid_era, loss_train, loss_valid, loss_train_w, loss_valid_w, train_seed, cv_seed, acc_train_cv, acc_valid_cv, acc_train_cv_era, acc_valid_cv_era) if save_cv_pred: utils.save_pred_to_csv( pred_path + 'cv_results/' + self.model_name + '_cv_{}_'.format(cv_counter), self.id_test, prob_test) # CV End Time print('------------------------------------------------------') print('CV Done! Using Time: {}s'.format(time.time() - cv_start_time)) # Final Result print('======================================================') print('Calculating Final Result...') # Calculate Means of prob and losses prob_test_mean, prob_train_mean, loss_train_mean, loss_valid_mean, loss_train_w_mean, loss_valid_w_mean = \ utils.calculate_means(prob_test_total, prob_train_total, loss_train_total, loss_valid_total, loss_train_w_total, loss_valid_w_total, weights=cv_weights) # Save Logs of num_boost_round if mode == 'auto_train_boost_round': l = len(train_loss_round_total[0]) for train_loss_cv in train_loss_round_total: if l > len(train_loss_cv): l = len(train_loss_cv) idx_round = idx_round[:l] train_loss_round_total = [ train_loss[:l] for train_loss in train_loss_round_total ] valid_loss_round_total = [ valid_loss[:l] for valid_loss in valid_loss_round_total ] train_loss_round_mean, valid_loss_round_mean = \ utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total, weights=cv_weights) self.save_boost_round_log(boost_round_log_path, idx_round, train_loss_round_mean, valid_loss_round_mean, train_seed, cv_seed, csv_idx, parameters, param_name_list, param_value_list, append_info=append_info) # Save Final Result if save_final_pred: self.save_final_pred(mode, save_final_pred, prob_test_mean, pred_path, parameters, csv_idx, train_seed, cv_seed, boost_round_log_path, param_name_list, param_value_list, file_name_params=None, append_info=append_info) # Save Final prob_train if save_final_prob_train: utils.save_prob_train_to_csv( pred_path + 'final_prob_train/' + self.model_name + '_', prob_train_mean, self.y_train) # Print Total Losses utils.print_total_loss(loss_train_mean, loss_valid_mean, loss_train_w_mean, loss_valid_w_mean) # Print and Get Accuracies of CV of All Train Set acc_train, acc_train_era = \ utils.print_and_get_train_accuracy(prob_train_mean, self.y_train, self.e_train, show_accuracy) # Save Final Losses to File utils.save_final_loss_log(loss_log_path + self.model_name + '_', self.parameters, n_valid, n_cv, loss_train_mean, loss_valid_mean, loss_train_w_mean, loss_valid_w_mean, train_seed, cv_seed, acc_train, acc_train_era) # Print Global Validation Information and Save if use_global_valid: # Calculate Means of Probabilities and Losses prob_global_valid_mean, loss_global_valid_mean, loss_global_valid_w_mean = \ utils.calculate_global_valid_means(prob_global_valid_total, loss_global_valid_total, loss_global_valid_w_total, weights=cv_weights) # Print Loss and Accuracy acc_total_global_valid = \ utils.print_total_global_valid_loss_and_acc(prob_global_valid_mean, self.y_global_valid, loss_global_valid_mean, loss_global_valid_w_mean) # Save csv log if save_csv_log: self.save_csv_log( mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_w_mean, loss_valid_w_mean, acc_train, train_seed, cv_seed, n_valid, n_cv, parameters, boost_round_log_path=boost_round_log_path, file_name_params=file_name_params, append_info=append_info, loss_global_valid=loss_global_valid_w_mean, acc_global_valid=acc_total_global_valid) # Save Loss Log to csv File if save_csv_log: if not use_global_valid: self.save_csv_log(mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_w_mean, loss_valid_w_mean, acc_train, train_seed, cv_seed, n_valid, n_cv, parameters, file_name_params=file_name_params, append_info=append_info) # Return Final Result if return_prob_test: return prob_test_mean
def train(self, pred_path=None, loss_log_path=None, csv_log_path=None, boost_round_log_path=None, train_seed=None, cv_args=None, parameters=None, show_importance=False, save_cv_pred=True, save_cv_pred_train=False, save_final_pred=True, save_final_pred_train=False, save_csv_log=True, csv_idx=None, use_global_valid=False, return_pred_test=False, mode=None, param_name_list=None, param_value_list=None, use_custom_obj=False, file_name_params=None, append_info=None, loss_fuc=None): # Check if directories exit or not utils.check_dir_model(pred_path, loss_log_path) # Global Validation self.use_global_valid = use_global_valid cv_args_copy = copy.deepcopy(cv_args) n_cv = cv_args_copy['n_cv'] cv_seed = cv_args_copy['cv_seed'] valid_rate = 1 / float(n_cv) # Append Information if append_info is None: append_info = 'v-' + str(valid_rate) + '_c-' + str(n_cv) if csv_idx is None: csv_idx = self.model_name # Build Network tf.reset_default_graph() train_graph = tf.Graph() with train_graph.as_default(): # Inputs inputs, labels, lr, keep_prob, is_training = self.input_tensor() # Logits logits = self.model(inputs, self.unit_number, keep_prob, is_training) logits = tf.identity(logits, name='logits') # Loss with tf.name_scope('Loss'): cost_ = tf.sqrt(tf.reduce_mean(tf.square(logits - labels))) # cost_ = tf.reduce_mean(tf.square(logits - labels)) # Optimizer optimizer = tf.train.AdamOptimizer(lr).minimize(cost_) # Training self.print_start_info() if use_global_valid: print('------------------------------------------------------') print('[W] Using Global Validation...') with tf.Session(graph=train_graph) as sess: # Merge all the summaries merged = tf.summary.merge_all() start_time = time.time() cv_count = 0 pred_test_total = [] pred_train_total = [] loss_train_total = [] loss_valid_total = [] idx_round = [] train_loss_round_total = [] valid_loss_round_total = [] global_valid_loss_round_total = [] pred_global_valid_total = [] loss_global_valid_total = [] # Get Cross Validation Generator if 'cv_generator' in cv_args_copy: cv_generator = cv_args_copy['cv_generator'] if cv_generator is None: cv_generator = CrossValidation.random_split cv_args_copy.pop('cv_generator') else: cv_generator = CrossValidation.random_split print('------------------------------------------------------') print('[W] Using CV Generator: {}'.format( getattr(cv_generator, '__name__'))) # Training on Cross Validation Sets for x_train, y_train, x_valid, y_valid in cv_generator( x=self.x_train, y=self.y_train, **cv_args_copy): # CV Start Time cv_start_time = time.time() cv_count += 1 # Training if mode == 'auto_train_boost_round': idx_round_cv, train_loss_round_cv, valid_loss_round_cv = \ self.train_with_round_log(boost_round_log_path, sess, cv_count, x_train, y_train, x_valid, y_valid, optimizer, merged, cost_, inputs, labels, lr, keep_prob, is_training, start_time, param_name_list, param_value_list, append_info=append_info) idx_round = idx_round_cv train_loss_round_total.append(train_loss_round_cv) valid_loss_round_total.append(valid_loss_round_cv) else: self.trainer(sess, cv_count, x_train, y_train, x_valid, y_valid, optimizer, merged, cost_, inputs, labels, lr, keep_prob, is_training, start_time) # Save Model # print('Saving model...') # saver = tf.train.Saver() # saver.save(sess, self.save_path + 'model.' + self.version + '.ckpt') # Prediction print('------------------------------------------------------') print('Predicting Probabilities...') pred_train = self.get_pred(sess, logits, x_train, self.batch_size, inputs, keep_prob, is_training) pred_train_all = self.get_pred(sess, logits, self.x_train, self.batch_size, inputs, keep_prob, is_training) pred_valid = self.get_pred(sess, logits, x_valid, self.batch_size, inputs, keep_prob, is_training) pred_test = self.get_pred(sess, logits, self.x_test, self.batch_size, inputs, keep_prob, is_training) # Predict Global Validation Set if use_global_valid: pred_global_valid = self.get_pred(sess, logits, self.x_global_valid, self.batch_size, inputs, keep_prob, is_training) else: pred_global_valid = np.array([]) # Print Losses of CV loss_train, loss_valid = utils.print_loss( pred_train, y_train, pred_valid, y_valid, loss_fuc) # Print Loss and Accuracy of Global Validation Set if use_global_valid: loss_global_valid = utils.print_global_valid_loss( pred_global_valid, self.y_global_valid, self.rmse_loss) pred_global_valid_total.append(pred_global_valid) loss_global_valid_total.append(loss_global_valid) # Save Losses to File utils.save_loss_log(loss_log_path + self.model_name + '_', cv_count, parameters, valid_rate, n_cv, loss_train, loss_valid, train_seed, cv_seed) pred_test_total.append(pred_test) pred_train_total.append(pred_train_all) loss_train_total.append(loss_train) loss_valid_total.append(loss_valid) if save_cv_pred: utils.save_pred_to_csv( pred_path + 'cv_results/' + self.model_name + '_cv_{}_'.format(cv_count), self.id_test, pred_test) # CV End Time print('------------------------------------------------------') print('CV Done! Using Time: {}s'.format(time.time() - cv_start_time)) print('======================================================') print('Calculating Final Result...') # Calculate Means of pred and losses pred_test_mean, pred_train_mean, loss_train_mean, loss_valid_mean = \ utils.calculate_means(pred_test_total, pred_train_total, loss_train_total, loss_valid_total) # Save Logs of num_boost_round if mode == 'auto_train_boost_round': if use_global_valid: train_loss_round_mean, valid_loss_round_mean, global_valid_loss_round_mean = \ utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total, global_valid_loss_round_total=global_valid_loss_round_total) self.save_boost_round_log(boost_round_log_path, idx_round, train_loss_round_mean, valid_loss_round_mean, train_seed, cv_seed, csv_idx, parameters, param_name_list, param_value_list, append_info=append_info, global_valid_loss_round_mean= global_valid_loss_round_mean) else: train_loss_round_mean, valid_loss_round_mean = \ utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total) self.save_boost_round_log(boost_round_log_path, idx_round, train_loss_round_mean, valid_loss_round_mean, train_seed, cv_seed, csv_idx, parameters, param_name_list, param_value_list, append_info=append_info) # Save 'num_boost_round' if self.model_name in ['xgb', 'lgb']: parameters['num_boost_round'] = self.num_boost_round # Save Final Result if save_final_pred: self.save_final_pred(mode, save_final_pred, pred_test_mean, pred_path, parameters, csv_idx, train_seed, cv_seed, boost_round_log_path, param_name_list, param_value_list, file_name_params=file_name_params, append_info=append_info) # Save Final pred_train if save_final_pred_train: utils.save_pred_train_to_csv( pred_path + 'final_pred_train/' + self.model_name + '_', pred_train_mean, self.y_train) # Print Total Losses utils.print_total_loss(loss_train_mean, loss_valid_mean) # Save Final Losses to File utils.save_final_loss_log(loss_log_path + self.model_name + '_', parameters, valid_rate, n_cv, loss_train_mean, loss_valid_mean, train_seed, cv_seed) # Print Global Validation Information and Save if use_global_valid: # Calculate Means of Predictions and Losses loss_global_valid_mean = utils.calculate_global_valid_means( loss_global_valid_total) # Save csv log if save_csv_log: self.save_csv_log( mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_mean, loss_global_valid_mean, train_seed, cv_seed, valid_rate, n_cv, parameters, boost_round_log_path=boost_round_log_path, file_name_params=file_name_params, append_info=append_info, loss_global_valid=loss_global_valid_mean) # Save Loss Log to csv File if save_csv_log: if not use_global_valid: self.save_csv_log( mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_mean, loss_valid_mean, train_seed, cv_seed, valid_rate, n_cv, parameters, boost_round_log_path=boost_round_log_path, file_name_params=file_name_params, append_info=append_info) # Remove 'num_boost_round' of parameters if 'num_boost_round' in parameters: parameters.pop('num_boost_round') # Return Final Result if return_pred_test: return pred_test_mean