def run_experiment(self, exp_list): p = multiprocessing.Pool() for e in exp_list: nn = self.log_dir + '/' + e['exp_name'] + '_' + str( random.random()) + '.txt' self.log_files.append(nn) e['nn'] = nn result = (p.map(self.csdnn_helper, exp_list)) p.close() # result = [self.csdnn_helper(ps) for ps in exp_list] eval_result = [] predict_probs_train = [] predict_probs_test = [] #y_pred_positive =[] # (train_result, test_result, p_dict),y_pred_score_test for r in result: eval_result.append(r[0]) predict_probs_test.append(r[1]) #y_pred_positive.append(r[1][:,1]) sum_probabilities = np.sum(predict_probs_test, axis=0) y_pred = np.argmax(sum_probabilities, axis=1) # y_pred_max_on_positive = np.max(y_pred_positive,axis=0) test_path = exp_list[0]['test_path'] _, y = load_data_from_path(test_path) y = np.argmax(y.eval(), axis=1) TPR, TNR = get_tpr_tnr(y, y_pred) best_confusion_matrix = confusion_matrix(y, y_pred, labels=[0, 1]) #AUC_of_max_prob = metrics.roc_auc_score(y, y_pred_max_on_positive) combine_log(path=self.log_dir + '/combine_logs.txt', file_list=self.log_files) return eval_result, [TPR, TNR], best_confusion_matrix
def compute_result_from_pretrain_model(main_model_path, train_path, test_path): train_result = [] test_result = [] model = load_model(main_model_path) train_set_x, train_set_y = load_data_from_path(train_path) test_set_x, test_set_y = load_data_from_path(test_path) ff_test = model.test_model([train_set_x, train_set_y]) [y_pred, y_pred_score_train] = ff_test() y = np.argmax(train_set_y.eval(), axis=1) TPR, TNR = get_tpr_tnr(y, y_pred) AUC = metrics.roc_auc_score(y, y_pred_score_train[:, 1]) train_result.append([TPR, TNR, AUC]) ff_test = model.test_model([test_set_x, test_set_y]) [y_pred, y_pred_score_train] = ff_test() y = np.argmax(test_set_y.eval(), axis=1) TPR, TNR = get_tpr_tnr(y, y_pred) AUC = metrics.roc_auc_score(y, y_pred_score_train[:, 1]) test_result.append([TPR, TNR, AUC]) return train_result, test_result
def test_SdA(finetune_lr=0.1, pretraining_epochs=1, pretrain_lr=0.001, training_epochs=1, batch_size=10, h=[10, 10], cl=[.1, .2], cost_vec=[1, 1.2], beta=30, logger=None): logger.info( 'pre-epoch:%d\ntrain_epoch:%d\npre_lr:%lf\nfine_lr:%lf\nhidden_layer:%s\nCoruption level:%s\nCost_vec:%s\nBeta:%s' % (pretraining_epochs, training_epochs, pretrain_lr, finetune_lr, h, cl, cost_vec, beta)) cost_vec = numpy.array(cost_vec, dtype="float32") train_result = [] test_result = [] auc_list = [] test_lift = [[], []] hidden_l_size = h num_of_fold = 1 for mm in range(num_of_fold): # for mm in range(5,6): logger.info("Trail K=%d" % (mm + 1)) logger.info('Load data from %s', data_dir) datasets = load_data_lift(mm + 1, data_dir) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] profit_train, profit_test = datasets[3] profit_train = get_cost_vector(profit_train, beta, cost_vec[1]) datasets[3] = (profit_train, profit_test) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size # numpy random generator # start-snippet-3 numpy_rng = numpy.random.RandomState(89677) logger.info('... building the model') # construct the stacked denoising autoencoder class sda = SdA( numpy_rng=numpy_rng, n_ins=train_set_x.eval().shape[1], hidden_layers_sizes=hidden_l_size, n_outs=2, costVec=cost_vec, ) # corruption_levels = [.1, .2] corruption_levels = cl ######################### # PRETRAINING THE MODEL # ######################### logger.info('... getting the pretraining functions') pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size) logger.info('... pre-training the model') ## Pre-train layer-wise for i in range(sda.n_layers): # go through pretraining epochs for epoch in range(pretraining_epochs): # go through the training set c = [] for batch_index in range(n_train_batches): c.append(pretraining_fns[i]( index=batch_index, corruption=corruption_levels[i], lr=pretrain_lr)) logger.info('Pre-training layer %i, epoch %d, cost %f' % (i, epoch, numpy.mean(c, dtype='float64'))) # logger.info(('The pretraining dllib for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) # end-snippet-4 ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model logger.info('... getting the finetuning functions') train_fn, validate_model, test_model = sda.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr, ) logger.info('... finetunning the model') # early-stopping parameters patience = 10 * n_train_batches # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): epoch += 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) # logger.info ("AVG_COST:%f",(minibatch_avg_cost)) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() # this_validation_loss = numpy.mean(validation_losses, dtype='float64') ff_test = sda.test_model(datasets[1]) [y_pred, y_pred_score] = ff_test() # get_any = sda.get_any(datasets[0], profit_train) # a,b=get_any() # get_cost = sda.get_cost(datasets[0], profit_train) # d = get_cost() # s = dbn.getSoftmaxresult(datasets[2]) # softm = s() y = np.argmax(datasets[1][1].eval(), axis=1) tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[0, 1]).ravel() # logger.info(classification_report(y_true, y_pred, target_names=['no','yes'],labels=[0,1])) # logger.info(confusion_matrix(y, y_pred, labels=[0, 1])) TPR, TNR = get_tpr_tnr(y, y_pred) temp = [0, 0] # logger.info('TPR=', TPR) temp[0] = TPR # logger.info('TNR=', TNR) temp[1] = TNR # this_validation_loss = numpy.abs((TPR-TNR)) this_validation_loss = numpy.mean(validation_losses, dtype='float64') logger.info( 'epoch %i, minibatch %i/%i, validation partial error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses, dtype='float64') logger.info( (' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) logger.info("========new best with testset=========") ff_test = sda.test_model(datasets[2]) [y_pred, y_pred_score] = ff_test() y = np.argmax(datasets[2][1].eval(), axis=1) response_lift, profit_lift = evaluate_decile( prob=y_pred_score, label=y, actual_profit=profit_test, isPlot=True) logger.info('response_lift:%s', response_lift) logger.info('profit lift:%s', profit_lift) # logger.info(classification_report(y_true, y_pred, target_names=['no','yes'],labels=[0,1])) best_confusion_matrix = confusion_matrix(y, y_pred, labels=[0, 1]) logger.info(best_confusion_matrix) best_test = [0, 0, 0] # best_response_lift = [] # best_profit_lift = [] TPR, TNR = get_tpr_tnr(y, y_pred) best_test[0] = TPR best_test[1] = TNR best_test[2] = metrics.roc_auc_score( y, y_pred_score[:, 1]) best_response_lift = (response_lift) best_profit_lift = (profit_lift) logger.info('AUC=%s', best_test[2]) logger.info('TPR=%s', TPR) logger.info('TNR=%s', TNR) # test_result[mm] = best_test # train ff_test = sda.test_model(datasets[0]) [y_pred, y_pred_score] = ff_test() y = np.argmax(datasets[0][1].eval(), axis=1) best_train = [0, 0, 0] TPR, TNR = get_tpr_tnr(y, y_pred) best_train[0] = TPR best_train[1] = TNR best_train[2] = metrics.roc_auc_score( y, y_pred_score[:, 1]) # train_result[mm] = best_train if patience <= iter: done_looping = True break # logger.info RESULT logger.info("========Test with training set=========") # ff = sda.test_model(datasets[0]) # [y_pred, y_pred_score] = ff() # y = np.argmax(datasets[0][1].eval(),axis=1) # logger.info(confusion_matrix(y, y_pred, labels=[0, 1])) # temp = [0, 0] # TPR,TNR = get_tpr_tnr(y,y_pred) # temp[0] = TPR # temp[1] = TNR logger.info('AUC=%s', best_train[2]) logger.info('TPR=%s', best_train[0]) logger.info('TNR=%s', best_train[1]) train_result.append(best_train) # evaluate logger.info("========Test with test set=========") # ff_test = sda.test_model(datasets[2]) # [y_pred, y_pred_score] = ff_test() # y = np.argmax(datasets[2][1].eval(),axis=1) # auc = metrics.roc_auc_score(y, y_pred_score[:,1]) # logger.info('AUC=', auc) # auc_list.append(auc) # ============ # logger.info(classification_report(y_true, y_pred, target_names=['no','yes'],labels=[0,1])) # logger.info(confusion_matrix(y, y_pred, labels=[0, 1])) # TPR,TNR = get_tpr_tnr(y,y_pred) logger.info('Response_lift=%s', best_response_lift) logger.info('Profit_lift=%s', best_profit_lift) logger.info('AUC=%s', best_test[2]) logger.info('TPR=%s', best_test[0]) logger.info('TNR=%s', best_test[1]) test_result.append(best_test) # test_lift[0] = np.vstack([test_lift[0], best_response_lift]) # test_lift[1] = np.vstack([test_lift[1], best_profit_lift]) test_lift[0].append(best_response_lift) test_lift[1].append(best_profit_lift) logger.info( ('Optimization complete with best validation score of %f %%, ' 'on iteration %i, ' 'with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) # logger.info(('The training dllib for file ' + # os.path.split(__file__)[1] + # ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) logger.info( 'pre-epoch:%d\ntrain_epoch:%d\npre_lr:%lf\nfine_lr:%lf\nhidden_layer:%s\nCoruption level:%s\nCost_vec:%s\nBeta:%s' % (pretraining_epochs, training_epochs, pretrain_lr, finetune_lr, hidden_l_size, cl, cost_vec, beta)) logger.info("====Overall: trainset====") for p, n, a in train_result: logger.info("%f\t%f\t%f" % (p, n, a)) mean = np.mean(train_result, axis=0) logger.info('Mean=%f\t%f\t%f' % (mean[0], mean[1], mean[2])) logger.info("====Overall: testset====") temp_response = [] temp_profit = [] mean = [] for p, n, a in test_result: logger.info("%f\t%f\t%f" % (p, n, a)) mean = np.mean(test_result, axis=0) logger.info('Mean=%f\t%f\t%f' % (mean[0], mean[1], mean[2])) mean_response_lift = np.mean(test_lift[0], axis=0) mean_profit_lift = np.mean(test_lift[1], axis=0) logger.info('Mean Response lift=%s', mean_response_lift) logger.info('Mean Profit lift=%s', mean_profit_lift) # logger.info("====Overall: AUC for testset====") # for a in auc_list: # logger.info("%f" % a) #train_all.append(train_result) # test_all.append(test_result) # test_lift_all.append((test_lift[0], test_lift[1])) return train_result, test_result, test_lift
def csdnn_classifier(log_dir, logger, p_dict): finetune_lr = p_dict['finetune_lr'] pretraining_epochs = p_dict['pretraining_epochs'] pretrain_lr = p_dict['pretrain_lr'] training_epochs = p_dict['training_epochs'] batch_size = p_dict['batch_size'] h = p_dict['h'] cl = p_dict['cl'] cost_vec = p_dict['cost_vec'] beta = p_dict['beta'] reg_coef = p_dict['reg_coef'] pretrain_batchsize = p_dict['pretrain_batchsize'] drop_ps = p_dict['drop_ps'] exp_name = p_dict['exp_name'] train_path = p_dict['train_path'] test_path = p_dict['test_path'] valid_path = p_dict['valid_path'] logger.info( 'pre-epoch:%d\ntrain_epoch:%d\npre_lr:%lf\nfine_lr:%lf\nhidden_layer:%s\nCoruption level:%s\nCost_vec:%s\nBeta:%s' % (pretraining_epochs, training_epochs, pretrain_lr, finetune_lr, h, cl, cost_vec, beta)) cost_vec = np.array(cost_vec, dtype="float32") train_result = [] test_result = [] hidden_l_size = h logger.info('Load data from %s', train_path) train_set_x, train_set_y = load_data_from_path(train_path) test_set_x, test_set_y = load_data_from_path(test_path) valid_set_x, valid_set_y = load_data_from_path(valid_path) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_pretrain_batches = int(n_train_batches / pretrain_batchsize) n_train_batches //= batch_size # numpy random generator numpy_rng = np.random.RandomState(89677) logger.info('... building the model') # construct the stacked denoising autoencoder class sda = SdA_Ensemble(numpy_rng=numpy_rng, n_ins=train_set_x.eval().shape[1], hidden_layers_sizes=hidden_l_size, n_outs=2, costVec=cost_vec, reg_coef=reg_coef, drop_ps=drop_ps) corruption_levels = cl ######################### # PRETRAINING THE MODEL # ######################### if pretraining_epochs > 0: logger.info('... getting the pretraining functions') pretraining_fns = sda.pretraining_functions( train_set_x=train_set_x, batch_size=pretrain_batchsize) logger.info('... pre-training the model') ## Pre-train layer-wise for i in range(sda.n_layers): # go through pretraining epochs for epoch in range(pretraining_epochs): # go through the training set c = [] for batch_index in range(n_pretrain_batches): c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i], lr=pretrain_lr)) logger.info('Pre-training layer %i, epoch %d, cost %f' % (i, epoch, np.mean(c, dtype='float64'))) ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model logger.info('... getting the finetuning functions') train_fn, validate_model, test_model = sda.build_finetune_functions( datasets=[(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)], batch_size=batch_size, learning_rate=finetune_lr, ) logger.info('... finetunning the model') # early-stopping parameters patience = 10 * n_train_batches # look as this many examples regardless patience_increase = 3 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # improvement_threshold = 1.005 # considered significant validation_frequency = min(n_train_batches, patience // 2) print('patience:%s, validation_freq:%s' % (patience, validation_frequency)) best_validation_loss = np.inf test_score = 0. done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): epoch += 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) # logger.info ("AVG_COST:%f",(minibatch_avg_cost)) iter = (epoch - 1) * n_train_batches + minibatch_index # print('iter:%s' % iter) if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() ff_test = sda.test_model([valid_set_x, valid_set_y]) [y_pred, y_pred_score] = ff_test() y = np.argmax(valid_set_y.eval(), axis=1) TPR, TNR = get_tpr_tnr(y, y_pred) # this_validation_loss = np.abs((TPR - TNR)) this_validation_loss = 1 - metrics.roc_auc_score( y, y_pred_score[:, 1]) # this_validation_loss = np.mean(validation_losses, dtype='float64') logger.info( 'epoch %i, minibatch %i/%i, validation partial error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): print('increase patience from:%s' % patience) patience = max(patience, iter * patience_increase) print('to:%s' % patience) # Save model logger.info("Save best model by pickle") model_name = log_dir + '/tmp_model/' + exp_name + '_' save_file = open(model_name, 'wb') cPickle.dump(sda, save_file) save_file.close() # save best validation score and iteration number best_validation_loss = this_validation_loss logger.info("========new best with testset=========") ff_test = sda.test_model([test_set_x, test_set_y]) [y_pred, y_pred_score] = ff_test() y = np.argmax(test_set_y.eval(), axis=1) best_confusion_matrix = confusion_matrix(y, y_pred, labels=[0, 1]) logger.info(best_confusion_matrix) TPR, TNR = get_tpr_tnr(y, y_pred) AUC = metrics.roc_auc_score(y, y_pred_score[:, 1]) logger.info('AUC=%s', AUC) logger.info('TPR=%s', TPR) logger.info('TNR=%s', TNR) if patience <= iter: done_looping = True break # load best model model = load_model(model_name) logger.info("========Test with training set=========") ff_test = model.test_model([train_set_x, train_set_y]) [y_pred, y_pred_score_train] = ff_test() y = np.argmax(train_set_y.eval(), axis=1) TPR, TNR = get_tpr_tnr(y, y_pred) AUC = metrics.roc_auc_score(y, y_pred_score_train[:, 1]) logger.info('AUC=%s', AUC) logger.info('TPR=%s', TPR) logger.info('TNR=%s', TNR) train_result.append([TPR, TNR, AUC]) # evaluate logger.info("========Test with test set=========") ff_test = model.test_model([test_set_x, test_set_y]) [y_pred, y_pred_score_test] = ff_test() y = np.argmax(test_set_y.eval(), axis=1) TPR, TNR = get_tpr_tnr(y, y_pred) AUC = metrics.roc_auc_score(y, y_pred_score_test[:, 1]) logger.info('AUC=%s', AUC) logger.info('TPR=%s', TPR) logger.info('TNR=%s', TNR) test_result.append([TPR, TNR, AUC]) return train_result, test_result, model_name, y_pred_score_test