def main(argv): config_file = argv[0] cfg = config.YamlParser(config_file) log_dir, out_dir = logger.init(log_dir=cfg.log_dir(), out_dir=cfg.out_dir(), level=cfg.log_level()) weight_path = '{}/weights.h5'.format(out_dir) (X, Y), (x_val, y_val), (_, _) = cango_pboc.get_train_val_test_data( path=cfg.train_data(), drop_columns=cfg.drop_columns(), train_val_ratio=cfg.train_val_ratio(), do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio()) # (X, Y), (x_val, y_val) = cango_pboc.get_train_val_data( # path=cfg.train_data(), drop_columns=cfg.drop_columns(), # train_val_ratio=cfg.train_val_ratio(), # do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio()) kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=constants.random_seed) checkpointer = ka.callbacks.ModelCheckpoint(filepath=weight_path, verbose=1, save_best_only=True) # Construct the model input_dim = X.shape[1] mmnn = MultiModelsNeuralNetwork(input_dim) mmnn.set_reg_val(cfg.model_reg_val()) mmnn.set_learning_rate(cfg.model_learning_rate()) for i in range(0, 2): branch = single_model.create_model( input_dim, regularization_val=cfg.model_reg_val() * (i * 0.1), dropout_val=cfg.model_dropout_val(), learning_rate=cfg.model_learning_rate()) mmnn.add_model(branch) model_nn = mmnn.create_model() cvscores = [] for train_index, test_index in kfold.split(X, Y): if os.path.exists(weight_path): model_nn.load_weights(weight_path) early_stopping = ka.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, mode='auto') train_array = [] val_array = [] for i in range(0, 2): train_array.append(X[train_index]) val_array.append(x_val) model_nn.fit(train_array, Y[train_index], batch_size=cfg.model_train_batch_size(), epochs=cfg.model_train_epoches(), verbose=0, class_weight=cfg.model_class_weight(), validation_data=(val_array, y_val), callbacks=[early_stopping, checkpointer]) scores = model_nn.evaluate(val_array, y_val, verbose=0) print("%s: %.2f%%" % (model_nn.metrics_names[1], scores[1] * 100)) cvscores.append(scores[1] * 100) print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) # save the model json_string = model_nn.to_json() open('{}/model_architecture.json'.format(out_dir), 'w').write(json_string)
def main(argv): config_file = argv[0] cfg = config.YamlParser(config_file) log_dir, out_dir = logger.init(log_dir=cfg.log_dir(), out_dir=cfg.out_dir(), level=cfg.log_level()) (_, _), (x_val, y_val), (x_train, y_train) = cango_pboc.get_train_val_test_data( path=cfg.train_data(), drop_columns=cfg.drop_columns(), train_val_ratio=cfg.train_val_ratio(), do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio()) # (x_train, y_train), (x_val, y_val) = cango_pboc.get_train_val_data( # path=cfg.train_data(), drop_columns=cfg.drop_columns(), # train_val_ratio=cfg.train_val_ratio(), # do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio()) # streams epoch results to a csv file csv_logger = ka.callbacks.CSVLogger('{}/epoches.log'.format(log_dir)) # checkpoint weight after each epoch if the validation loss decreased checkpointer = ka.callbacks.ModelCheckpoint( filepath='{}/weights.h5'.format(out_dir), verbose=1, save_best_only=True) # stop training when a monitored quality has stopped improving early_stopping = ka.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto') # Construct the model input_dim = x_train.shape[1] mmnn = MultiModelsNeuralNetwork(input_dim) mmnn.set_reg_val(cfg.model_reg_val()) mmnn.set_learning_rate(cfg.model_learning_rate()) train_array = [] val_array = [] for i in range(0, 2): branch = single_model.create_model( input_dim, regularization_val=cfg.model_reg_val() * (i * 0.1), dropout_val=cfg.model_dropout_val(), learning_rate=cfg.model_learning_rate()) mmnn.add_model(branch) train_array.append(x_train) val_array.append(x_val) model_nn = mmnn.create_model() # Train the model history = model_nn.fit( train_array, y_train, batch_size=cfg.model_train_batch_size(), epochs=cfg.model_train_epoches(), verbose=0, validation_data=(val_array, y_val), class_weight=cfg.model_class_weight(), callbacks=[checkpointer, csv_logger, early_stopping]) score = model_nn.evaluate(val_array, y_val, verbose=0) print('Validation score:', score[0]) print('Validation accuracy:', score[1]) # summarize history for accuracy plots.train_val_acc(train_acc=history.history['acc'], val_acc=history.history['val_acc'], to_file='{}/plt_acc'.format(out_dir), show=True) # summarize history for loss plots.train_val_loss(train_loss=history.history['loss'], val_loss=history.history['val_loss'], to_file='{}/plt_loss'.format(out_dir), show=True) # save the model json_string = model_nn.to_json() open('{}/model_architecture.json'.format(out_dir), 'w').write(json_string) plot_model(model_nn, to_file='{}/model.png'.format(out_dir))
def main(argv): config_file = argv[0] cfg = config.YamlParser(config_file) log_dir, out_dir = logger.init(log_dir=cfg.log_dir(), out_dir=cfg.out_dir(), level=cfg.log_level()) if cfg.one_filer(): (x_train, y_train), (x_val, y_val), (x_test, y_test) = cango.get_train_val_test_data( path=cfg.train_data(), drop_columns=cfg.drop_columns(), train_val_ratio=cfg.train_val_ratio(), do_shuffle=cfg.do_shuffle(), do_smote=False, smote_ratio=cfg.smote_ratio()) else: (x_train, y_train), (x_val, y_val) = cango.get_train_val_data( path=cfg.train_data(), drop_columns=cfg.drop_columns(), train_val_ratio=cfg.train_val_ratio(), do_shuffle=cfg.do_shuffle(), do_smote=False, smote_ratio=cfg.smote_ratio()) x_test, y_test = cango.get_test_data(path=cfg.test_data(), drop_columns=cfg.drop_columns()) model_nn = get_model(cfg.out_dir(), cfg.out_dir()) x_train_array = [] x_val_array = [] x_test_array = [] for i in range(0, 2): x_train_array.append(x_train) x_val_array.append(x_val) x_test_array.append(x_test) y_pred_train_out, proba_g_train, proba_b_train = get_predict( model=model_nn, data=x_train_array, batch_size=100, cutoff=cfg.cutoff()) y_pred_train_1 = np.count_nonzero(y_pred_train_out) y_pred_train_0 = len(y_pred_train_out) - y_pred_train_1 log.debug('predict train dataset distribution: 0 - {}, 1 - {}'.format( y_pred_train_0, y_pred_train_1)) y_pred_val_out, proba_g_val, proba_b_val = get_predict(model=model_nn, data=x_val_array, batch_size=100, cutoff=cfg.cutoff()) y_pred_val_1 = np.count_nonzero(y_pred_val_out) y_pred_val_0 = len(y_pred_val_out) - y_pred_val_1 log.debug('predict validation dataset distribution: 0 - {}, 1 - {}'.format( y_pred_val_0, y_pred_val_1)) y_pred_test_out, proba_g_test, proba_b_test = get_predict( model=model_nn, data=x_test_array, batch_size=100, cutoff=cfg.cutoff()) y_pred_test_1 = np.count_nonzero(y_pred_test_out) y_pred_test_0 = len(y_pred_test_out) - y_pred_test_1 log.debug('predict test dataset distribution: 0 - {}, 1 - {}'.format( y_pred_test_0, y_pred_test_1)) df_test = None # output if y_test is not None: np.savetxt('{}/predict_test.csv'.format(cfg.out_dir()), np.c_[y_test, y_pred_test_out, proba_g_test, proba_b_test], delimiter=',', header='CG_Label, Label, p_g, p_b', comments='', fmt='%d, %d, %.6f, %.6f') df_test = pd.DataFrame({ 'CG_Label': y_test, 'Label': y_pred_test_out, 'p_g': proba_g_test, 'p_b': proba_b_test }) bins_test, c0_test, c1_test = metrics.cals_KS_bins( df_test, 'p_b', 'Label') np.savetxt('{}/predict_bin_test.csv'.format(cfg.out_dir()), np.c_[bins_test, c0_test, c1_test], delimiter=',', header='p_b, n_g_label, n_b_label', comments='', fmt='%.1f, %d, %d') else: np.savetxt('{}/predict_bin_test.csv'.format(cfg.out_dir()), np.c_[y_pred_test_out, proba_g_test, proba_b_test], delimiter=',', header='Label, p_g, p_b', comments='', fmt='%d, %.6f, %.6f') np.savetxt('{}/predict_val.csv'.format(cfg.out_dir()), np.c_[y_val, y_pred_val_out, proba_g_val, proba_b_val], delimiter=',', header='CG_Label, Label, p_g, p_b', comments='', fmt='%d, %d, %.6f, %.6f') df_val = pd.DataFrame({ 'CG_Label': y_val, 'Label': y_pred_val_out, 'p_g': proba_g_val, 'p_b': proba_b_val }) bins_val, c0_val, c1_val = metrics.cals_KS_bins(df_val, 'p_b', 'CG_Label') np.savetxt('{}/predict_bin_val.csv'.format(cfg.out_dir()), np.c_[bins_val, c0_val, c1_val], delimiter=',', header='p_b, n_g_label, n_b_Label', comments='', fmt='%.1f, %d, %d') # KS test score ks_val = metrics.calc_KS_AR(df_val, 'p_g', 'CG_Label') ks_val_value = np.max( np.subtract(ks_val[1]['badCumPer'].values, ks_val[1]['goodCumPer'].values)) log.info('ks val score: {}'.format(ks_val_value)) ks_test = metrics.calc_KS_AR(df_test, 'p_g', 'CG_Label') ks_test_value = np.max( np.subtract(ks_test[1]['badCumPer'].values, ks_test[1]['goodCumPer'].values)) log.info('ks test score: {}'.format(ks_test_value)) plt.figure(figsize=(14, 10), dpi=80, facecolor='w') plt.plot(ks_val[1]['p_g'], ks_val[1]['goodCumPer'], lw=2, alpha=0.8, label='Good Percent -val') plt.plot(ks_test[1]['p_g'], ks_test[1]['goodCumPer'], lw=2, alpha=0.8, label='Good Percent -test') plt.plot(ks_val[1]['p_g'], ks_val[1]['badCumPer'], lw=2, alpha=0.8, label='Bad Percent- val') plt.plot(ks_test[1]['p_g'], ks_test[1]['badCumPer'], lw=2, alpha=0.8, label='Bad Percent -test') #plt.xticks(list(train_ks[1]['goodCumPer'].index), list(train_ks[1]['train_proba'].unique()), rotation=90) plt.title('K-S curve', fontsize=18) plt.xlabel('p_b', fontsize=14) plt.ylabel('good/bad percent', fontsize=14) plt.legend(loc='upper left', fontsize=12) plt.grid(b=True, ls=':') plt.savefig('{}/ks'.format(cfg.out_dir())) plt.show() # PSI psiCalc = psi3.PSI() psi_val = psiCalc.calcPSI(y_pred_test_out, proba_b_test, y_pred_val_out, proba_b_val) log.info('PSI (p_b): {}'.format(psi_val)) psi_val = psiCalc.calcPSI(y_pred_test_out, proba_g_test, y_pred_val_out, proba_g_val) log.info('PSI (p_g): {}'.format(psi_val)) # AUC ROC if y_test is not None: y_true_arr = [y_test, y_val] y_score_arr = [proba_b_test, proba_b_val] y_label_arr = ['AUC-test', 'AUC-val'] plots.roc_auc_multi(y_true_arr=y_true_arr, y_score_arr=y_score_arr, label_arr=y_label_arr, to_file='{}/roc_all'.format(out_dir), show=True) # confusion matrix plots.confusion_matrix(y_true=y_test, y_pred=np.asarray(y_pred_test_out), to_file='{}/confusion_test'.format(out_dir), show=True) plots.confusion_matrix(y_true=y_val, y_pred=np.asarray(y_pred_val_out), to_file='{}/confusion_val'.format(out_dir), show=True)