Esempio n. 1
0
def main(argv):
    config_file = argv[0]
    cfg = config.YamlParser(config_file)
    log_dir, out_dir = logger.init(log_dir=cfg.log_dir(),
                                   out_dir=cfg.out_dir(),
                                   level=cfg.log_level())
    weight_path = '{}/weights.h5'.format(out_dir)

    (X, Y), (x_val, y_val), (_, _) = cango_pboc.get_train_val_test_data(
        path=cfg.train_data(),
        drop_columns=cfg.drop_columns(),
        train_val_ratio=cfg.train_val_ratio(),
        do_shuffle=cfg.do_shuffle(),
        do_smote=cfg.do_smote(),
        smote_ratio=cfg.smote_ratio())

    # (X, Y), (x_val, y_val) = cango_pboc.get_train_val_data(
    #     path=cfg.train_data(), drop_columns=cfg.drop_columns(),
    #     train_val_ratio=cfg.train_val_ratio(),
    #     do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio())

    kfold = StratifiedKFold(n_splits=10,
                            shuffle=True,
                            random_state=constants.random_seed)

    checkpointer = ka.callbacks.ModelCheckpoint(filepath=weight_path,
                                                verbose=1,
                                                save_best_only=True)

    # Construct the model
    input_dim = X.shape[1]
    mmnn = MultiModelsNeuralNetwork(input_dim)
    mmnn.set_reg_val(cfg.model_reg_val())
    mmnn.set_learning_rate(cfg.model_learning_rate())

    for i in range(0, 2):
        branch = single_model.create_model(
            input_dim,
            regularization_val=cfg.model_reg_val() * (i * 0.1),
            dropout_val=cfg.model_dropout_val(),
            learning_rate=cfg.model_learning_rate())
        mmnn.add_model(branch)

    model_nn = mmnn.create_model()

    cvscores = []
    for train_index, test_index in kfold.split(X, Y):

        if os.path.exists(weight_path):
            model_nn.load_weights(weight_path)

        early_stopping = ka.callbacks.EarlyStopping(monitor='val_loss',
                                                    min_delta=0,
                                                    patience=5,
                                                    verbose=1,
                                                    mode='auto')
        train_array = []
        val_array = []
        for i in range(0, 2):
            train_array.append(X[train_index])
            val_array.append(x_val)

        model_nn.fit(train_array,
                     Y[train_index],
                     batch_size=cfg.model_train_batch_size(),
                     epochs=cfg.model_train_epoches(),
                     verbose=0,
                     class_weight=cfg.model_class_weight(),
                     validation_data=(val_array, y_val),
                     callbacks=[early_stopping, checkpointer])
        scores = model_nn.evaluate(val_array, y_val, verbose=0)
        print("%s: %.2f%%" % (model_nn.metrics_names[1], scores[1] * 100))
        cvscores.append(scores[1] * 100)

    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

    # save the model
    json_string = model_nn.to_json()
    open('{}/model_architecture.json'.format(out_dir), 'w').write(json_string)
Esempio n. 2
0
def main(argv):
    config_file = argv[0]
    cfg = config.YamlParser(config_file)
    log_dir, out_dir = logger.init(log_dir=cfg.log_dir(),
                                   out_dir=cfg.out_dir(),
                                   level=cfg.log_level())

    (_, _), (x_val, y_val), (x_train,
                             y_train) = cango_pboc.get_train_val_test_data(
                                 path=cfg.train_data(),
                                 drop_columns=cfg.drop_columns(),
                                 train_val_ratio=cfg.train_val_ratio(),
                                 do_shuffle=cfg.do_shuffle(),
                                 do_smote=cfg.do_smote(),
                                 smote_ratio=cfg.smote_ratio())

    # (x_train, y_train), (x_val, y_val) = cango_pboc.get_train_val_data(
    #     path=cfg.train_data(), drop_columns=cfg.drop_columns(),
    #     train_val_ratio=cfg.train_val_ratio(),
    #     do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio())

    # streams epoch results to a csv file
    csv_logger = ka.callbacks.CSVLogger('{}/epoches.log'.format(log_dir))

    # checkpoint weight after each epoch if the validation loss decreased
    checkpointer = ka.callbacks.ModelCheckpoint(
        filepath='{}/weights.h5'.format(out_dir),
        verbose=1,
        save_best_only=True)

    # stop training when a monitored quality has stopped improving
    early_stopping = ka.callbacks.EarlyStopping(monitor='val_loss',
                                                min_delta=0,
                                                patience=10,
                                                verbose=1,
                                                mode='auto')

    # Construct the model
    input_dim = x_train.shape[1]
    mmnn = MultiModelsNeuralNetwork(input_dim)
    mmnn.set_reg_val(cfg.model_reg_val())
    mmnn.set_learning_rate(cfg.model_learning_rate())

    train_array = []
    val_array = []
    for i in range(0, 2):
        branch = single_model.create_model(
            input_dim,
            regularization_val=cfg.model_reg_val() * (i * 0.1),
            dropout_val=cfg.model_dropout_val(),
            learning_rate=cfg.model_learning_rate())
        mmnn.add_model(branch)
        train_array.append(x_train)
        val_array.append(x_val)

    model_nn = mmnn.create_model()

    # Train the model
    history = model_nn.fit(
        train_array,
        y_train,
        batch_size=cfg.model_train_batch_size(),
        epochs=cfg.model_train_epoches(),
        verbose=0,
        validation_data=(val_array, y_val),
        class_weight=cfg.model_class_weight(),
        callbacks=[checkpointer, csv_logger, early_stopping])
    score = model_nn.evaluate(val_array, y_val, verbose=0)
    print('Validation score:', score[0])
    print('Validation accuracy:', score[1])

    # summarize history for accuracy
    plots.train_val_acc(train_acc=history.history['acc'],
                        val_acc=history.history['val_acc'],
                        to_file='{}/plt_acc'.format(out_dir),
                        show=True)

    # summarize history for loss
    plots.train_val_loss(train_loss=history.history['loss'],
                         val_loss=history.history['val_loss'],
                         to_file='{}/plt_loss'.format(out_dir),
                         show=True)

    # save the model
    json_string = model_nn.to_json()
    open('{}/model_architecture.json'.format(out_dir), 'w').write(json_string)
    plot_model(model_nn, to_file='{}/model.png'.format(out_dir))
Esempio n. 3
0
def main(argv):
    config_file = argv[0]
    cfg = config.YamlParser(config_file)
    log_dir, out_dir = logger.init(log_dir=cfg.log_dir(),
                                   out_dir=cfg.out_dir(),
                                   level=cfg.log_level())

    if cfg.one_filer():
        (x_train,
         y_train), (x_val, y_val), (x_test,
                                    y_test) = cango.get_train_val_test_data(
                                        path=cfg.train_data(),
                                        drop_columns=cfg.drop_columns(),
                                        train_val_ratio=cfg.train_val_ratio(),
                                        do_shuffle=cfg.do_shuffle(),
                                        do_smote=False,
                                        smote_ratio=cfg.smote_ratio())
    else:
        (x_train, y_train), (x_val, y_val) = cango.get_train_val_data(
            path=cfg.train_data(),
            drop_columns=cfg.drop_columns(),
            train_val_ratio=cfg.train_val_ratio(),
            do_shuffle=cfg.do_shuffle(),
            do_smote=False,
            smote_ratio=cfg.smote_ratio())

        x_test, y_test = cango.get_test_data(path=cfg.test_data(),
                                             drop_columns=cfg.drop_columns())

    model_nn = get_model(cfg.out_dir(), cfg.out_dir())

    x_train_array = []
    x_val_array = []
    x_test_array = []
    for i in range(0, 2):
        x_train_array.append(x_train)
        x_val_array.append(x_val)
        x_test_array.append(x_test)

    y_pred_train_out, proba_g_train, proba_b_train = get_predict(
        model=model_nn,
        data=x_train_array,
        batch_size=100,
        cutoff=cfg.cutoff())
    y_pred_train_1 = np.count_nonzero(y_pred_train_out)
    y_pred_train_0 = len(y_pred_train_out) - y_pred_train_1
    log.debug('predict train dataset distribution: 0 - {}, 1 - {}'.format(
        y_pred_train_0, y_pred_train_1))

    y_pred_val_out, proba_g_val, proba_b_val = get_predict(model=model_nn,
                                                           data=x_val_array,
                                                           batch_size=100,
                                                           cutoff=cfg.cutoff())
    y_pred_val_1 = np.count_nonzero(y_pred_val_out)
    y_pred_val_0 = len(y_pred_val_out) - y_pred_val_1
    log.debug('predict validation dataset distribution: 0 - {}, 1 - {}'.format(
        y_pred_val_0, y_pred_val_1))

    y_pred_test_out, proba_g_test, proba_b_test = get_predict(
        model=model_nn, data=x_test_array, batch_size=100, cutoff=cfg.cutoff())
    y_pred_test_1 = np.count_nonzero(y_pred_test_out)
    y_pred_test_0 = len(y_pred_test_out) - y_pred_test_1
    log.debug('predict test dataset distribution: 0 - {}, 1 - {}'.format(
        y_pred_test_0, y_pred_test_1))

    df_test = None
    # output
    if y_test is not None:
        np.savetxt('{}/predict_test.csv'.format(cfg.out_dir()),
                   np.c_[y_test, y_pred_test_out, proba_g_test, proba_b_test],
                   delimiter=',',
                   header='CG_Label, Label, p_g, p_b',
                   comments='',
                   fmt='%d, %d, %.6f, %.6f')
        df_test = pd.DataFrame({
            'CG_Label': y_test,
            'Label': y_pred_test_out,
            'p_g': proba_g_test,
            'p_b': proba_b_test
        })
        bins_test, c0_test, c1_test = metrics.cals_KS_bins(
            df_test, 'p_b', 'Label')
        np.savetxt('{}/predict_bin_test.csv'.format(cfg.out_dir()),
                   np.c_[bins_test, c0_test, c1_test],
                   delimiter=',',
                   header='p_b, n_g_label, n_b_label',
                   comments='',
                   fmt='%.1f, %d, %d')
    else:
        np.savetxt('{}/predict_bin_test.csv'.format(cfg.out_dir()),
                   np.c_[y_pred_test_out, proba_g_test, proba_b_test],
                   delimiter=',',
                   header='Label, p_g, p_b',
                   comments='',
                   fmt='%d, %.6f, %.6f')

    np.savetxt('{}/predict_val.csv'.format(cfg.out_dir()),
               np.c_[y_val, y_pred_val_out, proba_g_val, proba_b_val],
               delimiter=',',
               header='CG_Label, Label, p_g, p_b',
               comments='',
               fmt='%d, %d, %.6f, %.6f')

    df_val = pd.DataFrame({
        'CG_Label': y_val,
        'Label': y_pred_val_out,
        'p_g': proba_g_val,
        'p_b': proba_b_val
    })
    bins_val, c0_val, c1_val = metrics.cals_KS_bins(df_val, 'p_b', 'CG_Label')
    np.savetxt('{}/predict_bin_val.csv'.format(cfg.out_dir()),
               np.c_[bins_val, c0_val, c1_val],
               delimiter=',',
               header='p_b, n_g_label, n_b_Label',
               comments='',
               fmt='%.1f, %d, %d')

    # KS test score
    ks_val = metrics.calc_KS_AR(df_val, 'p_g', 'CG_Label')
    ks_val_value = np.max(
        np.subtract(ks_val[1]['badCumPer'].values,
                    ks_val[1]['goodCumPer'].values))
    log.info('ks val score: {}'.format(ks_val_value))
    ks_test = metrics.calc_KS_AR(df_test, 'p_g', 'CG_Label')
    ks_test_value = np.max(
        np.subtract(ks_test[1]['badCumPer'].values,
                    ks_test[1]['goodCumPer'].values))
    log.info('ks test score: {}'.format(ks_test_value))

    plt.figure(figsize=(14, 10), dpi=80, facecolor='w')
    plt.plot(ks_val[1]['p_g'],
             ks_val[1]['goodCumPer'],
             lw=2,
             alpha=0.8,
             label='Good Percent -val')
    plt.plot(ks_test[1]['p_g'],
             ks_test[1]['goodCumPer'],
             lw=2,
             alpha=0.8,
             label='Good Percent -test')
    plt.plot(ks_val[1]['p_g'],
             ks_val[1]['badCumPer'],
             lw=2,
             alpha=0.8,
             label='Bad Percent- val')
    plt.plot(ks_test[1]['p_g'],
             ks_test[1]['badCumPer'],
             lw=2,
             alpha=0.8,
             label='Bad Percent -test')
    #plt.xticks(list(train_ks[1]['goodCumPer'].index), list(train_ks[1]['train_proba'].unique()), rotation=90)
    plt.title('K-S curve', fontsize=18)
    plt.xlabel('p_b', fontsize=14)
    plt.ylabel('good/bad percent', fontsize=14)
    plt.legend(loc='upper left', fontsize=12)
    plt.grid(b=True, ls=':')
    plt.savefig('{}/ks'.format(cfg.out_dir()))
    plt.show()

    # PSI
    psiCalc = psi3.PSI()
    psi_val = psiCalc.calcPSI(y_pred_test_out, proba_b_test, y_pred_val_out,
                              proba_b_val)
    log.info('PSI (p_b): {}'.format(psi_val))
    psi_val = psiCalc.calcPSI(y_pred_test_out, proba_g_test, y_pred_val_out,
                              proba_g_val)
    log.info('PSI (p_g): {}'.format(psi_val))

    # AUC ROC
    if y_test is not None:
        y_true_arr = [y_test, y_val]
        y_score_arr = [proba_b_test, proba_b_val]
        y_label_arr = ['AUC-test', 'AUC-val']
        plots.roc_auc_multi(y_true_arr=y_true_arr,
                            y_score_arr=y_score_arr,
                            label_arr=y_label_arr,
                            to_file='{}/roc_all'.format(out_dir),
                            show=True)
        # confusion matrix
        plots.confusion_matrix(y_true=y_test,
                               y_pred=np.asarray(y_pred_test_out),
                               to_file='{}/confusion_test'.format(out_dir),
                               show=True)

    plots.confusion_matrix(y_true=y_val,
                           y_pred=np.asarray(y_pred_val_out),
                           to_file='{}/confusion_val'.format(out_dir),
                           show=True)