Esempio n. 1
0
# coding:utf8

from common import logger
import sys, os
from entry import frameui

# 和 cx_freeze 这个库有关。这是一个用于在 windows 下将程序打包成 exe 的库,会将一个变量 frozen 注入到 sys 中。
# application_path = ''
if getattr(sys, 'frozen', False):
    application_path = os.path.dirname(sys.executable)
elif __file__:
    application_path = os.path.dirname(__file__)

print(application_path)
application_path = application_path[:application_path.rfind('/')]
logger.init(application_path)
# logger.init(application_path)
frameui.run(application_path)
sys.path.append("../")
sys.path.append("../strategy/")
sys.path.append("../common/")
import strategy.AroundClustering as strategy
import config as cfg

# 本地sklearn的process数量
WORKER = 64
BUFF_PER_WORKER = 50

NAME = (os.path.splitext(os.path.basename(__file__))[0])

import common.logger as logger

log = logger.init(NAME)


def invokeProcess(shell):
    '''
    python调用shell的封装

    :param shell: shell命令
    :return: 无
    '''

    log.debug(shell)
    ret = subprocess.call(shell, shell=True)
    if ret != 0:
        log.error("some error in bash command, program will exit with -1")
        exit(-1)
Esempio n. 3
0
def main(argv):
    config_file = argv[0]
    cfg = config.YamlParser(config_file)
    log_dir, out_dir = logger.init(log_dir=cfg.log_dir(),
                                   out_dir=cfg.out_dir(),
                                   level=cfg.log_level())

    (_, _), (x_val, y_val), (x_train,
                             y_train) = cango_pboc.get_train_val_test_data(
                                 path=cfg.train_data(),
                                 drop_columns=cfg.drop_columns(),
                                 train_val_ratio=cfg.train_val_ratio(),
                                 do_shuffle=cfg.do_shuffle(),
                                 do_smote=cfg.do_smote(),
                                 smote_ratio=cfg.smote_ratio())

    # (x_train, y_train), (x_val, y_val) = cango_pboc.get_train_val_data(
    #     path=cfg.train_data(), drop_columns=cfg.drop_columns(),
    #     train_val_ratio=cfg.train_val_ratio(),
    #     do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio())

    # streams epoch results to a csv file
    csv_logger = ka.callbacks.CSVLogger('{}/epoches.log'.format(log_dir))

    # checkpoint weight after each epoch if the validation loss decreased
    checkpointer = ka.callbacks.ModelCheckpoint(
        filepath='{}/weights.h5'.format(out_dir),
        verbose=1,
        save_best_only=True)

    # stop training when a monitored quality has stopped improving
    early_stopping = ka.callbacks.EarlyStopping(monitor='val_loss',
                                                min_delta=0,
                                                patience=10,
                                                verbose=1,
                                                mode='auto')

    # Construct the model
    input_dim = x_train.shape[1]
    mmnn = MultiModelsNeuralNetwork(input_dim)
    mmnn.set_reg_val(cfg.model_reg_val())
    mmnn.set_learning_rate(cfg.model_learning_rate())

    train_array = []
    val_array = []
    for i in range(0, 2):
        branch = single_model.create_model(
            input_dim,
            regularization_val=cfg.model_reg_val() * (i * 0.1),
            dropout_val=cfg.model_dropout_val(),
            learning_rate=cfg.model_learning_rate())
        mmnn.add_model(branch)
        train_array.append(x_train)
        val_array.append(x_val)

    model_nn = mmnn.create_model()

    # Train the model
    history = model_nn.fit(
        train_array,
        y_train,
        batch_size=cfg.model_train_batch_size(),
        epochs=cfg.model_train_epoches(),
        verbose=0,
        validation_data=(val_array, y_val),
        class_weight=cfg.model_class_weight(),
        callbacks=[checkpointer, csv_logger, early_stopping])
    score = model_nn.evaluate(val_array, y_val, verbose=0)
    print('Validation score:', score[0])
    print('Validation accuracy:', score[1])

    # summarize history for accuracy
    plots.train_val_acc(train_acc=history.history['acc'],
                        val_acc=history.history['val_acc'],
                        to_file='{}/plt_acc'.format(out_dir),
                        show=True)

    # summarize history for loss
    plots.train_val_loss(train_loss=history.history['loss'],
                         val_loss=history.history['val_loss'],
                         to_file='{}/plt_loss'.format(out_dir),
                         show=True)

    # save the model
    json_string = model_nn.to_json()
    open('{}/model_architecture.json'.format(out_dir), 'w').write(json_string)
    plot_model(model_nn, to_file='{}/model.png'.format(out_dir))
Esempio n. 4
0
                     contact.name)

        if contact.name == group:
            logger.debug('是这个群组[%s]的消息,我立即消息路由', group)

            # 得到我们的业务处理组件 | route(self, client, user, group, msg):
            biz_comp, context = bot.bizManager.route("qq", member.name, group,
                                                     content)
            if biz_comp is None:
                logger.error("无法找到对应的业务处理器![QQ],user[%s],group[%s]",
                             member.name, group)
                return "不理解您的回复,请再对我说点啥"

            logger.debug("成功加载业务处理器[%r]", biz_comp)

            # 调用业务组件的接口方法来处理消息
            returnMsg = biz_comp.bot2system(bot.qbot, "qq", context,
                                            member.name, group, content)


if __name__ == "__main__":

    logger.init()
    logger.debug(__remove_at("@刘创 你好呀!"))

    print __remove_at("@刘创你好呀!")
    print __remove_at("@刘创  你好呀!")
    print __remove_at("  @刘创 你好呀!  ")
    print __remove_at("你好呀! @刘创 ")
    print __remove_at("你好呀@刘创  ")
    print __remove_at("你好呀刘创")
Esempio n. 5
0
def main(argv):
    #create output folder before anything else
    output_dir = glob.OUTPUT_DIR_NAME
    try:
        #create if not present
        if os.path.isdir(output_dir) == False:
            os.mkdir(output_dir)
    except Exception as e:
        print('Exception occured while creating ' + output_dir +
              ', EXITING...')
        print(str(e))
        sys.exit(1)

    #separate dir for association, clustering, classification
    for d in [
            glob.ASSOCIATON_DIR, glob.CLUSTERING_DIR, glob.CLASSIFICATION_DIR,
            glob.REGRESSION_DIR, glob.DQS_DIR, glob.SCATTER_DIR, glob.EDA_DIR,
            glob.VIS_DIR, glob.TSA_DIR
    ]:
        result_dir = os.path.join(glob.OUTPUT_DIR_NAME, d)
        try:
            #create if not present
            if os.path.isdir(result_dir) == False:
                os.mkdir(result_dir)
        except Exception as e:
            print('Exception occured while creating ' + result_dir +
                  ', EXITING...')
            print(str(e))
            sys.exit(1)

    #intiialize logger so that we can see the traces
    try:
        glob.log = logger.init(glob.NAME_FOR_LOGGER)
    except Exception as e:
        print('failed to initialize logger, exception: ' + str(e))
        print('EXITING..')
        sys.exit(1)
    #logging initialize, no ready to start the data science pipeline
    print_banner()
    #if only running in analysis mode then assume all csv files are there
    #just run the analysis and exit
    if len(argv) >= 2:
        mode = argv[1]
        if mode == '-a':
            glob.log.info(
                'Begin SB analysis, logs available on console and in %s' %
                (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log')))
            analyze.run()
            a1.run()
            a2.run()
            sys.exit(0)  ## all done
        if mode == '-a1':
            glob.log.info(
                'Begin SB additional analysis, logs available on console and in %s'
                % (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log')))
            a1.run()
            sys.exit(0)  ## all done
        if mode == '-a2':
            glob.log.info(
                'Begin SB additional analysis, logs available on console and in %s'
                % (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log')))
            a2.run()
            sys.exit(0)  ## all done
        else:
            glob.log.info(
                'Begin SB visualizations, logs available on console and in %s'
                % (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log')))
            visualize_data()
            sys.exit(0)  ## all done

    glob.log.info('Begin SB study, logs available on console and in %s' %
                  (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log')))

    #initialize the 'wb' module which is a submodule for everything we want to
    #do with the world bank data and then do the same for the 'sb' module
    wb.init()
    sb.init()

    #STEP 1: get the data
    get_data()

    #Step 2: evaluate and clean the data
    check_quality_of_data()
    clean_data()

    #step 2.5 feature creation and EDA
    create_features()
    #do EDA
    do_eda()

    #Step 3: analyze
    glob.log.info('Begin SB analysis, logs available on console and in %s' %
                  (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log')))
    analyze.run()
    a1.run()
    a2.run()

    #Step 3.5: TBD
    visualize_data()

    #Further steps are currently TBD
    glob.log.info('all done, existing...')
Esempio n. 6
0
def main(argv):
    config_file = argv[0]
    cfg = config.YamlParser(config_file)
    log_dir, out_dir = logger.init(log_dir=cfg.log_dir(),
                                   out_dir=cfg.out_dir(),
                                   level=cfg.log_level())
    weight_path = '{}/weights.h5'.format(out_dir)

    (X, Y), (x_val, y_val), (_, _) = cango_pboc.get_train_val_test_data(
        path=cfg.train_data(),
        drop_columns=cfg.drop_columns(),
        train_val_ratio=cfg.train_val_ratio(),
        do_shuffle=cfg.do_shuffle(),
        do_smote=cfg.do_smote(),
        smote_ratio=cfg.smote_ratio())

    # (X, Y), (x_val, y_val) = cango_pboc.get_train_val_data(
    #     path=cfg.train_data(), drop_columns=cfg.drop_columns(),
    #     train_val_ratio=cfg.train_val_ratio(),
    #     do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio())

    kfold = StratifiedKFold(n_splits=10,
                            shuffle=True,
                            random_state=constants.random_seed)

    checkpointer = ka.callbacks.ModelCheckpoint(filepath=weight_path,
                                                verbose=1,
                                                save_best_only=True)

    # Construct the model
    input_dim = X.shape[1]
    mmnn = MultiModelsNeuralNetwork(input_dim)
    mmnn.set_reg_val(cfg.model_reg_val())
    mmnn.set_learning_rate(cfg.model_learning_rate())

    for i in range(0, 2):
        branch = single_model.create_model(
            input_dim,
            regularization_val=cfg.model_reg_val() * (i * 0.1),
            dropout_val=cfg.model_dropout_val(),
            learning_rate=cfg.model_learning_rate())
        mmnn.add_model(branch)

    model_nn = mmnn.create_model()

    cvscores = []
    for train_index, test_index in kfold.split(X, Y):

        if os.path.exists(weight_path):
            model_nn.load_weights(weight_path)

        early_stopping = ka.callbacks.EarlyStopping(monitor='val_loss',
                                                    min_delta=0,
                                                    patience=5,
                                                    verbose=1,
                                                    mode='auto')
        train_array = []
        val_array = []
        for i in range(0, 2):
            train_array.append(X[train_index])
            val_array.append(x_val)

        model_nn.fit(train_array,
                     Y[train_index],
                     batch_size=cfg.model_train_batch_size(),
                     epochs=cfg.model_train_epoches(),
                     verbose=0,
                     class_weight=cfg.model_class_weight(),
                     validation_data=(val_array, y_val),
                     callbacks=[early_stopping, checkpointer])
        scores = model_nn.evaluate(val_array, y_val, verbose=0)
        print("%s: %.2f%%" % (model_nn.metrics_names[1], scores[1] * 100))
        cvscores.append(scores[1] * 100)

    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

    # save the model
    json_string = model_nn.to_json()
    open('{}/model_architecture.json'.format(out_dir), 'w').write(json_string)
Esempio n. 7
0
def main(argv):
    config_file = argv[0]
    cfg = config.YamlParser(config_file)
    log_dir, out_dir = logger.init(log_dir=cfg.log_dir(),
                                   out_dir=cfg.out_dir(),
                                   level=cfg.log_level())

    if cfg.one_filer():
        (x_train,
         y_train), (x_val, y_val), (x_test,
                                    y_test) = cango.get_train_val_test_data(
                                        path=cfg.train_data(),
                                        drop_columns=cfg.drop_columns(),
                                        train_val_ratio=cfg.train_val_ratio(),
                                        do_shuffle=cfg.do_shuffle(),
                                        do_smote=False,
                                        smote_ratio=cfg.smote_ratio())
    else:
        (x_train, y_train), (x_val, y_val) = cango.get_train_val_data(
            path=cfg.train_data(),
            drop_columns=cfg.drop_columns(),
            train_val_ratio=cfg.train_val_ratio(),
            do_shuffle=cfg.do_shuffle(),
            do_smote=False,
            smote_ratio=cfg.smote_ratio())

        x_test, y_test = cango.get_test_data(path=cfg.test_data(),
                                             drop_columns=cfg.drop_columns())

    model_nn = get_model(cfg.out_dir(), cfg.out_dir())

    x_train_array = []
    x_val_array = []
    x_test_array = []
    for i in range(0, 2):
        x_train_array.append(x_train)
        x_val_array.append(x_val)
        x_test_array.append(x_test)

    y_pred_train_out, proba_g_train, proba_b_train = get_predict(
        model=model_nn,
        data=x_train_array,
        batch_size=100,
        cutoff=cfg.cutoff())
    y_pred_train_1 = np.count_nonzero(y_pred_train_out)
    y_pred_train_0 = len(y_pred_train_out) - y_pred_train_1
    log.debug('predict train dataset distribution: 0 - {}, 1 - {}'.format(
        y_pred_train_0, y_pred_train_1))

    y_pred_val_out, proba_g_val, proba_b_val = get_predict(model=model_nn,
                                                           data=x_val_array,
                                                           batch_size=100,
                                                           cutoff=cfg.cutoff())
    y_pred_val_1 = np.count_nonzero(y_pred_val_out)
    y_pred_val_0 = len(y_pred_val_out) - y_pred_val_1
    log.debug('predict validation dataset distribution: 0 - {}, 1 - {}'.format(
        y_pred_val_0, y_pred_val_1))

    y_pred_test_out, proba_g_test, proba_b_test = get_predict(
        model=model_nn, data=x_test_array, batch_size=100, cutoff=cfg.cutoff())
    y_pred_test_1 = np.count_nonzero(y_pred_test_out)
    y_pred_test_0 = len(y_pred_test_out) - y_pred_test_1
    log.debug('predict test dataset distribution: 0 - {}, 1 - {}'.format(
        y_pred_test_0, y_pred_test_1))

    df_test = None
    # output
    if y_test is not None:
        np.savetxt('{}/predict_test.csv'.format(cfg.out_dir()),
                   np.c_[y_test, y_pred_test_out, proba_g_test, proba_b_test],
                   delimiter=',',
                   header='CG_Label, Label, p_g, p_b',
                   comments='',
                   fmt='%d, %d, %.6f, %.6f')
        df_test = pd.DataFrame({
            'CG_Label': y_test,
            'Label': y_pred_test_out,
            'p_g': proba_g_test,
            'p_b': proba_b_test
        })
        bins_test, c0_test, c1_test = metrics.cals_KS_bins(
            df_test, 'p_b', 'Label')
        np.savetxt('{}/predict_bin_test.csv'.format(cfg.out_dir()),
                   np.c_[bins_test, c0_test, c1_test],
                   delimiter=',',
                   header='p_b, n_g_label, n_b_label',
                   comments='',
                   fmt='%.1f, %d, %d')
    else:
        np.savetxt('{}/predict_bin_test.csv'.format(cfg.out_dir()),
                   np.c_[y_pred_test_out, proba_g_test, proba_b_test],
                   delimiter=',',
                   header='Label, p_g, p_b',
                   comments='',
                   fmt='%d, %.6f, %.6f')

    np.savetxt('{}/predict_val.csv'.format(cfg.out_dir()),
               np.c_[y_val, y_pred_val_out, proba_g_val, proba_b_val],
               delimiter=',',
               header='CG_Label, Label, p_g, p_b',
               comments='',
               fmt='%d, %d, %.6f, %.6f')

    df_val = pd.DataFrame({
        'CG_Label': y_val,
        'Label': y_pred_val_out,
        'p_g': proba_g_val,
        'p_b': proba_b_val
    })
    bins_val, c0_val, c1_val = metrics.cals_KS_bins(df_val, 'p_b', 'CG_Label')
    np.savetxt('{}/predict_bin_val.csv'.format(cfg.out_dir()),
               np.c_[bins_val, c0_val, c1_val],
               delimiter=',',
               header='p_b, n_g_label, n_b_Label',
               comments='',
               fmt='%.1f, %d, %d')

    # KS test score
    ks_val = metrics.calc_KS_AR(df_val, 'p_g', 'CG_Label')
    ks_val_value = np.max(
        np.subtract(ks_val[1]['badCumPer'].values,
                    ks_val[1]['goodCumPer'].values))
    log.info('ks val score: {}'.format(ks_val_value))
    ks_test = metrics.calc_KS_AR(df_test, 'p_g', 'CG_Label')
    ks_test_value = np.max(
        np.subtract(ks_test[1]['badCumPer'].values,
                    ks_test[1]['goodCumPer'].values))
    log.info('ks test score: {}'.format(ks_test_value))

    plt.figure(figsize=(14, 10), dpi=80, facecolor='w')
    plt.plot(ks_val[1]['p_g'],
             ks_val[1]['goodCumPer'],
             lw=2,
             alpha=0.8,
             label='Good Percent -val')
    plt.plot(ks_test[1]['p_g'],
             ks_test[1]['goodCumPer'],
             lw=2,
             alpha=0.8,
             label='Good Percent -test')
    plt.plot(ks_val[1]['p_g'],
             ks_val[1]['badCumPer'],
             lw=2,
             alpha=0.8,
             label='Bad Percent- val')
    plt.plot(ks_test[1]['p_g'],
             ks_test[1]['badCumPer'],
             lw=2,
             alpha=0.8,
             label='Bad Percent -test')
    #plt.xticks(list(train_ks[1]['goodCumPer'].index), list(train_ks[1]['train_proba'].unique()), rotation=90)
    plt.title('K-S curve', fontsize=18)
    plt.xlabel('p_b', fontsize=14)
    plt.ylabel('good/bad percent', fontsize=14)
    plt.legend(loc='upper left', fontsize=12)
    plt.grid(b=True, ls=':')
    plt.savefig('{}/ks'.format(cfg.out_dir()))
    plt.show()

    # PSI
    psiCalc = psi3.PSI()
    psi_val = psiCalc.calcPSI(y_pred_test_out, proba_b_test, y_pred_val_out,
                              proba_b_val)
    log.info('PSI (p_b): {}'.format(psi_val))
    psi_val = psiCalc.calcPSI(y_pred_test_out, proba_g_test, y_pred_val_out,
                              proba_g_val)
    log.info('PSI (p_g): {}'.format(psi_val))

    # AUC ROC
    if y_test is not None:
        y_true_arr = [y_test, y_val]
        y_score_arr = [proba_b_test, proba_b_val]
        y_label_arr = ['AUC-test', 'AUC-val']
        plots.roc_auc_multi(y_true_arr=y_true_arr,
                            y_score_arr=y_score_arr,
                            label_arr=y_label_arr,
                            to_file='{}/roc_all'.format(out_dir),
                            show=True)
        # confusion matrix
        plots.confusion_matrix(y_true=y_test,
                               y_pred=np.asarray(y_pred_test_out),
                               to_file='{}/confusion_test'.format(out_dir),
                               show=True)

    plots.confusion_matrix(y_true=y_val,
                           y_pred=np.asarray(y_pred_val_out),
                           to_file='{}/confusion_val'.format(out_dir),
                           show=True)