# coding:utf8 from common import logger import sys, os from entry import frameui # 和 cx_freeze 这个库有关。这是一个用于在 windows 下将程序打包成 exe 的库,会将一个变量 frozen 注入到 sys 中。 # application_path = '' if getattr(sys, 'frozen', False): application_path = os.path.dirname(sys.executable) elif __file__: application_path = os.path.dirname(__file__) print(application_path) application_path = application_path[:application_path.rfind('/')] logger.init(application_path) # logger.init(application_path) frameui.run(application_path)
sys.path.append("../") sys.path.append("../strategy/") sys.path.append("../common/") import strategy.AroundClustering as strategy import config as cfg # 本地sklearn的process数量 WORKER = 64 BUFF_PER_WORKER = 50 NAME = (os.path.splitext(os.path.basename(__file__))[0]) import common.logger as logger log = logger.init(NAME) def invokeProcess(shell): ''' python调用shell的封装 :param shell: shell命令 :return: 无 ''' log.debug(shell) ret = subprocess.call(shell, shell=True) if ret != 0: log.error("some error in bash command, program will exit with -1") exit(-1)
def main(argv): config_file = argv[0] cfg = config.YamlParser(config_file) log_dir, out_dir = logger.init(log_dir=cfg.log_dir(), out_dir=cfg.out_dir(), level=cfg.log_level()) (_, _), (x_val, y_val), (x_train, y_train) = cango_pboc.get_train_val_test_data( path=cfg.train_data(), drop_columns=cfg.drop_columns(), train_val_ratio=cfg.train_val_ratio(), do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio()) # (x_train, y_train), (x_val, y_val) = cango_pboc.get_train_val_data( # path=cfg.train_data(), drop_columns=cfg.drop_columns(), # train_val_ratio=cfg.train_val_ratio(), # do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio()) # streams epoch results to a csv file csv_logger = ka.callbacks.CSVLogger('{}/epoches.log'.format(log_dir)) # checkpoint weight after each epoch if the validation loss decreased checkpointer = ka.callbacks.ModelCheckpoint( filepath='{}/weights.h5'.format(out_dir), verbose=1, save_best_only=True) # stop training when a monitored quality has stopped improving early_stopping = ka.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto') # Construct the model input_dim = x_train.shape[1] mmnn = MultiModelsNeuralNetwork(input_dim) mmnn.set_reg_val(cfg.model_reg_val()) mmnn.set_learning_rate(cfg.model_learning_rate()) train_array = [] val_array = [] for i in range(0, 2): branch = single_model.create_model( input_dim, regularization_val=cfg.model_reg_val() * (i * 0.1), dropout_val=cfg.model_dropout_val(), learning_rate=cfg.model_learning_rate()) mmnn.add_model(branch) train_array.append(x_train) val_array.append(x_val) model_nn = mmnn.create_model() # Train the model history = model_nn.fit( train_array, y_train, batch_size=cfg.model_train_batch_size(), epochs=cfg.model_train_epoches(), verbose=0, validation_data=(val_array, y_val), class_weight=cfg.model_class_weight(), callbacks=[checkpointer, csv_logger, early_stopping]) score = model_nn.evaluate(val_array, y_val, verbose=0) print('Validation score:', score[0]) print('Validation accuracy:', score[1]) # summarize history for accuracy plots.train_val_acc(train_acc=history.history['acc'], val_acc=history.history['val_acc'], to_file='{}/plt_acc'.format(out_dir), show=True) # summarize history for loss plots.train_val_loss(train_loss=history.history['loss'], val_loss=history.history['val_loss'], to_file='{}/plt_loss'.format(out_dir), show=True) # save the model json_string = model_nn.to_json() open('{}/model_architecture.json'.format(out_dir), 'w').write(json_string) plot_model(model_nn, to_file='{}/model.png'.format(out_dir))
contact.name) if contact.name == group: logger.debug('是这个群组[%s]的消息,我立即消息路由', group) # 得到我们的业务处理组件 | route(self, client, user, group, msg): biz_comp, context = bot.bizManager.route("qq", member.name, group, content) if biz_comp is None: logger.error("无法找到对应的业务处理器![QQ],user[%s],group[%s]", member.name, group) return "不理解您的回复,请再对我说点啥" logger.debug("成功加载业务处理器[%r]", biz_comp) # 调用业务组件的接口方法来处理消息 returnMsg = biz_comp.bot2system(bot.qbot, "qq", context, member.name, group, content) if __name__ == "__main__": logger.init() logger.debug(__remove_at("@刘创 你好呀!")) print __remove_at("@刘创你好呀!") print __remove_at("@刘创 你好呀!") print __remove_at(" @刘创 你好呀! ") print __remove_at("你好呀! @刘创 ") print __remove_at("你好呀@刘创 ") print __remove_at("你好呀刘创")
def main(argv): #create output folder before anything else output_dir = glob.OUTPUT_DIR_NAME try: #create if not present if os.path.isdir(output_dir) == False: os.mkdir(output_dir) except Exception as e: print('Exception occured while creating ' + output_dir + ', EXITING...') print(str(e)) sys.exit(1) #separate dir for association, clustering, classification for d in [ glob.ASSOCIATON_DIR, glob.CLUSTERING_DIR, glob.CLASSIFICATION_DIR, glob.REGRESSION_DIR, glob.DQS_DIR, glob.SCATTER_DIR, glob.EDA_DIR, glob.VIS_DIR, glob.TSA_DIR ]: result_dir = os.path.join(glob.OUTPUT_DIR_NAME, d) try: #create if not present if os.path.isdir(result_dir) == False: os.mkdir(result_dir) except Exception as e: print('Exception occured while creating ' + result_dir + ', EXITING...') print(str(e)) sys.exit(1) #intiialize logger so that we can see the traces try: glob.log = logger.init(glob.NAME_FOR_LOGGER) except Exception as e: print('failed to initialize logger, exception: ' + str(e)) print('EXITING..') sys.exit(1) #logging initialize, no ready to start the data science pipeline print_banner() #if only running in analysis mode then assume all csv files are there #just run the analysis and exit if len(argv) >= 2: mode = argv[1] if mode == '-a': glob.log.info( 'Begin SB analysis, logs available on console and in %s' % (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log'))) analyze.run() a1.run() a2.run() sys.exit(0) ## all done if mode == '-a1': glob.log.info( 'Begin SB additional analysis, logs available on console and in %s' % (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log'))) a1.run() sys.exit(0) ## all done if mode == '-a2': glob.log.info( 'Begin SB additional analysis, logs available on console and in %s' % (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log'))) a2.run() sys.exit(0) ## all done else: glob.log.info( 'Begin SB visualizations, logs available on console and in %s' % (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log'))) visualize_data() sys.exit(0) ## all done glob.log.info('Begin SB study, logs available on console and in %s' % (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log'))) #initialize the 'wb' module which is a submodule for everything we want to #do with the world bank data and then do the same for the 'sb' module wb.init() sb.init() #STEP 1: get the data get_data() #Step 2: evaluate and clean the data check_quality_of_data() clean_data() #step 2.5 feature creation and EDA create_features() #do EDA do_eda() #Step 3: analyze glob.log.info('Begin SB analysis, logs available on console and in %s' % (os.path.join(glob.OUTPUT_DIR_NAME, 'SBS.log'))) analyze.run() a1.run() a2.run() #Step 3.5: TBD visualize_data() #Further steps are currently TBD glob.log.info('all done, existing...')
def main(argv): config_file = argv[0] cfg = config.YamlParser(config_file) log_dir, out_dir = logger.init(log_dir=cfg.log_dir(), out_dir=cfg.out_dir(), level=cfg.log_level()) weight_path = '{}/weights.h5'.format(out_dir) (X, Y), (x_val, y_val), (_, _) = cango_pboc.get_train_val_test_data( path=cfg.train_data(), drop_columns=cfg.drop_columns(), train_val_ratio=cfg.train_val_ratio(), do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio()) # (X, Y), (x_val, y_val) = cango_pboc.get_train_val_data( # path=cfg.train_data(), drop_columns=cfg.drop_columns(), # train_val_ratio=cfg.train_val_ratio(), # do_shuffle=cfg.do_shuffle(), do_smote=cfg.do_smote(), smote_ratio=cfg.smote_ratio()) kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=constants.random_seed) checkpointer = ka.callbacks.ModelCheckpoint(filepath=weight_path, verbose=1, save_best_only=True) # Construct the model input_dim = X.shape[1] mmnn = MultiModelsNeuralNetwork(input_dim) mmnn.set_reg_val(cfg.model_reg_val()) mmnn.set_learning_rate(cfg.model_learning_rate()) for i in range(0, 2): branch = single_model.create_model( input_dim, regularization_val=cfg.model_reg_val() * (i * 0.1), dropout_val=cfg.model_dropout_val(), learning_rate=cfg.model_learning_rate()) mmnn.add_model(branch) model_nn = mmnn.create_model() cvscores = [] for train_index, test_index in kfold.split(X, Y): if os.path.exists(weight_path): model_nn.load_weights(weight_path) early_stopping = ka.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, mode='auto') train_array = [] val_array = [] for i in range(0, 2): train_array.append(X[train_index]) val_array.append(x_val) model_nn.fit(train_array, Y[train_index], batch_size=cfg.model_train_batch_size(), epochs=cfg.model_train_epoches(), verbose=0, class_weight=cfg.model_class_weight(), validation_data=(val_array, y_val), callbacks=[early_stopping, checkpointer]) scores = model_nn.evaluate(val_array, y_val, verbose=0) print("%s: %.2f%%" % (model_nn.metrics_names[1], scores[1] * 100)) cvscores.append(scores[1] * 100) print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) # save the model json_string = model_nn.to_json() open('{}/model_architecture.json'.format(out_dir), 'w').write(json_string)
def main(argv): config_file = argv[0] cfg = config.YamlParser(config_file) log_dir, out_dir = logger.init(log_dir=cfg.log_dir(), out_dir=cfg.out_dir(), level=cfg.log_level()) if cfg.one_filer(): (x_train, y_train), (x_val, y_val), (x_test, y_test) = cango.get_train_val_test_data( path=cfg.train_data(), drop_columns=cfg.drop_columns(), train_val_ratio=cfg.train_val_ratio(), do_shuffle=cfg.do_shuffle(), do_smote=False, smote_ratio=cfg.smote_ratio()) else: (x_train, y_train), (x_val, y_val) = cango.get_train_val_data( path=cfg.train_data(), drop_columns=cfg.drop_columns(), train_val_ratio=cfg.train_val_ratio(), do_shuffle=cfg.do_shuffle(), do_smote=False, smote_ratio=cfg.smote_ratio()) x_test, y_test = cango.get_test_data(path=cfg.test_data(), drop_columns=cfg.drop_columns()) model_nn = get_model(cfg.out_dir(), cfg.out_dir()) x_train_array = [] x_val_array = [] x_test_array = [] for i in range(0, 2): x_train_array.append(x_train) x_val_array.append(x_val) x_test_array.append(x_test) y_pred_train_out, proba_g_train, proba_b_train = get_predict( model=model_nn, data=x_train_array, batch_size=100, cutoff=cfg.cutoff()) y_pred_train_1 = np.count_nonzero(y_pred_train_out) y_pred_train_0 = len(y_pred_train_out) - y_pred_train_1 log.debug('predict train dataset distribution: 0 - {}, 1 - {}'.format( y_pred_train_0, y_pred_train_1)) y_pred_val_out, proba_g_val, proba_b_val = get_predict(model=model_nn, data=x_val_array, batch_size=100, cutoff=cfg.cutoff()) y_pred_val_1 = np.count_nonzero(y_pred_val_out) y_pred_val_0 = len(y_pred_val_out) - y_pred_val_1 log.debug('predict validation dataset distribution: 0 - {}, 1 - {}'.format( y_pred_val_0, y_pred_val_1)) y_pred_test_out, proba_g_test, proba_b_test = get_predict( model=model_nn, data=x_test_array, batch_size=100, cutoff=cfg.cutoff()) y_pred_test_1 = np.count_nonzero(y_pred_test_out) y_pred_test_0 = len(y_pred_test_out) - y_pred_test_1 log.debug('predict test dataset distribution: 0 - {}, 1 - {}'.format( y_pred_test_0, y_pred_test_1)) df_test = None # output if y_test is not None: np.savetxt('{}/predict_test.csv'.format(cfg.out_dir()), np.c_[y_test, y_pred_test_out, proba_g_test, proba_b_test], delimiter=',', header='CG_Label, Label, p_g, p_b', comments='', fmt='%d, %d, %.6f, %.6f') df_test = pd.DataFrame({ 'CG_Label': y_test, 'Label': y_pred_test_out, 'p_g': proba_g_test, 'p_b': proba_b_test }) bins_test, c0_test, c1_test = metrics.cals_KS_bins( df_test, 'p_b', 'Label') np.savetxt('{}/predict_bin_test.csv'.format(cfg.out_dir()), np.c_[bins_test, c0_test, c1_test], delimiter=',', header='p_b, n_g_label, n_b_label', comments='', fmt='%.1f, %d, %d') else: np.savetxt('{}/predict_bin_test.csv'.format(cfg.out_dir()), np.c_[y_pred_test_out, proba_g_test, proba_b_test], delimiter=',', header='Label, p_g, p_b', comments='', fmt='%d, %.6f, %.6f') np.savetxt('{}/predict_val.csv'.format(cfg.out_dir()), np.c_[y_val, y_pred_val_out, proba_g_val, proba_b_val], delimiter=',', header='CG_Label, Label, p_g, p_b', comments='', fmt='%d, %d, %.6f, %.6f') df_val = pd.DataFrame({ 'CG_Label': y_val, 'Label': y_pred_val_out, 'p_g': proba_g_val, 'p_b': proba_b_val }) bins_val, c0_val, c1_val = metrics.cals_KS_bins(df_val, 'p_b', 'CG_Label') np.savetxt('{}/predict_bin_val.csv'.format(cfg.out_dir()), np.c_[bins_val, c0_val, c1_val], delimiter=',', header='p_b, n_g_label, n_b_Label', comments='', fmt='%.1f, %d, %d') # KS test score ks_val = metrics.calc_KS_AR(df_val, 'p_g', 'CG_Label') ks_val_value = np.max( np.subtract(ks_val[1]['badCumPer'].values, ks_val[1]['goodCumPer'].values)) log.info('ks val score: {}'.format(ks_val_value)) ks_test = metrics.calc_KS_AR(df_test, 'p_g', 'CG_Label') ks_test_value = np.max( np.subtract(ks_test[1]['badCumPer'].values, ks_test[1]['goodCumPer'].values)) log.info('ks test score: {}'.format(ks_test_value)) plt.figure(figsize=(14, 10), dpi=80, facecolor='w') plt.plot(ks_val[1]['p_g'], ks_val[1]['goodCumPer'], lw=2, alpha=0.8, label='Good Percent -val') plt.plot(ks_test[1]['p_g'], ks_test[1]['goodCumPer'], lw=2, alpha=0.8, label='Good Percent -test') plt.plot(ks_val[1]['p_g'], ks_val[1]['badCumPer'], lw=2, alpha=0.8, label='Bad Percent- val') plt.plot(ks_test[1]['p_g'], ks_test[1]['badCumPer'], lw=2, alpha=0.8, label='Bad Percent -test') #plt.xticks(list(train_ks[1]['goodCumPer'].index), list(train_ks[1]['train_proba'].unique()), rotation=90) plt.title('K-S curve', fontsize=18) plt.xlabel('p_b', fontsize=14) plt.ylabel('good/bad percent', fontsize=14) plt.legend(loc='upper left', fontsize=12) plt.grid(b=True, ls=':') plt.savefig('{}/ks'.format(cfg.out_dir())) plt.show() # PSI psiCalc = psi3.PSI() psi_val = psiCalc.calcPSI(y_pred_test_out, proba_b_test, y_pred_val_out, proba_b_val) log.info('PSI (p_b): {}'.format(psi_val)) psi_val = psiCalc.calcPSI(y_pred_test_out, proba_g_test, y_pred_val_out, proba_g_val) log.info('PSI (p_g): {}'.format(psi_val)) # AUC ROC if y_test is not None: y_true_arr = [y_test, y_val] y_score_arr = [proba_b_test, proba_b_val] y_label_arr = ['AUC-test', 'AUC-val'] plots.roc_auc_multi(y_true_arr=y_true_arr, y_score_arr=y_score_arr, label_arr=y_label_arr, to_file='{}/roc_all'.format(out_dir), show=True) # confusion matrix plots.confusion_matrix(y_true=y_test, y_pred=np.asarray(y_pred_test_out), to_file='{}/confusion_test'.format(out_dir), show=True) plots.confusion_matrix(y_true=y_val, y_pred=np.asarray(y_pred_val_out), to_file='{}/confusion_val'.format(out_dir), show=True)