def record_offline_time(count): """ 尝试记录商户全部掉线时间 :param count: 当前在线商户个数 :return: """ global SUMMARY_INFO SUMMARY_INFO["last_valid_count"] = SUMMARY_INFO.get( "current_valid_count", None) SUMMARY_INFO["current_valid_count"] = count if SUMMARY_INFO["last_valid_count"] and count <= 0: # 最近一次不同,且有非0->0,说明下线了 SUMMARY_INFO['offline_time'] = get_now_str() if not SUMMARY_INFO["last_valid_count"] and count: SUMMARY_INFO['online_time'] = get_now_str()
def run_lgbm_predict_submit(): # name = 'Booster_bagging_fraction:0.85_bagging_freq:3_categorical_column:[0, 1, 2]_feature_fraction:0.7_learning_rate:0.04_max_bin:255_max_depth:8_num_leaves:200_08-13-12-16-48.pkl' # name = 'Booster_bagging_fraction:0.85_bagging_freq:3_categorical_column:[0, 1, 2]_feature_fraction:0.7_learning_rate:0.04_max_bin:255_max_depth:8_num_leaves:170_08-14-08-39-57.pkl' name = 'Booster_bagging_fraction:0.85_bagging_freq:3_categorical_column:[0, 1, 2]_feature_fraction:0.7_learning_rate:0.03_max_bin:255_max_depth:8_num_leaves:170_08-15-00-36-41.pkl' # name = 'Booster_bagging_fraction:0.85_bagging_freq:3_categorical_column:[0, 1, 2]_feature_fraction:0.6_learning_rate:0.03_max_bin:255_max_depth:8_num_leaves:170_08-15-02-58-34.pkl' predict_file = PREDICT_DIR + 'test_core_lgbm_submit.pkl' lgbm_predict( name, INPUT_DIR + 'test_diff_order_streaks_fix2_add_new.csv_core.csv', predict_file, True) convert_to_submit_max_f1(predict_file, 'submit_max_f1_' + utils.get_now_str() + '.csv')
def run_convert(): # proba_threshold = 0.53480 # proba_threshold = 0.56982 # proba_threshold = 0.70710 # proba_threshold = 0.53781 # proba_threshold = 0.70056 # proba_threshold = 0.68611 # proba_threshold = 0.55909 proba_threshold = 0.5 convert_to_submit(PREDICT_DIR + 'test_diff_stack.csv', proba_threshold, 'submit_' + utils.get_now_str() + '.csv')
def validate_model_mean_f1(predict_file, save_analysis=False): import max_f1_predict begin_time = time.time() # df = pd.read_csv(PREDICT_DIR + predict_file) df = pd.read_pickle(PREDICT_DIR + predict_file) # df_validate = pd.read_csv(INPUT_DIR+ground_truth_file) # print df_validate[UID].nunique() df_positive = df[df.label == 1] # print df_positive[UID].nunique() df_positive = df_positive.groupby(UID)[PID].apply(lambda pids: list( pids)).reset_index().rename(columns={PID: 'true_products'}) # df = pd.read_csv(PREDICT_DIR + predict_file) df_users = pd.DataFrame({UID: df[UID].unique()}) df_users = df_users.merge(df_positive, how='left', on=[UID]) for index in df_users[df_users['true_products'].isnull()].index: df_users.ix[index, 'true_products'] = [None] print 'step 1 elapsed: {}'.format(time.time() - begin_time) # df_pred = df.groupby(UID).apply(max_f1_predict.get_best_prediction_group).reset_index() df_pred = df.groupby(UID).apply( max_f1_predict.get_best_prediction_group_submit).reset_index() # print df_pred.head() df_users = df_users.merge(df_pred, how='left', on=[UID]) df_users['scores'] = df_users.apply(apply_f_score, axis=1) if save_analysis: df_users.to_csv(PREDICT_DIR + '_'.join( ['analysis', utils.get_now_str(), predict_file]), index=False) print 'mean_f_score: {}'.format(df_users['scores'].mean())
def _save_stacker_train_set(S_train): S_train_name = 'stacker_train' + '_' + utils.get_now_str() + '.csv' S_train.to_csv(STACKING_DIR + S_train_name, index=False) print 'save stacker train set:', S_train_name
def run_convert_to_submit_max_f1(): # convert_to_submit_max_f1(PREDICT_DIR+'test_diff_stack', 'submit_max_f1' + utils.get_now_str() + '.csv') convert_to_submit_max_f1( PREDICT_DIR + 'validate_core_lgbm_08-13-11-00-07.pkl', 'submit_max_f1_' + utils.get_now_str() + '.csv')