def data_prepare(df_train, df_test): conti_list = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'contract_time', 'former_complaint_fee', 'former_complaint_num', 'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic', 'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time', 'pay_num_per_time', 'll'] normalize_process(df_train, df_test, conti_list) # label 2 index base_data_process.label2index(df_train, LABEL) log.info('current path: {}'.format(os.getcwd())) with timer('save train data'): df_train.to_csv('../../origin_data/train_modified.csv', index=False) with timer('save test data'): df_test.to_csv('../../origin_data/test_modified.csv', index=False)
def cross_validation(train, params, ID_COLUMN_NAME, LABEL_COLUMN_NAME, N_FOLD=5): ''' :return: loss ''' NUM_BOOST_ROUND = 1000 EARLY_STOPPING_ROUNDS = 50 # Cross validation model folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=1001) feats = [ f for f in train.columns if f not in [LABEL_COLUMN_NAME, ID_COLUMN_NAME] ] for i_fold, (train_idx, valid_idx) in enumerate( folds.split(train[feats], train[LABEL_COLUMN_NAME])): dtrain = lgb.Dataset(data=train[feats].iloc[train_idx], label=train[LABEL_COLUMN_NAME].iloc[train_idx], free_raw_data=False, silent=True) dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx], label=train[LABEL_COLUMN_NAME].iloc[valid_idx], free_raw_data=False, silent=True) with timer('cross validation-fold {} train model'.format(i_fold)): log.info('params is {}'.format(params)) clf = lgb.train(num_boost_round=NUM_BOOST_ROUND, params=params, verbose_eval=10, train_set=dtrain, valid_sets=[dvalid], early_stopping_rounds=EARLY_STOPPING_ROUNDS) with timer('cross validation-fold {} predict'.format(i_fold)): v_data = clf.predict(dvalid.data) y_pre = [] for d in v_data: max = d[0] max_i = 0 for i in range(1, 15): if d[i] > max: max = d[i] max_i = i y_pre.append(max_i) f1 = f1_score(dvalid.label, y_pre, average='macro') return f1
def write2file(col_id, pre_label, name=None): with timer('write result {}'.format(name)): y_pre = one_hot2label_index(pre_label) df = pd.DataFrame() df[ID] = col_id df['predict'] = index2label(y_pre) df.to_csv('result{}.csv'.format(name), index=False)
def data_prepare(df_train, df_test): conti_list = [ '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'contract_time', 'former_complaint_fee', 'former_complaint_num', 'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic', 'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time', 'pay_num_per_time', 'll' ] normalize_process(df_train, df_test, conti_list) # label 2 index base_data_process.label2index(df_train, LABEL) base_util.pickle_dump( (base_data_process.encode_map, base_data_process.decode_list), '../../origin_data/label2index.pkl') with timer('save train data'): df_train.to_csv('../../origin_data/train_modified.csv', index=False) with timer('save test data'): df_test.to_csv('../../origin_data/test_modified.csv', index=False)
def optimization(): space = { 'learning_rate': 0.1, 'boosting_type': hp.choice('boosting_type', ['gbdt']), 'num_leaves': hp.choice('num_leaves', [15, 20, 30, 50, 65, 80, 100, 150, 400]), 'bin_construct_sample_cnt': hp.choice('bin_construct_sample_cnt', [10000, 20000, 60000, 100000, 200000]), 'min_data_in_leaf': hp.quniform('min_data_in_leaf', 20, 500, 10), 'reg_alpha': hp.choice('reg_alpha', [0, 0.001, 0.01, 0.1, 0.2]), 'reg_lambda': hp.choice('reg_lambda', [0, 0.001, 0.01, 0.1, 0.2]), 'feature_fraction': hp.uniform('feature_fraction', 0.8, 1.0), 'bagging_fraction': hp.uniform('bagging_fraction', 0.8, 1.0), 'bagging_freq': hp.choice('bagging_freq', [0, 2, 6, 10, 16]), 'is_unbalance': hp.choice('is_unbalance', [True, False]), 'num_threads': 40, 'objective': 'multiclass', 'num_class': 15, 'verbose': -1 } trials = Trials() with timer('optimization'): # Run optimization best = fmin(fn=objective, space=space, algo=tpe.suggest, trials=trials, max_evals=config_dict['max_evals']) print('-' * 100) log.warn(best) with open('model_trials.pkl', mode='wb') as mt: pickle.dump(trials, mt)
def write_result(file_name, ids, labels, label_type='label_index'): # todo 完成数据的写入部分 ''' :param file_name: :param id_series: :param label_list: :param label_type: :return: ''' load_label2index() df_test = pd.DataFrame() df_test[ID] = ids if label_type == 'one_hot': labels = one_hot2label_index(labels) if label_type in ['label_index', 'one_hot']: labels = [decode_list[label] for label in labels] df_test[LABEL] = labels df_test.columns = [ID, 'predict'] print('====shape df_test====', df_test.shape) with timer('write result to {}'.format(file_name)): df_test.to_csv(file_name, index=False)
def objective(hyperparameters): # Keep track of evals global ITERATION ITERATION += 1 # Make sure parameters that need to be integers are integers for parameter_name in [ 'num_leaves', 'bin_construct_sample_cnt', 'bagging_freq', 'min_data_in_leaf' ]: hyperparameters[parameter_name] = int(hyperparameters[parameter_name]) with timer('run lgb') as ti: # Perform n_folds cross validation f1 = cross_validation(config_dict['train'], hyperparameters, 'user_id', 'current_service') loss = 1 - f1**2 run_time = ti.get_delay_t0() # Write to the csv file ('a' means append) of_connection = open('hyperparameters.csv', 'a') writer = csv.writer(of_connection) writer.writerow([loss, hyperparameters, ITERATION, run_time, 1 - loss]) of_connection.close() log.info('iteration-{} f1:{} loss:{} train_time:{}'.format( ITERATION, f1, loss, run_time)) # Dictionary with information for evaluation return { 'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION, 'train_time': run_time, 'status': STATUS_OK }
def model(train, test, num_folds=5, stratified=True, num_boost_round=1000, save_path='origin_data_save'): LABEL_SIZE = train[LABEL].value_counts().count() print("Starting LightGBM. Train shape: {}, test shape: {}".format( train.shape, test.shape)) gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results sub_preds = np.zeros(shape=(test.shape[0], LABEL_SIZE)) feature_importance_df = pd.DataFrame() feats = [f for f in train.columns if f not in [LABEL, ID]] for i_fold, (train_idx, valid_idx) in enumerate( folds.split(train[feats], train[LABEL])): dtrain = lgb.Dataset(data=train[feats].iloc[train_idx], label=train[LABEL].iloc[train_idx], free_raw_data=False, silent=True) dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx], label=train[LABEL].iloc[valid_idx], free_raw_data=False, silent=True) params = { 'bagging_fraction': 0.94795171020152, 'bagging_freq': 6, 'bin_construct_sample_cnt': 200000, 'boosting_type': 'gbdt', 'feature_fraction': 0.9953235660931046, 'is_unbalance': False, 'learning_rate': 0.005, 'min_data_in_leaf': 30, 'num_class': 11, 'num_leaves': 80, 'num_threads': 40, 'objective': 'multiclass', 'reg_alpha': 0.001, 'reg_lambda': 0.1, 'verbose': -1 } with timer('fold {} train model'.format(i_fold)): clf = lgb.train(num_boost_round=num_boost_round, params=params, train_set=dtrain, valid_sets=[dvalid], early_stopping_rounds=50) clf.save_model( (save_path + '/model{}_{}.txt').format(i_fold, int(time.time()))) with timer('fold {} predict'.format(i_fold)): v_data = clf.predict(dvalid.data) y_pre = one_hot2label_index(v_data) sub_preds += clf.predict(test[feats]) write2file(test[ID], sub_preds, i_fold) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importance( importance_type='gain') fold_importance_df["fold"] = i_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) f1 = f1_score(dvalid.label, y_pre, average='macro') log.warn('Fold {} f1 : {} score {}'.format(i_fold + 1, f1, f1**2)) del clf, dtrain, dvalid gc.collect() display_importances(feature_importance_df)
df['predict'] = index2label(y_pre) df.to_csv('result{}.csv'.format(name), index=False) # Display/plot feature importance def display_importances(feature_importance_df_): cols = feature_importance_df_[[ "feature", "importance" ]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index best_features = feature_importance_df_.loc[ feature_importance_df_.feature.isin(cols)] plt.figure(figsize=(8, 10)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.savefig('lgbm_importances01.png') if __name__ == '__main__': if not os.path.exists('origin_data_save'): os.mkdir('origin_data_save') with timer('data process'): df_train, df_test = eda() label2index(df_train, LABEL) with timer('model process'): model(df_train, df_test, num_folds=5, num_boost_round=10000)