def pred_func(sep_month_pred, do_year_pred): """2 step predict: raw + month""" # first raw prediction x_raw, y = outlier_handle(train_x, train_y) # raw prediction x = z.lgb_data_prep(x_raw, step1_new_features, step1_rm_features) gbms_step1 = [] for param in p.raw_lgb_2y: gbms_step1.append(z.train_lgb(x, y, z.get_params(param, 'reg'))) pred1_train = z.pred_lgb_blend(x, gbms_step1) error1 = y - pred1_train # year prediction if do_year_pred: # first collect all step2 errors for train month for mon_set in ({'01'}, {'02'}, {'03'}, {'04'}, {'05'}, {'06'}, {'07'}, {'08'}, {'09'}): if sep_month_pred: pass else: pred_2016_step2, pred_2017_step2 = pred_step2(x_raw, error1, {'10', '11', '12'}, p.lgb_month) # predict for 2016 prop_2016_step1 = z.lgb_data_prep(z.prop_2016, step1_new_features, step1_rm_features) pred_2016_step1 = z.pred_lgb_blend(prop_2016_step1, gbms_step1) pred_2016 = pred_2016_step1 + pred_2016_step2 # predict for 2017 prop_2017_step1 = z.lgb_data_prep(z.prop_2017, step1_new_features, step1_rm_features) pred_2017_step1 = z.pred_lgb_blend(prop_2017_step1, gbms_step1) pred_2017 = pred_2017_step1 + pred_2017_step2
def train_prop(): gbms_step1 = pkl.load(open('final_pred/gbms_step1.pkl', 'rb')) prop_2016_step1 = z.lgb_data_prep(z.prop_2016, p.class3_new_features, p.class3_rm_features) pred_2016_step1 = z.pred_lgb_blend(prop_2016_step1, gbms_step1) pkl.dump(pred_2016_step1, open('final_pred/pred_step1_2016.pkl', 'wb')) prop_2017_step1 = z.lgb_data_prep(z.prop_2017, p.class3_new_features, p.class3_rm_features) pred_2017_step1 = z.pred_lgb_blend(prop_2017_step1, gbms_step1) pkl.dump(pred_2017_step1, open('final_pred/pred_step1_2017.pkl', 'wb'))
def pred_step2(x_raw, error1, mon_set, params): idx = x_raw['sale_month'].apply(lambda x: x in mon_set) y_step2 = error1[idx.values] x_raw_step2 = x_raw.loc[x_raw.index[idx], :] x_step2 = z.lgb_data_prep(x_raw_step2, step2_new_features, step2_rm_features) gbms_step2 = [] for param in params: gbms_step2.append(z.train_lgb(x_step2, y_step2, z.get_params(param, 'reg'))) # predict for 2016 prop_2016_step2 = z.lgb_data_prep(z.prop_2016, step2_new_features, step2_rm_features) pred_2016_step2 = z.pred_lgb_blend(prop_2016_step2, gbms_step2) # predict for 2017 prop_2017_step2 = z.lgb_data_prep(z.prop_2017, step2_new_features, step2_rm_features) pred_2017_step2 = z.pred_lgb_blend(prop_2017_step2, gbms_step2) return pred_2016_step2, pred_2017_step2
def param_search_raw_lgb_final(): x_raw, y = z.rm_outlier(z.train_x, z.train_y) x = z.lgb_data_prep(x_raw, p.class3_new_features, p.class3_rm_features) n_iter = 100 params_reg = z.params_base.copy() params_reg.update(z.params_reg) # raw lgb z.search_lgb_random(x, y, params_reg, 'lgb_raw_final', n_iter)
def param_search_batch(): new_features = z.load_feature_list('2y_raw_lgb') train_x, train_y = z.load_train_data(z.prop_2016, z.prop_2017, new_features) train_x_lgb = z.lgb_data_prep(train_x, new_features) n_iter = 100 params_reg = z.params_base.copy() params_reg.update(z.params_reg) # params_clf = z.params_base.copy() # params_clf.update(z.params_clf) # raw lgb z.search_lgb_random(train_x_lgb, train_y, params_reg, 'raw_lgb', n_iter)
def param_search_3step(): error_series = pkl.load(open('error_after_month_train_2017.pkl', 'rb')) x = z.lgb_data_prep(z.train_2017_x, keep_only_feature=[ '2y_diff_dollar_taxvalue_total', '2y_diff_dollar_taxvalue_land', '2y_diff_dollar_taxvalue_structure' ]) n_iter = 50 params_reg = z.params_base.copy() params_reg.update(z.params_reg) z.search_lgb_random(x, error_series, params_reg, 'lgb_step2_error_2017', n_iter)
def param_search_batch_with_outlier(): new_features = z.load_feature_list('2y_raw_lgb') train_x, train_y = z.load_train_data(z.prop_2016, z.prop_2017, new_features) train_x_lgb = z.lgb_data_prep(train_x, new_features) n_iter = 150 params_reg = z.params_base.copy() params_reg.update(z.params_reg) # raw lgb z.search_lgb_random(train_x_lgb, train_y, params_reg, 'lgb_fe3', n_iter, with_rm_outlier=True)
def class_3_var_rank(): # create_group_mean, group_count variables num_vars = ('dollar_tax', 'dollar_taxvalue_structure', 'dollar_taxvalue_land', 'dollar_taxvalue_total', 'dollar_taxvalue_structure_land_diff_norm', 'dollar_taxvalue_structure_land_absdiff_norm', 'dollar_taxvalue_structure_total_ratio', 'dollar_taxvalue_total_dollar_tax_ratio') for var_class3 in ('block', 'census', 'code_county_landuse', 'str_zoning_desc', 'raw_block', 'raw_census', 'code_city', 'code_neighborhood', 'code_zip'): new_features = [] for num_var in num_vars: if num_var == num_vars[0]: new_features.append(num_var + '__groupby__' + var_class3 + '__count') new_features.append(num_var + '__groupby__' + var_class3 + '__mean') new_features.append(var_class3 + '_lgb') train_x, train_y = z.load_train_data(z.prop_2016, z.prop_2017, new_features) x_raw, y = z.capfloor_outlier(train_x, train_y) x = z.lgb_data_prep(x_raw, new_features) gbm = z.train_lgb(x, y, z.get_params(p.raw_lgb_2y_1, 'reg')) feature_list, feature_rank = z.feature_importance(gbm, None, False) idx = np.array( [True if f in new_features else False for f in feature_list]) out_df = pd.DataFrame({ 'feature': feature_list[idx], 'rank': feature_rank[idx] }) out_df.to_csv('class3_research/%s.csv' % var_class3, index=False)
def param_search_batch_one_mon(keep_size=('all', )): x_raw, y = z.rm_outlier(z.train_x, z.train_y) x_step1 = z.lgb_data_prep(x_raw, p.class3_new_features, p.class3_rm_features) x_step2 = z.lgb_data_prep(x_raw, keep_only_feature=p.step2_keep_only_feature) n_iter = 10 params = z.params_base.copy() params.update(z.params_reg) if 'num_boosting_rounds' in params: params.pop('num_boosting_rounds') metric = list(params['metric'])[0] min_data_in_leaf_range = (30, 100) num_leaf_range = (5, 20) def rand_min_data_in_leaf(): return np.random.randint(min_data_in_leaf_range[0], min_data_in_leaf_range[1]) def rand_learning_rate(): return np.random.uniform(2, 3) def rand_num_leaf(): return np.random.randint(num_leaf_range[0], num_leaf_range[1]) def rand_lambda_l2(): return np.random.uniform(1, 4) def write_to_file(line, label): f = open('temp_cv_res_random_month_%s.txt' % label, 'a') f.write(line + '\n') f.close() headers = ','.join([ '%s-mean' % metric, '%s-stdv' % metric, 'n_rounds-mean', 'n_rounds_stdv', 'num_leaves', 'min_data_in_leaf', 'learning_rate' ]) for s in keep_size: write_to_file(headers, str(s)) # gbms = [] # for params_i in p.raw_lgb_2y: # gbms.append(z.train_lgb(x_step1, y, z.get_params(params_i, 'reg'))) # # error_step1 = y - z.pred_lgb_blend(x_step1, gbms) pred_step1_train = pkl.load(open('final_pred/pred_step1_train.pkl', 'rb')) error_step1 = y - pred_step1_train for i in range(1, n_iter + 1): rand_params = { 'num_leaves': rand_num_leaf(), 'min_data_in_leaf': rand_min_data_in_leaf(), 'learning_rate': 0.1**rand_learning_rate(), # 'lambda_l2': 0.1 ** rand_lambda_l2() } params.update(rand_params) for s in keep_size: cv_hist = [] n_rounds = [] for mon_set in ({'01'}, {'02'}, {'03'}, {'04'}, {'05'}, {'06'}, {'07'}, {'08'}): for year in (2016, 2017): use_idx = np.array(x_raw.index[np.logical_and( x_raw['sale_month'].apply(lambda x: x in mon_set), x_raw['data_year'] == year)]) if s == 'all': pass else: np.random.shuffle(use_idx) use_idx = use_idx[:s] # print('train_size: %d' % int(np.sum(use_idx))) pred_error = error_step1[use_idx] train_x_step2_local = x_step2.loc[use_idx, :] lgb_train = lgb.Dataset(train_x_step2_local, pred_error) eval_hist = lgb.cv(params, lgb_train, stratified=False, num_boost_round=5000, early_stopping_rounds=100) cv_hist.append([ eval_hist['%s-mean' % metric][-1], eval_hist['%s-stdv' % metric][-1] ]) n_rounds.append(len(eval_hist['%s-mean' % metric])) m_mean, m_stdv = np.array(cv_hist).mean(axis=0) n_rounds_mean = np.mean(np.array(n_rounds)) n_rounds_stdv = np.std(np.array(n_rounds)) line = '%.7f,%.7f,%.0f,%.0f,%.0f,%.0f,%.6f' % ( m_mean, m_stdv, n_rounds_mean, n_rounds_stdv, rand_params['num_leaves'], rand_params['min_data_in_leaf'], rand_params['learning_rate']) write_to_file(line, str(s)) print('finished %d / %d' % (i, n_iter))