コード例 #1
0
def pred_func(sep_month_pred, do_year_pred):
    """2 step predict: raw + month"""
    # first raw prediction
    x_raw, y = outlier_handle(train_x, train_y)

    # raw prediction
    x = z.lgb_data_prep(x_raw, step1_new_features, step1_rm_features)
    gbms_step1 = []
    for param in p.raw_lgb_2y:
        gbms_step1.append(z.train_lgb(x, y, z.get_params(param, 'reg')))
    pred1_train = z.pred_lgb_blend(x, gbms_step1)
    error1 = y - pred1_train

    # year prediction
    if do_year_pred:
        # first collect all step2 errors for train month
        for mon_set in ({'01'}, {'02'}, {'03'}, {'04'}, {'05'}, {'06'}, {'07'}, {'08'}, {'09'}):



    if sep_month_pred:
        pass
    else:
        pred_2016_step2, pred_2017_step2 = pred_step2(x_raw, error1, {'10', '11', '12'}, p.lgb_month)

        # predict for 2016
        prop_2016_step1 = z.lgb_data_prep(z.prop_2016, step1_new_features, step1_rm_features)
        pred_2016_step1 = z.pred_lgb_blend(prop_2016_step1, gbms_step1)
        pred_2016 = pred_2016_step1 + pred_2016_step2

        # predict for 2017
        prop_2017_step1 = z.lgb_data_prep(z.prop_2017, step1_new_features, step1_rm_features)
        pred_2017_step1 = z.pred_lgb_blend(prop_2017_step1, gbms_step1)
        pred_2017 = pred_2017_step1 + pred_2017_step2
コード例 #2
0
ファイル: cmd_runner.py プロジェクト: mqkh515/udacityMLND
def train_prop():
    gbms_step1 = pkl.load(open('final_pred/gbms_step1.pkl', 'rb'))

    prop_2016_step1 = z.lgb_data_prep(z.prop_2016, p.class3_new_features,
                                      p.class3_rm_features)
    pred_2016_step1 = z.pred_lgb_blend(prop_2016_step1, gbms_step1)
    pkl.dump(pred_2016_step1, open('final_pred/pred_step1_2016.pkl', 'wb'))

    prop_2017_step1 = z.lgb_data_prep(z.prop_2017, p.class3_new_features,
                                      p.class3_rm_features)
    pred_2017_step1 = z.pred_lgb_blend(prop_2017_step1, gbms_step1)
    pkl.dump(pred_2017_step1, open('final_pred/pred_step1_2017.pkl', 'wb'))
コード例 #3
0
def pred_step2(x_raw, error1, mon_set, params):
    idx = x_raw['sale_month'].apply(lambda x: x in mon_set)
    y_step2 = error1[idx.values]
    x_raw_step2 = x_raw.loc[x_raw.index[idx], :]
    x_step2 = z.lgb_data_prep(x_raw_step2, step2_new_features, step2_rm_features)
    gbms_step2 = []
    for param in params:
        gbms_step2.append(z.train_lgb(x_step2, y_step2, z.get_params(param, 'reg')))

    # predict for 2016
    prop_2016_step2 = z.lgb_data_prep(z.prop_2016, step2_new_features, step2_rm_features)
    pred_2016_step2 = z.pred_lgb_blend(prop_2016_step2, gbms_step2)

    # predict for 2017
    prop_2017_step2 = z.lgb_data_prep(z.prop_2017, step2_new_features, step2_rm_features)
    pred_2017_step2 = z.pred_lgb_blend(prop_2017_step2, gbms_step2)
    return pred_2016_step2, pred_2017_step2
コード例 #4
0
ファイル: cmd_runner.py プロジェクト: mqkh515/udacityMLND
def param_search_raw_lgb_final():
    x_raw, y = z.rm_outlier(z.train_x, z.train_y)
    x = z.lgb_data_prep(x_raw, p.class3_new_features, p.class3_rm_features)

    n_iter = 100
    params_reg = z.params_base.copy()
    params_reg.update(z.params_reg)

    # raw lgb
    z.search_lgb_random(x, y, params_reg, 'lgb_raw_final', n_iter)
コード例 #5
0
ファイル: cmd_runner.py プロジェクト: mqkh515/udacityMLND
def param_search_batch():
    new_features = z.load_feature_list('2y_raw_lgb')
    train_x, train_y = z.load_train_data(z.prop_2016, z.prop_2017,
                                         new_features)
    train_x_lgb = z.lgb_data_prep(train_x, new_features)

    n_iter = 100
    params_reg = z.params_base.copy()
    params_reg.update(z.params_reg)
    # params_clf = z.params_base.copy()
    # params_clf.update(z.params_clf)

    # raw lgb
    z.search_lgb_random(train_x_lgb, train_y, params_reg, 'raw_lgb', n_iter)
コード例 #6
0
ファイル: cmd_runner.py プロジェクト: mqkh515/udacityMLND
def param_search_3step():
    error_series = pkl.load(open('error_after_month_train_2017.pkl', 'rb'))
    x = z.lgb_data_prep(z.train_2017_x,
                        keep_only_feature=[
                            '2y_diff_dollar_taxvalue_total',
                            '2y_diff_dollar_taxvalue_land',
                            '2y_diff_dollar_taxvalue_structure'
                        ])

    n_iter = 50
    params_reg = z.params_base.copy()
    params_reg.update(z.params_reg)

    z.search_lgb_random(x, error_series, params_reg, 'lgb_step2_error_2017',
                        n_iter)
コード例 #7
0
ファイル: cmd_runner.py プロジェクト: mqkh515/udacityMLND
def param_search_batch_with_outlier():
    new_features = z.load_feature_list('2y_raw_lgb')
    train_x, train_y = z.load_train_data(z.prop_2016, z.prop_2017,
                                         new_features)
    train_x_lgb = z.lgb_data_prep(train_x, new_features)

    n_iter = 150
    params_reg = z.params_base.copy()
    params_reg.update(z.params_reg)

    # raw lgb
    z.search_lgb_random(train_x_lgb,
                        train_y,
                        params_reg,
                        'lgb_fe3',
                        n_iter,
                        with_rm_outlier=True)
コード例 #8
0
def class_3_var_rank():

    # create_group_mean, group_count variables
    num_vars = ('dollar_tax', 'dollar_taxvalue_structure',
                'dollar_taxvalue_land', 'dollar_taxvalue_total',
                'dollar_taxvalue_structure_land_diff_norm',
                'dollar_taxvalue_structure_land_absdiff_norm',
                'dollar_taxvalue_structure_total_ratio',
                'dollar_taxvalue_total_dollar_tax_ratio')

    for var_class3 in ('block', 'census', 'code_county_landuse',
                       'str_zoning_desc', 'raw_block', 'raw_census',
                       'code_city', 'code_neighborhood', 'code_zip'):

        new_features = []
        for num_var in num_vars:
            if num_var == num_vars[0]:
                new_features.append(num_var + '__groupby__' + var_class3 +
                                    '__count')
            new_features.append(num_var + '__groupby__' + var_class3 +
                                '__mean')
        new_features.append(var_class3 + '_lgb')

        train_x, train_y = z.load_train_data(z.prop_2016, z.prop_2017,
                                             new_features)
        x_raw, y = z.capfloor_outlier(train_x, train_y)
        x = z.lgb_data_prep(x_raw, new_features)
        gbm = z.train_lgb(x, y, z.get_params(p.raw_lgb_2y_1, 'reg'))
        feature_list, feature_rank = z.feature_importance(gbm, None, False)

        idx = np.array(
            [True if f in new_features else False for f in feature_list])
        out_df = pd.DataFrame({
            'feature': feature_list[idx],
            'rank': feature_rank[idx]
        })
        out_df.to_csv('class3_research/%s.csv' % var_class3, index=False)
コード例 #9
0
ファイル: cmd_runner.py プロジェクト: mqkh515/udacityMLND
def param_search_batch_one_mon(keep_size=('all', )):
    x_raw, y = z.rm_outlier(z.train_x, z.train_y)
    x_step1 = z.lgb_data_prep(x_raw, p.class3_new_features,
                              p.class3_rm_features)
    x_step2 = z.lgb_data_prep(x_raw,
                              keep_only_feature=p.step2_keep_only_feature)

    n_iter = 10
    params = z.params_base.copy()
    params.update(z.params_reg)

    if 'num_boosting_rounds' in params:
        params.pop('num_boosting_rounds')

    metric = list(params['metric'])[0]
    min_data_in_leaf_range = (30, 100)
    num_leaf_range = (5, 20)

    def rand_min_data_in_leaf():
        return np.random.randint(min_data_in_leaf_range[0],
                                 min_data_in_leaf_range[1])

    def rand_learning_rate():
        return np.random.uniform(2, 3)

    def rand_num_leaf():
        return np.random.randint(num_leaf_range[0], num_leaf_range[1])

    def rand_lambda_l2():
        return np.random.uniform(1, 4)

    def write_to_file(line, label):
        f = open('temp_cv_res_random_month_%s.txt' % label, 'a')
        f.write(line + '\n')
        f.close()

    headers = ','.join([
        '%s-mean' % metric,
        '%s-stdv' % metric, 'n_rounds-mean', 'n_rounds_stdv', 'num_leaves',
        'min_data_in_leaf', 'learning_rate'
    ])
    for s in keep_size:
        write_to_file(headers, str(s))

    # gbms = []
    # for params_i in p.raw_lgb_2y:
    #     gbms.append(z.train_lgb(x_step1, y, z.get_params(params_i, 'reg')))
    #
    # error_step1 = y - z.pred_lgb_blend(x_step1, gbms)
    pred_step1_train = pkl.load(open('final_pred/pred_step1_train.pkl', 'rb'))
    error_step1 = y - pred_step1_train

    for i in range(1, n_iter + 1):
        rand_params = {
            'num_leaves': rand_num_leaf(),
            'min_data_in_leaf': rand_min_data_in_leaf(),
            'learning_rate': 0.1**rand_learning_rate(),
            # 'lambda_l2': 0.1 ** rand_lambda_l2()
        }
        params.update(rand_params)
        for s in keep_size:
            cv_hist = []
            n_rounds = []
            for mon_set in ({'01'}, {'02'}, {'03'}, {'04'}, {'05'}, {'06'},
                            {'07'}, {'08'}):
                for year in (2016, 2017):
                    use_idx = np.array(x_raw.index[np.logical_and(
                        x_raw['sale_month'].apply(lambda x: x in mon_set),
                        x_raw['data_year'] == year)])
                    if s == 'all':
                        pass
                    else:
                        np.random.shuffle(use_idx)
                        use_idx = use_idx[:s]
                    # print('train_size: %d' % int(np.sum(use_idx)))
                    pred_error = error_step1[use_idx]

                    train_x_step2_local = x_step2.loc[use_idx, :]
                    lgb_train = lgb.Dataset(train_x_step2_local, pred_error)
                    eval_hist = lgb.cv(params,
                                       lgb_train,
                                       stratified=False,
                                       num_boost_round=5000,
                                       early_stopping_rounds=100)
                    cv_hist.append([
                        eval_hist['%s-mean' % metric][-1],
                        eval_hist['%s-stdv' % metric][-1]
                    ])
                    n_rounds.append(len(eval_hist['%s-mean' % metric]))

            m_mean, m_stdv = np.array(cv_hist).mean(axis=0)
            n_rounds_mean = np.mean(np.array(n_rounds))
            n_rounds_stdv = np.std(np.array(n_rounds))
            line = '%.7f,%.7f,%.0f,%.0f,%.0f,%.0f,%.6f' % (
                m_mean, m_stdv, n_rounds_mean, n_rounds_stdv,
                rand_params['num_leaves'], rand_params['min_data_in_leaf'],
                rand_params['learning_rate'])
            write_to_file(line, str(s))
        print('finished %d / %d' % (i, n_iter))