def get_feature_importance_df(importance_type='gain'):
    from xgboost_baseline import XGBoostModel

    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # feature importance
    tmp = xgbm.base_model.get_score(importance_type=importance_type)
    columns, importances = [], []
    for c, i in tmp.items():
        columns.append(c)
        importances.append(i)

    importance_df = pd.DataFrame({
        'column_name': columns,
        'importance': importances
    })
    importance_df = importance_df.sort_values(by='importance', ascending=True)

    importance_df = importance_df.reset_index(drop=True)

    return importance_df
Example #2
0
def run_grid():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True, standard_scaler_flag=True)

    X = drop_columns(X)
    feature_cnt = X.columns.shape[0]

    print 'Grid Search.'
    parameters = {
        'hidden_layer_sizes':
        [(feature_cnt + 1, ) * n for n in [1, 2, 3, 4, 5, 6]],
        'activation': ['identity', 'logistic', 'tanh', 'relu'],
        'solver': ['lbfgs', 'sgd', 'adam'],
        'learning_rate': ['constant', 'invscaling', 'adaptive'],
        'max_iter': [200, 400, 600],
        'early_stopping': [False, True]
    }
    grid = GridSearchCV(MLPRegressor(),
                        parameters,
                        cv=10,
                        n_jobs=4,
                        scoring='neg_mean_squared_error')
    grid.fit(X, y)

    print 'best_score_', grid.best_score_
    print 'best_params_', grid.best_params_
Example #3
0
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)
    # train model.
    lrm = LinearRegressionModel()

    tarlist = [
        c for c in X.columns if not c in
        'fips,hashottuborspa,poolcnt,pooltypeid10,assessmentyear'.split(',')
    ]

    X_trans, propdic = getTransData(X, y, tarlist)
    x_train, y_train, x_holdout, y_holdout = cu.get_cv(X_trans, y)
    lrm.train(x_train, y_train, None, None)
    y_pred = lrm.predict(x_holdout)

    score = abs(y_pred - y_holdout).mean()
    print(score)

    y_trans = [max([min([0.1, v]), -0.1]) for v in y]
    lrm.train(X_trans, y_trans, None, None)

    # read test data.
    T = cu.get_test_data(encode_non_object=True)
    T_trans = getTransTest(T, propdic)
    # predict result.
    print('Predicting.')
    y_pred = lrm.predict(T_trans[X_trans.columns].values)

    # write result.
    cu.write_result(y_pred)
    print(max(list(lrm.base_model.coef_)))
    print(min(y_pred))
Example #4
0
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)

    # read test data.
    T = cu.get_test_data(encode_non_object=True)

    # create base models.
    base_models = [
        LinearRegressionModel(),
        XGBoostModel(),
        LightGBMModel()
    ]

    # setup ensemble parameters.
    ensemble = Ensemble(
        n_folds=10,
        stacker=LinearRegressionModel(),
        base_models=base_models
    )

    # ensemble result.
    print('Ensembling result.')
    y_pred = ensemble.fit_predict(X, y, T[X.columns])

    # write result.
    cu.write_result(y_pred)
def run_feature_outlier():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # transform feature 'yearbuilt'
    X['yearbuilt'] = 2016 - X['yearbuilt']

    result = []
    for feature in ['taxamount', 'yearbuilt']:
        for name, newSeries in generate_feature_replace_outlier(
                X[feature]).items():
            print 'Try to deal with feature[%s] outlier by [%s].' % (feature,
                                                                     name)

            # get CV from train data.
            newX = X.copy()
            newX[feature] = newSeries
            X_train, y_train, X_holdout, y_holdout = cu.get_cv(newX, y)

            # train model.
            xgbm = XGBoostModel()
            xgbm.train(X_train, y_train, X_holdout, y_holdout)

            result.append([feature, name, xgbm.base_model.best_score])

    print '\n'.join(','.join(str(o) for o in one) for one in result)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)
    y_mean, y_std = y.mean(), y.std()
    y -= y_mean
    y /= y_std

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)

    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T[X_train.columns])
    y_pred *= y_std
    y_pred += y_mean

    # write result.
    cu.write_result(y_pred)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)

    # read test data.
    T = cu.get_test_data(encode_non_object=True)

    # create base models.
    base_models = [
        XGBoostModel(),
        LightGBMModel(),
        LinearRegressionModel(),
        RidgeModel(),
        LassoModel(),
        ElasticNetModel(),
        LassoLarsModel(),
        BayesianRidgeModel(),
    ]

    # setup ensemble parameters.
    ensemble = Ensemble(stacker=LinearRegressionModel(),
                        base_models=base_models)

    # ensemble result.
    print('Ensembling result.')
    y_pred = ensemble.fit_predict(X, y, T[X.columns])

    # write result.
    cu.write_result(y_pred)
def run_laglng_cluster():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    m_distances = [1500, 500, 50]
    min_sampleses = [1, 10, 50]

    result = []
    for m_distance in m_distances:
        for min_samples in min_sampleses:
            print 'Run DBSCAN m_distance = %d, min_samples = %d.' % (
                m_distance, min_samples)
            newX = preprocess_raw_latlng(X)
            coordinates = get_coordinates(newX)
            dbscan = cluster_latlng(coordinates,
                                    m_distance=m_distance,
                                    min_samples=min_samples)
            centroid_dict = get_centroid_dict(dbscan, coordinates)
            newX = replace_predict_cluster_df(dbscan, centroid_dict, newX)

            # get CV from train data.
            X_train, y_train, X_holdout, y_holdout = cu.get_cv(newX, y)

            # train model.
            xgbm = XGBoostModel()
            xgbm.train(X_train, y_train, X_holdout, y_holdout)

            result.append(
                [m_distance, min_samples, xgbm.base_model.best_score])

    print '\n'.join(','.join(str(o) for o in one) for one in result)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # MeanEncoder
    print('Use MeanEncoder.')
    mean_encoder = MeanEncoder(categorical_features=[
        'regionidcity', 'regionidneighborhood', 'regionidzip'
    ],
                               target_type='regression')

    X = mean_encoder.fit_transform(X, pd.Series(y))
    X = X.drop(mean_encoder.categorical_features, axis=1)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)
    T = mean_encoder.transform(T)

    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T[X_train.columns])

    # write result.
    cu.write_result(y_pred)
def gen_zero_variance_features():
    X, _ = cu.get_train_data(encode_non_object=False)

    X.fillna(X.median(), inplace=True)  # IMPORTANT

    from sklearn.feature_selection import VarianceThreshold
    selector = VarianceThreshold()

    selector.fit(X)
    zero_variance_columns = [
        col for i, col in enumerate(X.columns) if selector.variances_[i] == 0
    ]

    return zero_variance_columns
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)

    # train model.
    lrm = BayesianRidgeModel()
    lrm.train(X, y)

    # read test data.
    T = cu.get_test_data(encode_non_object=True)

    # predict result.
    print('Predicting.')
    y_pred = lrm.predict(T[X.columns])

    # write result.
    cu.write_result(y_pred)
Example #12
0
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True, standard_scaler_flag=True)
    X = drop_columns(X)

    # train model.
    lrm = MLPRegressorModel()
    lrm.train(X, y)

    # read test data.
    T = cu.get_test_data(encode_non_object=True, standard_scaler_flag=True)

    # predict result.
    print('Predicting.')
    y_pred = lrm.predict(T[X.columns])

    # write result.
    cu.write_result(y_pred)
Example #13
0
def run():
    def gridSearch():
        st,nt,step=5,51,5
        for a in range(st,nt,step):
            for b in range(st,nt,step):
                rlist = []
                for c in range(st,nt,step):
                    bindic = dict(zip(tarlist, [a, b, c]))
                    X_trans = dt.getTransData(X, tarlist, bindic)
                    # get CV from train data.
                    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y)
                
                    # train model.
                    xgbm = XGBoostModel()
                    xgbm.train(X_train, y_train, X_holdout, y_holdout)
                    rlist.append([a, b, c, xgbm.base_model.best_score])
                
                with open('../../data/param.data','a') as outfile:
                    for vs in rlist:
                        outfile.write('\t'.join([str(v) for v in vs]) + '\n')
    
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)
    tarlist = X.columns
    X_trans, propdic = dt.getTransData(X, y, tarlist)
    
    for c in tarlist:
        X_trans[c] = X_trans[c].astype(float)
    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)
    
    # read test data.
    T = cu.get_test_data(encode_non_object=True)
    T_trans = dt.getTransTest(T, propdic)
    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T_trans[X_train.columns])

    # write result.
    cu.write_result(y_pred)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    lgbmm = LightGBMModel()
    lgbmm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)

    # predict result.
    print('Predicting.')
    y_pred = lgbmm.predict(T[X_train.columns])

    # write result.
    cu.write_result(y_pred)
Example #15
0
def run_fe():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)

    # feature utils
    from feature_utils import get_category_features, get_bool_features
    category_bool_columns = []
    category_bool_columns.extend(get_category_features())
    category_bool_columns.extend(get_bool_features())
    print 'Drop category & bool columns: %s' % ','.join(category_bool_columns)
    X = X.drop(category_bool_columns, axis=1)

    # from sklearn.preprocessing import StandardScaler
    print 'Standard Scaler.'
    for col in X.columns:
        if col in category_bool_columns:
            continue
        col_mean, col_std = X[col].mean(), X[col].std()
        X[col] = (X[col] - col_mean) / col_std

    # train model.
    lrm = LinearRegressionModel()
    lrm.train(X, y, None, None)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    print('Transform, replace feature outliers.')
    X['yearbuilt'] = 2016 - X['yearbuilt']

    yearbuilt_llimit, yearbuilt_ulimit = get_series_percentile(X['yearbuilt'])
    yearbuilt_median = X['yearbuilt'].median()
    taxamount_q1, taxamount_q3 = get_series_q1q3(X['taxamount'])

    X['yearbuilt'] = replace_with_value(X['yearbuilt'], yearbuilt_llimit,
                                        yearbuilt_ulimit, yearbuilt_median)
    X['taxamount'] = replace_with_iqr_boundary(X['taxamount'], taxamount_q1,
                                               taxamount_q3)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)
    T['yearbuilt'] = 2016 - T['yearbuilt']
    T['yearbuilt'] = replace_with_value(T['yearbuilt'], yearbuilt_llimit,
                                        yearbuilt_ulimit, yearbuilt_median)
    T['taxamount'] = replace_with_iqr_boundary(T['taxamount'], taxamount_q1,
                                               taxamount_q3)

    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T[X_train.columns])

    # write result.
    cu.write_result(y_pred)
Example #17
0
    }

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(params,
                      d_train,
                      500,
                      watchlist,
                      early_stopping_rounds=100,
                      verbose_eval=1000)

    best = model.best_score
    return best


# read train data.
X, y = cu.get_train_data(encode_non_object=True)
tarlist = X.columns
X_trans, propdic = dt.getTransData(X, y, tarlist)

for c in tarlist:
    X_trans[c] = X_trans[c].astype(float)
# get CV from train data.
rlist = []
tarcols = [
    'calculatedfinishedsquarefeet',
]
fold = 10
for i in range(10):
    best = runTrain(X, y, fold, i, tarcols)
    rlist.append([i, best])
    X.fillna(X.median(), inplace=True)  # IMPORTANT

    from sklearn.feature_selection import VarianceThreshold
    selector = VarianceThreshold()

    selector.fit(X)
    zero_variance_columns = [
        col for i, col in enumerate(X.columns) if selector.variances_[i] == 0
    ]

    return zero_variance_columns


if __name__ == '__main__':

    X, y = cu.get_train_data(encode_non_object=False)

    X = fillna_zero(X)
    print X.shape

    # feature importance
    print 'Generate feature importance.'
    print get_feature_importance_df()

    # missing rate
    print 'Missing rate.'
    missing_df = get_feature_missing_df(X)
    print missing_df

    print 'Missing rate >= 0.90'
    print get_features_by_missing_rate(missing_df, 0.90)