def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)
    y_mean, y_std = y.mean(), y.std()
    y -= y_mean
    y /= y_std

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)

    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T[X_train.columns])
    y_pred *= y_std
    y_pred += y_mean

    # write result.
    cu.write_result(y_pred)
예제 #2
0
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)
    # train model.
    lrm = LinearRegressionModel()

    tarlist = [
        c for c in X.columns if not c in
        'fips,hashottuborspa,poolcnt,pooltypeid10,assessmentyear'.split(',')
    ]

    X_trans, propdic = getTransData(X, y, tarlist)
    x_train, y_train, x_holdout, y_holdout = cu.get_cv(X_trans, y)
    lrm.train(x_train, y_train, None, None)
    y_pred = lrm.predict(x_holdout)

    score = abs(y_pred - y_holdout).mean()
    print(score)

    y_trans = [max([min([0.1, v]), -0.1]) for v in y]
    lrm.train(X_trans, y_trans, None, None)

    # read test data.
    T = cu.get_test_data(encode_non_object=True)
    T_trans = getTransTest(T, propdic)
    # predict result.
    print('Predicting.')
    y_pred = lrm.predict(T_trans[X_trans.columns].values)

    # write result.
    cu.write_result(y_pred)
    print(max(list(lrm.base_model.coef_)))
    print(min(y_pred))
def run_feature_outlier():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # transform feature 'yearbuilt'
    X['yearbuilt'] = 2016 - X['yearbuilt']

    result = []
    for feature in ['taxamount', 'yearbuilt']:
        for name, newSeries in generate_feature_replace_outlier(
                X[feature]).items():
            print 'Try to deal with feature[%s] outlier by [%s].' % (feature,
                                                                     name)

            # get CV from train data.
            newX = X.copy()
            newX[feature] = newSeries
            X_train, y_train, X_holdout, y_holdout = cu.get_cv(newX, y)

            # train model.
            xgbm = XGBoostModel()
            xgbm.train(X_train, y_train, X_holdout, y_holdout)

            result.append([feature, name, xgbm.base_model.best_score])

    print '\n'.join(','.join(str(o) for o in one) for one in result)
def run_laglng_cluster():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    m_distances = [1500, 500, 50]
    min_sampleses = [1, 10, 50]

    result = []
    for m_distance in m_distances:
        for min_samples in min_sampleses:
            print 'Run DBSCAN m_distance = %d, min_samples = %d.' % (
                m_distance, min_samples)
            newX = preprocess_raw_latlng(X)
            coordinates = get_coordinates(newX)
            dbscan = cluster_latlng(coordinates,
                                    m_distance=m_distance,
                                    min_samples=min_samples)
            centroid_dict = get_centroid_dict(dbscan, coordinates)
            newX = replace_predict_cluster_df(dbscan, centroid_dict, newX)

            # get CV from train data.
            X_train, y_train, X_holdout, y_holdout = cu.get_cv(newX, y)

            # train model.
            xgbm = XGBoostModel()
            xgbm.train(X_train, y_train, X_holdout, y_holdout)

            result.append(
                [m_distance, min_samples, xgbm.base_model.best_score])

    print '\n'.join(','.join(str(o) for o in one) for one in result)
예제 #5
0
def get_feature_importance_df(importance_type='gain'):
    from xgboost_baseline import XGBoostModel

    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # feature importance
    tmp = xgbm.base_model.get_score(importance_type=importance_type)
    columns, importances = [], []
    for c, i in tmp.items():
        columns.append(c)
        importances.append(i)

    importance_df = pd.DataFrame({
        'column_name': columns,
        'importance': importances
    })
    importance_df = importance_df.sort_values(by='importance', ascending=True)

    importance_df = importance_df.reset_index(drop=True)

    return importance_df
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # MeanEncoder
    print('Use MeanEncoder.')
    mean_encoder = MeanEncoder(categorical_features=[
        'regionidcity', 'regionidneighborhood', 'regionidzip'
    ],
                               target_type='regression')

    X = mean_encoder.fit_transform(X, pd.Series(y))
    X = X.drop(mean_encoder.categorical_features, axis=1)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)
    T = mean_encoder.transform(T)

    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T[X_train.columns])

    # write result.
    cu.write_result(y_pred)
예제 #7
0
def run():
    def gridSearch():
        st,nt,step=5,51,5
        for a in range(st,nt,step):
            for b in range(st,nt,step):
                rlist = []
                for c in range(st,nt,step):
                    bindic = dict(zip(tarlist, [a, b, c]))
                    X_trans = dt.getTransData(X, tarlist, bindic)
                    # get CV from train data.
                    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y)
                
                    # train model.
                    xgbm = XGBoostModel()
                    xgbm.train(X_train, y_train, X_holdout, y_holdout)
                    rlist.append([a, b, c, xgbm.base_model.best_score])
                
                with open('../../data/param.data','a') as outfile:
                    for vs in rlist:
                        outfile.write('\t'.join([str(v) for v in vs]) + '\n')
    
    # read train data.
    X, y = cu.get_train_data(encode_non_object=True)
    tarlist = X.columns
    X_trans, propdic = dt.getTransData(X, y, tarlist)
    
    for c in tarlist:
        X_trans[c] = X_trans[c].astype(float)
    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)
    
    # read test data.
    T = cu.get_test_data(encode_non_object=True)
    T_trans = dt.getTransTest(T, propdic)
    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T_trans[X_train.columns])

    # write result.
    cu.write_result(y_pred)
예제 #8
0
 def gridSearch():
     st,nt,step=5,51,5
     for a in range(st,nt,step):
         for b in range(st,nt,step):
             rlist = []
             for c in range(st,nt,step):
                 bindic = dict(zip(tarlist, [a, b, c]))
                 X_trans = dt.getTransData(X, tarlist, bindic)
                 # get CV from train data.
                 X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y)
             
                 # train model.
                 xgbm = XGBoostModel()
                 xgbm.train(X_train, y_train, X_holdout, y_holdout)
                 rlist.append([a, b, c, xgbm.base_model.best_score])
             
             with open('../../data/param.data','a') as outfile:
                 for vs in rlist:
                     outfile.write('\t'.join([str(v) for v in vs]) + '\n')
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    lgbmm = LightGBMModel()
    lgbmm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)

    # predict result.
    print('Predicting.')
    y_pred = lgbmm.predict(T[X_train.columns])

    # write result.
    cu.write_result(y_pred)
def run():
    # read train data.
    X, y = cu.get_train_data(encode_non_object=False)

    print('Transform, replace feature outliers.')
    X['yearbuilt'] = 2016 - X['yearbuilt']

    yearbuilt_llimit, yearbuilt_ulimit = get_series_percentile(X['yearbuilt'])
    yearbuilt_median = X['yearbuilt'].median()
    taxamount_q1, taxamount_q3 = get_series_q1q3(X['taxamount'])

    X['yearbuilt'] = replace_with_value(X['yearbuilt'], yearbuilt_llimit,
                                        yearbuilt_ulimit, yearbuilt_median)
    X['taxamount'] = replace_with_iqr_boundary(X['taxamount'], taxamount_q1,
                                               taxamount_q3)

    # get CV from train data.
    X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y)

    # train model.
    xgbm = XGBoostModel()
    xgbm.train(X_train, y_train, X_holdout, y_holdout)

    # read test data.
    T = cu.get_test_data(encode_non_object=False)
    T['yearbuilt'] = 2016 - T['yearbuilt']
    T['yearbuilt'] = replace_with_value(T['yearbuilt'], yearbuilt_llimit,
                                        yearbuilt_ulimit, yearbuilt_median)
    T['taxamount'] = replace_with_iqr_boundary(T['taxamount'], taxamount_q1,
                                               taxamount_q3)

    # predict result.
    print('Predicting.')
    y_pred = xgbm.predict(T[X_train.columns])

    # write result.
    cu.write_result(y_pred)
예제 #11
0
    # Train the model, iterating on the data in batches of 32 samples
    model.fit(data, labels, epochs=10, batch_size=32)

    y_pred = model.predict(data)

    df = pd.DataFrame({1: list(y_pred[:, 0]), 2: list(labels)})
    df.to_clipboard()


# read train data.
X, y = cu.get_train_data(encode_non_object=True)
tarlist = X.columns  #['longitude', 'yearbuilt', 'taxamount']
X_trans, propdic = getTransData(X, tarlist)
from keras import regularizers

x_train, y_train, x_holdout, y_holdout = cu.get_cv(X_trans, y)
num, acfunc = 10, 'softmax'
model = Sequential([
    Dense(10, input_shape=(53,),\
      kernel_regularizer=regularizers.l2(0.01), \
      activity_regularizer=regularizers.l1(0.01)
      ),
    Dense(5),
    Dense(5),
    Dense(1),
    Activation('linear')
])
#0.052303568738
# For a mean squared error regression problem
model.compile(optimizer='rmsprop', loss='mae')