def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) y_mean, y_std = y.mean(), y.std() y -= y_mean y /= y_std # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) # predict result. print('Predicting.') y_pred = xgbm.predict(T[X_train.columns]) y_pred *= y_std y_pred += y_mean # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=True) # train model. lrm = LinearRegressionModel() tarlist = [ c for c in X.columns if not c in 'fips,hashottuborspa,poolcnt,pooltypeid10,assessmentyear'.split(',') ] X_trans, propdic = getTransData(X, y, tarlist) x_train, y_train, x_holdout, y_holdout = cu.get_cv(X_trans, y) lrm.train(x_train, y_train, None, None) y_pred = lrm.predict(x_holdout) score = abs(y_pred - y_holdout).mean() print(score) y_trans = [max([min([0.1, v]), -0.1]) for v in y] lrm.train(X_trans, y_trans, None, None) # read test data. T = cu.get_test_data(encode_non_object=True) T_trans = getTransTest(T, propdic) # predict result. print('Predicting.') y_pred = lrm.predict(T_trans[X_trans.columns].values) # write result. cu.write_result(y_pred) print(max(list(lrm.base_model.coef_))) print(min(y_pred))
def run_feature_outlier(): # read train data. X, y = cu.get_train_data(encode_non_object=False) # transform feature 'yearbuilt' X['yearbuilt'] = 2016 - X['yearbuilt'] result = [] for feature in ['taxamount', 'yearbuilt']: for name, newSeries in generate_feature_replace_outlier( X[feature]).items(): print 'Try to deal with feature[%s] outlier by [%s].' % (feature, name) # get CV from train data. newX = X.copy() newX[feature] = newSeries X_train, y_train, X_holdout, y_holdout = cu.get_cv(newX, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) result.append([feature, name, xgbm.base_model.best_score]) print '\n'.join(','.join(str(o) for o in one) for one in result)
def run_laglng_cluster(): # read train data. X, y = cu.get_train_data(encode_non_object=False) m_distances = [1500, 500, 50] min_sampleses = [1, 10, 50] result = [] for m_distance in m_distances: for min_samples in min_sampleses: print 'Run DBSCAN m_distance = %d, min_samples = %d.' % ( m_distance, min_samples) newX = preprocess_raw_latlng(X) coordinates = get_coordinates(newX) dbscan = cluster_latlng(coordinates, m_distance=m_distance, min_samples=min_samples) centroid_dict = get_centroid_dict(dbscan, coordinates) newX = replace_predict_cluster_df(dbscan, centroid_dict, newX) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(newX, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) result.append( [m_distance, min_samples, xgbm.base_model.best_score]) print '\n'.join(','.join(str(o) for o in one) for one in result)
def get_feature_importance_df(importance_type='gain'): from xgboost_baseline import XGBoostModel # read train data. X, y = cu.get_train_data(encode_non_object=False) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # feature importance tmp = xgbm.base_model.get_score(importance_type=importance_type) columns, importances = [], [] for c, i in tmp.items(): columns.append(c) importances.append(i) importance_df = pd.DataFrame({ 'column_name': columns, 'importance': importances }) importance_df = importance_df.sort_values(by='importance', ascending=True) importance_df = importance_df.reset_index(drop=True) return importance_df
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) # MeanEncoder print('Use MeanEncoder.') mean_encoder = MeanEncoder(categorical_features=[ 'regionidcity', 'regionidneighborhood', 'regionidzip' ], target_type='regression') X = mean_encoder.fit_transform(X, pd.Series(y)) X = X.drop(mean_encoder.categorical_features, axis=1) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) T = mean_encoder.transform(T) # predict result. print('Predicting.') y_pred = xgbm.predict(T[X_train.columns]) # write result. cu.write_result(y_pred)
def run(): def gridSearch(): st,nt,step=5,51,5 for a in range(st,nt,step): for b in range(st,nt,step): rlist = [] for c in range(st,nt,step): bindic = dict(zip(tarlist, [a, b, c])) X_trans = dt.getTransData(X, tarlist, bindic) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) rlist.append([a, b, c, xgbm.base_model.best_score]) with open('../../data/param.data','a') as outfile: for vs in rlist: outfile.write('\t'.join([str(v) for v in vs]) + '\n') # read train data. X, y = cu.get_train_data(encode_non_object=True) tarlist = X.columns X_trans, propdic = dt.getTransData(X, y, tarlist) for c in tarlist: X_trans[c] = X_trans[c].astype(float) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=True) T_trans = dt.getTransTest(T, propdic) # predict result. print('Predicting.') y_pred = xgbm.predict(T_trans[X_train.columns]) # write result. cu.write_result(y_pred)
def gridSearch(): st,nt,step=5,51,5 for a in range(st,nt,step): for b in range(st,nt,step): rlist = [] for c in range(st,nt,step): bindic = dict(zip(tarlist, [a, b, c])) X_trans = dt.getTransData(X, tarlist, bindic) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X_trans, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) rlist.append([a, b, c, xgbm.base_model.best_score]) with open('../../data/param.data','a') as outfile: for vs in rlist: outfile.write('\t'.join([str(v) for v in vs]) + '\n')
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. lgbmm = LightGBMModel() lgbmm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) # predict result. print('Predicting.') y_pred = lgbmm.predict(T[X_train.columns]) # write result. cu.write_result(y_pred)
def run(): # read train data. X, y = cu.get_train_data(encode_non_object=False) print('Transform, replace feature outliers.') X['yearbuilt'] = 2016 - X['yearbuilt'] yearbuilt_llimit, yearbuilt_ulimit = get_series_percentile(X['yearbuilt']) yearbuilt_median = X['yearbuilt'].median() taxamount_q1, taxamount_q3 = get_series_q1q3(X['taxamount']) X['yearbuilt'] = replace_with_value(X['yearbuilt'], yearbuilt_llimit, yearbuilt_ulimit, yearbuilt_median) X['taxamount'] = replace_with_iqr_boundary(X['taxamount'], taxamount_q1, taxamount_q3) # get CV from train data. X_train, y_train, X_holdout, y_holdout = cu.get_cv(X, y) # train model. xgbm = XGBoostModel() xgbm.train(X_train, y_train, X_holdout, y_holdout) # read test data. T = cu.get_test_data(encode_non_object=False) T['yearbuilt'] = 2016 - T['yearbuilt'] T['yearbuilt'] = replace_with_value(T['yearbuilt'], yearbuilt_llimit, yearbuilt_ulimit, yearbuilt_median) T['taxamount'] = replace_with_iqr_boundary(T['taxamount'], taxamount_q1, taxamount_q3) # predict result. print('Predicting.') y_pred = xgbm.predict(T[X_train.columns]) # write result. cu.write_result(y_pred)
# Train the model, iterating on the data in batches of 32 samples model.fit(data, labels, epochs=10, batch_size=32) y_pred = model.predict(data) df = pd.DataFrame({1: list(y_pred[:, 0]), 2: list(labels)}) df.to_clipboard() # read train data. X, y = cu.get_train_data(encode_non_object=True) tarlist = X.columns #['longitude', 'yearbuilt', 'taxamount'] X_trans, propdic = getTransData(X, tarlist) from keras import regularizers x_train, y_train, x_holdout, y_holdout = cu.get_cv(X_trans, y) num, acfunc = 10, 'softmax' model = Sequential([ Dense(10, input_shape=(53,),\ kernel_regularizer=regularizers.l2(0.01), \ activity_regularizer=regularizers.l1(0.01) ), Dense(5), Dense(5), Dense(1), Activation('linear') ]) #0.052303568738 # For a mean squared error regression problem model.compile(optimizer='rmsprop', loss='mae')