def main(in_dir, out_dir): if not os.path.exists(out_dir): os.makedirs(out_dir) in_file = in_dir + '/train.csv' print "reading datafile: " + str(in_file) df = pd.read_table(in_file, sep=',', header=0, parse_dates=['datetime']) df['dow'] = df['datetime'].apply(lambda x: x.weekday()) X, enc, scalar = bc.prep_data(df) y = df[['count']].values X_train, X_test, y_train, y_test = sl.cross_validation.train_test_split(X, y) # clf = linear_model.LinearRegression() # clf = sl.tree.DecisionTreeRegressor() # clf = ensemble.GradientBoostingClassifier() # clf = ensemble.GradientBoostingRegressor() clf = ensemble.RandomForestRegressor() param_grid = {'n_estimators': [5,10,20,50], 'max_features': (None,0.75,0.50,0.25)} rmsle_scorer = sl.metrics.make_scorer(bc.score_func, greater_is_better=False) # train the rental model print ('training') srch = sl.grid_search.GridSearchCV(clf, param_grid, rmsle_scorer) srch.fit(X_train, y_train.ravel()) clf = srch.best_estimator_ print('clf stats: best_score=%f best_params=%s' % (srch.best_score_, srch.best_params_) ) zc = clf.predict(X_test) zc[zc<0] = 0 print ('clf Xtrain RMSLE: ' + str(bc.score_func(y_test, zc))) # full prediction + addition z = clf.predict(X) print ('clf Xt RMSLE: ' + str(bc.score_func(y, z))) ############## # run model against test dataset test_file = in_dir + '/test.csv' print "reading test datafile: " + str(test_file) df_test = pd.read_table(test_file, sep=',',header=0, parse_dates=['datetime']) df_test['dow'] = df_test['datetime'].apply(lambda x: x.weekday()) Xtest, enc, scalar = bc.prep_data(df_test, enc, scalar) # full prediction + addition zc = clf.predict(Xtest) zc[zc<0] = 0 df_test['count'] = zc df_test[['datetime','count']].to_csv(out_dir + '/submission.csv', sep=',', header=True, index=False)
def main(in_dir, out_dir): if not os.path.exists(out_dir): os.makedirs(out_dir) in_file = in_dir + '/train.csv' print "reading datafile: " + str(in_file) df = pd.read_table(in_file, sep=',', header=0, parse_dates=['datetime']) df['dow'] = df['datetime'].apply(lambda x: x.weekday()) X, enc, scalar = bc.prep_data(df) # output prepped file for visual review # X.to_csv(out_dir + '/X.csv', sep=',', header=True, index=False) y = df[['count']].values Xcorr = pd.concat([X, df[['count']]], axis=1) Ycorr = Xcorr.corr() # output prepped file for visual review Ycorr.to_csv(out_dir + '/Ycorr.csv', sep=',', header=True, index=False) # strip X down to minimal columns based on pearson coef # cols = [0,1,2,3,7,9,12,14,17,18,20,21,32,33,34,35,36,37,38,40,48,49,50,51,55] # Xmini = X[cols] # print("shrunk X from " + str(X.columns) + " to " + str(Xmini.columns)) # X = Xmini X_train, X_test, y_train, y_test = sl.cross_validation.train_test_split(X, y) # clf = linear_model.LinearRegression() # clf = sl.tree.DecisionTreeRegressor() # clf = ensemble.GradientBoostingClassifier() # clf = ensemble.GradientBoostingRegressor(verbose=1) # param_grid = {'loss':('ls','lad','huber','quantile'), 'n_estimators':[50,100,500], 'max_features':(None,'auto'), 'max_depth':(10,20,40)} clf = ensemble.RandomForestRegressor() param_grid = {'n_estimators': [5,10,20,50,100,500,1000], 'max_features': (None,0.75,0.50,0.25)} rmsle_scorer = sl.metrics.make_scorer(bc.score_func, greater_is_better=False) # train the rental model print ('training') srch = sl.grid_search.GridSearchCV(clf, param_grid, scoring=rmsle_scorer, verbose=1) srch.fit(X_train, y_train.ravel()) clf = srch.best_estimator_ print('clf stats: best_score=%f best_params=%s' % (srch.best_score_, srch.best_params_) ) # clf = ensemble.RandomForestRegressor(n_estimators = 500, verbose = 1, n_jobs=10) # clf = linear_model.LinearRegression() # clf = ensemble.GradientBoostingRegressor(n_estimators=500, verbose=1, loss='huber', max_depth=24) clf.fit(X_train, y_train.ravel()) zc = clf.predict(X_test) print("number rows < 0 = " + str(len(zc[zc<0]))) zc[zc<0] = 0 print ('clf CV RMSLE: ' + str(bc.score_func(y_test, zc))) # full prediction + addition # z = clf.predict(X) # print ('clf Xt RMSLE: ' + str(bc.score_func(y, z))) ############## # run model against test dataset test_file = in_dir + '/test.csv' print "reading test datafile: " + str(test_file) df_test = pd.read_table(test_file, sep=',',header=0, parse_dates=['datetime']) df_test['dow'] = df_test['datetime'].apply(lambda x: x.weekday()) Xtest, enc, scalar = bc.prep_data(df_test, enc, scalar) # Xtest = Xtest[cols] # full prediction + addition zc = clf.predict(Xtest) zc[zc<0] = 0 df_test['count'] = zc df_test[['datetime','count']].to_csv(out_dir + '/submission.csv', sep=',', header=True, index=False)
def main(in_dir, out_dir): if not os.path.exists(out_dir): os.makedirs(out_dir) in_file = in_dir + "/train.csv" print "reading datafile: " + str(in_file) # names=['datetime','season','holiday','workingday','weather','temp','atemp','humidity','windspeed','casual','registered','count'] df = pd.read_table(in_file, sep=",", header=0, parse_dates=["datetime"]) df["dow"] = df["datetime"].apply(lambda x: x.weekday()) # prepare the feature data X, enc, scalar = bc.prep_data(df) X.to_csv(out_dir + "/scratch.csv", sep=",", header=True, index=False) # extract the labels for the casual and reserved bike sets yc = df[["casual"]].values yr = df[["registered"]].values # create training and test data for casual and reserved data Xc_train, Xc_test, yc_train, yc_test = sl.cross_validation.train_test_split(X, yc) Xr_train, Xr_test, yr_train, yr_test = sl.cross_validation.train_test_split(X, yr) # try out a bunch of different models # RMLSE score=0.993142954278 (vs. test 15.57659) # clfc = linear_model.LinearRegression() # clfr = linear_model.LinearRegression() # param_grid = {} # RMLSE score=0.251551972408 (vs. test 0.59778) # with DOW RMLSE score=0.252605615098 (vs. .59090) # clfc = sl.tree.DecisionTreeRegressor() # clfr = sl.tree.DecisionTreeRegressor() # param_grid = {'max_features':(None,'auto','sqrt')} # RMLSE score=0.226339353755 (vs. test 0.57022 # clfc = sl.tree.ExtraTreeRegressor() # clfr = sl.tree.ExtraTreeRegressor() # param_grid = {'max_features':(None,'auto','sqrt')} # RMLSE score=1.31458932334 # clfc = sl.svm.SVR() # clfr = sl.svm.SVR() # param_grid = {'C':[.01,.1,1,10], 'epsilon':[.01,.1,1], 'kernel': ('rbf','linear','poly','sigmoid')} # RMLSE score=0.983135646219 # clfc = ensemble.GradientBoostingRegressor() # clfr = ensemble.GradientBoostingRegressor() # param_grid = {'loss':('ls','lad','huber','quantile'), 'n_estimators':[50,100,200,500], 'max_features':(None,'auto','sqrt')} # RMLSE score=0.316815997324 (vs. test 0.54892) # with params RMLSE score=0.313294668232 (vs. test 0.57309) # with DOW RMLSE score=0.307161680679 (vs. test 0.55184) clfc = ensemble.RandomForestRegressor() clfr = ensemble.RandomForestRegressor() param_grid = {"n_estimators": [5, 10, 20, 50], "max_features": (None, 0.75, 0.50, 0.25)} # turn my RMSLE method into a scoring func for use in GridSearch rmsle_scorer = sl.metrics.make_scorer(bc.score_func, greater_is_better=False) # train the casual rental model print ("training 1") srch = sl.grid_search.GridSearchCV(clfc, param_grid, rmsle_scorer) srch.fit(Xc_train, yc_train.ravel()) clfc = srch.best_estimator_ print ("clfc stats: best_score=%f best_params=%s" % (srch.best_score_, srch.best_params_)) # re-fit with entire dataset clfc.fit(X, yc.ravel()) # score this model for fun zc = clfc.predict(Xc_test) zc[zc < 0] = 0 print ("clf Xc rmsle: " + str(bc.score_func(yc_test.ravel(), zc.ravel()))) # train the reserved rental model print ("training 2") srch = sl.grid_search.GridSearchCV(clfr, param_grid, rmsle_scorer) srch.fit(Xr_train, yr_train.ravel()) clfr = srch.best_estimator_ print ("clfr stats: best_score=%f best_params=%s" % (srch.best_score_, srch.best_params_)) # re-fit with entire dataset clfr.fit(X, yr.ravel()) # score this model for fun zr = clfr.predict(Xr_test) zr[zr < 0] = 0 print ("clf Xr rmsle: " + str(bc.score_func(yr_test, zr))) # now add the two predictions and score the combined model zc = clfc.predict(X) zc[zc < 0] = 0 zr = clfr.predict(X) zr[zr < 0] = 0 zt = zc + zr yt = df[["count"]].values score = bc.score_func(yt, zt) print ("RMLSE score=" + str(score)) df["casual_pred"] = zc df["registered_pred"] = zr df["count_pred"] = zt df.to_csv(out_dir + "/train_xval_preds.csv", sep=",", header=True, index=False) ########################### # apply trained models to test set test_file = in_dir + "/test.csv" print "reading test datafile: " + str(test_file) df_test = pd.read_table(test_file, sep=",", header=0, parse_dates=["datetime"]) df_test["dow"] = df_test["datetime"].apply(lambda x: x.weekday()) Xtest, enc, scalar = bc.prep_data(df_test, enc, scalar) zc = clfc.predict(Xtest) zc[zc < 0] = 0 zr = clfr.predict(Xtest) zr[zr < 0] = 0 zt = zc + zr df_test["count"] = zt df_test[["datetime", "count"]].to_csv(out_dir + "/submission.csv", sep=",", header=True, index=False)