clf_5.fit(Xtrain, ytrain_casual) print 'Finished fitting' dt_casual = clf_1.predict(Xtest) ada_casual = clf_2.predict(Xtest) grad_casual = clf_3.predict(Xtest) rf_casual = clf_4.predict(Xtest) et_casual = clf_5.predict(Xtest) feature_imps = clf_4.feature_importances_ print "regular decision tree" print rmsle(ytest, dt_regular + dt_casual) print "boosted decision tree" print rmsle(ytest, ada_regular + ada_casual) print "gradient tree boosting" print rmsle(ytest, grad_regular + grad_casual) print "random forest classifier" print rmsle(ytest, rf_regular + rf_casual) print "extra trees classifier" print rmsle(ytest, et_casual + et_regular) print "feature importances" print feature_imps if __name__ == '__main__': (X,y1,y2,y3) = import_training_file(sys.argv[1], True) decision_tree(X, y1, y2, y3)
from sklearn import datasets from sklearn.svm import SVC from sklearn.svm import SVR from sklearn.svm import LinearSVC from sklearn.metrics import accuracy_score from datetime import datetime from import_train import rmsle from import_train import import_training_file from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report from sklearn import preprocessing as pre from scipy import sparse import sys if __name__ == '__main__': (X, y_total, y_regis, y_casual) = import_training_file(sys.argv[1], True) n,d = X.shape nTrain = 0.5*n Xtrain = X[:nTrain,:] y_casual_train = y_casual[:nTrain] y_regis_train = y_regis[:nTrain] y_total_train = y_total[:nTrain] Xtest = X[nTrain:,:] y_casual_test = y_casual[nTrain:] y_regis_test = y_regis[nTrain:] y_total_test = y_total[nTrain:] #linear #param_grid = {'C': [1, 5, 10, 100],}
from sklearn import datasets from sklearn.svm import SVC from sklearn.svm import LinearSVC from sklearn.svm import SVR from sklearn.metrics import accuracy_score from datetime import datetime from import_train import rmsle from import_train import import_training_file from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report from sklearn import preprocessing as pre from scipy import sparse import sys if __name__ == '__main__': (X, y_total, y_regis, y_casual) = import_training_file(sys.argv[1], True) n, d = X.shape nTrain = 0.5 * n # shuffle the data #idx = np.arange(n) #np.random.seed(42) #np.random.shuffle(idx) #X = X[idx] #y = y[idx] Xtrain = X[:nTrain, :] y_casual_train = y_casual[:nTrain] y_regis_train = y_regis[:nTrain] y_total_train = y_total[:nTrain]
clf_4.fit(Xtrain, ytrain_casual) clf_5.fit(Xtrain, ytrain_casual) print 'Finished fitting' dt_casual = clf_1.predict(Xtest) ada_casual = clf_2.predict(Xtest) grad_casual = clf_3.predict(Xtest) rf_casual = clf_4.predict(Xtest) et_casual = clf_5.predict(Xtest) feature_imps = clf_4.feature_importances_ print "regular decision tree" print rmsle(ytest, dt_regular + dt_casual) print "boosted decision tree" print rmsle(ytest, ada_regular + ada_casual) print "gradient tree boosting" print rmsle(ytest, grad_regular + grad_casual) print "random forest classifier" print rmsle(ytest, rf_regular + rf_casual) print "extra trees classifier" print rmsle(ytest, et_casual + et_regular) print "feature importances" print feature_imps if __name__ == '__main__': (X, y1, y2, y3) = import_training_file(sys.argv[1], True) decision_tree(X, y1, y2, y3)
train_pred - train_given, c='b', s=40, alpha=0.25, label='Training Residuals') plt.plot([0, 600], [0, 0], c='r') plt.xlim(0, 600) plt.legend() plt.title('Residuals vs Bike Share Count') plt.xlabel("Predicted count values") plt.ylabel("Residuals") plt.show() if __name__ == '__main__': X_train, y_total, y_reg, y_casual = import_training_file(sys.argv[1]) X_test, datetime = import_testing_file(sys.argv[2]) n, _ = X_train.shape nTrain = int(0.5 * n) #training on 50% of the data sub_Xtrain = X_train[:nTrain, :] sub_ytrain = y_total[:nTrain] sub_ytrain_registered = y_reg[:nTrain] sub_ytest_registered = y_reg[nTrain:] sub_ytrain_casual = y_casual[:nTrain] sub_ytest_casual = y_casual[nTrain:] sub_Xtest = X_train[nTrain:, :] sub_ytest = y_total[nTrain:] rf_opt = RandomForestRegressor(bootstrap=True, compute_importances=None,
if row[5] >= 80: row[5] = 4 elif row[5] >= 60: row[5] = 3 elif row[5] >= 40: row[5] = 2 elif row[5] >= 20: row[5] = 1 else: row[5] = 0 # reclassify wind speed if row[6] >= 30.0: row[6] = 3 elif row[6] >= 20.0: row[6] = 2 elif row[6] >= 10.0: row[6] = 1 else: row[6] = 0 print X[0] print y[0] return simple_naive_bayes(X, y) if __name__ == '__main__': (X, y) = import_training_file(sys.argv[1], True) simple_naive_bayes(X, y) print '------------------' harder_naive_bayes(X, y)
for i, C in enumerate(10. ** np.arange(1, 6)): clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01) clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01) clf_l1_LR.fit(Xtrain, ytrain) clf_l2_LR.fit(Xtrain, ytrain) y1 = clf_l1_LR.predict(Xtest) y2 = clf_l2_LR.predict(Xtest) #L1 penalty print "L1 Penalty with C=" + str(C) print rmsle(ytest, y1) print "L2 Penalty with C=" + str(C) #L2 penalty print rmsle(ytest, y2) logreg = LinearRegression() logreg.fit(Xtrain, ytrain) y3 = logreg.predict(Xtest) print "Linear Regression" print y3 print rmsle(ytest,y3) if __name__ == '__main__': (X, y) = import_training_file(sys.argv[1], True) logistic_regression(X, y)
temp_dict = {'datetime' : datetime[idx], 'count' : np.rint(y_total_pred[idx])} writer.writerow(temp_dict) def plot_residuals(train_pred, train_given, test_pred, test_given): plt.scatter(test_pred, test_pred - test_given, c='g', s=40, alpha=0.8, label='Testing Residuals') plt.scatter(train_pred, train_pred - train_given, c='b', s=40, alpha=0.25, label='Training Residuals') plt.plot([0,600],[0,0], c='r') plt.xlim(0,600) plt.legend() plt.title('Residuals vs Bike Share Count') plt.xlabel("Predicted count values") plt.ylabel("Residuals") plt.show() if __name__ == '__main__': X_train, y_total, y_reg, y_casual = import_training_file(sys.argv[1]) X_test, datetime = import_testing_file(sys.argv[2]) n, _ = X_train.shape nTrain = int(0.5*n) #training on 50% of the data sub_Xtrain = X_train[:nTrain,:] sub_ytrain = y_total[:nTrain] sub_ytrain_registered = y_reg[:nTrain] sub_ytest_registered = y_reg[nTrain:] sub_ytrain_casual = y_casual[:nTrain] sub_ytest_casual = y_casual[nTrain:] sub_Xtest = X_train[nTrain:,:] sub_ytest = y_total[nTrain:] rf_opt = RandomForestRegressor(bootstrap=True, compute_importances=None, criterion='mse', max_depth=None, max_features='auto',