Ejemplo n.º 1
0
  clf_5.fit(Xtrain, ytrain_casual)


  print 'Finished fitting'


  dt_casual = clf_1.predict(Xtest)
  ada_casual = clf_2.predict(Xtest)
  grad_casual = clf_3.predict(Xtest)
  rf_casual = clf_4.predict(Xtest)
  et_casual = clf_5.predict(Xtest)
  feature_imps = clf_4.feature_importances_

  print "regular decision tree"
  print rmsle(ytest, dt_regular + dt_casual)
  print "boosted decision tree"
  print rmsle(ytest, ada_regular + ada_casual)
  print "gradient tree boosting"
  print rmsle(ytest, grad_regular + grad_casual)
  print "random forest classifier"
  print rmsle(ytest, rf_regular + rf_casual)
  print "extra trees classifier"
  print rmsle(ytest, et_casual + et_regular)

  print "feature importances"
  print feature_imps

if __name__ == '__main__':
  (X,y1,y2,y3) = import_training_file(sys.argv[1], True)
  decision_tree(X, y1, y2, y3)
Ejemplo n.º 2
0
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from datetime import datetime
from import_train import rmsle
from import_train import import_training_file
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import preprocessing as pre
from scipy import sparse
import sys

if __name__ == '__main__':
	(X, y_total, y_regis, y_casual) = import_training_file(sys.argv[1], True)

	n,d = X.shape
	nTrain = 0.5*n

	Xtrain = X[:nTrain,:]
	y_casual_train = y_casual[:nTrain]
	y_regis_train = y_regis[:nTrain]
	y_total_train = y_total[:nTrain]
	Xtest = X[nTrain:,:]
	y_casual_test = y_casual[nTrain:]
	y_regis_test = y_regis[nTrain:]
	y_total_test = y_total[nTrain:]
	
	#linear
	#param_grid = {'C': [1, 5, 10, 100],}
Ejemplo n.º 3
0
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
from datetime import datetime
from import_train import rmsle
from import_train import import_training_file
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import preprocessing as pre
from scipy import sparse
import sys

if __name__ == '__main__':
    (X, y_total, y_regis, y_casual) = import_training_file(sys.argv[1], True)

    n, d = X.shape
    nTrain = 0.5 * n

    # shuffle the data
    #idx = np.arange(n)
    #np.random.seed(42)
    #np.random.shuffle(idx)
    #X = X[idx]
    #y = y[idx]

    Xtrain = X[:nTrain, :]
    y_casual_train = y_casual[:nTrain]
    y_regis_train = y_regis[:nTrain]
    y_total_train = y_total[:nTrain]
Ejemplo n.º 4
0
    clf_4.fit(Xtrain, ytrain_casual)
    clf_5.fit(Xtrain, ytrain_casual)

    print 'Finished fitting'

    dt_casual = clf_1.predict(Xtest)
    ada_casual = clf_2.predict(Xtest)
    grad_casual = clf_3.predict(Xtest)
    rf_casual = clf_4.predict(Xtest)
    et_casual = clf_5.predict(Xtest)
    feature_imps = clf_4.feature_importances_

    print "regular decision tree"
    print rmsle(ytest, dt_regular + dt_casual)
    print "boosted decision tree"
    print rmsle(ytest, ada_regular + ada_casual)
    print "gradient tree boosting"
    print rmsle(ytest, grad_regular + grad_casual)
    print "random forest classifier"
    print rmsle(ytest, rf_regular + rf_casual)
    print "extra trees classifier"
    print rmsle(ytest, et_casual + et_regular)

    print "feature importances"
    print feature_imps


if __name__ == '__main__':
    (X, y1, y2, y3) = import_training_file(sys.argv[1], True)
    decision_tree(X, y1, y2, y3)
Ejemplo n.º 5
0
                train_pred - train_given,
                c='b',
                s=40,
                alpha=0.25,
                label='Training Residuals')
    plt.plot([0, 600], [0, 0], c='r')
    plt.xlim(0, 600)
    plt.legend()
    plt.title('Residuals vs Bike Share Count')
    plt.xlabel("Predicted count values")
    plt.ylabel("Residuals")
    plt.show()


if __name__ == '__main__':
    X_train, y_total, y_reg, y_casual = import_training_file(sys.argv[1])
    X_test, datetime = import_testing_file(sys.argv[2])

    n, _ = X_train.shape
    nTrain = int(0.5 * n)  #training on 50% of the data
    sub_Xtrain = X_train[:nTrain, :]
    sub_ytrain = y_total[:nTrain]
    sub_ytrain_registered = y_reg[:nTrain]
    sub_ytest_registered = y_reg[nTrain:]
    sub_ytrain_casual = y_casual[:nTrain]
    sub_ytest_casual = y_casual[nTrain:]
    sub_Xtest = X_train[nTrain:, :]
    sub_ytest = y_total[nTrain:]

    rf_opt = RandomForestRegressor(bootstrap=True,
                                   compute_importances=None,
Ejemplo n.º 6
0
        if row[5] >= 80:
            row[5] = 4
        elif row[5] >= 60:
            row[5] = 3
        elif row[5] >= 40:
            row[5] = 2
        elif row[5] >= 20:
            row[5] = 1
        else:
            row[5] = 0

        # reclassify wind speed
        if row[6] >= 30.0:
            row[6] = 3
        elif row[6] >= 20.0:
            row[6] = 2
        elif row[6] >= 10.0:
            row[6] = 1
        else:
            row[6] = 0

    print X[0]
    print y[0]
    return simple_naive_bayes(X, y)


if __name__ == '__main__':
    (X, y) = import_training_file(sys.argv[1], True)
    simple_naive_bayes(X, y)
    print '------------------'
    harder_naive_bayes(X, y)
  for i, C in enumerate(10. ** np.arange(1, 6)):
    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
    clf_l1_LR.fit(Xtrain, ytrain)
    clf_l2_LR.fit(Xtrain, ytrain)


    y1 = clf_l1_LR.predict(Xtest)
    y2 = clf_l2_LR.predict(Xtest)

    #L1 penalty
    print "L1 Penalty with C=" + str(C)
    print rmsle(ytest, y1)
    print "L2 Penalty with C=" + str(C)
    #L2 penalty
    print rmsle(ytest, y2)

  logreg = LinearRegression()
  logreg.fit(Xtrain, ytrain)
  y3 = logreg.predict(Xtest)
  print "Linear Regression"
  print y3
  print rmsle(ytest,y3)



if __name__ == '__main__':
  (X, y) = import_training_file(sys.argv[1], True)
  logistic_regression(X, y)
Ejemplo n.º 8
0
      temp_dict = {'datetime' : datetime[idx], 'count' : np.rint(y_total_pred[idx])}
      writer.writerow(temp_dict)

def plot_residuals(train_pred, train_given, test_pred, test_given):
  plt.scatter(test_pred, test_pred - test_given, c='g', s=40, alpha=0.8, label='Testing Residuals')
  plt.scatter(train_pred, train_pred - train_given, c='b', s=40, alpha=0.25, label='Training Residuals')
  plt.plot([0,600],[0,0], c='r')
  plt.xlim(0,600)
  plt.legend()
  plt.title('Residuals vs Bike Share Count')
  plt.xlabel("Predicted count values")
  plt.ylabel("Residuals")
  plt.show()

if __name__ == '__main__':
  X_train, y_total, y_reg, y_casual = import_training_file(sys.argv[1])
  X_test, datetime = import_testing_file(sys.argv[2])

  n, _ = X_train.shape
  nTrain = int(0.5*n)  #training on 50% of the data
  sub_Xtrain = X_train[:nTrain,:]
  sub_ytrain = y_total[:nTrain]
  sub_ytrain_registered = y_reg[:nTrain]
  sub_ytest_registered = y_reg[nTrain:]
  sub_ytrain_casual = y_casual[:nTrain]
  sub_ytest_casual = y_casual[nTrain:]
  sub_Xtest = X_train[nTrain:,:]
  sub_ytest = y_total[nTrain:]

  rf_opt = RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',