def build_extratrees_features(): X, y, X_holdout, _ = prepare_data("./data", drop_categorical=False) print "Getting OOB predictions from ExtraTreesClassifier" clf = ExtraTreesClassifier(n_estimators=500, max_features= 50,criterion= 'entropy',min_samples_split= 5, max_depth= 50, min_samples_leaf= 5, n_jobs=4) X_1, X_2 = build_base_features(clf, X, X_holdout, y, 10) np.vstack((X_1, X_2)).tofile('./features/extra_trees_oob.npy')
def get_sparse_onehot_features(): X, y, X_holdout, ids = prepare_data("./data", drop_categorical=False) cat_idx = get_cat_columns() encoder = OneHotEncoder(categorical_features=cat_idx, sparse=True, handle_unknown="ignore") X[:, cat_idx] = X[:, cat_idx] + 1 X_holdout[:, cat_idx] = X_holdout[:, cat_idx] + 1 X = encoder.fit_transform(X) X_holdout = encoder.transform(X_holdout) return X.tocsr(), y, X_holdout.tocsr(), ids
def build_multinomial_nb_features(): X, y, X_holdout, _ = prepare_data("./data", drop_categorical=False) cat_idx = get_cat_columns() X, X_holdout = X[:, cat_idx], X_holdout[:, cat_idx] X = X + 1 X_holdout = X_holdout + 1 print "Getting OOB predictions from mNB" clf = MultinomialNB(alpha=1) X_1, X_2 = build_base_features(clf, X, X_holdout, y, 10) np.vstack((X_1, X_2)).tofile('./features/NB_oob.npy')
def build_rf_features(): X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False) X1, X2 = load_extra_features() X = np.hstack((X, X1)) X_holdout = np.hstack((X_holdout, X2)) X, X_test, y, y_test = train_test_split(X, y, test_size=0.2) t0 = time() for max_depth in range(1, 15): print "max depth {}".format(max_depth) rf_clf = RandomForestClassifier(n_estimators=200, max_depth=max_depth, criterion="entropy", n_jobs=-1) rf_clf.fit(X, y) print "Done in %0.3fs" % (time() - t0) print log_loss(y_test, rf_clf.predict_proba(X_test))
def build_knn_features(): X, y, X_holdout, _ = prepare_data("./data", drop_categorical=True) n_rows = X.shape[0] scaler = StandardScaler() Z = np.vstack((X, X_holdout)) Z = scaler.fit_transform(np.vstack((X, X_holdout ))) X = Z[:n_rows] X_test = Z[n_rows:] for k in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: print "Getting OOB from KNN for k={}".format(k) clf = KNeighborsClassifier(k, n_jobs=-1) X_1, X_2 = build_base_features(clf, X, X_test, y, 10) M = np.vstack((X_1, X_2)) M.tofile('./features/knn_oob_{}.npy'.format(k))
def build_rf_submission(): X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False) # Right now we look at an extra y_train, y_test to assess the quality of our cv-estimates. X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42) print "Run Random Forest with {} data points and {} features.".format(X_train.shape[0], X_train.shape[1]) t0 = time() grid_cv = find_cv_rf_model(X_train, y_train, grid=False) # stochastic search now best_clf = grid_cv.best_estimator_ y_pred = best_clf.predict_proba(X_test) print "Done in %0.3fs" % (time() - t0) print "Best params {}: ".format(grid_cv.best_params_) print "Best CV score {}: ".format(grid_cv.best_score_) print "Training log-loss: {}".format(log_loss(y_train, best_clf.predict_proba(X_train))) print "Training accuracy: {}".format(best_clf.score(X_train, y_train)) print "Test log-loss: {}".format(log_loss(y_test, y_pred)) print "Test accuracy: {}".format(best_clf.score(X_test, y_test)) submission_name = "submission_{}.csv".format(time()) util.note_submission_info("Model: {}".format(best_clf), submission_name) util.build_submission(best_clf, X_holdout, ids, submission_name)
from extraction import prepare_data, load_extra_features import xgboost as xgb import numpy as np import pandas as pd from time import time from sklearn import cross_validation if __name__ == "__main__": X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False) X_extra, X_holdout_extra = load_extra_features() params = { "objective" : "binary:logistic", "eval_metric" : "logloss", "eta" : 0.005, # 0.01 "subsample" : 0.8, "colsample_bytree" : 0.8, "min_child_weight" : 1, "max_depth" : 10 } xg_train = xgb.DMatrix(np.hstack((X, X_extra)), label=y) xg_test = xgb.DMatrix(np.hstack((X_holdout, X_holdout_extra))) #xg_train = xgb.DMatrix(X, label=y) xgb_clf = xgb.train(params, xg_train, num_boost_round=2500, verbose_eval=True, maximize=False) y_pred = xgb_clf.predict(xg_test)# ,ntree_limit=xgb_clf.best_iteration) #cv_scores = xgb.cv(params, xg_train, num_boost_round=100, nfold=5, metrics="logloss", seed=42, early_stopping_rounds=5) #print cv_scores
from sklearn.decomposition import TruncatedSVD import numpy as np from extraction import prepare_data if __name__ == "__main__": X, _, X_holdout, _ = prepare_data("./data/", drop_categorical=False) A = np.vstack((X, X_holdout)) print "Applying SVD" svd = TruncatedSVD(20) B = svd.fit_transform(A) print B.shape for col in xrange(B.shape[1]): B[:, col].tofile("./features/svd_{}.npy".format(col))
from time import time import matplotlib.pyplot as plt from sklearn import cross_validation from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import OneHotEncoder from extraction import prepare_data, get_int_feature_columns from visualization.learning_curve import plot_learning_curve if __name__ == "__main__": X, y, _, _ = prepare_data("../data", drop_categorical=False) cat_idx = get_int_feature_columns() encoder = OneHotEncoder(categorical_features=cat_idx, sparse=True) X = encoder.fit_transform(X, y) plt = plot_learning_curve(estimator=LogisticRegression(C=0.1, penalty='l1'), title="Learning Curves of LogReg with logloss", X=X, y=y, cv=5, n_jobs=7, scoring="log_loss") plt.savefig("../images/learning_curve_logreg_{}.png".format(time())) plt.show()