def get_sparse_onehot_features(): X, y, X_holdout, ids = prepare_data("./data", drop_categorical=False) cat_idx = get_cat_columns() encoder = OneHotEncoder(categorical_features=cat_idx, sparse=True, handle_unknown="ignore") X[:, cat_idx] = X[:, cat_idx] + 1 X_holdout[:, cat_idx] = X_holdout[:, cat_idx] + 1 X = encoder.fit_transform(X) X_holdout = encoder.transform(X_holdout) return X.tocsr(), y, X_holdout.tocsr(), ids
def build_multinomial_nb_features(): X, y, X_holdout, _ = prepare_data("./data", drop_categorical=False) cat_idx = get_cat_columns() X, X_holdout = X[:, cat_idx], X_holdout[:, cat_idx] X = X + 1 X_holdout = X_holdout + 1 print "Getting OOB predictions from mNB" clf = MultinomialNB(alpha=1) X_1, X_2 = build_base_features(clf, X, X_holdout, y, 10) np.vstack((X_1, X_2)).tofile('./features/NB_oob.npy')
grid_cv = GridSearchCV(SVC(probability=True, random_state=SEED), param_grid=params, scoring='log_loss', n_jobs=-1, cv=5, verbose=1) grid_cv.fit(X_train, y_train) return grid_cv if __name__ == "__main__": X, y, X_holdout, ids = prepare_data("./data", drop_categorical=False) # prepare the categorical columns cat_idx = get_cat_columns() encoder = OneHotEncoder(categorical_features=cat_idx, sparse=True, handle_unknown="ignore") n_rows = X.shape[0] X[:, cat_idx] = X[:, cat_idx] + 1 X_holdout[:, cat_idx] = X_holdout[:, cat_idx] + 1 X = encoder.fit_transform(X) X_holdout = encoder.transform(X_holdout) print "Run SVC with {} data points and {} features.".format(X.shape[0], X.shape[1]) t0 = time() grid_cv = find_svm_model(X, y) best_clf = grid_cv.best_estimator_ print "CV log-loss: {}".format(grid_cv.best_score_) best_clf.fit(X, y)