Esempio n. 1
0
def run_random_forest(X_train, y_train, X_test, y_test):
    print("Fitting Random Forest")
    X_train = dp.factorize_variables(X_train)
    X_test = dp.factorize_variables(X_test)

    max_features = [None, 'auto', 'log2']
    params = {'criterion': ['gini'],
          'random_state': [1234],
          'n_estimators': [100, 200],
          'max_features': max_features,
          'oob_score': [False],
          # EOSL -- just fit max depth trees. not going to overfit
          'max_depth': [None, 10],
          'n_jobs': [1],
          }

    cv_func, y  = dp.get_kfold_obj(y_train, k = 3)
    grid = GridSearchCV(RandomForestClassifier(), params, cv=cv_func, 
        verbose=2)
    grid.fit(X_train, y_train.values)

    print(grid.best_score_)
    print(grid.best_estimator_)
    
    print("Training set score {}".format(grid.score(X_train,y_train)))
    print("Test set score {}".format(grid.score(X_test,y_test)))

    return grid
Esempio n. 2
0
y_train = pd.Series(psf.adjust_y(y_train))
y_test= pd.Series(psf.adjust_y(y_test))

X_ID = train.pop('Id')
Y_ID = test.pop('Id')

# are there any missing values?
dp.print_columns_with_missing(X_train)
dp.print_columns_with_missing(X_test)

# Run the model
rf_model = rrf.run_random_forest(X_train, y_train, X_test, y_test)
print(rf_model.get_params())
rf_final_model = rf_model.fit(X_data_full, y_data_full.values)


# Score the actual test set
test = dp.read_data('{}data/test.csv'.format(file_path))
test = test.set_index('Id')
test = dp.factorize_variables(test)
test_predictions = rf_final_model.predict(test)
test = pd.DataFrame(np.transpose([test.index, test_predictions]))
test.columns = ["Id", "Hazard"]



# Store results and pickle model
test.to_csv('{}Output/{}.csv'.format(file_path, model_name), drop = True)
#with open('/Users/Adrianna/Desktop/Kaggle/Liberty/Output/rf.pkl', 'wb') as f:
#	cPickle.dump(rf_model, f)