X_trans = np.hstack((X_num_trans, X_cat_trans)) print 'Training model... ' #rf = RandomForestClassifier(n_estimators=10) #rf = RandomForestClassifier(n_estimators=100) rf = RandomForestClassifier(n_estimators=1000) rf.fit(X_trans, y, sample_weight=w) joblib.dump(rf, 'models/rf.pkl') print print 'Predicting in sample... ' evaluator = ModelEvaluator( imputer=imp, scaler=scaler, encoder=enc, model=rf ) y_pred = evaluator.predict(X_num, X_cat) print 'Training Error = {0}'.format(np.sum(w[y != y_pred]) / np.sum(w)) print 'Predicted +, - counts = {0}, {1}'.format(y_pred[y_pred==1].shape[0], y_pred[y_pred==0].shape[0]) print print 'Importing test sample... ' adapter = LearningDataAdapter(for_learning=True) adapter.adapt_file('data/validate.csv') X_num, X_cat = adapter.X_num, adapter.X_cat w, y = adapter.w, adapter.y print print 'Predicting out of sample... ' y_pred = evaluator.predict(X_num, X_cat) print 'Testing Error = {0}'.format(np.sum(w[y != y_pred]) / np.sum(w))
print 'Training model... ' #rf = RandomForestClassifier(n_estimators=10) #rf = RandomForestClassifier(n_estimators=100) #rf = RandomForestClassifier(n_estimators=200) rf = RandomForestClassifier(n_estimators=1000) #rf.fit(X_trans, y, sample_weight=w) rf.fit(X_trans, y) joblib.dump(rf, 'models/rf.pkl') print print 'Predicting in sample... ' evaluator = ModelEvaluator(imputer=imp, scaler=scaler, encoder=enc, model=rf) y_pred = evaluator.predict(X_num, X_cat) #print 'Training Error = {0}'.format(np.sum(w[y != y_pred]) / np.sum(w)) print 'Training Error = {0}'.format(y_pred[y != y_pred].shape[0] / float(y_pred.shape[0])) print 'Predicted +, - counts = {0}, {1}'.format( y_pred[y_pred == 1].shape[0], y_pred[y_pred == 0].shape[0]) print print 'Importing test sample... ' adapter = LearningDataAdapter(for_learning=True) adapter.adapt_file('data/validate.csv') X_num, X_cat = adapter.X_num, adapter.X_cat w, y = adapter.w, adapter.y print