def GBM(x_train,y_train,x_test,udf_trees=100,udf_lr=0.01,udf_max_depth=5,udf_minsam=50,do_CV=False,names=None): from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error if do_CV: param_grid = {'max_depth': [2,3,4,5], 'min_samples_leaf':[50,250,1000,2500]} est=GradientBoostingRegressor(n_estimators=100,learning_rate=0.1, verbose=1) cv_scores=list() params_list=list() start = time() for mdep in param_grid['max_depth']: for minSamples in param_grid['min_samples_leaf']: print 'Trying parameter combination: (Max_Depth=%i, minSamples=%i)' % (mdep,minSamples) est.min_samples_leaf=minSamples est.max_depth=mdep cv_score=udf.cross_val_score_proba(x_train,y_train,5,est) cv_scores.append(np.mean(cv_score)) ### Create the labels for display purposes ### params_list.append((mdep,minSamples)) print 'Took %.2f seconds for parameter tuning.' %(time()-start) print 'writing CV results to file...' results = np.array([params_list,cv_scores]).T ## should have 48 results... print 'GBM Parameter tuning results........' print 'Parameters (max_depth, min_samples_in_leaf), CV_Scores' for i in range(len(results)): print results[i] else: ### Train the GBM Classifier with the optimal parameters found above ### print 'Fitting GBM with optimal user-defined parameters....' est=GradientBoostingRegressor(n_estimators=udf_trees,learning_rate=udf_lr,max_depth=udf_max_depth,min_samples_leaf=udf_minsam,verbose=1) est.fit(x_train,y_train) idx=np.where(x_test[:,1]==0) x_test=np.delete(x_test, 1, axis=1) y_pred=est.predict(x_test) y_pred=np.exp(y_pred) y_pred[idx] = 0 print 'Writing submission file....' with open('GBM_Submission.csv','wb') as testfile: w=csv.writer(testfile) w.writerow(('Id','Sales')) for i in range(len(y_pred)): w.writerow(((i+1),y_pred[i])) testfile.close() print 'File written to disk...'
def RFC(x_train,y_train,x_test,udf_trees=100,udf_max_features='auto', udf_min_samples=50, do_CV=False,names=None): from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score if do_CV: param_grid = {'max_features': [2,3,4], 'min_samples_leaf':[50,250,1000,2500]} est=RandomForestClassifier(n_estimators=100,verbose=1) cv_scores=list() params_list=list() start = time() for mfeatures in param_grid['max_features']: for minSamples in param_grid['min_samples_leaf']: print 'Trying parameter combination: (MaxFeatures=%i, minSamples=%i)' % (mfeatures,minSamples) est.min_samples_leaf=minSamples est.max_features=mfeatures cv_score=udf.cross_val_score_proba(x_train,y_train,5,est) cv_scores.append(np.mean(cv_score)) ### Create the labels for display purposes ### params_list.append((mfeatures,minSamples)) print 'Took %.2f seconds for parameter tuning.' %(time()-start) print 'writing CV results to file...' results = np.array([params_list,cv_scores]).T ## should have 48 results... print 'Parameter tuning results........' print 'Parameters (max_features, min_samples_leaf), CV_Scores' for i in range(len(results)): print results[i] else: ### Train the RFC Classifier with the optimal parameters found above ### print 'Fitting Random Forest with optimal user-defined parameters....' est=RandomForestClassifier(n_estimators=udf_trees, max_features=udf_max_features,min_samples_leaf=udf_min_samples,verbose=1) est.fit(x_train,y_train) y_pred=est.predict_proba(x_test)[:,1] ## Must predict probability!! ## ### Plot feature importances ### plot_feature_importance(est, names) print 'Writing submission file....' with open('RFC_Submission.csv','wb') as testfile: w=csv.writer(testfile) w.writerow(('Id','Probability')) for i in range(len(y_pred)): w.writerow(((i+1),y_pred[i])) testfile.close() print 'File written to disk...'
def logistic_regression(x_train,y_train,x_test,penalty='L2', regularization=1.0, do_CV=False): from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import KFold ### Mean Normalize variables before regression ### from sklearn.preprocessing import StandardScaler ss=StandardScaler() x_train=ss.fit_transform(x_train) x_test=ss.fit_transform(x_test) lr=LogisticRegression() if penalty=='L1': lr = LogisticRegression(penalty='l1') filename="Lasso_submission.csv" else: lr = LogisticRegression(penalty='l2') filename="Ridge_submission.csv" if do_CV: Cs=np.logspace(-1.5, 1.5, 10) cv_list=list() ### Fit lasso to various choices of regularization parameter C to select optimal C for c in Cs: lr.C = c print 'Running K-fold CV with C = %.5f' % (1.0/c) cv_scores=udf.cross_val_score_proba(x_train,y_train,5,lr) cv_list.append(np.mean(cv_scores)) print 'Best lambda based on Cross-Validation...' max_score=np.max(cv_list) max_lambda=Cs[cv_list.index(max_score)] print 1.0/max_lambda, max_score else: print 'Making prediction with optimal lambda....' lr.C=1.0/regularization lr.fit(x_train,y_train) y_pred=lr.predict_proba(x_test)[:,1] print 'Coefficients of the regression:' print lr.coef_ print 'Writing submission file....' with open(filename,'wb') as testfile: w=csv.writer(testfile) w.writerow(('Id','Probability')) for i in range(len(y_pred)): w.writerow(((i+1),y_pred[i])) testfile.close() print 'File written to disk...'