Example #1
0
def GBM(x_train,y_train,x_test,udf_trees=100,udf_lr=0.01,udf_max_depth=5,udf_minsam=50,do_CV=False,names=None):
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.metrics import mean_squared_error

	if do_CV:
		param_grid = {'max_depth': [2,3,4,5],
						'min_samples_leaf':[50,250,1000,2500]}

		est=GradientBoostingRegressor(n_estimators=100,learning_rate=0.1, verbose=1)
		cv_scores=list()
		params_list=list()

		start = time()
		for mdep in param_grid['max_depth']:
			for minSamples in param_grid['min_samples_leaf']:
				print 'Trying parameter combination: (Max_Depth=%i, minSamples=%i)' % (mdep,minSamples)
				est.min_samples_leaf=minSamples
				est.max_depth=mdep

				cv_score=udf.cross_val_score_proba(x_train,y_train,5,est)
				cv_scores.append(np.mean(cv_score))

				### Create the labels for display purposes ###
				params_list.append((mdep,minSamples))

		print 'Took %.2f seconds for parameter tuning.' %(time()-start)
		print 'writing CV results to file...'
		results = np.array([params_list,cv_scores]).T ## should have 48 results...

		print 'GBM Parameter tuning results........'
		print 'Parameters (max_depth, min_samples_in_leaf), CV_Scores'
		for i in range(len(results)):
			print results[i]
	else:
		### Train the GBM Classifier with the optimal parameters found above ###
		print 'Fitting GBM with optimal user-defined parameters....'
		est=GradientBoostingRegressor(n_estimators=udf_trees,learning_rate=udf_lr,max_depth=udf_max_depth,min_samples_leaf=udf_minsam,verbose=1)
		est.fit(x_train,y_train)

		idx=np.where(x_test[:,1]==0)
		x_test=np.delete(x_test, 1, axis=1)
		y_pred=est.predict(x_test) 
		y_pred=np.exp(y_pred)
		y_pred[idx] = 0

		print 'Writing submission file....'
		with open('GBM_Submission.csv','wb') as testfile:
			w=csv.writer(testfile)
			w.writerow(('Id','Sales'))
			for i in range(len(y_pred)):
				w.writerow(((i+1),y_pred[i]))
		testfile.close()
		print 'File written to disk...'
Example #2
0
def RFC(x_train,y_train,x_test,udf_trees=100,udf_max_features='auto', udf_min_samples=50, do_CV=False,names=None):

	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import roc_auc_score

	if do_CV:
		param_grid = {'max_features': [2,3,4],
						'min_samples_leaf':[50,250,1000,2500]}

		est=RandomForestClassifier(n_estimators=100,verbose=1)
		cv_scores=list()
		params_list=list()

		start = time()
		for mfeatures in param_grid['max_features']:
			for minSamples in param_grid['min_samples_leaf']:
				print 'Trying parameter combination: (MaxFeatures=%i, minSamples=%i)' % (mfeatures,minSamples)
				est.min_samples_leaf=minSamples
				est.max_features=mfeatures

				cv_score=udf.cross_val_score_proba(x_train,y_train,5,est)
				cv_scores.append(np.mean(cv_score))

				### Create the labels for display purposes ###
				params_list.append((mfeatures,minSamples))

		print 'Took %.2f seconds for parameter tuning.' %(time()-start)
		print 'writing CV results to file...'
		results = np.array([params_list,cv_scores]).T ## should have 48 results...

		print 'Parameter tuning results........'
		print 'Parameters (max_features, min_samples_leaf), CV_Scores'
		for i in range(len(results)):
			print results[i]
	else:
		### Train the RFC Classifier with the optimal parameters found above ###
		print 'Fitting Random Forest with optimal user-defined parameters....'
		est=RandomForestClassifier(n_estimators=udf_trees, max_features=udf_max_features,min_samples_leaf=udf_min_samples,verbose=1)
		est.fit(x_train,y_train)
		y_pred=est.predict_proba(x_test)[:,1] ## Must predict probability!! ##

		### Plot feature importances ###
		plot_feature_importance(est, names)

		print 'Writing submission file....'
		with open('RFC_Submission.csv','wb') as testfile:
			w=csv.writer(testfile)
			w.writerow(('Id','Probability'))
			for i in range(len(y_pred)):
				w.writerow(((i+1),y_pred[i]))
		testfile.close()
		print 'File written to disk...' 
Example #3
0
def logistic_regression(x_train,y_train,x_test,penalty='L2', regularization=1.0, do_CV=False):
	from sklearn.linear_model import LogisticRegression
	from sklearn.cross_validation import KFold

	### Mean Normalize variables before regression ###
	from sklearn.preprocessing import StandardScaler
	ss=StandardScaler()
	x_train=ss.fit_transform(x_train)
	x_test=ss.fit_transform(x_test)

	lr=LogisticRegression()	
	
	if penalty=='L1':
		lr = LogisticRegression(penalty='l1')
		filename="Lasso_submission.csv"
	else:
		lr = LogisticRegression(penalty='l2')
		filename="Ridge_submission.csv"
	
	if do_CV:
		Cs=np.logspace(-1.5, 1.5, 10)
		cv_list=list()

		### Fit lasso to various choices of regularization parameter C to select optimal C
		for c in Cs:
			lr.C = c
			print 'Running K-fold CV with C = %.5f' % (1.0/c)
			cv_scores=udf.cross_val_score_proba(x_train,y_train,5,lr)
			cv_list.append(np.mean(cv_scores))

		print 'Best lambda based on Cross-Validation...'
		max_score=np.max(cv_list)
		max_lambda=Cs[cv_list.index(max_score)]
		print 1.0/max_lambda, max_score
	else:
		print 'Making prediction with optimal lambda....'
		lr.C=1.0/regularization
		lr.fit(x_train,y_train)
		y_pred=lr.predict_proba(x_test)[:,1]

		print 'Coefficients of the regression:'
		print lr.coef_

		print 'Writing submission file....'
		with open(filename,'wb') as testfile:
			w=csv.writer(testfile)
			w.writerow(('Id','Probability'))
			for i in range(len(y_pred)):
				w.writerow(((i+1),y_pred[i]))
		testfile.close()
		print 'File written to disk...'