(X, Y, Xt) = boilerplate.loadData()

clf = GradientBoostingClassifier()
params = {"loss": ['deviance'],
			  #"loss": ['deviance', 'exponential']
			  "n_estimators": [180, 200, 220, 240, 260, 280],
			  "max_depth": [12, 18, 24, 30],
			  "max_features": [15, 30, 45, 60],
			  "min_samples_split": [1],
			  "min_samples_leaf": [3, 10],
			  "verbose": [1]}
search = GridSearchCV(clf, param_grid=params, n_jobs = -1)
search.fit(X, Y)
best = search.best_estimator_
scores = cross_validation.cross_val_score(best, X, Y, cv=5)
boilerplate.writeData('predictionsMGS_GB2.csv', best, Xt)

with open('Log.txt', 'a') as f:
	f.write('GridSearch_GradientBoost run at: ' + time.strftime("%H:%M:%S") \
												+ '\n')
	f.write('Status of GridSearch_GradientBoost:\n')
	f.write('Best GradientBoost in search has score:\n')
	f.write(str(sum(scores) / len(scores)) + '\n')
	f.write('The parameters of the best estimator:\n')
	f.write(str(best.get_params) + '\n')
	f.write('Done. Time taken (seconds):\n')
	f.write(str(time.time() - start) + '\n')

print('Best Gradient Boost Classifier in search has score:')
print(sum(scores) / len(scores))
print('The parameters of the best estimator:')
				use_idf=True, smooth_idf=False, sublinear_tf=True)
X = trf.fit_transform(X)
Xt = trf.transform(Xt)

clf = SVC()
params = {"C": [0.1 * np.e**i for i in range(10)],
		  "kernel": ['rbf', 'linear', 'poly', 'sigmoid'],
		  "shrinking": [True, False],
		  "tol": [1e-3, 1e-4, 1e-5, 1e-6],
		  "verbose": [True]}

search = GridSearchCV(clf, param_grid=params, n_jobs=-1)
search.fit(X, Y)
best = search.best_estimator_
scores = cross_validation.cross_val_score(best, X, Y, cv=5)
boilerplate.writeData('predictionsGS_SVM3.csv', best, Xt)

with open('Log.txt', 'a') as f:
	f.write('--------------------------------------\n')
	f.write('GridSearch_TFIDF_SVC run at: ' + time.strftime("%H:%M:%S") \
												+ '\n')
	f.write('Status of GridSearch_TFIDF_SVC:\n')
	f.write('Best SVC in search has score:\n')
	f.write(str(sum(scores) / len(scores)) + '\n')
	f.write('The parameters of the best estimator:\n')
	f.write(str(best.get_params) + '\n')
	f.write('Done. Time taken (seconds):\n')
	f.write(str(time.time() - start) + '\n')

print 'Best SVC in search has score:'
print sum(scores) / len(scores)
(X, Y, Xt) = boilerplate.loadData()

filename = "LogSGD.txt"

clf = SGDClassifier()
params = {"loss": ["log"],
          "penalty": ["elasticnet"],
          "l1_ratio": [0.005 * i for i in range(5)],
          "alpha": [0.002 * i for i in range(1, 11)],
          "n_iter": [50 * i for i in range(1, 9)]}

search = GridSearchCV(clf, param_grid=params, n_jobs=-1)
search.fit(X, Y)
best = search.best_estimator_
scores = cross_validation.cross_val_score(best, X, Y, cv=5)
boilerplate.writeData(filename, best, Xt)

with open('LogSGD.txt', 'a') as f:
  f.write('--------------------------------------\n')
  f.write('GridSearch_SGD run at: ' + time.strftime("%H:%M:%S") \
                        + '\n')
  f.write('Status of GridSearch_SGD:\n')
  f.write('Saved to: ' + filename + '\n')
  f.write('Best SGD in search has score:\n')
  f.write(str(sum(scores) / len(scores)) + '\n')
  f.write('The parameters of the best estimator:\n')
  f.write(str(best.get_params) + '\n')
  f.write('Done. Time taken (seconds):\n')
  f.write(str(time.time() - start) + '\n')

print 'Best SGD in search has score:'
			  "min_samples_leaf": sp_randint(1, 11),
			  "bootstrap": [True, False],
			  "criterion": ["gini"],
			  "verbose": [1],
			  "warm_start": [True, False]}

# Run randomized search
n_iters = 3
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=5, \
								   n_iter=n_iters, n_jobs=-1, verbose=1)
random_search.fit(X, Y)
report(random_search.grid_scores_)

best = random_search.best_estimator_
scores = cross_validation.cross_val_score(best, X, Y, cv=5)
boilerplate.writeData('predictionsRGS_RF1.csv', best, Xt)

with open('Log.txt', 'a') as f:
	f.write('--------------------------------------\n')
	f.write('RandomGridSearch_RandomForest run at: ' + time.strftime("%H:%M:%S") \
												+ '\n')
	f.write('Status of RandomGridSearch_RandomForest:\n')
	f.write('Best Random Forest in search has score:\n')
	f.write(str(sum(scores) / len(scores)) + '\n')
	f.write('The parameters of the best estimator:\n')
	f.write(str(best.get_params) + '\n')
	f.write('Done. Time taken (seconds):\n')
	f.write(str(time.time() - start) + '\n')

print 'Best Random Forest in search has score:'
print sum(scores) / len(scores)
			  "warm_start": [True, False]}
'''
params = {"n_estimators": [140, 160, 180, 200, 220],
			  "max_depth": [30, 35, 40, 55, 60],
			  "max_features": [60, 70, 80],
			  "min_samples_split": [1, 2, 4, 8, 16],
			  "min_samples_leaf": [1, 2, 4, 8, 16],
			  "bootstrap": [True],
			  "criterion": ["gini"],
			  "verbose": [1]}
'''
search = GridSearchCV(clf, param_grid=params, n_jobs=-1)
search.fit(X, Y)
best = search.best_estimator_
scores = cross_validation.cross_val_score(best, X, Y, cv=5)
boilerplate.writeData('predictionsMGS_TFIDF_RF3.csv', best, Xt)

with open('Log.txt', 'a') as f:
	f.write('--------------------------------------\n')
	f.write('GridSearch_RandomForest run at: ' + time.strftime("%H:%M:%S") \
												+ '\n')
	f.write('Status of GridSearch_RandomForest:\n')
	f.write('Best Random Forest in search has score:\n')
	f.write(str(sum(scores) / len(scores)) + '\n')
	f.write('The parameters of the best estimator:\n')
	f.write(str(best.get_params) + '\n')
	f.write('Done. Time taken (seconds):\n')
	f.write(str(time.time() - start) + '\n')

print 'Best Random Forest in search has score:'
print sum(scores) / len(scores)