def test_gridsearch_crossval( model=SVC(random_state=0), return_model=False, param_grid=None, opt_score=0.9298, assertions=True, scoring=None, verbose=False, ): data = load_breast_cancer() # Create test and train sets from one dataset X_train, X_test, y_train, y_test = train_test_split( data["data"], data["target"], test_size=0.3, random_state=0, stratify=data["target"], ) # List the parameters to search across if param_grid is None: param_grid = { 'C': [1, 10, 100, 120, 150], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'], } # Grid-search all parameter combinations WITHOUT a validation set. gs = GridSearch( model=model, param_grid=param_grid, ) gs.fit(X_train, y_train, scoring=scoring, verbose=False) # Compare with default model without hyperopt default = SVC(random_state=0) default.fit(X_train, y_train) default_score = round(default.score(X_test, y_test), 4) gs_score = round(gs.score(X_test, y_test), 4) if verbose: print('Default score:', default_score, '| GridSearch Score:', gs_score) if assertions: assert (gs_score == opt_score) if return_model: return gs
class Prediction: def __init__(self, data, model, prefix, param_grid=[]): self.train_df, self.test_df = data self.model = model self.param_grid = param_grid self.prefix = prefix + datetime.now().strftime('%m-%d-%H:%M') self.X = self.train_df.loc[:, self.train_df.columns != 'precio'] self.y = self.train_df['precio'].values self.X_train, self.X_val, self.y_train, self.y_val = train_test_split( self.X, self.y, test_size=0.1, random_state=1) def manualGridSearch(self): best_score = math.inf for g in self.param_grid: print(g) self.model.set_params(**g) self.model.fit(self.X_train, self.y_train) score = mean_absolute_error(self.model.predict(self.X_val), self.y_val) print(score) # save if best if score < best_score: self.best_score = score self.best_grid = g def gridSearchTrain(self): print('Training...') self.gscv = GridSearchCV(self.model, self.param_grid, scoring='neg_mean_absolute_error', verbose=10) self.gscv.fit(self.X_train, self.y_train) self.best_params = self.gscv.best_params_ self.score = self.gscv.best_score_ self.predicted = self.gscv.predict(self.test_df) print(self.best_params) print(self.score) def HypOptTrain(self): print('Training...') self.opt = GridSearch(model=self.model, param_grid=self.param_grid) self.opt.fit(self.X_train, self.y_train, self.X_val, self.y_val, scoring='neg_mean_squared_error') self.best_params = self.opt.best_params_ self.score = self.opt.score(X_val, y_val) self.predicted = self.opt.predict(self.test_df) print(self.best_params) print(self.score) def train(self): print('Training...') self.model.fit(self.X_train, self.y_train) self.score = mean_absolute_error(self.model.predict(self.X_val), self.y_val) print(self.score) self.predicted = self.model.predict(self.test_df) def crossValidation(self, cv=5): cv_scores = cross_val_score( self.model, self.X, self.y, cv=cv, scoring='neg_mean_absolute_error' ) #print each cv score (accuracy) and average them self.score = np.mean(cv_scores) print(self.score) def save(self): if self.param_grid == []: with open('{}.model'.format(self.prefix), 'wb') as f: pickle.dump(self.model, f) else: with open('{}.model'.format(self.prefix), 'wb') as f: pickle.dump(self.gscv, f) def submit(self): self.test_ids = pd.read_csv('data/test.csv')['id'] answer = pd.DataFrame(list(zip(self.test_ids, self.predicted)), columns=['id', 'target']) answer.to_csv('{}-{}.csv'.format(self.prefix, int(round(self.score))), sep=',', index=False)
# print('feature counts: {0}'.format(len(features))) X_train = vect_val.transform(X_train) X_val = vect_val.transform(X_val) print('******** GridSearch ********') param_grid = { 'n_estimators': [40, 60, 80, 100, 120], 'learning_rate': [0.1, 0.15, 0.2], 'max_depth': [6, 7, 8, 9, 10] } scorer = make_scorer(f2) gs = GridSearch(model=GradientBoostingClassifier()) gs.fit(X_train, y_train, param_grid, X_val, y_val, scoring=scorer) print('params: ', gs.get_best_params()) print('Test Score for Optimized Parameters:', gs.score(X_val, y_val)) # print('******** GradientBoostingClassifier ********') # gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80, max_depth=7) # gb, preds_train, preds = train_and_predict(gb, X_train, y_train, X_val) # print_scores(y_val, preds, y_train, preds_train) # # print('******** AdamBoostingClassifier ********') # ada = AdaBoostClassifier() # ada, preds_train, preds = train_and_predict(ada, X_train, y_train, X_val) # print_scores(y_val, preds, y_train, preds_train) # # print('******** XgBoostClassifier ********') # xgb = XGBClassifier() # xgb, preds_train, preds = train_and_predict(xgb, X_train, y_train, X_val) # print_scores(y_val, preds, y_train, preds_train)
# Create a validation set. X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size = 0.3, random_state = 0, stratify = y_train, ) # List the parameters to search across # List the parameters to search across param_grid = { 'C': [1, 10, 100, 120, 150], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'], } # Grid-search all parameter combinations using a validation set. gs = GridSearch(model = SVC(random_state=0), param_grid=param_grid, parallelize=False) # You can choose the metric to optimize (f1, auc_roc, accuracy, etc.) # scoring = None will default to optimizing model.score() _ = gs.fit(X_train, y_train, X_val, y_val, scoring = 'f1') # Compare with default model without hyperopt default = SVC(random_state=0) _ = default.fit(X_train, y_train) print('\nTest score comparison (larger is better):') print('Non-optimized Parameters:', round(default.score(X_test, y_test), 4)) print('Optimized Parameters:', round(gs.score(X_test, y_test), 4))
default = MLPClassifier(max_iter=50, random_state=0) default.fit(X_train, y_train) test_score = round(default.score(X_test, y_test), 4) val_score = round(default.score(X_val, y_val), 4) print('\nTEST SCORE (default parameters):', test_score) print('VALIDATION SCORE (default parameters):', val_score) # In[5]: gs_val = GridSearch(model = MLPClassifier(max_iter=50, random_state=0), param_grid=param_grid,\ parallelize=False) print("Grid-search using a validation set.\n", "-" * 79) get_ipython().magic( u"time gs_val.fit(X_train, y_train, X_val, y_val, scoring = 'accuracy')") test_score = round(gs_val.score(X_test, y_test), 4) val_score = round(gs_val.score(X_val, y_val), 4) print('\nTEST SCORE (hyper-parameter optimization with validation set):', test_score) print('VALIDATION SCORE (hyper-parameter optimization with validation set):', val_score) # In[6]: gs_cv = GridSearch(model=MLPClassifier(max_iter=50, random_state=0), param_grid=param_grid, cv_folds=6) print( "\n\nLet's see how long grid-search takes to run when we don't use a validation set." ) print("Grid-search using cross-validation.\n", "-" * 79)
def test_regression( model=SVR(), return_model=False, param_grid=None, gs_score=.4532, assertions=True, scoring=None, verbose=False, ): from sklearn.datasets import load_boston data = load_boston() # Create test and train sets from one dataset X_train, X_test, y_train, y_test = train_test_split( data["data"], data["target"], test_size=0.1, random_state=0, ) # Create a validation set. X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.1, random_state=0, ) # List the parameters to search across if param_grid is None: param_grid = { 'C': [1, 10, 100, 120, 150], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'], } # Grid-search all parameter combinations using a validation set. gs = GridSearch( model=model, param_grid=param_grid, ) gs.fit( X_train, y_train, X_val, y_val, scoring=scoring, verbose=True, ) # Compare with default model without hyperopt default = model default.fit(X_train, y_train) default_score = round(default.score(X_test, y_test), 4) gridsearch_score = round(gs.score(X_test, y_test), 4) if verbose: print('Default score:', default_score, '| GridSearch Score:', gridsearch_score) if assertions: assert (default_score == .0175) assert (gridsearch_score is not None) if return_model: return gs
def classifier(classifier, train, truth, validate, validate_truth, test, test_truth, datatype): np.random.seed(0) rng = np.random.permutation(1)[0] train = pd.DataFrame(train) validate = pd.DataFrame(validate) test = pd.DataFrame(test) logger = logging.getLogger('myapp') hdlr = logging.FileHandler('classifiers.log') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(logging.WARN) if classifier.lower( ) == 'svm': #best: C = 50, gamma = 0.0001, kernel = rbf model = svm.SVC(random_state=rng) hyperparameter = { 'kernel': ('linear', 'rbf'), 'C': [1, 1.5, 10, 50, 100, 200], 'gamma': [1e-7, 1e-4] } elif classifier.lower() == 'randomforest': #120 model = RandomForestClassifier(random_state=rng) hyperparameter = {'n_estimators': np.arange(10, 300, 10)} elif classifier.lower() == 'adaboost': model = AdaBoostClassifier(random_state=rng) hyperparameter = { 'n_estimators': np.arange(10, 300, 10), 'algorithm': ('SAMME', 'SAMME.R') } elif classifier.lower() == 'knn': #120 model = KNeighborsClassifier() hyperparameter = dict(n_neighbors=list(range(1, 100))) else: ## assume it's asking for neural network (multi-layer perceptron) model = MLPClassifier( max_iter=100 ) #activation=tanh, hiddenlayersize=(20,20), 'learning_rate'=adaptive,solver=lbfgs hyperparameter = { 'hidden_layer_sizes': [(20, 20), (80, 20), (80, 20, 20), (80, 40, 40, 20), (40, 40, 20, 20, 20, 10)], 'learning_rate': ['adaptive'], 'activation': ['tanh', 'relu', 'logistic'], 'solver': ['lbfgs', 'sgd', 'adam'] } tuned_model = GridSearch(model=model, param_grid=hyperparameter) tuned_model.fit(train, truth) prediction = tuned_model.score(test, test_truth) logger.warn(classifier + ' ' + datatype + ' validate ' + str(prediction)) tuned_model.fit(train, truth, validate, validate_truth) prediction = tuned_model.score(test, test_truth) target_names = [ 'c-CS-s', 'c-CS-m', 'c-SC-s', 'c-SC-m', 't-CS-s', 't-CS-m', 't-SC-s', 't-SC-m' ] prediction = tuned_model.predict(test) print( classification_report(test_truth, prediction, target_names=target_names)) logger.warn(classifier + ' ' + datatype + ' ' + str(prediction)) return