def training(matrix, Y, SVM): """ def training(matrix , Y , svm ): matrix: is the train data Y: is the labels in array svm: is a boolean. If svm == True we perform svm otherwise we perform AdaBoostClassifier return: cross_validation scores """ if SVM: classifier = svm.SVC() else: classifier = AdaBoostClassifier(n_estimators=300) precision_micro_scorer = metrics.make_scorer(custom_precision_micro_score) precision_macro_scorer = metrics.make_scorer(custom_precision_macro_score) recall_micro_scorer = metrics.make_scorer(custom_recall_micro_score) recall_macro_scorer = metrics.make_scorer(custom_recall_macro_score) precision_micro = cross_val_score(classifier, matrix, Y, cv=10, scoring=precision_micro_scorer) precision_macro = cross_val_score(classifier, matrix, Y, cv=10, scoring=precision_macro_scorer) recall_micro = cross_val_score(classifier, matrix, Y, cv=10, scoring=recall_micro_scorer) recall_macro = cross_val_score(classifier, matrix, Y, cv=10, scoring=recall_macro_scorer) return {"micro": (precision_micro, recall_micro), "macro": (precision_macro, recall_macro)}
def __init__(self, n_features=None, tuning_ranges=None, models=None, cv=None, njobs=1, pre_dispatch='2*n_jobs', stack=True, verbose=False, metric='lad'): if metric.lower() not in ['lad', 'mse']: raise ValueError('Metric must be either lad or mse.') if tuning_ranges is None: try: n_features is not None except ValueError: 'Must supply one of n_features or tuning_ranges.' # use default values for grid search over tuning parameters for all models tuning_ranges = {'DecisionTreeClassifier': {'max_depth': [5, 10, 20, 50, None]}, 'RandomForestRegressor': {'max_features': list(np.unique(np.linspace(2, n_features, 5).astype(np.int)))}, 'GbrAutoNtrees': {'max_depth': [1, 2, 3, 5, 10]}} if models is None: # initialize the list of sklearn objects corresponding to different statistical models models = [] if 'DecisionTreeRegressor' in tuning_ranges: models.append(DecisionTreeRegressor()) if 'RandomForestRegressor' in tuning_ranges: models.append(RandomForestRegressor(n_estimators=500, oob_score=True, n_jobs=njobs)) if 'GbrAutoNtrees' in tuning_ranges: models.append(GbrAutoNtrees(subsample=0.75, n_estimators=500, learning_rate=0.01)) super(RegressionSuite, self).__init__(tuning_ranges, models, cv, njobs, pre_dispatch, stack, verbose) self.scorer = make_scorer(accuracy_score) self.nfeatures = n_features self.metric = metric.lower() if self.metric == 'lad': self.scorer = make_scorer(mean_absolute_error, greater_is_better=False) elif self.metric == 'mse': self.scorer = make_scorer(mean_squared_error, greater_is_better=False)
def _make_scoring_r0( scoring): if scoring == 'r2': return metrics.make_scorer( metrics.r2_score) elif scoring == 'mean_absolute_error': return metrics.make_scorer( metrics.mean_absolute_error, greater_is_better=False) elif scoring == 'mean_squared_error': return metrics.make_scorer( metrics.mean_squared_error, greater_is_better=False) elif scoring == 'median_absolute_error': return metrics.make_scorer( metrics.median_absolute_error, greater_is_better=False) else: raise ValueError("Not supported scoring")
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] fit_intercept = filter_ == DENSE_FILTER ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv2.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv3.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('neg_mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert ridge_gcv4.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with sample weights if filter_ == DENSE_FILTER: ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert ridge_gcv.alpha_ == pytest.approx(alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred, rtol=1e-5) return ret
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = cval.StratifiedKFold(y, 2) score, scores, pvalue = cval.permutation_test_score( svm, X, y, cv=cv, scoring="accuracy") assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_label, _, pvalue_label = cval.permutation_test_score( svm, X, y, cv=cv, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # test with custom scoring object scorer = make_scorer(fbeta_score, beta=2) score_label, _, pvalue_label = cval.permutation_test_score( svm, X, y, scoring=scorer, cv=cv, labels=np.ones(y.size), random_state=0) assert_almost_equal(score_label, .97, 2) assert_almost_equal(pvalue_label, 0.01, 3) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = cval.StratifiedKFold(y, 2) score_label, _, pvalue_label = cval.permutation_test_score( svm_sparse, X_sparse, y, cv=cv_sparse, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = cval.permutation_test_score(svm, X, y, cv=cv, scoring="accuracy") assert_less(score, 0.5) assert_greater(pvalue, 0.2) # test with deprecated interface with warnings.catch_warnings(record=True): score, scores, pvalue = cval.permutation_test_score( svm, X, y, scoring=make_scorer(accuracy_score), cv=cv) assert_less(score, 0.5) assert_greater(pvalue, 0.2)
def test_cross_val_score_multilabel(): X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1], [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]]) y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]) clf = KNeighborsClassifier(n_neighbors=1) scoring_micro = make_scorer(precision_score, average="micro") scoring_macro = make_scorer(precision_score, average="macro") scoring_samples = make_scorer(precision_score, average="samples") score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5) score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5) score_samples = cval.cross_val_score(clf, X, y, scoring=scoring_samples, cv=5) assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def make_scoring(scoring): """ Score is reversed if greater_is_better is False. """ if scoring == 'r2': return metrics.make_scorer(metrics.r2_score) elif scoring == 'mean_absolute_error': return metrics.make_scorer(metrics.mean_absolute_error, greater_is_better=False) elif scoring == 'mean_squared_error': return metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False) elif scoring == 'median_absolute_error': return metrics.make_scorer(metrics.median_absolute_error, greater_is_better=False) else: raise ValueError("Not supported scoring")
def test_grid_search_sparse_scoring(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2) # Smoke test the score #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), # cv.score(X_[:180], y[:180])) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = make_scorer(f1_loss, greater_is_better=False) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C assert_equal(C, C3) assert_array_equal(y_pred, y_pred3)
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None): """ Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values """ try: df = pd.read_csv(coordinationDir + element + '.csv') except Exception: print 'No data for ' + element return None, None, None df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] if(len(df) < 4): print 'Not enough data for ' + element return None, None, None s = StandardScaler() X = s.fit_transform(df[predictorColumns].astype('float64')) y = df['avgCoordination'].values rfr = RandomForestRegressor(max_depth = md) acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error))) X_train, X_test, y_train, y_test = train_test_split(X,y) rfr.fit(X_train,y_train) y_predict = rfr.predict(X_test) t = pd.DataFrame({'True':y_test, 'Predicted':y_predict}) rfr.fit(X, y) return rfr, t, round(acc,2)
def run_gridsearch(clf, parameters, X_train, y_train, X_test, y_test): """cross-validated optimised parameter search""" start = time.time() #Scorer object scorer = make_scorer(accuracy_score, greater_is_better=True) #Gridsearch tuned_clf = GridSearchCV(clf, parameters,scoring=scorer) print "Final Model: " tuned_clf.fit(X_train, y_train) print "Best Parameters: {:}".format(tuned_clf.best_params_) #Calculate Accuracy for tuned clasifier est = tuned_clf.best_estimator_ tuned_pred = est.predict(X_test) print "accuracy score for tuned classifier: {:.3f}".format(accuracy_score(y_test, tuned_pred)) print "Training set: {} samples".format(X_train.shape[0]) print "Test set: {} samples".format(X_test.shape[0]) end = time.time() print "Grid search time (secs): {:.3f}".format(end - start)
def main(argv): pd.set_option('display.width', 200) pd.set_option('display.height', 500) warnings.filterwarnings("ignore") global file_path, RMSLE_scorer # RMSLE_scorer RMSLE_scorer = metrics.make_scorer(RMSLE, greater_is_better = False) if(platform.system() == "Windows"): file_path = 'C:/Python/Others/data/Kaggle/Caterpillar_Tube_Pricing/' else: file_path = '/home/roshan/Desktop/DS/Others/data/Kaggle/Caterpillar_Tube_Pricing/' ######################################################################################################################## #Read the input file , munging and splitting the data to train and test ######################################################################################################################## Train_DS = pd.read_csv(file_path+'competition_data/train_set.csv',sep=',') Actual_DS = pd.read_csv(file_path+'competition_data/test_set.csv',sep=',') Tube_DS = pd.read_csv(file_path+'competition_data/tube.csv',sep=',') Bill_DS = pd.read_csv(file_path+'competition_data/bill_of_materials.csv',sep=',') Spec_DS = pd.read_csv(file_path+'competition_data/specs.csv',sep=',') Tube_End_DS = pd.read_csv(file_path+'competition_data/tube_end_form.csv',sep=',') Comp_DS = pd.read_csv(file_path+'competition_data/components_2.csv',sep=',') Sample_DS = pd.read_csv(file_path+'sample_submission.csv',sep=',') Train_DS, Actual_DS, y = Data_Munging(Train_DS,Actual_DS,Tube_DS,Bill_DS,Spec_DS,Tube_End_DS, Comp_DS) pred_Actual = RFR_Regressor(Train_DS, y, Actual_DS, Sample_DS, grid=False)
def RunExp(StrModel:str, Param:str, FeaUsed:list, DataPath:str, Label:str, StrMeasure:str, std:bool = False, N:int = 0): Data = np.genfromtxt(DataPath + Label, delimiter = ',', dtype = int) Data = Data[:, np.newaxis] for f in FeaUsed: T = (np.genfromtxt(DataPath + Features[f], delimiter = ',' , dtype = float)) if len(T.shape) < 2: T = T[:, np.newaxis] Data = np.concatenate((Data, T), axis = 1) if N > 0: Data = Data[:N, :] Lbl = Data[:, 0] Fea = Data[:,1:] if std: scaler = preprocessing.StandardScaler() Fea = scaler.fit_transform(Fea) Model = base.clone(Models[StrModel]) SetParam(Model, Param) Model.fit(Fea, Lbl) Pred = Model.predict(Fea) st = Measures[StrMeasure](Lbl, Pred) sv = cross_validation.cross_val_score(base.clone(Models[StrModel]), Fea, Lbl, metrics.make_scorer(Measures[StrMeasure]), cv = 5, n_jobs = 5) return st, np.mean(sv)
def buildTreeRegressor(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'c_a', md = None): """ Build a random forest-regressor model to predict some structure feature from compositional data. Will return the model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values """ df = pd.read_csv(structurestable) df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] s = StandardScaler() X = s.fit_transform(df[predictorColumns].astype('float64')) y = df[targetcolumn].values rfr = RandomForestRegressor(max_depth = md) acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error))) X_train, X_test, y_train, y_test = train_test_split(X,y) rfr.fit(X_train,y_train) y_predict = rfr.predict(X_test) t = pd.DataFrame({'True':y_test, 'Predicted':y_predict}) rfr.fit(X, y) return rfr, t, round(acc,2)
def fit_predict_model(city_data): """Find and tune the optimal model. Make a prediction on housing data.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} scorer =metrics.make_scorer(performance_metric, greater_is_better=False) # 1. Find an appropriate performance metric. This should be the same as the # one used in your performance_metric procedure above: # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html # 2. We will use grid search to fine tune the Decision Tree Regressor and # obtain the parameters that generate the best training performance. Set up # the grid search object here. # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV reg = grid_search.GridSearchCV(regressor, parameters, scoring=scorer, verbose=True) # Fit the learner to the training data to obtain the best parameter set print "Final Model: " print reg.fit(X, y) # Use the model to predict the output of a particular sample x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] y = reg.predict(x) print "House: " + str(x) print "Prediction: " + str(y)
def main(): #Load the data. A pandas DataFrame is returned.Only the training data is selected data = load_data()[0] selected_columns = ["baselineDAS", "Age", "Gender"] y = np.array(data["Response.deltaDAS"]) x = np.array(data[selected_columns]) #Build a method with the dictionary and another one with the grid of parameters methods = { 'sumSVR': sumSVR } params_grid = { 'sumSVR': { 'dim': [3], 'C': np.arange(0.1, 2, 0.4), 'epsilon': np.arange(0.01, 0.1, 0.02), #'degree': np.arange(1, 10), 'kernel_functions':[[cosine_similarity, cosine_similarity, DiracKernel]], 'w': list(product(range(1,6), repeat=3)) } } #Build and run the comparison. tr_scoring has to be constructed like it is shown here. comp = MethodComparison(methods, params_grid=params_grid) scores = comp.process(x, y, scorer, repeats=10, train_cv=3, tr_scoring=make_scorer(scorer, greater_is_better=True), n_jobs=8) return scores
def svm_grid_search(): #get data training_input,training_target,validation_input,validation_target = prepare_input() #set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false, #and log-loss requires a probability log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True) training_input = training_input[:100000] training_target = training_target[:100000] print training_input.shape[0] print training_target.shape[0] start = time.time() svm = SVC(random_state=31,probability=True) svm_parameters = {'C':[.001,.01,.1,1,10,100],'kernel':["rbf","sigmoid"]} svm_grid_obj = GridSearchCV(svm,svm_parameters,log_loss_scorer,verbose=2,n_jobs=-1) svm_grid_obj = svm_grid_obj.fit(training_input,training_target) svm = svm_grid_obj.best_estimator_ print "Best params: " + str(svm_grid_obj.best_params_) svm_train_error = log_loss(training_target,svm.predict_proba(training_input)) svm_validation_error = log_loss(validation_target,svm.predict_proba(validation_input)) print "Best SVM training error: {:02.4f}".format(svm_train_error) print "Best SVM validation error: {:02.4f}".format(svm_validation_error) end = time.time() print "RF grid search took {:02.4f} seconds".format(end-start) return svm
def fit_predict_model(city_data): """Find and tune the optimal model. Make a prediction on housing data.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} # 1. Find the best performance metric # should be the same as your performance_metric procedure # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html MSE_scorer = make_scorer(mean_squared_error) # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV reg = grid_search.GridSearchCV(regressor, parameters, MSE_scorer) # Fit the learner to the training data print "Final Model: " print reg.fit(X, y) # Use the model to predict the output of a particular sample x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] y = reg.predict(x) print "House: " + str(x) print "Prediction: " + str(y)
def fit_predict_model(city_data): """Find and tune the optimal model. Make a prediction on housing data.""" # Get the features and labels from the Boston housing data # and shuffle them randomly X, y = shuffle(city_data.data, city_data.target) # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {"max_depth": (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)} # 1. Find the best performance metric # should be the same as your performance_metric procedure # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html scorer = make_scorer(METRIC, greater_is_better=False) # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV reg = grid_search.GridSearchCV(regressor, parameters, scoring=scorer, cv=10) # Fit the learner to the training data print "Final Model: " print reg.fit(X, y) print "Best model parameter: " + str(reg.best_params_) print "Best model estimator: " + str(reg.best_estimator_) # Use the model to predict the output of a particular sample x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] y = reg.predict(x) print "House: " + str(x) print "Prediction: " + str(y) return (reg.best_params_["max_depth"], y)
def rf_grid_search(): train_inp,valid_inp,train_target,valid_target = prepare_input() #set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false, #and log-loss requires a probability log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True) train_inp = train_inp[:100000] train_target = train_target[:100000] start = time.time() random_forest = RandomForestClassifier(random_state=31) # r_forest_parameters = {'n_estimators' : [120,300,500,800,1200],'max_depth':[5,8,15,25,30,None],'max_features':['log2','sqrt',None], # 'min_samples_split':[1,2,5,10,15,100],'min_samples_leaf':[1,2,5,10]} #75.1 minutes to run with these paramters - 72 fits r_forest_parameters = {'min_samples_split':[2,5,10,20,50,100],'min_samples_leaf':[1,2,5,10,50,100]} #grid search too slow to not use all cores, and wayyyy too slow to have no output. r_forest_grid_obj = GridSearchCV(random_forest,r_forest_parameters,log_loss_scorer,verbose=2,n_jobs=-1) r_forest_grid_obj = r_forest_grid_obj.fit(train_inp,train_target) random_forest = r_forest_grid_obj.best_estimator_ print "Best params: " + str(r_forest_grid_obj.best_params_) random_forest_train_error = log_loss(train_target,random_forest.predict_proba(train_inp)) random_forest_validation_error = log_loss(valid_target,random_forest.predict_proba(valid_inp)) print "Best random forest training error: {:02.4f}".format(random_forest_train_error) print "Best random forest validation error: {:02.4f}".format(random_forest_validation_error) end = time.time() print "RF grid search took {:02.4f} seconds".format(end-start) return random_forest
def fit_predict_model(city_data): """Find and tune the optimal model. Make a prediction on housing data.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {"max_depth": (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)} ################################### ### Step 4. YOUR CODE GOES HERE ### ################################### # 1. Find the best performance metric # should be the same as your performance_metric procedure # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html regScorer = make_scorer(performance_metric, greater_is_better=False) # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV reg = GridSearchCV(regressor, parameters, scoring=regScorer) # Fit the learner to the training data print "Final Model: " print reg.fit(X, y) print "Optimal parameter: " + ` reg.best_params_ ` print "Best Score: " + ` reg.best_score_ ` print reg.grid_scores_ x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] z = reg.predict(x) print "House: " + str(x) print "Prediction: " + str(z)
def test(): x_train,y_train=load_svmlight_file("D:/traindata/12trainset") x_train.todense() x_test,y_test=load_svmlight_file("D:/traindata/12testset") x_test.todense() print(x_train.shape) #classifier clf=SVC(kernel='rbf') ovrclf=OneVsRestClassifier(clf,-1) #parameter parameters=[{'estimator__C':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5], 'estimator__kernel':['rbf'], 'estimator__gamma':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5]}, {'estimator__C':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5], 'estimator__kernel':['linear']}] para={'estimator__C':[2**-5,2**-4], 'estimator__kernel':['rbf'], 'estimator__gamma':[2**-1,1]} #scoring sougou_score=make_scorer(score_func,greater_is_better=False) #cross_validation iterator sfk=c_v.StratifiedKFold(y_train,shuffle=True,n_folds=5,random_state=0) #grid search gsclf=g_s.GridSearchCV(ovrclf,param_grid=para,cv=sfk,scoring=sougou_score) gsclf.fit(x_train,y_train) print("best score: ",gsclf.best_score_) print("best parameters: ",gsclf.best_params_) y_pred=gsclf.predict(x_test) #result target_names=['0','1','2','3'] sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2) print(classification_report(y_test,y_pred,target_names=target_names)) print("sougouVal: ",float(sum_y)/y_pred.shape[0]) print(time.time()-start_time)
def fit_model(X, y): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ # Create cross-validation sets from the training data cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0) # TODO: Create a decision tree regressor object regressor = DecisionTreeRegressor() # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 params = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric, greater_is_better=True) # TODO: Create the grid search object grid = GridSearchCV(regressor, params, scoring=scoring_fnc, verbose=True) # Fit the grid search object to the data to compute the optimal model grid = grid.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_
def ada_boost(): savefile = open('traindata.pkl', 'rb') (x_train, y_train, t1) = cPickle.load(savefile) savefile.close() savefile = open('testdata.pkl', 'rb') (x_test, t1, name1) = cPickle.load(savefile) savefile.close() # X_train, X_valid, y_train, y_valid = cross_validation.train_test_split( # X, y, test_size=0.1, random_state=42) x_train = np.asarray(x_train,dtype=np.float32) y_train = np.asarray(y_train, dtype='int32')-1 nest = 190 lr = .1 md = 6 # clf1 = DecisionTreeClassifier(max_depth=2) # clf = AdaBoostClassifier(clf1, n_estimators=200, learning_rate=.25) clf = GradientBoostingClassifier(n_estimators=nest, learning_rate=lr, max_depth=md, random_state=0) # clf = RandomForestClassifier(n_estimators=200) #.81 # clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, random_state=0,n_jobs=8) #.81 # clf = KNeighborsClassifier(15) if 1: clf.fit(x_train, y_train) ypred = clf.predict_proba(x_test) y_str = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'] kcsv.print_csv(ypred, name1, y_str,indexname='id') print (nest, lr, md) if 0: multiclass_log_loss = make_scorer(score_func=logloss_mc, greater_is_better=True, needs_proba=True) scores = cross_val_score(clf, x_train, y_train, n_jobs=8, cv=5,scoring=multiclass_log_loss) print scores print (nest, lr, md, scores.mean())
def main(argv): pd.set_option("display.width", 200) pd.set_option("display.height", 500) warnings.filterwarnings("ignore") global file_path, gini_scorer # Normalized Gini Scorer gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better=True) if platform.system() == "Windows": file_path = "C:/Python/Others/data/Kaggle/Liberty_Mutual_Group/" else: file_path = "/home/roshan/Desktop/DS/Others/data/Kaggle/Liberty_Mutual_Group/" ######################################################################################################################## # Read the input file , munging and splitting the data to train and test ######################################################################################################################## Train_DS = pd.read_csv(file_path + "train.csv", sep=",", index_col=0) Actual_DS = pd.read_csv(file_path + "test.csv", sep=",", index_col=0) Sample_DS = pd.read_csv(file_path + "sample_submission.csv", sep=",") Parms_XGB_DS = pd.read_csv(file_path + "Parms_DS_XGB_1001.csv", sep=",") Parms_RF_DS = pd.read_csv(file_path + "Parms_DS_RF2.csv", sep=",") Train_DS, Actual_DS, y = Data_Munging(Train_DS, Actual_DS) pred_Actual = RFR_Regressor(Train_DS, y, Actual_DS, Sample_DS, Parms_RF_DS, Grid=False, Ensemble=False)
def plot_validation_curve(clf, X, y, param, name=None): try: name = clf.__class__.__name__ if name is None else name if param is None: return scorer = metrics.make_scorer(metrics.average_precision_score) train_scores, test_scores = validation_curve(clf, X, y, cv=5, scoring=scorer, n_jobs=-1, param_name=param['name'], param_range=param['range']) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.title('Validation Curve of {} varying {}'.format(name, param['name'])) plt.xlabel(param['name']) plt.ylabel("Score") plt.ylim(-0.05, 1.05) plt.xlim(min(param['range']), max(param['range'])) plt.plot(param['range'], train_scores_mean, label='Training score', color='r') plt.fill_between(param['range'], train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color='r') plt.plot(param['range'], test_scores_mean, label='Cross-validation score', color="g") plt.fill_between(param['range'], test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color='g') plt.legend(loc='lower right') plt.savefig(name+'_'+param['name']+'_validationcurve.png') plt.clf() except Exception as e: print('ERROR: {}, {}'.format(name, str((e)))) pass
def fit_model(X, y): """ Tunes a decision tree regressor model using GridSearchCV on the input data X and target labels y and returns this optimal model. """ # Create a decision tree regressor object regressor = DecisionTreeRegressor() # Set up the parameters we wish to tune parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} # Make an appropriate scoring function scoring_function = make_scorer(mean_squared_error, greater_is_better = False) # Make the GridSearchCV object. It returns an object that has an attribute # best_estimator (only if refit == true, default) that has the model with the paramenter # that better represent the data (= best tree depth). This passage doesn't calculate # anything it just sets create the object grid search and set it in reg reg = grid_search.GridSearchCV(regressor, parameters, scoring_function) # Fit the learner to the data to obtain the optimal model with tuned parameters. # The best model will be saved in reg.best_estimator reg.fit(X, y) # Return the optimal model return reg.best_estimator_
def fit_model_dtr(X, y): """ Tunes a decision tree regressor model using GridSearchCV on the input data X and target labels y and returns this optimal model. """ from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import make_scorer from sklearn import grid_search #print X,y # Create a decision tree regressor object regressor = DecisionTreeRegressor(); # Set up the parameters we wish to tune parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} # Make an appropriate scoring function # We can use either of the following two statements scoring_function = make_scorer(mean_squared_error,greater_is_better=False); # Make the GridSearchCV object reg = grid_search.GridSearchCV(regressor, parameters,scoring=scoring_function); # Fit the learner to the data to obtain the optimal model with tuned parameters reg.fit(X, y); # print reg.grid_scores_ # print reg.best_estimator_ # Return the optimal model return reg.best_estimator_
def fit_predict_model(city_data): """Find and tune the optimal model. Make a prediction on housing data.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} my_scorer = make_scorer(performance_metric, greater_is_better = False) #Use gridearch to fine tune the Decision Tree Regressor and find the best model print "Final Model: " x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] #Run gridsearch several times to get average result for best prediction predictions = [] best_predictor = [] for i in range(10): grid = grid_search.GridSearchCV(regressor, param_grid=parameters, scoring=my_scorer, cv = 5) grid.fit(X,y) y1 = grid.best_estimator_.predict(x) best_predictor.append(grid.best_params_.itervalues().next()) predictions.append(y1) average_predicted_price = np.mean(predictions) print "Best model parameter: " + str(int(np.mean(best_predictor))) print "Prediction: " + str(average_predicted_price)
def tune_parameters(features, labels): """ Use GridSearchCV to identify and return the best parameters to use for the Decision Tree algorithm. features = features list as returned by the targetFeatureSplit script labels = target list as returned by the targetFeatureSplit script """ from sklearn import tree from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer # Make scorer for the GridSearchCV function scorer = make_scorer(custom_scorer, greater_is_better = True) # Parameters names and settings to be used by GridSearchCV parameters = [{"criterion": ["gini", "entropy"], "splitter": ["best", "random"], "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "min_impurity_split": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5], "presort": [True, False], "random_state": [42]}] # Use GridSearchCV to identify the best parameters # K-fold cross-validation is used (100 folds) # F1 score from custom_scorer function is used as the evaluator clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, cv = 100, scoring = scorer) clf.fit(features, labels) best_parameters = clf.best_params_ return best_parameters
def fit_predict_model(city_data): """Find and tune the optimal model. Make a prediction on housing data.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() # Setup parameters and scores for model optimization through Grid Search parameters = {"max_depth": (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)} scorer = make_scorer(mean_squared_error, greater_is_better=False) gs = GridSearchCV(regressor, parameters, scoring=scorer) gs.fit(X, y) # Select the best settings for regressor reg = gs.best_estimator_ # Fit the learner to the training data print "Final Model: " print reg.fit(X, y) # Use the model to predict the output of a particular sample x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] y = reg.predict(x) print "House: " + str(x) print "Prediction: " + str(y)
# min num_features a = 50 r = 2 # length of sequence length = 10 sequence = [a * r**(n - 1) for n in range(1, length + 1)] max_features_params = [x for x in sequence if x <= 3500] pipe = Pipeline(steps=[( 'preprocess', NLP_transformer()), ('vectorizer', CountVectorizer( lowercase=False)), ('clf', ClassifierPipeline())]) scoring = { 'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score), 'Brier': make_scorer(brier_score_loss), 'f1-score': make_scorer(f1_score), 'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score) } param_grid = [ { 'preprocess__metadata_remove': [False, True], 'preprocess__emoji_remove': [False, True], 'preprocess__punctuation_remove': [False, True], 'preprocess__negation_expand': [False, True], 'preprocess__digits_remove': [False, True], 'preprocess__negation_mark': [False, True], 'preprocess__normalize': [False, True],
y_pred = clf.predict(X_test) f1_scorer = f1_score(y_test, y_pred, average=None) print("\n{}: \n".format(clf.__class__.__name__)) print("f1 score for test test is {}".format(f1_scorer)) from sklearn.metrics import f1_score from sklearn.svm import SVC parameters = [ { 'kernel': ['rbf', 'sigmoid'], 'C': [.1, 1, 10], 'gamma': [0.001, 10, 1000] }, ] clf = SVC() f1_scorer = make_scorer(f1_score, pos_label=0) sss = StratifiedShuffleSplit(y_train, n_iter=10, test_size=0.25) grid_obj = GridSearchCV(clf, parameters, cv=sss, scoring=f1_scorer) grid_obj = grid_obj.fit(X_train, y_train) clf = grid_obj.best_estimator_ #print clf y_pred = clf.predict(X_test) f1_score_value = f1_score(y_test, y_pred, pos_label=0, average=None) # For testing print("F1 Score for test set: {}".format(f1_score_value)) print("Confusion Matrix is : \n {} ".format(confusion_matrix(y_test, y_pred))) target_names = ['class 0', 'class 1'] print(" ") print("Classification report is : \n ") print(classification_report(y_test, y_pred, target_names=target_names))
calculate_mean_sem_ranking, apply_friedman_test, apply_holms_test) sys.path.append(join(dirname(__file__), '..', '..')) from utils import generate_mean_std_tbl, generate_pvalues_tbl, sort_tbl RESULTS_NAMES = [ 'no_oversampling', 'random_oversampling', 'smote', 'kmeans_smote', 'somo', 'gsmote', 'gsomo' ] OVERSAMPLERS_NAMES = [ 'NO OVERSAMPLING', 'RANDOM OVERSAMPLING', 'SMOTE', 'K-MEANS SMOTE', 'SOMO', 'G-SMOTE', 'G-SOMO' ] CLASSIFIERS_NAMES = ['LR', 'KNN', 'DT', 'GBC'] SCORERS['geometric_mean_score'] = make_scorer(geometric_mean_score) RESULTS_PATH = join(dirname(__file__), '..', 'results') ANALYSIS_PATH = join(dirname(__file__), '..', 'analysis') def generate_results(): """Generate results including all oversamplers.""" # Load results results = [] for name in RESULTS_NAMES: file_path = join(RESULTS_PATH, f'{name}.pkl') results.append(pd.read_pickle(file_path)) # Combine results results = combine_results(*results)
Support.colored_print("Training...", "green") t0 = time.time() model.fit(X[:train_size], y[:train_size]) model_fit = time.time() - t0 print(model_name + " complexity and bandwidth selected and model fitted in %.3f s" % model_fit) t0 = time.time() y_model = model.predict(X_plot) model_predict = time.time() - t0 print(model_name + " prediction for %d inputs in %.3f s" % (X_plot.shape[0], model_predict)) # Look at the results Support.colored_print("Saving results...", "green") #train_sizes_mse, train_scores_model_mse, test_scores_model_mse = learning_curve(forest, X[:train_size], y[:train_size], train_sizes=numpy.linspace(0.1, 1, 10), scoring="neg_mean_squared_error", cv=10) train_sizes_r2, train_scores_model_r2, test_scores_model_r2 = learning_curve(model, X[:train_size], y[:train_size], train_sizes=numpy.linspace(0.1, 1, 10), scoring="r2", cv=10) train_sizes_re, train_scores_model_re, test_scores_model_re = learning_curve(model, X[:train_size], y[:train_size], train_sizes=numpy.linspace(0.1, 1, 10), scoring=make_scorer(scoring.relative_error), cv=10) plotter.figure() plotter.clf() #plotter.plot(train_sizes_mse, -train_scores_model_mse.mean(1), 'o-', color="b", label="mean squared error") plotter.plot(train_sizes_r2, train_scores_model_r2.mean(1), 'o-', color="g", label="r2") plotter.plot(train_sizes_re, train_scores_model_re.mean(1), 'o-', color="y", label="relative error") plotter.xlabel("Train size") plotter.ylabel("Error") plotter.title("Learning curve for output n. " + str(output_selected) + " Training Set") plotter.legend(loc="best") path_to_save = base_path_saving + "/out_" + model_name if not os.path.isdir(path_to_save): os.mkdir(path_to_save)
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), scorer=make_scorer(cohen_kappa_score, weights='quadratic')): """ Generate a simple plot of the test and training learning curve. The function was copied from the scikit tutorials: https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. n_jobs : int or None, optional (default=None) Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. train_sizes : array-like, shape (n_ticks,), dtype float or int Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) scorer: A modification I added. Allows a custom scoring metric. (In our case, it will be the QWK) """ plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scorer) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt
axs[2].set_xlabel("Adjusted Rand Score") for (model_name, dr), df_ in df_selected.groupby(["model", "DR"]): df_ = df_.set_index("k") df_["runtime"].plot(marker="x", ax=axs[0], label=model_name+"-"+dr) df_["adjusted_mutual_info_score"].plot(marker="x", ax=axs[1], label=model_name+"-"+dr) df_["adjusted_rand_score"].plot(marker="x", ax=axs[2], label=model_name+"-"+dr) axs[2].legend() plt.close(fig) fig.savefig(f"..//output//ul_{dataset_prefix}_experiment3_comparison_vs_k.png") df.to_csv(f"..//output//ul_{dataset_prefix}_experiment3.csv") pd.DataFrame(results).to_csv(f"..//output//ul_{dataset_prefix}_experiment3.csv") print("EXP4") results = [] scorer = make_scorer(balanced_accuracy_score) for dataset_name, dataset in dataset_dict.items(): if "original" in dataset_name: continue X_train, X_test, y_train, y_test = dataset param_grid_nn = {"hidden_layer_sizes": [64, (4, 16), (8, 8), (16, 4), (4, 4, 4)]} param_grid_pipline = {"estimator__" + k: v for k, v in param_grid_nn.items()} pipline = build_pipeline(MLPClassifier(), resampling=False) gscv = GridSearchCV(pipline, param_grid_pipline, n_jobs=-3, verbose=1) res = gscv.fit(X_train, y_train) best_estimator = res.best_estimator_ best_estimator.fit(X_train, y_train) train_score = scorer(best_estimator, X=X_train, y_true=y_train) test_score = scorer(best_estimator, X=X_test, y_true=y_test) dr = dataset_name.split("(")[0].split("_")[1] k = int(dataset_name.split("(")[1].split(")")[0])
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = cval.StratifiedKFold(y, 2) score, scores, pvalue = cval.permutation_test_score(svm, X, y, cv=cv, scoring="accuracy") assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_label, _, pvalue_label = cval.permutation_test_score( svm, X, y, cv=cv, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # test with custom scoring object scorer = make_scorer(fbeta_score, beta=2) score_label, _, pvalue_label = cval.permutation_test_score(svm, X, y, scoring=scorer, cv=cv, labels=np.ones( y.size), random_state=0) assert_almost_equal(score_label, .97, 2) assert_almost_equal(pvalue_label, 0.01, 3) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = cval.StratifiedKFold(y, 2) score_label, _, pvalue_label = cval.permutation_test_score( svm_sparse, X_sparse, y, cv=cv_sparse, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = cval.permutation_test_score(svm, X, y, cv=cv, scoring="accuracy") assert_less(score, 0.5) assert_greater(pvalue, 0.2) # test with deprecated interface with warnings.catch_warnings(record=True): score, scores, pvalue = cval.permutation_test_score( svm, X, y, score_func=accuracy_score, cv=cv) assert_less(score, 0.5) assert_greater(pvalue, 0.2)
def main(): """Main.""" # Define paths parent_dir = os.getcwd() wdir = parent_dir + '/output/features/' outdir = parent_dir + "/output/classifiers/" # Load the full dataframe df = pd.read_parquet(wdir + "features_all.parquet") # IMPORTANT: Keep only a random sample of 25% of data df = df.groupby(["dataset"]).sample(frac=0.25, random_state=42) print(r"GridSearch will be performed on a random sample of 25% of data") print("Shape after downsampling:", df.shape) # Predictors cols_all = df.columns cols_time = cols_all[cols_all.str.startswith('time_')].tolist() # EEG also includes the time columns cols_eeg = cols_all[cols_all.str.startswith('eeg_')].tolist() + cols_time cols_eog = cols_all[cols_all.str.startswith('eog_')].tolist() cols_emg = cols_all[cols_all.str.startswith('emg_')].tolist() cols_demo = ['age', 'male'] # Define predictors X = df[cols_eeg + cols_eog + cols_emg + cols_demo].sort_index(axis=1) # Define target and groups y = df['stage'] subjects = df.index.get_level_values(0).to_numpy() # Show the values of balanced class weights # print("Balanced class weights are:", # np.round(compute_class_weight('balanced', np.unique(y), y), 2)) # Define cross-validation strategy # For speed, we only use a 2-fold validation cv = GroupKFold(n_splits=2) groups = subjects # Define hyper-parameters params = dict( boosting_type='gbdt', n_estimators=50, max_depth=7, num_leaves=30, colsample_bytree=0.8, importance_type='gain', n_jobs=2 ) # Define scoring metrics scorer = { "accuracy": "accuracy", "f1_N1": make_scorer(f1_score, labels=["N1"], average=None), "f1_N2": make_scorer(f1_score, labels=["N2"], average=None), "f1_N3": make_scorer(f1_score, labels=["N3"], average=None), "f1_R": make_scorer(f1_score, labels=["R"], average=None), "f1_W": make_scorer(f1_score, labels=["W"], average=None), "mcc": make_scorer(matthews_corrcoef), } # get param_grid param_grid = get_param_grid() # Fit GridSearchCV clf = LGBMClassifier(**params) grid = GridSearchCV(clf, param_grid, cv=cv, scoring=scorer, refit=False, n_jobs=6, verbose=1) grid.fit(X, y, groups=groups) # Sort by best performance cols_scoring = ["mean_test_" + c for c in scorer.keys()] cols = ['param_class_weight'] + cols_scoring grid_res = pd.DataFrame(grid.cv_results_)[cols] grid_res.rename( columns={'param_class_weight': 'class_weight'}, inplace=True) grid_res['mean_test_scores'] = grid_res[cols_scoring].mean(1) grid_res = grid_res.sort_values( by="mean_test_scores", ascending=False).reset_index(drop=True).round(5) # Export to CSV grid_res.to_csv(outdir + "gridsearch_class_weights.csv", index=False)
test_model(pca_model, "PCA", K) test_model(ridge_model, "Ridge", K) test_model(lasso_model, "Lasso", K) test_model(enet_model, "Enet", K) ''' --> ENET and LASSO perform better our-of-sample but R2 negative ''' #%% #-------------------------------------------------- #'''Potential Alternative Approach ''' print("Potential Alternative Approach") from sklearn.model_selection import cross_validate ### Define Scorer from sklearn.metrics import make_scorer, mean_squared_error, r2_score mean_squared_error_scorer = make_scorer(mean_squared_error) scoring = {'MSE': mean_squared_error_scorer, 'r2': make_scorer(r2_score)} # cv=TimeSeriesSplit(n_splits=5).split(X) ### Cross-Validation models = [c_model, ols_model, pca_model, ridge_model, lasso_model, enet_model] models_names = [ 'c_model', 'ols_model', 'pca_model', 'ridge_model', 'lasso_model', 'enet_model' ] for k in range(len(models)): if models_names[k] == "c_model": cv_results = cross_validate(models[k], Ones, y, cv=K, return_train_score=True,
def predictRandomForestRegression(data_path, periods): print("\nTraining Random Forest Regression model with full dataset ...") df = pd.read_csv(data_path) df['TIMESTAMP'] = df['TIMESTAMP'].astype('datetime64') df.set_index('TIMESTAMP', inplace=True) dfmean = df.resample('1M').mean() dfmin = df.resample('1M').min() dfmax = df.resample('1M').max() x_train, y_train = transformDataset(dfmean) xmin_train, ymin_train = transformDataset(dfmin) xmax_train, ymax_train = transformDataset(dfmax) model = ensemble.RandomForestRegressor() model_min = ensemble.RandomForestRegressor() model_max = ensemble.RandomForestRegressor() param_search = { 'n_estimators': [100], 'max_features': ['auto'], 'max_depth': [10] } tscv = model_selection.TimeSeriesSplit(n_splits=2) rmse_score = metrics.make_scorer(rmse_calc, greater_is_better=False) gsearch = model_selection.GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring=rmse_score) gsearch_min = model_selection.GridSearchCV(estimator=model_min, cv=tscv, param_grid=param_search, scoring=rmse_score) gsearch_max = model_selection.GridSearchCV(estimator=model_max, cv=tscv, param_grid=param_search, scoring=rmse_score) gsearch.fit(x_train, y_train) gsearch_min.fit(xmin_train, ymin_train) gsearch_max.fit(xmax_train, ymax_train) best_score = gsearch.best_score_ best_model = gsearch.best_estimator_ best_model_min = gsearch_min.best_estimator_ best_model_max = gsearch_max.best_estimator_ print("\nPredicting with Random Forest regressor ...") prediction = pd.DataFrame(columns=['TIMESTAMP', 'RENEWABLES_PCT']) l = len(x_train) x_pred = x_train.iloc[[l - 1]] y_pred = best_model.predict(x_pred) xmin_pred = xmin_train.iloc[[l - 1]] ymin_pred = best_model_min.predict(xmin_pred) xmax_pred = xmax_train.iloc[[l - 1]] ymax_pred = best_model_max.predict(xmax_pred) prediction = prediction.append( { 'TIMESTAMP': x_pred.index[0], 'RENEWABLES_PCT_MEAN': y_pred[0], 'RENEWABLES_PCT_LOWER': ymin_pred[0], 'RENEWABLES_PCT_UPPER': ymax_pred[0] }, ignore_index=True) for i in range(1, periods): ti = prediction.iloc[i - 1]['TIMESTAMP'] + pd.offsets.DateOffset(months=1) xi_pred = pd.DataFrame({ 'YESTERDAY': y_pred, 'YESTERDAY_DIFF': y_pred - x_pred['YESTERDAY'], 'YESTERDAY-1': x_pred['YESTERDAY'], 'YESTERDAY-1_DIFF': x_pred['YESTERDAY_DIFF'] }) yi_pred = best_model.predict(xi_pred) xmini_pred = pd.DataFrame({ 'YESTERDAY': ymin_pred, 'YESTERDAY_DIFF': ymin_pred - xmin_pred['YESTERDAY'], 'YESTERDAY-1': xmin_pred['YESTERDAY'], 'YESTERDAY-1_DIFF': xmin_pred['YESTERDAY_DIFF'] }) ymini_pred = best_model.predict(xmini_pred) xmaxi_pred = pd.DataFrame({ 'YESTERDAY': ymax_pred, 'YESTERDAY_DIFF': ymax_pred - xmax_pred['YESTERDAY'], 'YESTERDAY-1': xmax_pred['YESTERDAY'], 'YESTERDAY-1_DIFF': xmax_pred['YESTERDAY_DIFF'] }) ymaxi_pred = best_model.predict(xmaxi_pred) prediction = prediction.append( { 'TIMESTAMP': ti, 'RENEWABLES_PCT_MEAN': yi_pred[0], 'RENEWABLES_PCT_LOWER': ymini_pred[0], 'RENEWABLES_PCT_UPPER': ymaxi_pred[0] }, ignore_index=True) x_pred = xi_pred y_pred = yi_pred xmin_pred = xmini_pred ymin_pred = ymini_pred xmax_pred = xmaxi_pred ymax_pred = ymaxi_pred prediction.set_index('TIMESTAMP', inplace=True) prediction = prediction.resample('1Y').mean() p = prediction.plot() p.set_title('CA Predicted Renewables % by Random Forest Regression') p.set_ylabel('Renewables %') wd = os.path.dirname(data_path) + '/../images' os.makedirs(wd, exist_ok=True) plt.savefig(wd + '/prediction-randomforest.png') return prediction
def scorer(show): return make_scorer(score, show=show, greater_is_better=False)
y_true: numpy.ndarray {n_samples} True class labels y_pred: numpy.ndarray {n_samples} Predicted class labels by the estimator Returns ------- fitness: float Returns a float value indicating the `individual`'s balanced accuracy 0.5 is as good as chance, and 1.0 is perfect predictive accuracy """ all_classes = list(set(np.append(y_true, y_pred))) all_class_accuracies = [] for this_class in all_classes: this_class_sensitivity = \ float(sum((y_pred == this_class) & (y_true == this_class))) /\ float(sum((y_true == this_class))) this_class_specificity = \ float(sum((y_pred != this_class) & (y_true != this_class))) /\ float(sum((y_true != this_class))) this_class_accuracy = (this_class_sensitivity + this_class_specificity) / 2. all_class_accuracies.append(this_class_accuracy) return np.mean(all_class_accuracies) SCORERS['balanced_accuracy'] = make_scorer(balanced_accuracy)
#print scores2 #exit(0) # define fixed parameters and parameters to search crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=300, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(X_train_CRF_shape, y_train_CRF_shape) # crf = rs.best_estimator_ print('best params:', rs.best_params_) print('best CV score:', rs.best_score_)
from xgboost import XGBClassifier from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_squared_error, make_scorer from xgboost import XGBRegressor from sklearn.metrics import mean_squared_error, make_scorer, f1_score, roc_auc_score, recall_score, precision_score, accuracy_score from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression import os import pickle def mean_squared_error_(ground_truth, predictions): return mean_squared_error(ground_truth, predictions) MSE = make_scorer(mean_squared_error_, greater_is_better=False) F1 = make_scorer(f1_score) AUC = make_scorer(roc_auc_score) class Ensemble(object): def __init__(self, n_folds, stacker, base_models): self.n_folds = n_folds self.stacker = stacker self.base_models = base_models def fit(self, X, y): X = np.array(X) y = np.array(y) folds = list( KFold(len(y),
#clf2 = RandomForestClassifier(n_estimators=1000, random_state=1) # In[28]: #clf3 = GaussianNB() # In[41]: #eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') # In[32]: from sklearn.metrics import accuracy_score, make_scorer from sklearn.metrics import matthews_corrcoef mcc = make_scorer(matthews_corrcoef) # In[ ]: #clf1 = DecisionTreeClassifier(max_depth=4) #clf2 = KNeighborsClassifier(n_neighbors=7) #clf3 = SVC(gamma='scale', kernel='rbf', probability=True) #>>> eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], #... voting='soft', weights=[2, 1, 2]) # In[53]: #clf1 = clf1.fit(X_train, y_train) #clf2 = clf2.fit(X_train, y_train) #clf3 = clf3.fit(X_train, y_train) #eclf = eclf.fit(X_train, y_train)
'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e+1, 1e+2, 1e+3, 1e+4] } print(f'evaluating {ds}-{fold}') # Using 5 folds and cross validation. We need to convert our RMSE function to be compatible # with the gridsearch, through the make_scorer. To make it a minimization problem, the # greater_is_better will make the RMSE a negative value (so it is analogous to a maximization # problem). We need to change this later on the results file. grid = GridSearchCV(linear_model.Lasso(), confs, n_jobs=4, cv=5, verbose=1, scoring=make_scorer(RMSE, greater_is_better=False), return_train_score=True).fit(X_train, y_train) # Using the best gridsearch configuration to train and obtain the final RMSEs regressor = linear_model.Lasso(**grid.best_params_).fit( X_train, y_train) # Handling the greater_is_better change of sign on the best train score RMSE_train = -1 * grid.best_score_ RMSE_test = RMSE(regressor.predict(X_test).ravel(), y_test.ravel()) results['dataset'].append(ds) results['conf'].append(grid.best_params_) results['RMSE_train'].append(RMSE_train) results['RMSE_test'].append(RMSE_test) results['Fold'].append(fold)
data,meta = arff.loadarff(path) data_nolabels =np.asarray( [np.asarray(list(item)[:-1]) for item in data]) labels = np.asarray([ int(item[-1]) for item in data]) cols = list(meta)[:-1] ds = pandas.DataFrame(data_nolabels,columns = cols ) return ds,labels if __name__ == '__main__': parser = argparse.ArgumentParser(description='USAGE : python readscorer.py --path <> ') parser.add_argument('-p','--path',help='path to arff file',required=True) args= vars(parser.parse_args()) train,truth = read(args['path']) clf = DT(criterion='entropy',splitter='best') scores = cross_val_score(clf,train,truth,scoring = make_scorer(f1_score,average='binary',pos_label=1),cv=10) print 'f1 scores on 10 fold cross validation' for x in xrange(len(scores)): print '{0}) {1}'.format(x+1,scores[x]) print '########################' clf = clf.fit(train,truth) # clustering k = [x for x in xrange(1,33) if x%2 == 0] k = [1] + k # 6 attributes originally k= map(lambda z: z*6,k) k.reverse() clusters = {} feature_imps = [(name,imp) for name,imp in zip(train.columns,clf.feature_importances_)]
from CV import X_train, y_train, X_test, y_test, df_reduced_train clf = SGDClassifier() parameters = [{ 'n_iter': [3000, 4000, 10000], 'penalty': ['l2', 'elasticnet'], 'loss': ['hinge', 'log', 'perceptron', 'modified_huber'], 'alpha': [0.03, 0.04, 0.07], 'shuffle': [True], 'class_weight': [{ 1: 0.9 }, { 0: 0.1 }, 'balanced'] }] start = time() f1_scorer = make_scorer(f1_score) gs = grid_search.GridSearchCV(clf, parameters, scoring=f1_scorer, n_jobs=-1) gs.fit(df_reduced_train.values, y_train) print("Grid scores: --------") print(gs.grid_scores_) print("Best estimator----") print(gs.best_estimator_) print("Best params ----") print(gs.best_params_) print("Best score: ", gs.best_score_) print("Finished in: ", (time() - start))
('columns', ColumnFilter()), ('lm', LinearRegression()) ]) def rmsle(y_hat, y): target = y predictions = y_hat log_diff = np.log(predictions+1) - np.log(target+1) return np.sqrt(np.mean(log_diff**2)) # GridSearch params = {'nearest_average__window': [3, 5, 7]} # Turns our rmsle func into a scorer of the type required # by gridsearchcv. rmsle_scorer = make_scorer(rmsle, greater_is_better=False) gscv = GridSearchCV(p, params, scoring=rmsle_scorer, cv=cross_val) clf = gscv.fit(df.reset_index(), y) print 'Best parameters: %s' % clf.best_params_ print 'Best RMSLE: %s' % clf.best_score_ test = pd.read_csv('data/test.csv') test = test.sort_values(by='SalesID') test_predictions = clf.predict(test) test['SalePrice'] = test_predictions outfile = 'data/solution_benchmark.csv'
def algo_CVmetrics(classifier_object, X_train, Y_train): """ Analytics-function that reports performance metrics for imbalanced binary classifcation (Cross Validation) classifier object = classification method e.g. DecisionTreeClassifier() X_train = input (training data) Y_train = output (training data) """ cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=seed_custom) metricslist = { 'f2': make_scorer(metrics.fbeta_score, beta=2), 'balacc': make_scorer(metrics.balanced_accuracy_score), 'precision': make_scorer(metrics.precision_score), 'recall': make_scorer(metrics.recall_score) } cv_results = cross_validate(classifier_object, X_train, Y_train, cv=cv, scoring=metricslist, return_estimator=True) f2_mean = np.mean(cv_results['test_f2']) f2_std = np.std(cv_results['test_f2']) balacc_mean = np.mean(cv_results['test_balacc']) balacc_std = np.std(cv_results['test_balacc']) precision_mean = np.mean(cv_results['test_precision']) precision_std = np.std(cv_results['test_precision']) recall_mean = np.mean(cv_results['test_recall']) recall_std = np.std(cv_results['test_recall']) scorebox = pd.DataFrame(np.zeros((1, 8)), columns=list([ 'F2-Score Mean', 'F2-Score STD', 'Balanced Accuracy Mean', 'Balanced Accuracy STD', 'Precision Mean', 'Precision STD', 'Recall Mean', 'Recall STD' ])) scorebox.iloc[0, 0] = f2_mean scorebox.iloc[0, 1] = f2_std scorebox.iloc[0, 2] = balacc_mean scorebox.iloc[0, 3] = balacc_std scorebox.iloc[0, 4] = precision_mean scorebox.iloc[0, 5] = precision_std scorebox.iloc[0, 6] = recall_mean scorebox.iloc[0, 7] = recall_std scorebox = np.round(scorebox, 3) print("Model has a mean CV balanced accuracy of {0}, (Std: {1})".format( round(balacc_mean, 3), round(balacc_std, 3))) print("Model has a mean CV F2_Score of {0}, (Std: {1})".format( round(f2_mean, 3), round(f2_std, 3))) print("Model has a mean CV Precision of {0}, (Std: {1})".format( round(precision_mean, 3), round(precision_std, 3))) print("Model has a mean CV Recall of {0}, (Std: {1})".format( round(recall_mean, 3), round(recall_std, 3))) return scorebox
def optimize(): log(action_logging_enum=INFO, logging_text= "[DECISION TREE]: Starting to search for best depth by cross validating different values." ) # read data train = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE) # drop useless columns before training drop_elements = ['ID', 'URL'] # drop because of heatmap for last 2 train = train.drop(drop_elements, axis=1) x_train = train.drop(['Label'], axis=1) y_train = train['Label'].copy() # Create regularization penalty space class_weight = ['balanced'] min_samples_leaf = [1, 2, 3] min_samples_split = [2, 3, 4] max_features = ['auto', 'sqrt', 5, 6] random_state = [42] max_depth = [14, 15, 16, 19, 20, 21, 22, 23, 25] # Create hyperparameter options hyperparameters = dict(min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=random_state, class_weight=class_weight, max_features=max_features, max_depth=max_depth) model = tree.DecisionTreeClassifier() # Create grid search using 5-fold cross validation clf = RandomizedSearchCV(model, hyperparameters, n_iter=100, cv=3, verbose=10, n_jobs=1, scoring='f1_weighted') best_model = clf.fit(x_train, y_train) # View best hyperparameters # Best estimators: 60 # Best samples leaf: 1 # Best samples split: 3 # Best features: 5 log( INFO, str('Best estimators:', best_model.best_estimator_.get_params()['n_estimators'])) log( INFO, str('Best samples leaf:', best_model.best_estimator_.get_params()['min_samples_leaf'])) log( INFO, str('Best samples split:', best_model.best_estimator_.get_params()['min_samples_split'])) log( INFO, str('Best features:', best_model.best_estimator_.get_params()['max_features'])) # maximum depth for number of columns max_depth = len(train.columns) print(max_depth) cv = KFold(n_splits=10) accuracies = list() errors = list() max_attributes = max_depth depth_range = range(10, max_attributes + 10) scorer = {'main': 'accuracy', 'custom': make_scorer(score_func)} for depth in depth_range: fold_error = [] fold_accuracy = [] tree_model = tree.DecisionTreeClassifier(max_depth=depth, min_samples_split=15, min_samples_leaf=10, random_state=42, class_weight='balanced') cv_results = cross_validate(tree_model, X=x_train, y=y_train, cv=10, return_train_score=True) for res in cv_results['train_score']: error = 1 - res fold_error.append(error) fold_accuracy.append(res) avg_error = sum(fold_error) / len(fold_error) avg_accuracy = sum(fold_accuracy) / len(fold_accuracy) log(action_logging_enum=INFO, logging_text="AVG ERROR: {f}".format(f=avg_error)) log(action_logging_enum=INFO, logging_text="AVG PREC: {f}".format(f=avg_accuracy)) errors.append(avg_error) accuracies.append(avg_accuracy) log(action_logging_enum=INFO, logging_text="[DECISION TREE]: Optimization completed.")
def load_exp(filename, dataset, model_str, target, feature_subset, include_past_ys, n_prev_days, predict_d_plus, cv_folds=11): predict_pa = False y_subset = "sleep_metrics" keep_pids = True # Removes .pkl from file experiment_name = os.path.splitext(filename)[0] print("EXPERIMENT_NAME", experiment_name) df_per_day, df_per_hour, df_per_pid, df_keys, df_embeddings = get_dataframes( dataset, cv_folds) age_col = "sleepage5c" if dataset == "mesa" else "AGE_SUENO" print( "LOG: dataset (%s), model (%s), target (%s), features (%s), days (%d), include_past (%s), predict_pa (%s)" % (dataset, model_str, target, '-'.join(feature_subset), n_prev_days, include_past_ys, predict_pa)) data = get_data(n_prev_days, predict_pa, include_past_ys, df_per_day, df_per_hour, df_per_pid, df_keys, df_embeddings, y_subset=y_subset, x_subsets=feature_subset, y_label=target, keep_pids=keep_pids) # df_per_pid["sleep_hours"] = df_per_pid[age_col].apply(cdc) # data = pd.merge(data, df_per_pid[["sleep_hours", "pid"]]) df_per_pid["participant_age"] = df_per_pid[age_col] data = pd.merge(data, df_per_pid[["participant_age", "pid"]]) data = data.fillna(-1) data = modify_data_target(data, "participant_age", target) # Predicting day + 1, instead of day if predict_d_plus > 0: y = data[[target, "ml_sequence", "pid"]] x = data.drop(columns=[target]) y["ml_sequence"] = y.groupby( ["pid"])["ml_sequence"].apply(lambda x: x - predict_d_plus) data = pd.merge(x, y) cols_to_remove = ["ml_sequence", "pid", "participant_age"] # , "sleep_hours"] for col in cols_to_remove: data = data.drop(columns=col) test_data = data[data["fold"] == cv_folds - 1] data = data[data["fold"] != cv_folds - 1] force_cat, force_num = force_categories(dataset, feature_subset) experiment = setup(data=data, test_data=test_data, target=target, session_id=123, normalize=True, transformation=True, fold_strategy="groupkfold", fold_groups="fold", categorical_features=force_cat, numeric_features=force_num, ignore_features=["fold"], silent=True, use_gpu=False) make_scorer(f1_score, average="macro") make_scorer(f1_score, average="micro") add_metric(id='micro_f1', name="Micro F1", score_func=lambda x, y: f1_score(x, y, average="macro"), greater_is_better=True) add_metric(id='macro_f1', name="Macro F1", score_func=lambda x, y: f1_score(x, y, average="micro"), greater_is_better=True) # Metrics removed as it results in problem when using multiclass remove_metric('precision') remove_metric('recall') remove_metric('f1') unzip_pkl(experiment_name) loaded_model = load_model(experiment_name) delete_pkl(experiment_name) return loaded_model
def fit(self,x,y): x_train = np.array(x) y_train = np.array(y).reshape(y.shape[0],) self.factor_name = list(x.columns) if self.parameters is None: self.set_parameters() scoring = {"mse": make_scorer(mean_squared_error),} self.set_parameters() if self.method == 'linear': self.reg_model = linear_model.LinearRegression() self.reg_model.fit(x_train,y_train) elif self.method == 'ridge': self.reg_model = GridSearchCV(linear_model.Ridge(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'lasso': self.reg_model = GridSearchCV(linear_model.Lasso(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'ElasticNet': self.reg_model = GridSearchCV(linear_model.ElasticNet(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'pls': self.reg_model = GridSearchCV(PLSRegression(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'svr': self.reg_model = GridSearchCV(svm.SVR(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'knn': self.reg_model = GridSearchCV(KNeighborsRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'dt': self.reg_model = GridSearchCV(tree.DecisionTreeRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'rf': self.reg_model = GridSearchCV(esb.RandomForestRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'adaBoost': self.reg_model = GridSearchCV(esb.AdaBoostRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'gbm': self.reg_model = GridSearchCV(esb.GradientBoostingRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method == 'xgb': self.reg_model = GridSearchCV(XGBRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train) elif self.method =='bp': self.reg_model = GridSearchCV(neural_network.MLPRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse') self.reg_model.fit(x_train,y_train)
'max_depth': [None, 5, 10, 15, 20], 'criterion': ['entropy', 'gini'] } X_data, y_data = load_breast_cancer(return_X_y=True) estimator = RandomForestClassifier(random_state=42) print('Accuracy best params and score') result = GridSearchCV(estimator, param_grid, cv=3, scoring='accuracy').fit(X_data, y_data) print('\tParams:', result.best_params_) print('\tScore:', result.best_score_) def my_scorer(y_true, y_pred): precision, recall, _ = precision_recall_curve(y_true, y_pred[:, 1]) return max([p for p, r in zip(precision, recall) if p < 1.5 * r and r > 0.5]) scorer = make_scorer(my_scorer, greater_is_better=True, needs_proba=True) print('Custom loss best params and score') result = GridSearchCV(estimator, param_grid, cv=3, scoring=scorer).fit(X_data, y_data) print('\tParams:', result.best_params_) print('\tScore:', result.best_score_) # In[2]: print(round(result.best_score_, 4))
def _get_or_set_hyperparam(self, hyperparam, y=None): # If it's already set, move on. TUNEABLE_HYPERPARAMS = [ 'priors', 'n_estimators', 'learning_rate', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features', 'C' ] if self._hyperparams.get(hyperparam): if hyperparam == 'algorithm': if self._hyperparams[ hyperparam] not in SupervisedClassifier.SUPPORTED_ALGORITHMS: raise ValueError('Algorithm %s not supported.' % self._hyperparams[hyperparam]) elif hyperparam == 'hyperparam_strategy': if self._hyperparams[ hyperparam] not in SupervisedClassifier.HYPERPARAM_STRATEGIES: raise ValueError( 'Hyperparameter strategy %s not supported.' % self._hyperparams[hyperparam]) # If the hyperparam has a relevant search space, set the search # space to the user defined value. if hyperparam in TUNEABLE_HYPERPARAMS: self._hyperparam_search_space[hyperparam] = [ self._hyperparams[hyperparam] ] return # Otherwise, define a decent initial value, based on algorithm. # If the hyperparam has a relevant search space, define it automatically. # Code sanitation note: please keep these conditions alphabetized =) if hyperparam == 'activation': # NN self._hyperparams[hyperparam] = 'relu' self._hyperparam_search_space[hyperparam] = [ 'logistic', 'tanh', 'relu' ] elif hyperparam == 'adaboost_algorithm': # ADABOOST, DECISION_TREE self._hyperparams[hyperparam] = 'SAMME.R' elif hyperparam == 'algorithm': # SUPPORTED_ALGORITHMS self._hyperparams[ hyperparam] = SupervisedClassifier.LOGISTIC_REGRESSION elif hyperparam == 'base_estimator': # ADABOOST self._hyperparams[hyperparam] = 'DecisionTreeClassifier' elif hyperparam == 'bootstrap': # RANDOM_FOREST self._hyperparams[hyperparam] = True elif hyperparam == 'C': # LOGISTIC_REGRESSION self._hyperparams[hyperparam] = 10.0 self._hyperparam_search_space[hyperparam] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0 ] elif hyperparam == 'class_weight': # ADABOOST, DECISION_TREE, LOGISTIC_REGRESSION, RANDOM_FOREST self._hyperparams[hyperparam] = 'balanced' elif hyperparam == 'colsample_bytree': # XGB self._hyperparams[hyperparam] = 0.5 self._hyperparam_search_space[hyperparam] = [0.6, 0.8, 1.0] elif hyperparam == 'criterion': # DECISION_TREE, RANDOM_FOREST self._hyperparams[hyperparam] = 'gini' elif hyperparam == 'cv': # SUPPORTED_ALGORITHMS self._hyperparams['cv'] = self._build_cv_generator(y) elif hyperparam == 'degree': # SVM, when kernel='poly'. TODO: in the future, do sub-case grid search as degree is only needed for poly self._hyperparams[hyperparam] = 3 self._hyperparam_search_space[hyperparam] = [0, 1, 2, 3, 4, 5, 6] elif hyperparam == 'dual': # LOGISTIC_REGRESSION self._hyperparams[hyperparam] = False elif hyperparam == 'fit_intercept': # LOGISTIC_REGRESSION self._hyperparams[hyperparam] = True elif hyperparam == 'gamma': # SVM, when non-linear kernel 'rbf', 'poly', 'sigmoid' self._hyperparams[hyperparam] = 'auto' self._hyperparam_search_space[hyperparam] = [0.1, 1, 10, 100] elif hyperparam == 'hidden_layer_sizes': # NN self._hyperparams[hyperparam] = (10, 10, 10) self._hyperparam_search_space[hyperparam] = [(10, ), (10, 10), (10, 10, 10, 10)] elif hyperparam == 'hyperparam_strategy': # SUPPORTED_ALGORITHMS self._hyperparams[ hyperparam] = SupervisedClassifier.STOCHASTIC_SEARCH elif hyperparam == 'kernel': # SVM self._hyperparams[hyperparam] = 'rbf' self._hyperparam_search_space[hyperparam] = [ 'linear', 'poly', 'rbf', 'sigmoid' ] elif hyperparam == 'learning_rate': # ADABOOST, XGB self._hyperparams[hyperparam] = 0.1 self._hyperparam_search_space[hyperparam] = [ 0.001, 0.01, 0.1, 1.0, 10.0 ] elif hyperparam == 'max_depth': # DECISION_TREE, RANDOM_FOREST, XGB (XGB does not allow 'None') self._hyperparams[hyperparam] = 3 # Include 1, 2, 3 to bias towards simpler tree. self._hyperparam_search_space[hyperparam] = [1, 2, 3, 4, 5] elif hyperparam == 'max_features': # DECISION_TREE, RANDOM_FOREST self._hyperparams[hyperparam] = 'sqrt' # Empirical good default values are max_features=n_features for # regression problems, and max_features=sqrt(n_features) for # classification tasks. # http://scikit-learn.org/stable/modules/ensemble.html#forest self._hyperparam_search_space[hyperparam] = ['sqrt', 'log2', None] elif hyperparam == 'max_iter': # LOGISTIC_REGRESSION self._hyperparams[hyperparam] = 100 elif hyperparam == 'min_child_weight': # XGB self._hyperparams[hyperparam] = 3 self._hyperparam_search_space[hyperparam] = [1, 5, 10] elif hyperparam == 'min_impurity_decrease': # DECISION_TREE, RANDOM_FOREST self._hyperparams[hyperparam] = 0.0 elif hyperparam == 'max_leaf_nodes': # DECISION_TREE, RANDOM_FOREST self._hyperparams[hyperparam] = None elif hyperparam == 'min_samples_leaf': # DECISION_TREE, RANDOM_FOREST self._hyperparams[hyperparam] = 1 self._hyperparam_search_space[hyperparam] = [0.01, 0.1, 1, 10] elif hyperparam == 'min_samples_split': # DECISION_TREE, RANDOM_FOREST self._hyperparams[hyperparam] = 2 # Include 20 and .02 to bias towards simpler trees. self._hyperparam_search_space[hyperparam] = [0.02, 0.2, 2, 20] elif hyperparam == 'min_weight_fraction_leaf': # DECISION_TREE, RANDOM_FOREST self._hyperparams[hyperparam] = 0.0 elif hyperparam == 'multi_class': # LOGISTIC_REGRESSION self._hyperparams[hyperparam] = 'ovr' elif hyperparam == 'n_estimators': # ADABOOST, RANDOM_FOREST if self._hyperparams['algorithm'] == SupervisedClassifier.ADABOOST: self._hyperparams[hyperparam] = 30 self._hyperparam_search_space[hyperparam] = [ 10, 20, 30, 40, 50 ] elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.RANDOM_FOREST: self._hyperparams[hyperparam] = 10 # The larger the better, but the longer it will take to compute. self._hyperparam_search_space[hyperparam] = [ 2, 5, 10, 15, 20, 25 ] elif hyperparam == 'n_iter': # RandomizedSearchCV throws ValueError if n_iter is less than the # number of hyperparam options. num_hyperparam_settings = np.prod([ len(value) for key, value in self._hyperparam_search_space.iteritems() ]) log.debug('num_hyperparam_settings: %s' % num_hyperparam_settings) self._hyperparams[hyperparam] = np.min( [48, num_hyperparam_settings]) elif hyperparam == 'n_jobs': # SUPPORTED_ALGORITHMS # LOGISTIC_REGRESSION parallelization causes multiarray.so to crash. # Automatically switch to 1 core so others can ignore this =/ if self._hyperparams[ 'algorithm'] == SupervisedClassifier.LOGISTIC_REGRESSION: self._hyperparams[hyperparam] = 1 elif self._hyperparams[ 'algorithm'] == SupervisedClassifier.REGRESS_AND_ROUND: self._hyperparams[hyperparam] = 1 else: self._hyperparams[hyperparam] = -1 elif hyperparam == 'oob_score': # RANDOM_FOREST self._hyperparams[hyperparam] = False elif hyperparam == 'penalty': # LOGISTIC_REGRESSION self._hyperparams[hyperparam] = 'l1' elif hyperparam == 'presort': # DECISION_TREE self._hyperparams[hyperparam] = False elif hyperparam == 'priors': # GAUSSIAN_NAIVE_BAYES self._hyperparams[hyperparam] = None self._hyperparam_search_space[hyperparam] = [[0.0001, 0.9999], [0.001, 0.999], [0.01, 0.99], [0.05, 0.95], [0.1, 0.9], [0.25, 0.75], [0.5, 0.5], [0.75, 0.25], [0.9, 0.1], [0.95, 0.05], [0.99, 0.01], [0.999, 0.001], [0.9999, 0.0001]] elif hyperparam == 'random_state': # SUPPORTED_ALGORITHMS self._hyperparams[hyperparam] = None elif hyperparam == 'scoring': # SUPPORTED_ALGORITHMS # Assume unbalanced classification problems, so use roc auc. # http://scikit-learn.org/stable/modules/grid_search.html#specifying-an-objective-metric scorer = make_scorer(roc_auc_score, needs_threshold=True) self._hyperparams['scoring'] = scorer elif hyperparam == 'solver': # LOGISTIC_REGRESSION, NN if self._hyperparams[ 'algorithm'] == SupervisedClassifier.LOGISTIC_REGRESSION or self._hyperparams[ 'algorithm'] == SupervisedClassifier.REGRESS_AND_ROUND: self._hyperparams[hyperparam] = 'saga' elif self._hyperparams['algorithm'] == SupervisedClassifier.NN: self._hyperparams[hyperparam] = 'adam' self._hyperparam_search_space[hyperparam] = [ 'lbfgs', 'sgd', 'adam' ] elif hyperparam == 'splitter': # DECISION_TREE self._hyperparams[hyperparam] = 'best' elif hyperparam == 'subsample': # XGB self._hyperparams[hyperparam] = 0.5 self._hyperparam_search_space[hyperparam] = [0.6, 0.8, 1.0] elif hyperparam == 'tol': # LOGISTIC_REGRESSION self._hyperparams[hyperparam] = 0.0001 elif hyperparam == 'warm_start': # RANDOM_FOREST self._hyperparams[hyperparam] = False
nombres = vectorizer.get_feature_names() tablita = mutual_info_classif(X_train, y_train) orden = np.argsort(tablita) reduc = SelectKBest(mutual_info_classif, k=4001) X_train = reduc.fit_transform(X_train, y_train) buenos_2 = [(nombres[i], tablita[i]) for i in orden] print(" Done!") print(' New shape of training matrix: ', X_train.shape) jobs = -1 paramGrid = [] nIter = 20 crossV = 10 # New performance scorer myScorer = make_scorer(f1_score, average='weighted') print("Defining randomized grid search...") if args.classifier == 'SVM': # SVM classifier = SVC() if args.kernel == 'rbf': paramGrid = { 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1), 'kernel': ['rbf'], 'class_weight': ['balanced', None] } elif args.kernel == 'linear': paramGrid = { 'C': scipy.stats.expon(scale=100), 'kernel': ['linear'],
'vect__tokenizer': [None, stemming_tokenizer], 'vect__ngram_range': [(1, 1), (1, 2),], 'svm__kernel':["linear", "rbf", "poly", "sigmoid"], 'svm__degree': [2, 3], 'svm__coef0': [0.0, 1.0], 'svm__gamma': [1e-2, 1e-3,"auto"], 'svm__C': [1, 5, 10] } ## Create a Grid-Search-Cross-Validation object ## to find in an automated fashion the best combination of parameters. grid_search = GridSearchCV(pipeline, parameters, #scoring=metrics.make_scorer(metrics.average_precision_score, average='weighted'), scoring=metrics.make_scorer(metrics.matthews_corrcoef), cv=10, n_jobs=-1, verbose=10) ## Start an exhaustive search to find the best combination of parameters ## according to the selected scoring-function. print() grid_search.fit(X_train, Y_train) print() ## Print results for each combination of parameters. number_of_candidates = len(grid_search.cv_results_['params']) print("Results:") for i in range(number_of_candidates): print(i, 'params - %s; mean - %0.3f; std - %0.3f' %
def semantic_loss(truth, pred): """Creates a loss function based on semantic similarity.""" error = 0 for i in range(0, len(pred)): truth_i = wn.synsets(truth[i])[0] pred_i = wn.synsets(truth[i])[0] error_i = truth_i.path_similarity(pred_i) error += error_i return error # Grab our data X = df['Color'].tolist() y = df['Emotion'].tolist() # Make scoring function scorer = make_scorer(semantic_loss, greater_is_better=False) # Create new knn model knn = KNeighborsClassifier() # Specify values to test param_grid = {'n_neighbors': np.arange(4, 25), 'weights':['uniform', 'distance']} # Use gridsearch to test all values for n_neighbors knn_gscv = GridSearchCV(knn, param_grid, cv=5, scoring=scorer) # Fit model to data knn_gscv.fit(X, y) # Check out best params print(knn_gscv.best_params_)
rel_id = label_encoder.transform([rel])[0] #print(rel_id,rel) stats_rel = [stat[rel_id] for stat in stats] results[rel].append(stats_rel) for rel in label_encoder.classes_: results[rel] = average_results(results[rel]) if verbose: print_statistics_row(rel, results[rel]) avg_result = macro_average_results(results) if verbose: print_statistics_footer(avg_result) return avg_result[2] # return f_0.5 score as summary statistic # A check for the average F1 score f_scorer = make_scorer(fbeta_score, beta=0.5, average='macro') def evaluateCV_check(classifier, X, y, verbose=True): if verbose: kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) scores = cross_val_score(classifier, X, y, cv=kfold, scoring=f_scorer) print("\nCross-validation scores (StratifiedKFold): ", scores) print("Mean cv score (StratifiedKFold): ", scores.mean()) ######################################################################################### # 4. TEST PREDICTIONS and ANALYSIS #########################################################################################
print total_features ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ### Build a pipeline scaler = MinMaxScaler() skb = SelectKBest(f_classif) gnb = GaussianNB() pipeline = Pipeline(steps=[("scaling", scaler), ("SKB", skb), ("NaiveBayes", gnb)]) SKB_params = {"SKB__k": range(1, 10)} cv = StratifiedShuffleSplit(labels, n_iter=100, random_state=42) # Use Kappa_scorer as a metric to evaluate kappa_scorer = make_scorer(cohen_kappa_score) gs = GridSearchCV(pipeline, SKB_params, scoring=kappa_scorer, cv=cv) gs.fit(features, labels) print "best # of parameters to choose:", gs.best_params_ clf = gs.best_estimator_ # Get the features selected by KBest clf.named_steps['SKB'].get_support(indices=True) features_selected = [ features_list[1:][i] for i in clf.named_steps['SKB'].get_support(indices=True) ] print features_selected feature_score = clf.named_steps['SKB'].scores_ score_summary = {} for i in range(len(feature_score)):