Exemple #1
0
def training(matrix, Y, SVM):
    """ def  training(matrix , Y , svm ):
			matrix: is the train data
			Y: is the labels in array
			svm: is a boolean. If svm == True we perform svm otherwise we perform AdaBoostClassifier

			return: cross_validation scores
	"""

    if SVM:
        classifier = svm.SVC()
    else:
        classifier = AdaBoostClassifier(n_estimators=300)

    precision_micro_scorer = metrics.make_scorer(custom_precision_micro_score)
    precision_macro_scorer = metrics.make_scorer(custom_precision_macro_score)
    recall_micro_scorer = metrics.make_scorer(custom_recall_micro_score)
    recall_macro_scorer = metrics.make_scorer(custom_recall_macro_score)

    precision_micro = cross_val_score(classifier, matrix, Y, cv=10, scoring=precision_micro_scorer)
    precision_macro = cross_val_score(classifier, matrix, Y, cv=10, scoring=precision_macro_scorer)
    recall_micro = cross_val_score(classifier, matrix, Y, cv=10, scoring=recall_micro_scorer)
    recall_macro = cross_val_score(classifier, matrix, Y, cv=10, scoring=recall_macro_scorer)

    return {"micro": (precision_micro, recall_micro), "macro": (precision_macro, recall_macro)}
    def __init__(self, n_features=None, tuning_ranges=None, models=None, cv=None, njobs=1, pre_dispatch='2*n_jobs',
                 stack=True, verbose=False, metric='lad'):
        if metric.lower() not in ['lad', 'mse']:
            raise ValueError('Metric must be either lad or mse.')

        if tuning_ranges is None:
            try:
                n_features is not None
            except ValueError:
                'Must supply one of n_features or tuning_ranges.'
            # use default values for grid search over tuning parameters for all models
            tuning_ranges = {'DecisionTreeClassifier': {'max_depth': [5, 10, 20, 50, None]},
                             'RandomForestRegressor': {'max_features':
                                                       list(np.unique(np.linspace(2, n_features, 5).astype(np.int)))},
                             'GbrAutoNtrees': {'max_depth': [1, 2, 3, 5, 10]}}
        if models is None:
            # initialize the list of sklearn objects corresponding to different statistical models
            models = []
            if 'DecisionTreeRegressor' in tuning_ranges:
                models.append(DecisionTreeRegressor())
            if 'RandomForestRegressor' in tuning_ranges:
                models.append(RandomForestRegressor(n_estimators=500, oob_score=True, n_jobs=njobs))
            if 'GbrAutoNtrees' in tuning_ranges:
                models.append(GbrAutoNtrees(subsample=0.75, n_estimators=500, learning_rate=0.01))

        super(RegressionSuite, self).__init__(tuning_ranges, models, cv, njobs, pre_dispatch, stack, verbose)

        self.scorer = make_scorer(accuracy_score)
        self.nfeatures = n_features
        self.metric = metric.lower()
        if self.metric == 'lad':
            self.scorer = make_scorer(mean_absolute_error, greater_is_better=False)
        elif self.metric == 'mse':
            self.scorer = make_scorer(mean_squared_error, greater_is_better=False)
Exemple #3
0
def _make_scoring_r0( scoring):
	if scoring == 'r2':
		return metrics.make_scorer( metrics.r2_score)
	elif scoring == 'mean_absolute_error':
		return metrics.make_scorer( metrics.mean_absolute_error, greater_is_better=False)
	elif scoring == 'mean_squared_error':
		return metrics.make_scorer( metrics.mean_squared_error, greater_is_better=False)
	elif scoring == 'median_absolute_error':
		return metrics.make_scorer( metrics.median_absolute_error, greater_is_better=False)
	else:
		raise ValueError("Not supported scoring")
def _test_ridge_loo(filter_):
    # test that can work with both dense or sparse matrices
    n_samples = X_diabetes.shape[0]

    ret = []

    fit_intercept = filter_ == DENSE_FILTER
    ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)

    # check best alpha
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    alpha_ = ridge_gcv.alpha_
    ret.append(alpha_)

    # check that we get same best alpha with custom loss_func
    f = ignore_warnings
    scoring = make_scorer(mean_squared_error, greater_is_better=False)
    ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv2.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with custom score_func
    func = lambda x, y: -mean_squared_error(x, y)
    scoring = make_scorer(func)
    ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
    f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv3.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with a scorer
    scorer = get_scorer('neg_mean_squared_error')
    ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
    ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
    assert ridge_gcv4.alpha_ == pytest.approx(alpha_)

    # check that we get same best alpha with sample weights
    if filter_ == DENSE_FILTER:
        ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
                      sample_weight=np.ones(n_samples))
        assert ridge_gcv.alpha_ == pytest.approx(alpha_)

    # simulate several responses
    Y = np.vstack((y_diabetes, y_diabetes)).T

    ridge_gcv.fit(filter_(X_diabetes), Y)
    Y_pred = ridge_gcv.predict(filter_(X_diabetes))
    ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
    y_pred = ridge_gcv.predict(filter_(X_diabetes))

    assert_allclose(np.vstack((y_pred, y_pred)).T,
                    Y_pred, rtol=1e-5)

    return ret
Exemple #5
0
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = cval.StratifiedKFold(y, 2)

    score, scores, pvalue = cval.permutation_test_score(
        svm, X, y, cv=cv, scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = cval.permutation_test_score(
        svm, X, y, cv=cv, scoring="accuracy", labels=np.ones(y.size),
        random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    scorer = make_scorer(fbeta_score, beta=2)
    score_label, _, pvalue_label = cval.permutation_test_score(
        svm, X, y, scoring=scorer, cv=cv, labels=np.ones(y.size),
        random_state=0)
    assert_almost_equal(score_label, .97, 2)
    assert_almost_equal(pvalue_label, 0.01, 3)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = cval.StratifiedKFold(y, 2)
    score_label, _, pvalue_label = cval.permutation_test_score(
        svm_sparse, X_sparse, y, cv=cv_sparse,
        scoring="accuracy", labels=np.ones(y.size), random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = cval.permutation_test_score(svm, X, y, cv=cv,
                                                        scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)

    # test with deprecated interface
    with warnings.catch_warnings(record=True):
        score, scores, pvalue = cval.permutation_test_score(
            svm, X, y, scoring=make_scorer(accuracy_score), cv=cv)
    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
def test_cross_val_score_multilabel():
    X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1], [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]])
    y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]])
    clf = KNeighborsClassifier(n_neighbors=1)
    scoring_micro = make_scorer(precision_score, average="micro")
    scoring_macro = make_scorer(precision_score, average="macro")
    scoring_samples = make_scorer(precision_score, average="samples")
    score_micro = cval.cross_val_score(clf, X, y, scoring=scoring_micro, cv=5)
    score_macro = cval.cross_val_score(clf, X, y, scoring=scoring_macro, cv=5)
    score_samples = cval.cross_val_score(clf, X, y, scoring=scoring_samples, cv=5)
    assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3])
    assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
    assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
Exemple #7
0
def make_scoring(scoring):
    """
    Score is reversed if greater_is_better is False.
    """
    if scoring == 'r2':
        return metrics.make_scorer(metrics.r2_score)
    elif scoring == 'mean_absolute_error':
        return metrics.make_scorer(metrics.mean_absolute_error, greater_is_better=False)
    elif scoring == 'mean_squared_error':
        return metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)
    elif scoring == 'median_absolute_error':
        return metrics.make_scorer(metrics.median_absolute_error, greater_is_better=False)
    else:
        raise ValueError("Not supported scoring")
def test_grid_search_sparse_scoring():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
    # Smoke test the score
    #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
    #                        cv.score(X_[:180], y[:180]))

    # test loss where greater is worse
    def f1_loss(y_true_, y_pred_):
        return -f1_score(y_true_, y_pred_)
    F1Loss = make_scorer(f1_loss, greater_is_better=False)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
    cv.fit(X_[:180], y_[:180])
    y_pred3 = cv.predict(X_[180:])
    C3 = cv.best_estimator_.C

    assert_equal(C, C3)
    assert_array_equal(y_pred, y_pred3)
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None):
    """
    Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
    """
    try:
        df = pd.read_csv(coordinationDir + element + '.csv')
    except Exception:
        print 'No data for ' + element
        return None, None, None
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    if(len(df) < 4):
        print 'Not enough data for ' + element
        return None, None, None
    s = StandardScaler()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = df['avgCoordination'].values

    rfr = RandomForestRegressor(max_depth = md)
    acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfr.fit(X_train,y_train)
    y_predict = rfr.predict(X_test)
    
    t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
    
    rfr.fit(X, y)

    return rfr, t, round(acc,2)
def run_gridsearch(clf, parameters, X_train, y_train, X_test, y_test):
    """cross-validated optimised parameter search"""
    start = time.time()
    
    #Scorer object
    scorer = make_scorer(accuracy_score, greater_is_better=True)
    
    #Gridsearch
    tuned_clf = GridSearchCV(clf, parameters,scoring=scorer)
    
    print "Final Model: "
    
    tuned_clf.fit(X_train, y_train)
    print "Best Parameters: {:}".format(tuned_clf.best_params_)
    
    #Calculate Accuracy for tuned clasifier
    est = tuned_clf.best_estimator_ 
    tuned_pred = est.predict(X_test)
    
    print "accuracy score for tuned classifier: {:.3f}".format(accuracy_score(y_test, tuned_pred))
    print "Training set: {} samples".format(X_train.shape[0])
    print "Test set: {} samples".format(X_test.shape[0])
    
    end = time.time()
    print "Grid search time (secs): {:.3f}".format(end - start)
Exemple #11
0
def main(argv):

    pd.set_option('display.width', 200)
    pd.set_option('display.height', 500)

    warnings.filterwarnings("ignore")

    global file_path, RMSLE_scorer

    # RMSLE_scorer
    RMSLE_scorer = metrics.make_scorer(RMSLE, greater_is_better = False)

    if(platform.system() == "Windows"):
        file_path = 'C:/Python/Others/data/Kaggle/Caterpillar_Tube_Pricing/'
    else:
        file_path = '/home/roshan/Desktop/DS/Others/data/Kaggle/Caterpillar_Tube_Pricing/'

########################################################################################################################
#Read the input file , munging and splitting the data to train and test
########################################################################################################################
    Train_DS      = pd.read_csv(file_path+'competition_data/train_set.csv',sep=',')
    Actual_DS     = pd.read_csv(file_path+'competition_data/test_set.csv',sep=',')
    Tube_DS       = pd.read_csv(file_path+'competition_data/tube.csv',sep=',')
    Bill_DS       = pd.read_csv(file_path+'competition_data/bill_of_materials.csv',sep=',')
    Spec_DS       = pd.read_csv(file_path+'competition_data/specs.csv',sep=',')
    Tube_End_DS   = pd.read_csv(file_path+'competition_data/tube_end_form.csv',sep=',')
    Comp_DS       = pd.read_csv(file_path+'competition_data/components_2.csv',sep=',')
    Sample_DS     = pd.read_csv(file_path+'sample_submission.csv',sep=',')


    Train_DS, Actual_DS, y =  Data_Munging(Train_DS,Actual_DS,Tube_DS,Bill_DS,Spec_DS,Tube_End_DS, Comp_DS)

    pred_Actual = RFR_Regressor(Train_DS, y, Actual_DS, Sample_DS, grid=False)
def RunExp(StrModel:str, Param:str, FeaUsed:list, DataPath:str, Label:str, StrMeasure:str, std:bool = False, N:int = 0):
	Data = np.genfromtxt(DataPath + Label, delimiter = ',', dtype = int)
	Data = Data[:, np.newaxis]

	for f in FeaUsed:
		T = (np.genfromtxt(DataPath + Features[f], delimiter = ',' , dtype = float))
		if len(T.shape) < 2:
			T = T[:, np.newaxis]
		Data = np.concatenate((Data, T), axis = 1)

	if N > 0:
		Data = Data[:N, :]

	Lbl = Data[:, 0]
	Fea = Data[:,1:]
	if std:
		scaler = preprocessing.StandardScaler()
		Fea = scaler.fit_transform(Fea)

	Model = base.clone(Models[StrModel])
	SetParam(Model, Param)

	Model.fit(Fea, Lbl)
	Pred = Model.predict(Fea)
	st = Measures[StrMeasure](Lbl, Pred)
		
	sv = cross_validation.cross_val_score(base.clone(Models[StrModel]), Fea, Lbl, metrics.make_scorer(Measures[StrMeasure]), cv = 5, n_jobs = 5)

	return st, np.mean(sv)
Exemple #13
0
def buildTreeRegressor(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'c_a', md = None):
    """
    Build a random forest-regressor model to predict some structure feature from compositional data.  Will return the model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = df[targetcolumn].values

    rfr = RandomForestRegressor(max_depth = md)
    acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfr.fit(X_train,y_train)
    y_predict = rfr.predict(X_test)
    
    t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
    
    rfr.fit(X, y)

    return rfr, t, round(acc,2)
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

    scorer =metrics.make_scorer(performance_metric, greater_is_better=False)

    # 1. Find an appropriate performance metric. This should be the same as the
    # one used in your performance_metric procedure above:
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html

    # 2. We will use grid search to fine tune the Decision Tree Regressor and
    # obtain the parameters that generate the best training performance. Set up
    # the grid search object here.
    # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV

    reg = grid_search.GridSearchCV(regressor, parameters, scoring=scorer, verbose=True)

    # Fit the learner to the training data to obtain the best parameter set
    print "Final Model: "
    print reg.fit(X, y)

    # Use the model to predict the output of a particular sample
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    y = reg.predict(x)
    print "House: " + str(x)
    print "Prediction: " + str(y)
def main():
    #Load the data. A pandas DataFrame is returned.Only the training data is selected
    data = load_data()[0]
    selected_columns = ["baselineDAS", "Age", "Gender"]
    y = np.array(data["Response.deltaDAS"])
    x = np.array(data[selected_columns])
    #Build a method with the dictionary and another one with the grid of parameters
    methods = {
        'sumSVR': sumSVR
    }

    params_grid = {
        'sumSVR': {
            'dim': [3],
            'C': np.arange(0.1, 2, 0.4),
            'epsilon': np.arange(0.01, 0.1, 0.02),
            #'degree': np.arange(1, 10),
            'kernel_functions':[[cosine_similarity, cosine_similarity, DiracKernel]],
            'w': list(product(range(1,6), repeat=3))
        }
    }
    #Build and run the comparison. tr_scoring has to be constructed like it is shown here.
    comp = MethodComparison(methods, params_grid=params_grid)
    scores = comp.process(x, y, scorer, repeats=10, train_cv=3,
                          tr_scoring=make_scorer(scorer, greater_is_better=True), n_jobs=8)

    return scores
Exemple #16
0
def svm_grid_search():

	#get data
	training_input,training_target,validation_input,validation_target = prepare_input()

	#set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false,
	#and log-loss requires a probability
	log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)

	training_input = training_input[:100000]
	training_target = training_target[:100000]

	print training_input.shape[0]
	print training_target.shape[0]

	start = time.time()
	svm = SVC(random_state=31,probability=True)
	
	
	svm_parameters = {'C':[.001,.01,.1,1,10,100],'kernel':["rbf","sigmoid"]}
	svm_grid_obj = GridSearchCV(svm,svm_parameters,log_loss_scorer,verbose=2,n_jobs=-1)
	svm_grid_obj = svm_grid_obj.fit(training_input,training_target)
	svm = svm_grid_obj.best_estimator_
	print "Best params: " + str(svm_grid_obj.best_params_)	
	svm_train_error = log_loss(training_target,svm.predict_proba(training_input))
	svm_validation_error = log_loss(validation_target,svm.predict_proba(validation_input))
	print "Best SVM training error: {:02.4f}".format(svm_train_error)
	print "Best SVM validation error: {:02.4f}".format(svm_validation_error)
	end = time.time()
	print "RF grid search took {:02.4f} seconds".format(end-start)

	return svm
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

    # 1. Find the best performance metric
    # should be the same as your performance_metric procedure
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    MSE_scorer = make_scorer(mean_squared_error) 
    # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model
    # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV
    reg = grid_search.GridSearchCV(regressor, parameters, MSE_scorer)
    # Fit the learner to the training data
    print "Final Model: "
    print reg.fit(X, y)
    
    # Use the model to predict the output of a particular sample
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    y = reg.predict(x)
    print "House: " + str(x)
    print "Prediction: " + str(y)
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    # and shuffle them randomly
    X, y = shuffle(city_data.data, city_data.target)

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    parameters = {"max_depth": (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}

    # 1. Find the best performance metric
    # should be the same as your performance_metric procedure
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    scorer = make_scorer(METRIC, greater_is_better=False)

    # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model
    # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV
    reg = grid_search.GridSearchCV(regressor, parameters, scoring=scorer, cv=10)

    # Fit the learner to the training data
    print "Final Model: "
    print reg.fit(X, y)
    print "Best model parameter:  " + str(reg.best_params_)
    print "Best model estimator:  " + str(reg.best_estimator_)

    # Use the model to predict the output of a particular sample
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    y = reg.predict(x)
    print "House: " + str(x)
    print "Prediction: " + str(y)

    return (reg.best_params_["max_depth"], y)
Exemple #19
0
def rf_grid_search():

	train_inp,valid_inp,train_target,valid_target = prepare_input()
	#set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false,
	#and log-loss requires a probability
	log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)

	train_inp = train_inp[:100000]
	train_target = train_target[:100000]

	start = time.time()
	random_forest = RandomForestClassifier(random_state=31)
	# r_forest_parameters = {'n_estimators' : [120,300,500,800,1200],'max_depth':[5,8,15,25,30,None],'max_features':['log2','sqrt',None],
	# 'min_samples_split':[1,2,5,10,15,100],'min_samples_leaf':[1,2,5,10]}
	
	#75.1 minutes to run with these paramters - 72 fits

	r_forest_parameters = {'min_samples_split':[2,5,10,20,50,100],'min_samples_leaf':[1,2,5,10,50,100]}
	#grid search too slow to not use all cores, and wayyyy too slow to have no output.
	r_forest_grid_obj = GridSearchCV(random_forest,r_forest_parameters,log_loss_scorer,verbose=2,n_jobs=-1)
	r_forest_grid_obj = r_forest_grid_obj.fit(train_inp,train_target)
	random_forest = r_forest_grid_obj.best_estimator_
	print "Best params: " + str(r_forest_grid_obj.best_params_)	
	random_forest_train_error = log_loss(train_target,random_forest.predict_proba(train_inp))
	random_forest_validation_error = log_loss(valid_target,random_forest.predict_proba(valid_inp))
	print "Best random forest training error: {:02.4f}".format(random_forest_train_error)
	print "Best random forest validation error: {:02.4f}".format(random_forest_validation_error)
	end = time.time()
	print "RF grid search took {:02.4f} seconds".format(end-start)

	return random_forest
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    parameters = {"max_depth": (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}

    ###################################
    ### Step 4. YOUR CODE GOES HERE ###
    ###################################

    # 1. Find the best performance metric
    # should be the same as your performance_metric procedure
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    regScorer = make_scorer(performance_metric, greater_is_better=False)

    # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model
    # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV
    reg = GridSearchCV(regressor, parameters, scoring=regScorer)

    # Fit the learner to the training data
    print "Final Model: "
    print reg.fit(X, y)
    print "Optimal parameter: " + ` reg.best_params_ `
    print "Best Score: " + ` reg.best_score_ `
    print reg.grid_scores_

    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    z = reg.predict(x)
    print "House: " + str(x)
    print "Prediction: " + str(z)
Exemple #21
0
def test():
    x_train,y_train=load_svmlight_file("D:/traindata/12trainset")
    x_train.todense()
    x_test,y_test=load_svmlight_file("D:/traindata/12testset")
    x_test.todense()
    print(x_train.shape)
    #classifier
    clf=SVC(kernel='rbf')
    ovrclf=OneVsRestClassifier(clf,-1)
    #parameter
    parameters=[{'estimator__C':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5],
                 'estimator__kernel':['rbf'],
                 'estimator__gamma':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5]},
                {'estimator__C':[2**-5,2**-4,2**-3,2**-2,2**-1,1,2**1,2**2,2**3,2**4,2**5],
                 'estimator__kernel':['linear']}]
    para={'estimator__C':[2**-5,2**-4],
                 'estimator__kernel':['rbf'],
                 'estimator__gamma':[2**-1,1]}
    #scoring
    sougou_score=make_scorer(score_func,greater_is_better=False)
    #cross_validation iterator
    sfk=c_v.StratifiedKFold(y_train,shuffle=True,n_folds=5,random_state=0)
    #grid search
    gsclf=g_s.GridSearchCV(ovrclf,param_grid=para,cv=sfk,scoring=sougou_score)
    gsclf.fit(x_train,y_train)
    print("best score: ",gsclf.best_score_)
    print("best parameters: ",gsclf.best_params_)
    y_pred=gsclf.predict(x_test)

    #result
    target_names=['0','1','2','3']
    sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2)
    print(classification_report(y_test,y_pred,target_names=target_names))
    print("sougouVal: ",float(sum_y)/y_pred.shape[0])
    print(time.time()-start_time)
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

    # TODO: Create a decision tree regressor object
    regressor = DecisionTreeRegressor()
    

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric, greater_is_better=True)

    # TODO: Create the grid search object
    grid = GridSearchCV(regressor, params, scoring=scoring_fnc, verbose=True)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_
Exemple #23
0
def ada_boost():
    savefile = open('traindata.pkl', 'rb')
    (x_train, y_train, t1) = cPickle.load(savefile)
    savefile.close()
    savefile = open('testdata.pkl', 'rb')
    (x_test, t1, name1) = cPickle.load(savefile)
    savefile.close()
    
#    X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
#    X, y, test_size=0.1, random_state=42)
    
    x_train = np.asarray(x_train,dtype=np.float32)
    y_train = np.asarray(y_train, dtype='int32')-1   
    
    nest = 190
    lr = .1
    md = 6
#    clf1 = DecisionTreeClassifier(max_depth=2)
#    clf = AdaBoostClassifier(clf1, n_estimators=200, learning_rate=.25)
    clf = GradientBoostingClassifier(n_estimators=nest, learning_rate=lr, max_depth=md, random_state=0)
#    clf = RandomForestClassifier(n_estimators=200) #.81
#    clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, random_state=0,n_jobs=8) #.81
#    clf = KNeighborsClassifier(15)
    if 1:
        clf.fit(x_train, y_train)
        ypred = clf.predict_proba(x_test)
        y_str = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']
        kcsv.print_csv(ypred, name1, y_str,indexname='id')
        print (nest, lr, md) 
    
    if 0:
        multiclass_log_loss = make_scorer(score_func=logloss_mc, greater_is_better=True, needs_proba=True)
        scores = cross_val_score(clf, x_train, y_train, n_jobs=8, cv=5,scoring=multiclass_log_loss)
        print scores
        print (nest, lr, md, scores.mean())  
Exemple #24
0
def main(argv):

    pd.set_option("display.width", 200)
    pd.set_option("display.height", 500)

    warnings.filterwarnings("ignore")

    global file_path, gini_scorer

    # Normalized Gini Scorer
    gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better=True)

    if platform.system() == "Windows":
        file_path = "C:/Python/Others/data/Kaggle/Liberty_Mutual_Group/"
    else:
        file_path = "/home/roshan/Desktop/DS/Others/data/Kaggle/Liberty_Mutual_Group/"

    ########################################################################################################################
    # Read the input file , munging and splitting the data to train and test
    ########################################################################################################################
    Train_DS = pd.read_csv(file_path + "train.csv", sep=",", index_col=0)
    Actual_DS = pd.read_csv(file_path + "test.csv", sep=",", index_col=0)
    Sample_DS = pd.read_csv(file_path + "sample_submission.csv", sep=",")
    Parms_XGB_DS = pd.read_csv(file_path + "Parms_DS_XGB_1001.csv", sep=",")
    Parms_RF_DS = pd.read_csv(file_path + "Parms_DS_RF2.csv", sep=",")

    Train_DS, Actual_DS, y = Data_Munging(Train_DS, Actual_DS)

    pred_Actual = RFR_Regressor(Train_DS, y, Actual_DS, Sample_DS, Parms_RF_DS, Grid=False, Ensemble=False)
Exemple #25
0
def plot_validation_curve(clf, X, y, param, name=None):
    try:
        name = clf.__class__.__name__ if name is None else name
        if param is None:
                return
        scorer = metrics.make_scorer(metrics.average_precision_score)
        train_scores, test_scores = validation_curve(clf, X, y, cv=5,
                scoring=scorer, n_jobs=-1, param_name=param['name'],
                param_range=param['range'])
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)

        plt.title('Validation Curve of {} varying {}'.format(name, param['name']))
        plt.xlabel(param['name'])
        plt.ylabel("Score")
        plt.ylim(-0.05, 1.05)
        plt.xlim(min(param['range']), max(param['range']))
        plt.plot(param['range'], train_scores_mean, label='Training score', color='r')
        plt.fill_between(param['range'], train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.2, color='r')
        plt.plot(param['range'], test_scores_mean, label='Cross-validation score',
                     color="g")
        plt.fill_between(param['range'], test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.2, color='g')
        plt.legend(loc='lower right')
        plt.savefig(name+'_'+param['name']+'_validationcurve.png')
        plt.clf()
    except Exception as e:
        print('ERROR: {}, {}'.format(name, str((e))))
        pass
def fit_model(X, y):
    """ Tunes a decision tree regressor model using GridSearchCV on the input data X 
        and target labels y and returns this optimal model. """

    # Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    # Set up the parameters we wish to tune
    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

    # Make an appropriate scoring function
    scoring_function = make_scorer(mean_squared_error, greater_is_better = False)

    # Make the GridSearchCV object. It returns an object that has an attribute 
    # best_estimator (only if refit == true, default) that has the model with the paramenter 
    # that better represent the data (= best tree depth). This passage doesn't calculate 
    # anything it just sets create the object grid search and set it in reg 
    reg = grid_search.GridSearchCV(regressor, parameters, scoring_function)

    # Fit the learner to the data to obtain the optimal model with tuned parameters. 
    # The best model will be saved in reg.best_estimator
    reg.fit(X, y)

    # Return the optimal model
    return reg.best_estimator_
Exemple #27
0
def fit_model_dtr(X, y):
    """ Tunes a decision tree regressor model using GridSearchCV on the input data X 
        and target labels y and returns this optimal model. """
    
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.metrics import make_scorer
    from sklearn import grid_search
    #print X,y
    # Create a decision tree regressor object
    regressor = DecisionTreeRegressor();
    
    # Set up the parameters we wish to tune
    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}
    
    # Make an appropriate scoring function
    # We can use either of the following two statements
    scoring_function = make_scorer(mean_squared_error,greater_is_better=False);
    
        
    # Make the GridSearchCV object
   
    reg = grid_search.GridSearchCV(regressor, parameters,scoring=scoring_function);
    
    
    # Fit the learner to the data to obtain the optimal model with tuned parameters
    reg.fit(X, y);
    
    # print reg.grid_scores_
    # print reg.best_estimator_

    # Return the optimal model
    return reg.best_estimator_
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

    my_scorer = make_scorer(performance_metric, greater_is_better = False)
    
    #Use gridearch to fine tune the Decision Tree Regressor and find the best model
    print "Final Model: "
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    
    #Run gridsearch several times to get average result for best prediction
    predictions = []
    best_predictor = []
    for i in range(10):
        grid = grid_search.GridSearchCV(regressor, param_grid=parameters, scoring=my_scorer, cv = 5)
        grid.fit(X,y)
        y1 = grid.best_estimator_.predict(x) 
        best_predictor.append(grid.best_params_.itervalues().next())
        predictions.append(y1)
    average_predicted_price = np.mean(predictions)

    print "Best model parameter:  " + str(int(np.mean(best_predictor)))
    print "Prediction: " + str(average_predicted_price)
def tune_parameters(features, labels):
    """ 
        Use GridSearchCV to identify and return the best parameters to use 
            for the Decision Tree algorithm.
        
        features = features list as returned by the targetFeatureSplit script
        labels = target list as returned by the targetFeatureSplit script
    """
    from sklearn import tree
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import make_scorer
    
    # Make scorer for the GridSearchCV function
    scorer = make_scorer(custom_scorer, greater_is_better = True)
    
    # Parameters names and settings to be used by GridSearchCV
    parameters = [{"criterion": ["gini", "entropy"], 
                   "splitter": ["best", "random"], 
                   "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], 
                   "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                   "min_impurity_split": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5], 
                   "presort": [True, False], 
                   "random_state": [42]}]
    
    # Use GridSearchCV to identify the best parameters
    # K-fold cross-validation is used (100 folds)
    # F1 score from custom_scorer function is used as the evaluator
    clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, cv = 100, scoring = scorer)
    
    clf.fit(features, labels)
    
    best_parameters = clf.best_params_
    
    return best_parameters
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    # Setup parameters and scores for model optimization through Grid Search
    parameters = {"max_depth": (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}
    scorer = make_scorer(mean_squared_error, greater_is_better=False)
    gs = GridSearchCV(regressor, parameters, scoring=scorer)
    gs.fit(X, y)

    # Select the best settings for regressor
    reg = gs.best_estimator_

    # Fit the learner to the training data
    print "Final Model: "
    print reg.fit(X, y)

    # Use the model to predict the output of a particular sample
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    y = reg.predict(x)
    print "House: " + str(x)
    print "Prediction: " + str(y)
    # min num_features
    a = 50
    r = 2
    # length of sequence
    length = 10
    sequence = [a * r**(n - 1) for n in range(1, length + 1)]
    max_features_params = [x for x in sequence if x <= 3500]

    pipe = Pipeline(steps=[(
        'preprocess',
        NLP_transformer()), ('vectorizer', CountVectorizer(
            lowercase=False)), ('clf', ClassifierPipeline())])

    scoring = {
        'AUC': 'roc_auc',
        'Accuracy': make_scorer(accuracy_score),
        'Brier': make_scorer(brier_score_loss),
        'f1-score': make_scorer(f1_score),
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score)
    }

    param_grid = [
        {
            'preprocess__metadata_remove': [False, True],
            'preprocess__emoji_remove': [False, True],
            'preprocess__punctuation_remove': [False, True],
            'preprocess__negation_expand': [False, True],
            'preprocess__digits_remove': [False, True],
            'preprocess__negation_mark': [False, True],
            'preprocess__normalize': [False, True],
Exemple #32
0
    y_pred = clf.predict(X_test)
    f1_scorer = f1_score(y_test, y_pred, average=None)
    print("\n{}: \n".format(clf.__class__.__name__))
    print("f1 score for test test is {}".format(f1_scorer))

from sklearn.metrics import f1_score
from sklearn.svm import SVC
parameters = [
    {
        'kernel': ['rbf', 'sigmoid'],
        'C': [.1, 1, 10],
        'gamma': [0.001, 10, 1000]
    },
]
clf = SVC()
f1_scorer = make_scorer(f1_score, pos_label=0)
sss = StratifiedShuffleSplit(y_train, n_iter=10, test_size=0.25)
grid_obj = GridSearchCV(clf, parameters, cv=sss, scoring=f1_scorer)
grid_obj = grid_obj.fit(X_train, y_train)
clf = grid_obj.best_estimator_
#print clf
y_pred = clf.predict(X_test)
f1_score_value = f1_score(y_test, y_pred, pos_label=0,
                          average=None)  # For testing
print("F1 Score for test set: {}".format(f1_score_value))
print("Confusion Matrix is : \n  {} ".format(confusion_matrix(y_test, y_pred)))
target_names = ['class 0', 'class 1']
print(" ")
print("Classification report is : \n  ")
print(classification_report(y_test, y_pred, target_names=target_names))
Exemple #33
0
                          calculate_mean_sem_ranking, apply_friedman_test,
                          apply_holms_test)

sys.path.append(join(dirname(__file__), '..', '..'))
from utils import generate_mean_std_tbl, generate_pvalues_tbl, sort_tbl

RESULTS_NAMES = [
    'no_oversampling', 'random_oversampling', 'smote', 'kmeans_smote', 'somo',
    'gsmote', 'gsomo'
]
OVERSAMPLERS_NAMES = [
    'NO OVERSAMPLING', 'RANDOM OVERSAMPLING', 'SMOTE', 'K-MEANS SMOTE', 'SOMO',
    'G-SMOTE', 'G-SOMO'
]
CLASSIFIERS_NAMES = ['LR', 'KNN', 'DT', 'GBC']
SCORERS['geometric_mean_score'] = make_scorer(geometric_mean_score)
RESULTS_PATH = join(dirname(__file__), '..', 'results')
ANALYSIS_PATH = join(dirname(__file__), '..', 'analysis')


def generate_results():
    """Generate results including all oversamplers."""

    # Load results
    results = []
    for name in RESULTS_NAMES:
        file_path = join(RESULTS_PATH, f'{name}.pkl')
        results.append(pd.read_pickle(file_path))

    # Combine results
    results = combine_results(*results)
    Support.colored_print("Training...", "green")
    t0 = time.time()
    model.fit(X[:train_size], y[:train_size])
    model_fit = time.time() - t0
    print(model_name + " complexity and bandwidth selected and model fitted in %.3f s" % model_fit)
    t0 = time.time()
    y_model = model.predict(X_plot)
    model_predict = time.time() - t0
    print(model_name + " prediction for %d inputs in %.3f s" % (X_plot.shape[0], model_predict))

    # Look at the results
    Support.colored_print("Saving results...", "green")
    #train_sizes_mse, train_scores_model_mse, test_scores_model_mse = learning_curve(forest, X[:train_size], y[:train_size], train_sizes=numpy.linspace(0.1, 1, 10), scoring="neg_mean_squared_error", cv=10)
    train_sizes_r2, train_scores_model_r2, test_scores_model_r2 = learning_curve(model, X[:train_size], y[:train_size], train_sizes=numpy.linspace(0.1, 1, 10), scoring="r2", cv=10)
    train_sizes_re, train_scores_model_re, test_scores_model_re = learning_curve(model, X[:train_size], y[:train_size], train_sizes=numpy.linspace(0.1, 1, 10), scoring=make_scorer(scoring.relative_error), cv=10)

    plotter.figure()
    plotter.clf()
    #plotter.plot(train_sizes_mse, -train_scores_model_mse.mean(1), 'o-', color="b", label="mean squared error")
    plotter.plot(train_sizes_r2, train_scores_model_r2.mean(1), 'o-', color="g", label="r2")
    plotter.plot(train_sizes_re, train_scores_model_re.mean(1), 'o-', color="y", label="relative error")
    plotter.xlabel("Train size")
    plotter.ylabel("Error")
    plotter.title("Learning curve for output n. " + str(output_selected) + " Training Set")
    plotter.legend(loc="best")

    path_to_save = base_path_saving + "/out_" + model_name
    if not os.path.isdir(path_to_save):
        os.mkdir(path_to_save)
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=None,
                        train_sizes=np.linspace(.1, 1.0, 5),
                        scorer=make_scorer(cohen_kappa_score,
                                           weights='quadratic')):
    """
    Generate a simple plot of the test and training learning curve.
	The function was copied from the scikit tutorials:
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
        
    scorer: A modification I added. Allows a custom scoring metric. (In our case, it will be the QWK)
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        scoring=scorer)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
Exemple #36
0
axs[2].set_xlabel("Adjusted Rand Score")
for (model_name, dr), df_ in df_selected.groupby(["model", "DR"]):
    df_ = df_.set_index("k")
    df_["runtime"].plot(marker="x", ax=axs[0], label=model_name+"-"+dr)
    df_["adjusted_mutual_info_score"].plot(marker="x", ax=axs[1], label=model_name+"-"+dr)
    df_["adjusted_rand_score"].plot(marker="x", ax=axs[2], label=model_name+"-"+dr)
axs[2].legend()
plt.close(fig)
fig.savefig(f"..//output//ul_{dataset_prefix}_experiment3_comparison_vs_k.png")
df.to_csv(f"..//output//ul_{dataset_prefix}_experiment3.csv")

pd.DataFrame(results).to_csv(f"..//output//ul_{dataset_prefix}_experiment3.csv")

print("EXP4")
results = []
scorer = make_scorer(balanced_accuracy_score)
for dataset_name, dataset in dataset_dict.items():
    if "original" in dataset_name:
        continue
    X_train, X_test, y_train, y_test = dataset
    param_grid_nn = {"hidden_layer_sizes": [64, (4, 16), (8, 8), (16, 4), (4, 4, 4)]}
    param_grid_pipline = {"estimator__" + k: v for k, v in param_grid_nn.items()}
    pipline = build_pipeline(MLPClassifier(), resampling=False)
    gscv = GridSearchCV(pipline, param_grid_pipline, n_jobs=-3, verbose=1)
    res = gscv.fit(X_train, y_train)
    best_estimator = res.best_estimator_
    best_estimator.fit(X_train, y_train)
    train_score = scorer(best_estimator, X=X_train, y_true=y_train)
    test_score = scorer(best_estimator, X=X_test, y_true=y_test)
    dr = dataset_name.split("(")[0].split("_")[1]
    k = int(dataset_name.split("(")[1].split(")")[0])
Exemple #37
0
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = cval.StratifiedKFold(y, 2)

    score, scores, pvalue = cval.permutation_test_score(svm,
                                                        X,
                                                        y,
                                                        cv=cv,
                                                        scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = cval.permutation_test_score(
        svm,
        X,
        y,
        cv=cv,
        scoring="accuracy",
        labels=np.ones(y.size),
        random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    scorer = make_scorer(fbeta_score, beta=2)
    score_label, _, pvalue_label = cval.permutation_test_score(svm,
                                                               X,
                                                               y,
                                                               scoring=scorer,
                                                               cv=cv,
                                                               labels=np.ones(
                                                                   y.size),
                                                               random_state=0)
    assert_almost_equal(score_label, .97, 2)
    assert_almost_equal(pvalue_label, 0.01, 3)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = cval.StratifiedKFold(y, 2)
    score_label, _, pvalue_label = cval.permutation_test_score(
        svm_sparse,
        X_sparse,
        y,
        cv=cv_sparse,
        scoring="accuracy",
        labels=np.ones(y.size),
        random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = cval.permutation_test_score(svm,
                                                        X,
                                                        y,
                                                        cv=cv,
                                                        scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)

    # test with deprecated interface
    with warnings.catch_warnings(record=True):
        score, scores, pvalue = cval.permutation_test_score(
            svm, X, y, score_func=accuracy_score, cv=cv)
    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
def main():
    """Main."""
    # Define paths
    parent_dir = os.getcwd()
    wdir = parent_dir + '/output/features/'
    outdir = parent_dir + "/output/classifiers/"
    # Load the full dataframe
    df = pd.read_parquet(wdir + "features_all.parquet")

    # IMPORTANT: Keep only a random sample of 25% of data
    df = df.groupby(["dataset"]).sample(frac=0.25, random_state=42)
    print(r"GridSearch will be performed on a random sample of 25% of data")
    print("Shape after downsampling:", df.shape)

    # Predictors
    cols_all = df.columns
    cols_time = cols_all[cols_all.str.startswith('time_')].tolist()
    # EEG also includes the time columns
    cols_eeg = cols_all[cols_all.str.startswith('eeg_')].tolist() + cols_time
    cols_eog = cols_all[cols_all.str.startswith('eog_')].tolist()
    cols_emg = cols_all[cols_all.str.startswith('emg_')].tolist()
    cols_demo = ['age', 'male']

    # Define predictors
    X = df[cols_eeg + cols_eog + cols_emg + cols_demo].sort_index(axis=1)

    # Define target and groups
    y = df['stage']
    subjects = df.index.get_level_values(0).to_numpy()

    # Show the values of balanced class weights
    # print("Balanced class weights are:",
    #       np.round(compute_class_weight('balanced', np.unique(y), y), 2))

    # Define cross-validation strategy
    # For speed, we only use a 2-fold validation
    cv = GroupKFold(n_splits=2)
    groups = subjects

    # Define hyper-parameters
    params = dict(
        boosting_type='gbdt',
        n_estimators=50,
        max_depth=7,
        num_leaves=30,
        colsample_bytree=0.8,
        importance_type='gain',
        n_jobs=2
    )

    # Define scoring metrics
    scorer = {
        "accuracy": "accuracy",
        "f1_N1": make_scorer(f1_score, labels=["N1"], average=None),
        "f1_N2": make_scorer(f1_score, labels=["N2"], average=None),
        "f1_N3": make_scorer(f1_score, labels=["N3"], average=None),
        "f1_R": make_scorer(f1_score, labels=["R"], average=None),
        "f1_W": make_scorer(f1_score, labels=["W"], average=None),
        "mcc": make_scorer(matthews_corrcoef),
    }

    # get param_grid
    param_grid = get_param_grid()

    # Fit GridSearchCV
    clf = LGBMClassifier(**params)
    grid = GridSearchCV(clf, param_grid, cv=cv, scoring=scorer,
                        refit=False, n_jobs=6, verbose=1)
    grid.fit(X, y, groups=groups)

    # Sort by best performance
    cols_scoring = ["mean_test_" + c for c in scorer.keys()]
    cols = ['param_class_weight'] + cols_scoring
    grid_res = pd.DataFrame(grid.cv_results_)[cols]

    grid_res.rename(
        columns={'param_class_weight': 'class_weight'}, inplace=True)

    grid_res['mean_test_scores'] = grid_res[cols_scoring].mean(1)
    grid_res = grid_res.sort_values(
        by="mean_test_scores", ascending=False).reset_index(drop=True).round(5)

    # Export to CSV
    grid_res.to_csv(outdir + "gridsearch_class_weights.csv", index=False)
Exemple #39
0
test_model(pca_model, "PCA", K)
test_model(ridge_model, "Ridge", K)
test_model(lasso_model, "Lasso", K)
test_model(enet_model, "Enet", K)
'''
--> ENET and LASSO  perform better our-of-sample but R2 negative
'''
#%% #--------------------------------------------------
#'''Potential Alternative Approach '''

print("Potential Alternative Approach")
from sklearn.model_selection import cross_validate
### Define Scorer
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

mean_squared_error_scorer = make_scorer(mean_squared_error)
scoring = {'MSE': mean_squared_error_scorer, 'r2': make_scorer(r2_score)}
# cv=TimeSeriesSplit(n_splits=5).split(X)
### Cross-Validation
models = [c_model, ols_model, pca_model, ridge_model, lasso_model, enet_model]
models_names = [
    'c_model', 'ols_model', 'pca_model', 'ridge_model', 'lasso_model',
    'enet_model'
]
for k in range(len(models)):
    if models_names[k] == "c_model":
        cv_results = cross_validate(models[k],
                                    Ones,
                                    y,
                                    cv=K,
                                    return_train_score=True,
Exemple #40
0
def predictRandomForestRegression(data_path, periods):
    print("\nTraining Random Forest Regression model with full dataset ...")
    df = pd.read_csv(data_path)
    df['TIMESTAMP'] = df['TIMESTAMP'].astype('datetime64')
    df.set_index('TIMESTAMP', inplace=True)
    dfmean = df.resample('1M').mean()
    dfmin = df.resample('1M').min()
    dfmax = df.resample('1M').max()
    x_train, y_train = transformDataset(dfmean)
    xmin_train, ymin_train = transformDataset(dfmin)
    xmax_train, ymax_train = transformDataset(dfmax)

    model = ensemble.RandomForestRegressor()
    model_min = ensemble.RandomForestRegressor()
    model_max = ensemble.RandomForestRegressor()
    param_search = {
        'n_estimators': [100],
        'max_features': ['auto'],
        'max_depth': [10]
    }
    tscv = model_selection.TimeSeriesSplit(n_splits=2)
    rmse_score = metrics.make_scorer(rmse_calc, greater_is_better=False)
    gsearch = model_selection.GridSearchCV(estimator=model,
                                           cv=tscv,
                                           param_grid=param_search,
                                           scoring=rmse_score)
    gsearch_min = model_selection.GridSearchCV(estimator=model_min,
                                               cv=tscv,
                                               param_grid=param_search,
                                               scoring=rmse_score)
    gsearch_max = model_selection.GridSearchCV(estimator=model_max,
                                               cv=tscv,
                                               param_grid=param_search,
                                               scoring=rmse_score)
    gsearch.fit(x_train, y_train)
    gsearch_min.fit(xmin_train, ymin_train)
    gsearch_max.fit(xmax_train, ymax_train)
    best_score = gsearch.best_score_
    best_model = gsearch.best_estimator_
    best_model_min = gsearch_min.best_estimator_
    best_model_max = gsearch_max.best_estimator_

    print("\nPredicting with Random Forest regressor ...")
    prediction = pd.DataFrame(columns=['TIMESTAMP', 'RENEWABLES_PCT'])
    l = len(x_train)
    x_pred = x_train.iloc[[l - 1]]
    y_pred = best_model.predict(x_pred)
    xmin_pred = xmin_train.iloc[[l - 1]]
    ymin_pred = best_model_min.predict(xmin_pred)
    xmax_pred = xmax_train.iloc[[l - 1]]
    ymax_pred = best_model_max.predict(xmax_pred)
    prediction = prediction.append(
        {
            'TIMESTAMP': x_pred.index[0],
            'RENEWABLES_PCT_MEAN': y_pred[0],
            'RENEWABLES_PCT_LOWER': ymin_pred[0],
            'RENEWABLES_PCT_UPPER': ymax_pred[0]
        },
        ignore_index=True)
    for i in range(1, periods):
        ti = prediction.iloc[i -
                             1]['TIMESTAMP'] + pd.offsets.DateOffset(months=1)
        xi_pred = pd.DataFrame({
            'YESTERDAY': y_pred,
            'YESTERDAY_DIFF': y_pred - x_pred['YESTERDAY'],
            'YESTERDAY-1': x_pred['YESTERDAY'],
            'YESTERDAY-1_DIFF': x_pred['YESTERDAY_DIFF']
        })
        yi_pred = best_model.predict(xi_pred)
        xmini_pred = pd.DataFrame({
            'YESTERDAY':
            ymin_pred,
            'YESTERDAY_DIFF':
            ymin_pred - xmin_pred['YESTERDAY'],
            'YESTERDAY-1':
            xmin_pred['YESTERDAY'],
            'YESTERDAY-1_DIFF':
            xmin_pred['YESTERDAY_DIFF']
        })
        ymini_pred = best_model.predict(xmini_pred)
        xmaxi_pred = pd.DataFrame({
            'YESTERDAY':
            ymax_pred,
            'YESTERDAY_DIFF':
            ymax_pred - xmax_pred['YESTERDAY'],
            'YESTERDAY-1':
            xmax_pred['YESTERDAY'],
            'YESTERDAY-1_DIFF':
            xmax_pred['YESTERDAY_DIFF']
        })
        ymaxi_pred = best_model.predict(xmaxi_pred)
        prediction = prediction.append(
            {
                'TIMESTAMP': ti,
                'RENEWABLES_PCT_MEAN': yi_pred[0],
                'RENEWABLES_PCT_LOWER': ymini_pred[0],
                'RENEWABLES_PCT_UPPER': ymaxi_pred[0]
            },
            ignore_index=True)
        x_pred = xi_pred
        y_pred = yi_pred
        xmin_pred = xmini_pred
        ymin_pred = ymini_pred
        xmax_pred = xmaxi_pred
        ymax_pred = ymaxi_pred

    prediction.set_index('TIMESTAMP', inplace=True)
    prediction = prediction.resample('1Y').mean()
    p = prediction.plot()
    p.set_title('CA Predicted Renewables % by Random Forest Regression')
    p.set_ylabel('Renewables %')
    wd = os.path.dirname(data_path) + '/../images'
    os.makedirs(wd, exist_ok=True)
    plt.savefig(wd + '/prediction-randomforest.png')

    return prediction
Exemple #41
0
def scorer(show):
    return make_scorer(score, show=show, greater_is_better=False)
Exemple #42
0
    y_true: numpy.ndarray {n_samples}
        True class labels
    y_pred: numpy.ndarray {n_samples}
        Predicted class labels by the estimator

    Returns
    -------
    fitness: float
        Returns a float value indicating the `individual`'s balanced accuracy
        0.5 is as good as chance, and 1.0 is perfect predictive accuracy
    """
    all_classes = list(set(np.append(y_true, y_pred)))
    all_class_accuracies = []
    for this_class in all_classes:
        this_class_sensitivity = \
            float(sum((y_pred == this_class) & (y_true == this_class))) /\
            float(sum((y_true == this_class)))

        this_class_specificity = \
            float(sum((y_pred != this_class) & (y_true != this_class))) /\
            float(sum((y_true != this_class)))

        this_class_accuracy = (this_class_sensitivity +
                               this_class_specificity) / 2.
        all_class_accuracies.append(this_class_accuracy)

    return np.mean(all_class_accuracies)


SCORERS['balanced_accuracy'] = make_scorer(balanced_accuracy)
Exemple #43
0
#print scores2

#exit(0)

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           max_iterations=300,
                           all_possible_transitions=True)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted',
                        labels=labels)

# search
rs = RandomizedSearchCV(crf,
                        params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train_CRF_shape, y_train_CRF_shape)

# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
Exemple #44
0
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer, f1_score, roc_auc_score, recall_score, precision_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import os
import pickle


def mean_squared_error_(ground_truth, predictions):
    return mean_squared_error(ground_truth, predictions)


MSE = make_scorer(mean_squared_error_, greater_is_better=False)
F1 = make_scorer(f1_score)
AUC = make_scorer(roc_auc_score)


class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        folds = list(
            KFold(len(y),
#clf2 = RandomForestClassifier(n_estimators=1000, random_state=1)

# In[28]:

#clf3 = GaussianNB()

# In[41]:

#eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

# In[32]:

from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import matthews_corrcoef
mcc = make_scorer(matthews_corrcoef)

# In[ ]:

#clf1 = DecisionTreeClassifier(max_depth=4)
#clf2 = KNeighborsClassifier(n_neighbors=7)
#clf3 = SVC(gamma='scale', kernel='rbf', probability=True)
#>>> eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)],
#...                         voting='soft', weights=[2, 1, 2])

# In[53]:

#clf1 = clf1.fit(X_train, y_train)
#clf2 = clf2.fit(X_train, y_train)
#clf3 = clf3.fit(X_train, y_train)
#eclf = eclf.fit(X_train, y_train)
            'alpha':
            [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e+1, 1e+2, 1e+3, 1e+4]
        }

        print(f'evaluating {ds}-{fold}')

        # Using 5 folds and cross validation. We need to convert our RMSE function to be compatible
        # with the gridsearch, through the make_scorer. To make it a minimization problem, the
        # greater_is_better will make the RMSE a negative value (so it is analogous to a maximization
        # problem). We need to change this later on the results file.
        grid = GridSearchCV(linear_model.Lasso(),
                            confs,
                            n_jobs=4,
                            cv=5,
                            verbose=1,
                            scoring=make_scorer(RMSE, greater_is_better=False),
                            return_train_score=True).fit(X_train, y_train)

        # Using the best gridsearch configuration to train and obtain the final RMSEs
        regressor = linear_model.Lasso(**grid.best_params_).fit(
            X_train, y_train)

        # Handling the greater_is_better change of sign on the best train score
        RMSE_train = -1 * grid.best_score_
        RMSE_test = RMSE(regressor.predict(X_test).ravel(), y_test.ravel())

        results['dataset'].append(ds)
        results['conf'].append(grid.best_params_)
        results['RMSE_train'].append(RMSE_train)
        results['RMSE_test'].append(RMSE_test)
        results['Fold'].append(fold)
Exemple #47
0
	data,meta = arff.loadarff(path)
	data_nolabels =np.asarray( [np.asarray(list(item)[:-1]) for item in data])
	labels = np.asarray([ int(item[-1]) for item in data])
	cols = list(meta)[:-1]
 	ds = pandas.DataFrame(data_nolabels,columns = cols )
	return ds,labels

if __name__ == '__main__':

	parser = argparse.ArgumentParser(description='USAGE :   python readscorer.py --path <> ')
	parser.add_argument('-p','--path',help='path to arff file',required=True)
	args= vars(parser.parse_args())
	train,truth = read(args['path'])
	clf = DT(criterion='entropy',splitter='best')

	scores = cross_val_score(clf,train,truth,scoring = make_scorer(f1_score,average='binary',pos_label=1),cv=10)

	print 'f1 scores on 10 fold cross validation'
	for x in xrange(len(scores)):
		print '{0}) {1}'.format(x+1,scores[x])
	print '########################'

	clf = clf.fit(train,truth)
	# clustering 
	k  = [x for x in xrange(1,33) if x%2 == 0]
	k = [1] + k
	# 6 attributes originally 
	k= map(lambda z: z*6,k) 
	k.reverse()
	clusters = {}
	feature_imps = [(name,imp) for name,imp in zip(train.columns,clf.feature_importances_)]
Exemple #48
0
from CV import X_train, y_train, X_test, y_test, df_reduced_train

clf = SGDClassifier()

parameters = [{
    'n_iter': [3000, 4000, 10000],
    'penalty': ['l2', 'elasticnet'],
    'loss': ['hinge', 'log', 'perceptron', 'modified_huber'],
    'alpha': [0.03, 0.04, 0.07],
    'shuffle': [True],
    'class_weight': [{
        1: 0.9
    }, {
        0: 0.1
    }, 'balanced']
}]

start = time()
f1_scorer = make_scorer(f1_score)
gs = grid_search.GridSearchCV(clf, parameters, scoring=f1_scorer, n_jobs=-1)
gs.fit(df_reduced_train.values, y_train)

print("Grid scores: --------")
print(gs.grid_scores_)
print("Best estimator----")
print(gs.best_estimator_)
print("Best params ----")
print(gs.best_params_)
print("Best score: ", gs.best_score_)
print("Finished in: ", (time() - start))
        ('columns', ColumnFilter()),
        ('lm', LinearRegression())
    ])

 def rmsle(y_hat, y):
        target = y
        predictions = y_hat
        log_diff = np.log(predictions+1) - np.log(target+1)
        return np.sqrt(np.mean(log_diff**2))

    # GridSearch
    params = {'nearest_average__window': [3, 5, 7]}

    # Turns our rmsle func into a scorer of the type required
    # by gridsearchcv.
    rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

    gscv = GridSearchCV(p, params,
                        scoring=rmsle_scorer,
                        cv=cross_val)
    clf = gscv.fit(df.reset_index(), y)

    print 'Best parameters: %s' % clf.best_params_
    print 'Best RMSLE: %s' % clf.best_score_

    test = pd.read_csv('data/test.csv')
    test = test.sort_values(by='SalesID')

    test_predictions = clf.predict(test)
    test['SalePrice'] = test_predictions
    outfile = 'data/solution_benchmark.csv'
def algo_CVmetrics(classifier_object, X_train, Y_train):
    """
    Analytics-function that reports performance metrics for imbalanced binary classifcation (Cross Validation)
    classifier object = classification method e.g. DecisionTreeClassifier()
    X_train = input (training data)
    Y_train = output (training data)
    """

    cv = RepeatedStratifiedKFold(n_splits=5,
                                 n_repeats=3,
                                 random_state=seed_custom)

    metricslist = {
        'f2': make_scorer(metrics.fbeta_score, beta=2),
        'balacc': make_scorer(metrics.balanced_accuracy_score),
        'precision': make_scorer(metrics.precision_score),
        'recall': make_scorer(metrics.recall_score)
    }

    cv_results = cross_validate(classifier_object,
                                X_train,
                                Y_train,
                                cv=cv,
                                scoring=metricslist,
                                return_estimator=True)

    f2_mean = np.mean(cv_results['test_f2'])
    f2_std = np.std(cv_results['test_f2'])

    balacc_mean = np.mean(cv_results['test_balacc'])
    balacc_std = np.std(cv_results['test_balacc'])

    precision_mean = np.mean(cv_results['test_precision'])
    precision_std = np.std(cv_results['test_precision'])

    recall_mean = np.mean(cv_results['test_recall'])
    recall_std = np.std(cv_results['test_recall'])

    scorebox = pd.DataFrame(np.zeros((1, 8)),
                            columns=list([
                                'F2-Score Mean', 'F2-Score STD',
                                'Balanced Accuracy Mean',
                                'Balanced Accuracy STD', 'Precision Mean',
                                'Precision STD', 'Recall Mean', 'Recall STD'
                            ]))

    scorebox.iloc[0, 0] = f2_mean
    scorebox.iloc[0, 1] = f2_std
    scorebox.iloc[0, 2] = balacc_mean
    scorebox.iloc[0, 3] = balacc_std
    scorebox.iloc[0, 4] = precision_mean
    scorebox.iloc[0, 5] = precision_std
    scorebox.iloc[0, 6] = recall_mean
    scorebox.iloc[0, 7] = recall_std

    scorebox = np.round(scorebox, 3)

    print("Model has a mean CV balanced accuracy of {0}, (Std: {1})".format(
        round(balacc_mean, 3), round(balacc_std, 3)))
    print("Model has a mean CV F2_Score of {0}, (Std: {1})".format(
        round(f2_mean, 3), round(f2_std, 3)))
    print("Model has a mean CV Precision of {0}, (Std: {1})".format(
        round(precision_mean, 3), round(precision_std, 3)))
    print("Model has a mean CV Recall of {0}, (Std: {1})".format(
        round(recall_mean, 3), round(recall_std, 3)))

    return scorebox
Exemple #51
0
def optimize():
    log(action_logging_enum=INFO,
        logging_text=
        "[DECISION TREE]: Starting to search for best depth by cross validating different values."
        )

    # read data
    train = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE)
    # drop useless columns before training
    drop_elements = ['ID', 'URL']  # drop because of heatmap for last 2
    train = train.drop(drop_elements, axis=1)
    x_train = train.drop(['Label'], axis=1)
    y_train = train['Label'].copy()

    # Create regularization penalty space
    class_weight = ['balanced']
    min_samples_leaf = [1, 2, 3]
    min_samples_split = [2, 3, 4]
    max_features = ['auto', 'sqrt', 5, 6]
    random_state = [42]
    max_depth = [14, 15, 16, 19, 20, 21, 22, 23, 25]

    # Create hyperparameter options
    hyperparameters = dict(min_samples_split=min_samples_split,
                           min_samples_leaf=min_samples_leaf,
                           random_state=random_state,
                           class_weight=class_weight,
                           max_features=max_features,
                           max_depth=max_depth)

    model = tree.DecisionTreeClassifier()
    # Create grid search using 5-fold cross validation
    clf = RandomizedSearchCV(model,
                             hyperparameters,
                             n_iter=100,
                             cv=3,
                             verbose=10,
                             n_jobs=1,
                             scoring='f1_weighted')
    best_model = clf.fit(x_train, y_train)

    # View best hyperparameters
    # Best estimators: 60
    # Best samples leaf: 1
    # Best samples split: 3
    # Best features: 5
    log(
        INFO,
        str('Best estimators:',
            best_model.best_estimator_.get_params()['n_estimators']))
    log(
        INFO,
        str('Best samples leaf:',
            best_model.best_estimator_.get_params()['min_samples_leaf']))
    log(
        INFO,
        str('Best samples split:',
            best_model.best_estimator_.get_params()['min_samples_split']))
    log(
        INFO,
        str('Best features:',
            best_model.best_estimator_.get_params()['max_features']))

    # maximum depth for number of columns
    max_depth = len(train.columns)
    print(max_depth)
    cv = KFold(n_splits=10)
    accuracies = list()
    errors = list()
    max_attributes = max_depth
    depth_range = range(10, max_attributes + 10)

    scorer = {'main': 'accuracy', 'custom': make_scorer(score_func)}

    for depth in depth_range:
        fold_error = []
        fold_accuracy = []
        tree_model = tree.DecisionTreeClassifier(max_depth=depth,
                                                 min_samples_split=15,
                                                 min_samples_leaf=10,
                                                 random_state=42,
                                                 class_weight='balanced')

        cv_results = cross_validate(tree_model,
                                    X=x_train,
                                    y=y_train,
                                    cv=10,
                                    return_train_score=True)

        for res in cv_results['train_score']:
            error = 1 - res
            fold_error.append(error)
            fold_accuracy.append(res)

        avg_error = sum(fold_error) / len(fold_error)
        avg_accuracy = sum(fold_accuracy) / len(fold_accuracy)
        log(action_logging_enum=INFO,
            logging_text="AVG ERROR: {f}".format(f=avg_error))
        log(action_logging_enum=INFO,
            logging_text="AVG PREC: {f}".format(f=avg_accuracy))
        errors.append(avg_error)
        accuracies.append(avg_accuracy)

    log(action_logging_enum=INFO,
        logging_text="[DECISION TREE]: Optimization completed.")
Exemple #52
0
def load_exp(filename,
             dataset,
             model_str,
             target,
             feature_subset,
             include_past_ys,
             n_prev_days,
             predict_d_plus,
             cv_folds=11):
    predict_pa = False
    y_subset = "sleep_metrics"
    keep_pids = True

    # Removes .pkl from file
    experiment_name = os.path.splitext(filename)[0]
    print("EXPERIMENT_NAME", experiment_name)

    df_per_day, df_per_hour, df_per_pid, df_keys, df_embeddings = get_dataframes(
        dataset, cv_folds)
    age_col = "sleepage5c" if dataset == "mesa" else "AGE_SUENO"

    print(
        "LOG: dataset (%s), model (%s), target (%s), features (%s), days (%d), include_past (%s), predict_pa (%s)"
        % (dataset, model_str, target, '-'.join(feature_subset), n_prev_days,
           include_past_ys, predict_pa))

    data = get_data(n_prev_days,
                    predict_pa,
                    include_past_ys,
                    df_per_day,
                    df_per_hour,
                    df_per_pid,
                    df_keys,
                    df_embeddings,
                    y_subset=y_subset,
                    x_subsets=feature_subset,
                    y_label=target,
                    keep_pids=keep_pids)

    # df_per_pid["sleep_hours"] = df_per_pid[age_col].apply(cdc)
    # data = pd.merge(data, df_per_pid[["sleep_hours", "pid"]])
    df_per_pid["participant_age"] = df_per_pid[age_col]
    data = pd.merge(data, df_per_pid[["participant_age", "pid"]])

    data = data.fillna(-1)
    data = modify_data_target(data, "participant_age", target)

    # Predicting day + 1, instead of day
    if predict_d_plus > 0:
        y = data[[target, "ml_sequence", "pid"]]
        x = data.drop(columns=[target])
        y["ml_sequence"] = y.groupby(
            ["pid"])["ml_sequence"].apply(lambda x: x - predict_d_plus)
        data = pd.merge(x, y)

    cols_to_remove = ["ml_sequence", "pid",
                      "participant_age"]  # , "sleep_hours"]
    for col in cols_to_remove:
        data = data.drop(columns=col)

    test_data = data[data["fold"] == cv_folds - 1]
    data = data[data["fold"] != cv_folds - 1]

    force_cat, force_num = force_categories(dataset, feature_subset)

    experiment = setup(data=data,
                       test_data=test_data,
                       target=target,
                       session_id=123,
                       normalize=True,
                       transformation=True,
                       fold_strategy="groupkfold",
                       fold_groups="fold",
                       categorical_features=force_cat,
                       numeric_features=force_num,
                       ignore_features=["fold"],
                       silent=True,
                       use_gpu=False)

    make_scorer(f1_score, average="macro")
    make_scorer(f1_score, average="micro")
    add_metric(id='micro_f1',
               name="Micro F1",
               score_func=lambda x, y: f1_score(x, y, average="macro"),
               greater_is_better=True)
    add_metric(id='macro_f1',
               name="Macro F1",
               score_func=lambda x, y: f1_score(x, y, average="micro"),
               greater_is_better=True)

    # Metrics removed as it results in problem when using multiclass
    remove_metric('precision')
    remove_metric('recall')
    remove_metric('f1')

    unzip_pkl(experiment_name)
    loaded_model = load_model(experiment_name)
    delete_pkl(experiment_name)

    return loaded_model
 def fit(self,x,y):
     x_train = np.array(x)
     y_train = np.array(y).reshape(y.shape[0],)
     self.factor_name = list(x.columns)
     
     if self.parameters is None:
         self.set_parameters()
     
     scoring = {"mse": make_scorer(mean_squared_error),}
     
     self.set_parameters()
     if self.method == 'linear':
         self.reg_model = linear_model.LinearRegression()     
         self.reg_model.fit(x_train,y_train)
         
     elif self.method == 'ridge':   
         self.reg_model = GridSearchCV(linear_model.Ridge(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
         
     elif self.method == 'lasso':
         self.reg_model = GridSearchCV(linear_model.Lasso(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
         
     elif self.method == 'ElasticNet':
         self.reg_model = GridSearchCV(linear_model.ElasticNet(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
     
     elif self.method == 'pls':
         self.reg_model = GridSearchCV(PLSRegression(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
     
     elif self.method == 'svr':
         self.reg_model = GridSearchCV(svm.SVR(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
         
     elif self.method == 'knn':
         self.reg_model = GridSearchCV(KNeighborsRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
     
     elif self.method == 'dt':
         self.reg_model = GridSearchCV(tree.DecisionTreeRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
         
     elif self.method == 'rf':
         self.reg_model = GridSearchCV(esb.RandomForestRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
         
     elif self.method == 'adaBoost':
         self.reg_model = GridSearchCV(esb.AdaBoostRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
         
     elif self.method == 'gbm':
         self.reg_model = GridSearchCV(esb.GradientBoostingRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
     
     elif self.method == 'xgb': 
         self.reg_model = GridSearchCV(XGBRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
     
     elif self.method =='bp':
         self.reg_model = GridSearchCV(neural_network.MLPRegressor(),param_grid=self.parameters,cv=5,scoring=scoring,refit ='mse')
         self.reg_model.fit(x_train,y_train)
    'max_depth': [None, 5, 10, 15, 20],
    'criterion': ['entropy', 'gini']
}

X_data, y_data = load_breast_cancer(return_X_y=True)

estimator = RandomForestClassifier(random_state=42)

print('Accuracy best params and score')
result = GridSearchCV(estimator, param_grid, cv=3, scoring='accuracy').fit(X_data, y_data)
print('\tParams:', result.best_params_)
print('\tScore:', result.best_score_)

def my_scorer(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred[:, 1])
    return max([p for p, r in zip(precision, recall) if p < 1.5 * r and r > 0.5])

scorer = make_scorer(my_scorer, greater_is_better=True, needs_proba=True)

print('Custom loss best params and score')
result = GridSearchCV(estimator, param_grid, cv=3, scoring=scorer).fit(X_data, y_data)
print('\tParams:', result.best_params_)
print('\tScore:', result.best_score_)


# In[2]:


print(round(result.best_score_, 4))

    def _get_or_set_hyperparam(self, hyperparam, y=None):
        # If it's already set, move on.
        TUNEABLE_HYPERPARAMS = [
            'priors', 'n_estimators', 'learning_rate', 'max_depth',
            'min_samples_split', 'min_samples_leaf', 'max_features', 'C'
        ]
        if self._hyperparams.get(hyperparam):
            if hyperparam == 'algorithm':
                if self._hyperparams[
                        hyperparam] not in SupervisedClassifier.SUPPORTED_ALGORITHMS:
                    raise ValueError('Algorithm %s not supported.' %
                                     self._hyperparams[hyperparam])
            elif hyperparam == 'hyperparam_strategy':
                if self._hyperparams[
                        hyperparam] not in SupervisedClassifier.HYPERPARAM_STRATEGIES:
                    raise ValueError(
                        'Hyperparameter strategy %s not supported.' %
                        self._hyperparams[hyperparam])

            # If the hyperparam has a relevant search space, set the search
            # space to the user defined value.
            if hyperparam in TUNEABLE_HYPERPARAMS:
                self._hyperparam_search_space[hyperparam] = [
                    self._hyperparams[hyperparam]
                ]

            return

        # Otherwise, define a decent initial value, based on algorithm.
        # If the hyperparam has a relevant search space, define it automatically.
        # Code sanitation note: please keep these conditions alphabetized =)
        if hyperparam == 'activation':
            # NN
            self._hyperparams[hyperparam] = 'relu'
            self._hyperparam_search_space[hyperparam] = [
                'logistic', 'tanh', 'relu'
            ]
        elif hyperparam == 'adaboost_algorithm':
            # ADABOOST, DECISION_TREE
            self._hyperparams[hyperparam] = 'SAMME.R'
        elif hyperparam == 'algorithm':
            # SUPPORTED_ALGORITHMS
            self._hyperparams[
                hyperparam] = SupervisedClassifier.LOGISTIC_REGRESSION
        elif hyperparam == 'base_estimator':
            # ADABOOST
            self._hyperparams[hyperparam] = 'DecisionTreeClassifier'
        elif hyperparam == 'bootstrap':
            # RANDOM_FOREST
            self._hyperparams[hyperparam] = True
        elif hyperparam == 'C':
            # LOGISTIC_REGRESSION
            self._hyperparams[hyperparam] = 10.0
            self._hyperparam_search_space[hyperparam] = [
                0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0
            ]
        elif hyperparam == 'class_weight':
            # ADABOOST, DECISION_TREE, LOGISTIC_REGRESSION, RANDOM_FOREST
            self._hyperparams[hyperparam] = 'balanced'
        elif hyperparam == 'colsample_bytree':
            # XGB
            self._hyperparams[hyperparam] = 0.5
            self._hyperparam_search_space[hyperparam] = [0.6, 0.8, 1.0]
        elif hyperparam == 'criterion':
            # DECISION_TREE, RANDOM_FOREST
            self._hyperparams[hyperparam] = 'gini'
        elif hyperparam == 'cv':
            # SUPPORTED_ALGORITHMS
            self._hyperparams['cv'] = self._build_cv_generator(y)
        elif hyperparam == 'degree':
            # SVM, when kernel='poly'. TODO: in the future, do sub-case grid search as degree is only needed for poly
            self._hyperparams[hyperparam] = 3
            self._hyperparam_search_space[hyperparam] = [0, 1, 2, 3, 4, 5, 6]
        elif hyperparam == 'dual':
            # LOGISTIC_REGRESSION
            self._hyperparams[hyperparam] = False
        elif hyperparam == 'fit_intercept':
            # LOGISTIC_REGRESSION
            self._hyperparams[hyperparam] = True
        elif hyperparam == 'gamma':
            # SVM, when non-linear kernel 'rbf', 'poly', 'sigmoid'
            self._hyperparams[hyperparam] = 'auto'
            self._hyperparam_search_space[hyperparam] = [0.1, 1, 10, 100]
        elif hyperparam == 'hidden_layer_sizes':
            # NN
            self._hyperparams[hyperparam] = (10, 10, 10)
            self._hyperparam_search_space[hyperparam] = [(10, ), (10, 10),
                                                         (10, 10, 10, 10)]
        elif hyperparam == 'hyperparam_strategy':
            # SUPPORTED_ALGORITHMS
            self._hyperparams[
                hyperparam] = SupervisedClassifier.STOCHASTIC_SEARCH
        elif hyperparam == 'kernel':
            # SVM
            self._hyperparams[hyperparam] = 'rbf'
            self._hyperparam_search_space[hyperparam] = [
                'linear', 'poly', 'rbf', 'sigmoid'
            ]
        elif hyperparam == 'learning_rate':
            # ADABOOST, XGB
            self._hyperparams[hyperparam] = 0.1
            self._hyperparam_search_space[hyperparam] = [
                0.001, 0.01, 0.1, 1.0, 10.0
            ]
        elif hyperparam == 'max_depth':
            # DECISION_TREE, RANDOM_FOREST, XGB (XGB does not allow 'None')
            self._hyperparams[hyperparam] = 3
            # Include 1, 2, 3 to bias towards simpler tree.
            self._hyperparam_search_space[hyperparam] = [1, 2, 3, 4, 5]
        elif hyperparam == 'max_features':
            # DECISION_TREE, RANDOM_FOREST
            self._hyperparams[hyperparam] = 'sqrt'
            # Empirical good default values are max_features=n_features for
            # regression problems, and max_features=sqrt(n_features) for
            # classification tasks.
            # http://scikit-learn.org/stable/modules/ensemble.html#forest
            self._hyperparam_search_space[hyperparam] = ['sqrt', 'log2', None]
        elif hyperparam == 'max_iter':
            # LOGISTIC_REGRESSION
            self._hyperparams[hyperparam] = 100
        elif hyperparam == 'min_child_weight':
            # XGB
            self._hyperparams[hyperparam] = 3
            self._hyperparam_search_space[hyperparam] = [1, 5, 10]
        elif hyperparam == 'min_impurity_decrease':
            # DECISION_TREE, RANDOM_FOREST
            self._hyperparams[hyperparam] = 0.0
        elif hyperparam == 'max_leaf_nodes':
            # DECISION_TREE, RANDOM_FOREST
            self._hyperparams[hyperparam] = None
        elif hyperparam == 'min_samples_leaf':
            # DECISION_TREE, RANDOM_FOREST
            self._hyperparams[hyperparam] = 1
            self._hyperparam_search_space[hyperparam] = [0.01, 0.1, 1, 10]
        elif hyperparam == 'min_samples_split':
            # DECISION_TREE, RANDOM_FOREST
            self._hyperparams[hyperparam] = 2
            # Include 20 and .02 to bias towards simpler trees.
            self._hyperparam_search_space[hyperparam] = [0.02, 0.2, 2, 20]
        elif hyperparam == 'min_weight_fraction_leaf':
            # DECISION_TREE, RANDOM_FOREST
            self._hyperparams[hyperparam] = 0.0
        elif hyperparam == 'multi_class':
            # LOGISTIC_REGRESSION
            self._hyperparams[hyperparam] = 'ovr'
        elif hyperparam == 'n_estimators':
            # ADABOOST, RANDOM_FOREST
            if self._hyperparams['algorithm'] == SupervisedClassifier.ADABOOST:
                self._hyperparams[hyperparam] = 30
                self._hyperparam_search_space[hyperparam] = [
                    10, 20, 30, 40, 50
                ]
            elif self._hyperparams[
                    'algorithm'] == SupervisedClassifier.RANDOM_FOREST:
                self._hyperparams[hyperparam] = 10
                # The larger the better, but the longer it will take to compute.
                self._hyperparam_search_space[hyperparam] = [
                    2, 5, 10, 15, 20, 25
                ]
        elif hyperparam == 'n_iter':
            # RandomizedSearchCV throws ValueError if n_iter is less than the
            # number of hyperparam options.
            num_hyperparam_settings = np.prod([
                len(value)
                for key, value in self._hyperparam_search_space.iteritems()
            ])
            log.debug('num_hyperparam_settings: %s' % num_hyperparam_settings)
            self._hyperparams[hyperparam] = np.min(
                [48, num_hyperparam_settings])
        elif hyperparam == 'n_jobs':
            # SUPPORTED_ALGORITHMS
            # LOGISTIC_REGRESSION parallelization causes multiarray.so to crash.
            # Automatically switch to 1 core so others can ignore this =/
            if self._hyperparams[
                    'algorithm'] == SupervisedClassifier.LOGISTIC_REGRESSION:
                self._hyperparams[hyperparam] = 1
            elif self._hyperparams[
                    'algorithm'] == SupervisedClassifier.REGRESS_AND_ROUND:
                self._hyperparams[hyperparam] = 1
            else:
                self._hyperparams[hyperparam] = -1
        elif hyperparam == 'oob_score':
            # RANDOM_FOREST
            self._hyperparams[hyperparam] = False
        elif hyperparam == 'penalty':
            # LOGISTIC_REGRESSION
            self._hyperparams[hyperparam] = 'l1'
        elif hyperparam == 'presort':
            # DECISION_TREE
            self._hyperparams[hyperparam] = False
        elif hyperparam == 'priors':
            # GAUSSIAN_NAIVE_BAYES
            self._hyperparams[hyperparam] = None
            self._hyperparam_search_space[hyperparam] = [[0.0001, 0.9999],
                                                         [0.001, 0.999],
                                                         [0.01, 0.99],
                                                         [0.05, 0.95],
                                                         [0.1, 0.9],
                                                         [0.25, 0.75],
                                                         [0.5, 0.5],
                                                         [0.75, 0.25],
                                                         [0.9, 0.1],
                                                         [0.95, 0.05],
                                                         [0.99, 0.01],
                                                         [0.999, 0.001],
                                                         [0.9999, 0.0001]]
        elif hyperparam == 'random_state':
            # SUPPORTED_ALGORITHMS
            self._hyperparams[hyperparam] = None
        elif hyperparam == 'scoring':
            # SUPPORTED_ALGORITHMS
            # Assume unbalanced classification problems, so use roc auc.
            # http://scikit-learn.org/stable/modules/grid_search.html#specifying-an-objective-metric
            scorer = make_scorer(roc_auc_score, needs_threshold=True)
            self._hyperparams['scoring'] = scorer
        elif hyperparam == 'solver':
            # LOGISTIC_REGRESSION, NN
            if self._hyperparams[
                    'algorithm'] == SupervisedClassifier.LOGISTIC_REGRESSION or self._hyperparams[
                        'algorithm'] == SupervisedClassifier.REGRESS_AND_ROUND:
                self._hyperparams[hyperparam] = 'saga'
            elif self._hyperparams['algorithm'] == SupervisedClassifier.NN:
                self._hyperparams[hyperparam] = 'adam'
                self._hyperparam_search_space[hyperparam] = [
                    'lbfgs', 'sgd', 'adam'
                ]
        elif hyperparam == 'splitter':
            # DECISION_TREE
            self._hyperparams[hyperparam] = 'best'
        elif hyperparam == 'subsample':
            # XGB
            self._hyperparams[hyperparam] = 0.5
            self._hyperparam_search_space[hyperparam] = [0.6, 0.8, 1.0]
        elif hyperparam == 'tol':
            # LOGISTIC_REGRESSION
            self._hyperparams[hyperparam] = 0.0001
        elif hyperparam == 'warm_start':
            # RANDOM_FOREST
            self._hyperparams[hyperparam] = False
Exemple #56
0
            nombres = vectorizer.get_feature_names()
            tablita = mutual_info_classif(X_train, y_train)
            orden = np.argsort(tablita)
            reduc = SelectKBest(mutual_info_classif, k=4001)
            X_train = reduc.fit_transform(X_train, y_train)
            buenos_2 = [(nombres[i], tablita[i]) for i in orden]

        print("   Done!")
        print('     New shape of training matrix: ', X_train.shape)

    jobs = -1
    paramGrid = []
    nIter = 20
    crossV = 10
    # New performance scorer
    myScorer = make_scorer(f1_score, average='weighted')
    print("Defining randomized grid search...")
    if args.classifier == 'SVM':
        # SVM
        classifier = SVC()
        if args.kernel == 'rbf':
            paramGrid = {
                'C': scipy.stats.expon(scale=100),
                'gamma': scipy.stats.expon(scale=.1),
                'kernel': ['rbf'],
                'class_weight': ['balanced', None]
            }
        elif args.kernel == 'linear':
            paramGrid = {
                'C': scipy.stats.expon(scale=100),
                'kernel': ['linear'],
    'vect__tokenizer': [None, stemming_tokenizer],
	'vect__ngram_range': [(1, 1), (1, 2),],
    'svm__kernel':["linear", "rbf", "poly", "sigmoid"],
    'svm__degree': [2, 3],
    'svm__coef0': [0.0, 1.0],
	'svm__gamma': [1e-2, 1e-3,"auto"],
	'svm__C': [1, 5, 10]
	}


## Create a Grid-Search-Cross-Validation object
## to find in an automated fashion the best combination of parameters.
grid_search = GridSearchCV(pipeline,
						   parameters,
						   #scoring=metrics.make_scorer(metrics.average_precision_score, average='weighted'),
						   scoring=metrics.make_scorer(metrics.matthews_corrcoef),
						   cv=10,
						   n_jobs=-1,
						   verbose=10)

## Start an exhaustive search to find the best combination of parameters
## according to the selected scoring-function.
print()
grid_search.fit(X_train, Y_train)
print()

## Print results for each combination of parameters.
number_of_candidates = len(grid_search.cv_results_['params'])
print("Results:")
for i in range(number_of_candidates):
	print(i, 'params - %s; mean - %0.3f; std - %0.3f' %
def semantic_loss(truth, pred):
"""Creates a loss function based on semantic similarity."""
  error = 0 
  for i in range(0, len(pred)):
    truth_i = wn.synsets(truth[i])[0]
    pred_i = wn.synsets(truth[i])[0]
    error_i = truth_i.path_similarity(pred_i)
    error += error_i
  return error

# Grab our data
X = df['Color'].tolist()
y = df['Emotion'].tolist()

# Make scoring function
scorer = make_scorer(semantic_loss, greater_is_better=False)

# Create new knn model
knn = KNeighborsClassifier()

# Specify values to test 
param_grid = {'n_neighbors': np.arange(4, 25), 'weights':['uniform', 'distance']}

# Use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn, param_grid, cv=5, scoring=scorer)

# Fit model to data
knn_gscv.fit(X, y)

# Check out best params
print(knn_gscv.best_params_)
Exemple #59
0
            rel_id = label_encoder.transform([rel])[0]
            #print(rel_id,rel)
            stats_rel = [stat[rel_id] for stat in stats]
            results[rel].append(stats_rel)
    for rel in label_encoder.classes_:
        results[rel] = average_results(results[rel])
        if verbose:
            print_statistics_row(rel, results[rel])
    avg_result = macro_average_results(results)
    if verbose:
        print_statistics_footer(avg_result)
    return avg_result[2]  # return f_0.5 score as summary statistic


# A check for the average F1 score
f_scorer = make_scorer(fbeta_score, beta=0.5, average='macro')


def evaluateCV_check(classifier, X, y, verbose=True):
    if verbose:
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        scores = cross_val_score(classifier, X, y, cv=kfold, scoring=f_scorer)
        print("\nCross-validation scores (StratifiedKFold): ", scores)
        print("Mean cv score (StratifiedKFold): ", scores.mean())


#########################################################################################
# 4. TEST PREDICTIONS and ANALYSIS
#########################################################################################

Exemple #60
0
print total_features

### Extract features and labels from dataset for local testing

data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
### Build a pipeline
scaler = MinMaxScaler()
skb = SelectKBest(f_classif)
gnb = GaussianNB()
pipeline = Pipeline(steps=[("scaling", scaler), ("SKB",
                                                 skb), ("NaiveBayes", gnb)])
SKB_params = {"SKB__k": range(1, 10)}
cv = StratifiedShuffleSplit(labels, n_iter=100, random_state=42)
# Use Kappa_scorer as a metric to evaluate
kappa_scorer = make_scorer(cohen_kappa_score)
gs = GridSearchCV(pipeline, SKB_params, scoring=kappa_scorer, cv=cv)
gs.fit(features, labels)
print "best # of parameters to choose:", gs.best_params_
clf = gs.best_estimator_
# Get the features selected by KBest
clf.named_steps['SKB'].get_support(indices=True)
features_selected = [
    features_list[1:][i]
    for i in clf.named_steps['SKB'].get_support(indices=True)
]
print features_selected

feature_score = clf.named_steps['SKB'].scores_
score_summary = {}
for i in range(len(feature_score)):