Example #1
0
def boost_elasticnet(X_train, X_test, y_train, y_test):

    # applying bagging to logistic regression with elasticnet
    # Args:
    #     X_train, X_test, y_train, y_test

    # Returns:
    #     DataFrame: Preprocessed DataFrame. where Alpha and L1 ratio are hyperparameters of elastic net, estimator is hyperparameter for bagging, confusion matrix is the confusion matrix for each combination of those hyperparameters

    df = pd.DataFrame(
        columns=['Estimators', 'Learning Rate', 'Confusion Matrix'])
    rows = []
    alphas = [0.0001, 0.001, 0.01]  #,0.1,1]
    estimators = [50, 100, 150]
    rates = [0.5, 0.75, 1]

    # for al in alphas:
    #     estimator = SGDClassifier(loss = 'log',alpha= al,penalty = 'l1',random_state=0)
    for n_est in estimators:
        for rate in rates:
            ada = LogitBoost(n_estimators=n_est,
                             learning_rate=rate,
                             random_state=0)  #algorithm='SAMME',
            ada.fit(X_train, y_train)
            predicted_labels = ada.predict(X_test)
            tn, fp, fn, tp = confusion_matrix(y_test,
                                              predicted_labels,
                                              labels=[0, 1]).ravel()
            convert_matrix = [tn, fp, fn, tp]
            rows.append([n_est, rate, convert_matrix])

    for i in range(len(rows)):
        df = df.append(
            {
                'Estimators': rows[i][0],
                'Learning Rate': rows[i][1],
                'Confusion Matrix': rows[i][2]
            },
            ignore_index=True)

    return df
def _toy_dataset_test(load_func,
                      test_size=(1. / 3),
                      random_state=0,
                      min_score_train=0.9,
                      min_score_test=0.9):
    """Create a classification unit test from a scikit-learn toy dataset."""
    # Fetch the dataset
    data = load_func()
    X = data.data
    y = data.target_names[data.target]

    # Distinct classes
    classes = data.target_names
    n_classes = len(classes)

    # Binary/multiclass classification indicator
    is_binary = (n_classes == 2)

    # Shuffle data and split it into training/testing samples
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y,
                         random_state=random_state)

    for bootstrap in (True, False):
        # Fit a LogitBoost model
        logitboost = LogitBoost(bootstrap=bootstrap, random_state=random_state)
        logitboost.fit(X_train, y_train)

        # Compute accuracy scores and assert minimum accuracy
        score_train = logitboost.score(X_train, y_train)
        score_test = logitboost.score(X_test, y_test)
        assert score_train >= min_score_train, \
            ("Failed with bootstrap=%s: training score %.3f less than %.3f"
             % (bootstrap, score_train, min_score_train))
        assert score_test >= min_score_test, \
            ("Failed with bootstrap=%s: testing score %.3f less than %.3f"
             % (bootstrap, score_test, min_score_test))

        # Get probabilities and the decision function
        predict_proba = logitboost.predict_proba(X_test)
        decision_function = logitboost.decision_function(X_test)

        # predict_proba() should always return (n_samples, n_classes)
        assert predict_proba.shape == (X_test.shape[0], n_classes)

        # decision_function() shape depends on the classification task
        if is_binary:
            assert decision_function.shape == (X_test.shape[0], )
        else:
            assert decision_function.shape == (X_test.shape[0], n_classes)

        # Check that the last item of a staged method is the same as a regular
        # method
        staged_predict = np.asarray(list(logitboost.staged_predict(X_test)))
        staged_predict_proba = \
            np.asarray(list(logitboost.staged_predict_proba(X_test)))
        staged_decision_function = \
            np.asarray(list(logitboost.staged_decision_function(X_test)))
        staged_score = \
            np.asarray(list(logitboost.staged_score(X_test, y_test)))

        np.testing.assert_equal(staged_predict[-1], logitboost.predict(X_test))
        np.testing.assert_almost_equal(staged_predict_proba[-1],
                                       logitboost.predict_proba(X_test))
        np.testing.assert_almost_equal(staged_decision_function[-1],
                                       logitboost.decision_function(X_test))
        np.testing.assert_almost_equal(staged_score[-1],
                                       logitboost.score(X_test, y_test))

        # contributions() should return one non-negative number for each
        # estimator in the ensemble
        contrib = logitboost.contributions(X_train)
        assert contrib.shape == (logitboost.n_estimators, )
        assert np.all(contrib >= 0)
Example #3
0
plt.title("t-SNE plot of the training data")
plt.xlabel("1st embedding axis")
plt.ylabel("2nd embedding axis")
plt.legend(loc="best", frameon=True, shadow=True)

plt.tight_layout()
plt.show()
plt.close()

lboost = LogitBoost(base_estimator=LogisticRegression(),
                    n_estimators=200,
                    random_state=0)
lboost.fit(X_train, y_train)

y_pred_train = lboost.predict(X_train)
y_pred_test = lboost.predict(X_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

print("Training accuracy: %.4f" % accuracy_train)
print("Test accuracy:     %.4f" % accuracy_test)

report_train = classification_report(y_train, y_pred_train)
report_test = classification_report(y_test, y_pred_test)
print("Training\n%s" % report_train)
print("Testing\n%s" % report_test)

iterations = np.arange(1, lboost.n_estimators + 1)
staged_accuracy_train = list(lboost.staged_score(X_train, y_train))
Example #4
0
def main(plot=True, M=8, n_fold=10):

	"""

	:param plot: whether to plot the train and test result
	:param M: maximum T to search would be 100*M
	:param n_fold: number of folds in cross validation
	:return: None
	"""

	train_file, test_file = "datasets/abalone_train_scaled.txt", "datasets/abalone_test_scaled.txt"

	train = sparse.csr_matrix(np.loadtxt(train_file, delimiter=","))
	test = sparse.csr_matrix(np.loadtxt(test_file, delimiter=","))

	m = train.shape[0]  #3133
	x_dim = train.shape[1] - 1  #10
	x_train, y_train = train[:, :x_dim].toarray(), train[:, x_dim].toarray().squeeze()
	x_test, y_test = test[:, :x_dim].toarray(), test[:, x_dim].toarray().squeeze()
	#print(x_train.shape, y_train.shape, x_test.shape)
	#print(x_train)


	aboost_train_cverror = list(np.ones(M))
	lboost_train_cverror = list(np.ones(M))
	for multiple in range(1, M+1):
		T = multiple * 100
		print("\nT = %s\t" % T)
		# Set AdaBoost parameters
		# decision stump is the default base estimator
		aboost = AdaBoostClassifier(n_estimators=T, random_state=0)
		# Set LogitBoost parameters
		lboost = LogitBoost(n_estimators=T, random_state=0)
		# get 10-fold cross validation error
		aboost_cv_results = cross_validate(aboost, x_train, y_train, cv=n_fold)
		lboost_cv_results = cross_validate(lboost, x_train, y_train, cv=n_fold)
		# compute error by 1 - accuracy
		aboost_train_cverror[multiple-1] = 1 - aboost_cv_results['test_score']
		lboost_train_cverror[multiple-1] = 1 - lboost_cv_results['test_score']
	aboost_train_cverror = np.stack(aboost_train_cverror)
	lboost_train_cverror = np.stack(lboost_train_cverror)
	print(aboost_train_cverror)
	print(lboost_train_cverror)

	# find the T that gives least error (the best cross-validation accuracy)
	a_train_cverror_mean, a_train_cverror_std = aboost_train_cverror.mean(axis=1), aboost_train_cverror.std(axis=1)
	argmin = a_train_cverror_mean.flatten().argmin()
	best_T_aboost = int(argmin+1) * 100
	print("----------------------\n",\
		  "AdaBoost iteration number T = %s\n"%(best_T_aboost), \
		  "----------------------\n")
	# find the T that gives least error (the best cross-validation accuracy)
	l_train_cverror_mean, l_train_cverror_std = lboost_train_cverror.mean(axis=1), lboost_train_cverror.std(axis=1)
	argmin = l_train_cverror_mean.flatten().argmin()
	best_T_lboost = int(argmin + 1) * 100
	print("----------------------\n", \
		  "LogitBoost iteration number T = %s\n" % (best_T_lboost), \
		  "----------------------\n")

	print('Now train with the best T=T* and eval on the test set\n')

	# Train on the whole train set
	aboost = AdaBoostClassifier(n_estimators=best_T_aboost, random_state=0)
	aboost.fit(x_train, y_train)
	lboost = LogitBoost(n_estimators=best_T_lboost, random_state=0)
	lboost.fit(x_train, y_train)
	# Test on the test set
	y_pred_train = aboost.predict(x_train)
	y_pred_test = aboost.predict(x_test)
	a_error_train = 1-accuracy_score(y_train, y_pred_train)
	a_error_test = 1-accuracy_score(y_test, y_pred_test)
	print("AdaBoost train error: %s test error: %s" % (a_error_train, a_error_test))

	y_pred_train = lboost.predict(x_train)
	y_pred_test = lboost.predict(x_test)
	l_error_train = 1-accuracy_score(y_train, y_pred_train)
	l_error_test = 1-accuracy_score(y_test, y_pred_test)
	print("LogitBoost train error: %s test error: %s"%(l_error_train, l_error_test))


	if plot:
		plt.figure()
		x_values = range(100, M*100+1, 100)
		plt.plot(x_values, a_train_cverror_mean, label="AdaBoost")
		plt.fill_between(x_values,
						a_train_cverror_mean + a_train_cverror_std,
						a_train_cverror_mean - a_train_cverror_std,
						alpha=0.5, edgecolor='blue', facecolor='blue')
		plt.plot(x_values, l_train_cverror_mean, label="LogitBoost")
		plt.fill_between(x_values,
						l_train_cverror_mean + l_train_cverror_std,
						l_train_cverror_mean - l_train_cverror_std,
						alpha=0.5, edgecolor='#FF9848', facecolor='#FF9848')
		plt.xlabel('T (number of iterations/classifiers)')
		plt.ylabel('10fold cross validation train error')
		plt.legend()
		plt.ylim(0, 0.5)
		plt.savefig('B.i_cverror.png')