Esempio n. 1
0
                    break
                mu = mu_new
            return mu

        # vectorized function
        return np.apply_along_axis(unit_loc_estimator,
                                   axis,
                                   arr,
                                   maxiter=maxiter,
                                   tor=tor)

    def fit(self, X, axis=0, maxiter=50, tor=0.001):
        loc = self.loc_estimator(X, axis=axis, maxiter=maxiter, tor=tor)
        scale = self.scale_estimator(X, axis=axis, maxiter=maxiter, tor=tor)
        return loc, scale


if __name__ == '__main__':
    m_est = MEstimator()
    X_train, y_train, X_test, y_test = simulation_setup(n_i=1000,
                                                        n_o=200,
                                                        n_t=1000,
                                                        p=10,
                                                        sigma_e=0.25)

    t1 = time.time()
    m_estimated_scale = m_est.scale_estimator(X_train, maxiter=50)
    m_estimated_loc = m_est.loc_estimator(X_train, maxiter=50)
    standard_deviation = np.std(X_train, ddof=1, axis=0)
    median = np.median(X_train, axis=0)
    print('consumed time: %.5f s' % (time.time() - t1))
Esempio n. 2
0
        """
        computes accuracy of the model

        :param X_test: ndarray, shape(n_samples, n_features)
                    Test data

        :param y_test: ndarray, shape(n_samples,)
                    Labels of test data

        :param prob_threshold: double, default: 0.5
                    probability threshold for determining the predicted labels

        :return: double
                accuracy of the logistic regression model.
        """
        y_test = np.array(y_test, dtype=int)
        y_predict = self.predict(X_test, prob_threshold=prob_threshold)
        accuracy = np.mean(y_predict == y_test)
        return accuracy


if __name__ == '__main__':
    data_train, data_test, beta_actual = simulation_setup(n_i=1000, n_o=200, n_t=1000, p=10,
                                                          sigma_e=0.25)
    X_train, y_train = data_train[:, :-1], data_train[:, -1]
    X_test, y_test = data_test[:, :-1], data_test[:, -1]
    classical_bootstrap = ClassicalBootstrap()
    classical_bootstrap.fit(X_train, y_train)
    print("classical bootstrap score: ", classical_bootstrap.score(X_test, y_test))

Esempio n. 3
0
n_strata_arr = np.arange(2, 16, 2, dtype=int)

# lambda_ = np.arange(0.05, 0.55, 0.05)
n_i = 1000
# n_o_arr = (n_i * lambda_).astype(int)

score_strat_arr = []
# score_boot_arr = []

# for n_o in n_o_arr:
for n_strata in n_strata_arr:
    score_strat = []
    # score_boot = []
    for i in range(10):
        X_train, y_train, X_test, y_test = simulation_setup(n_i=n_i,
                                                            n_o=200,
                                                            n_t=n_i,
                                                            p=p)

        # stratified
        strat = StratifiedBootstrap()
        strat.fit(X_train,
                  y_train,
                  n_bootstrap=5,
                  n_strata=n_strata,
                  fast=True)
        score_strat.append(strat.score(X_test, y_test))

        # Bootstrap
        boot = ClassicalBootstrap()
        boot.fit(X_train, y_train, n_bootstrap=5)
        # score_boot.append(boot.score(X_test, y_test))
Esempio n. 4
0
    def predict(self, X_test):
        if self.beta is None:
            raise ValueError("MLE Model is not fitted yet")
        # X_test = np.concatenate((np.ones(X_test.shape[0], 1), X_test), axis=1)
        return self.probability(self.beta, X_test)

    def accuracy(self, X_test, y_test, prob_threshold=0.5):
        y_test = np.array(y_test, dtype=int)
        y_predicted = (self.predict(X_test) >= prob_threshold).astype(int)
        accuracy = np.mean(y_predicted == y_test)
        return accuracy


if __name__ == '__main__':
    data_train, data_test, beta_actual = simulation_setup(n_i=1000,
                                                          n_o=0,
                                                          n_t=1000,
                                                          p=20)
    X_train, y_train = data_train[:, :-1], data_train[:, -1].astype(int)
    X_test, y_test = data_test[:, :-1], data_test[:, -1].astype(int)
    mle = MLE()
    mle.fit(X_train, y_train)
    acc = mle.accuracy(X_test, y_test)
    print('accuracy:', acc)
    print('fitted coefficients: \n', mle.beta)
    print('actual coefficients: \n', beta_actual)
    lr = LogisticRegression(fit_intercept=False, solver='lbfgs')
    lr.fit(X_train, y_train)
    score = lr.score(X_test, y_test)
    print('LR accuracy: ', score)
    print('LR coefficients: \n', lr.coef_[0])
    print('RMSE: ', np.sqrt(np.mean((mle.beta - lr.coef_[0])**2)))
Esempio n. 5
0
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('MacOSX')
from rblr.influence_function_bootstrap import IFB
from rblr.classical_bootstrap import ClassicalBootstrap
from sklearn.linear_model import LogisticRegression
from rblr.simulation_setup import simulation_setup

matplotlib.rcParams['font.size'] = 10

q = np.linspace(0.1, 1, 20)
b = 10
X_train, y_train, X_test, y_test = simulation_setup(n_i=1000, n_o=200, p=8)

lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)
score_lr = lr.score(X_test, y_test)

class_boot = ClassicalBootstrap()
class_boot.fit(X_train, y_train, n_bootstrap=b)
score_class_boot = class_boot.score(X_test, y_test)

score_ifb = []
for q_ in q:
    ifb = IFB(c=None, gamma=5)
    ifb.fit(X_train, y_train, n_bootstrap=b, quantile_factor=q_)
    score_ifb.append(ifb.score(X_test, y_test))

f1 = plt.figure(1, figsize=(7, 4.8))
plt.plot(q, score_ifb, label='IFB')
Esempio n. 6
0
from rblr.simulation_setup import simulation_setup
from rblr.influence_function_bootstrap import IFB
from sklearn.metrics import precision_recall_fscore_support
from rblr.preprocessing import Preprocessor

import matplotlib.pyplot as plt

n_i = 1000
n_o = 200
n_t = 1000
p = 20
sigma_e = 0.25
quantile_factor = np.arange(0.5, 1.0, 0.05)
X_train, y_train, X_test, y_test = simulation_setup(n_i=n_i,
                                                    n_o=n_o,
                                                    n_t=n_t,
                                                    p=p,
                                                    sigma_e=sigma_e)
# print("Simulation setup: inliers: {}; outliers: {}; dimensions: {}".format(n_i, n_o, n_t, p, sigma_e))

# def get_metrics_matrix(quantile_factor):
#     metrics_matrix = np.empty((len(quantile_factor), 3, 4))
#     for i, q in enumerate(quantile_factor):
#         unit_matrix = np.zeros((3, 4))
#         clf_ifb = IFB()
#         clf_ifb.fit(X_train, y_train, quantile_factor=q)
#         beta_ifb = clf_ifb.beta
#
#         clf_lr = LogisticRegression(fit_intercept=False, solver='lbfgs')
#         clf_lr.fit(X_train, y_train)
#
Esempio n. 7
0
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
matplotlib.use('Qt5Agg')
plt.style.use('ggplot')

n_i = 10000
n_o = 1000
p = 20

data_train, data_test, beta = simulation_setup(n_i=n_i,
                                               n_o=n_o,
                                               p=p,
                                               sigma_o=100,
                                               sigma_e=0.1)
df_train = pd.DataFrame(data=data_train)
df_test = pd.DataFrame(data=data_test)
# df_train = pd.read_csv('data_train.csv')

df_train[p] = df_train[p].astype(int)

# store the numpy array representation of the dataframe
X = df_train.drop([p], axis=1).values

me = MEstimator()
# estimated location of inliers
loc_estimated = me.loc_estimator(X)
scale_estimated = me.scale_estimator(X)
Esempio n. 8
0
        X_out = X[~inlier_flag]

        if y is None:
            if return_outliers:
                return X_in, X_out
            else:
                return X_in
        else:
            y_in, y_out = y[inlier_flag], y[~inlier_flag]
            if return_outliers:
                return X_in, y_in, X_out, y_out
            else:
                return X_in, y_in

    def fit_transform(self, X, y=None, return_outliers=False, n_inliers=None):
        self.fit(X, y)
        return self.transform(X,
                              y,
                              return_outliers=return_outliers,
                              n_inliers=n_inliers)


if __name__ == '__main__':
    data = simulation_setup(n_i=1000, n_o=800, p=8)[0]
    X, y = data[:, :-1], data[:, -1]
    preprocessor = Preprocessor()
    # X_in, y_in, X_out, y_out = preprocessor.fit_transform(X, y, return_outliers=True)
    #     # print('number of X_in: %d, y_in: %d' % (len(X_in), len(y_in)))
    #     # print('number of X_out: %d, y_out: %d' % (len(X_out), len(y_out)))
    X_in = preprocessor.fit_transform(X)
    print(X_in.shape)