Esempio n. 1
0
def check_norm_dist(
    features,
    target,
    params,
):
    rs = ss(n_splits=150, train_size=.7, random_state=0)
    scores_0 = []
    scores_1 = []
    mean_0 = []
    mean_1 = []
    S = []

    for train_index, _ in rs.split(features):
        classifier0 = XGBClassifier(n_estimators=params[0],
                                    max_depth=4,
                                    silent=1)
        score0 = cross_val_score(classifier0,
                                 features.iloc[train_index],
                                 target[train_index],
                                 scoring='roc_auc',
                                 cv=10)
        scores_0.append(score0)

        classifier1 = XGBClassifier(n_estimators=params[1],
                                    max_depth=4,
                                    silent=1)
        score1 = cross_val_score(classifier1,
                                 features.iloc[train_index],
                                 target[train_index],
                                 scoring='roc_auc',
                                 cv=10)
        scores_1.append(score1)

    scores_0 = np.asmatrix(scores_0)
    scores_1 = np.asmatrix(scores_1)

    for i in range(scores_0.shape[0]):
        diff = []
        mean_0.append(scores_0[i].mean())
        mean_1.append(scores_1[i].mean())
        for j in range(scores_0.shape[1]):
            diff.append(scores_0[i, j] - scores_1[i, j])
        S.append(np.array(diff).var())
    means = list((map(lambda x, y: x - y, mean_0, mean_1)))

    pylab.subplot(1, 2, 1)
    stats.probplot(S, dist="chi2", sparams=(150), plot=pylab)

    pylab.subplot(1, 2, 2)
    stats.probplot(means, dist="norm", plot=pylab)
    pylab.show()

    return means, S
Esempio n. 2
0
def load_data():
    X_train = []
    X_train_id = []
    y_train = []
    files_all = []
    index = 0

    for fl in os.listdir(tr_dir):
        fld = os.path.join(tr_dir, fl)
        print('load folder {} (Index:{})'.format(fld, index))
        path = os.path.join(fld, '*.jpg')
        files = sorted(glob.glob(path))
        for fl in files:
            flbase = os.path.basename(fl)
            files_all.append(fl)
            X_train_id.append(flbase)
            y_train.append(index)

        index = index + 1

    files_all, y_train = shuffle(files_all, y_train, random_state=9999)
    files_all, y_train = shuffle(files_all, y_train, random_state=9999)
    print("SHUFFLED TRAINING FILES")

    for fl in files_all:
        img = get_im_cv2(fl)
        X_train.append(img)

    print("Collected data TRAINING FILES")

    X_train = np.array(X_train, dtype=np.uint8)
    y_train = np.array(y_train, dtype=np.uint8)
    X_train = X_train.transpose((0, 3, 1, 2))
    X_train = X_train.astype('float32')
    X_train = X_train / 255
    y_flat = y_train
    y_train = np_utils.to_categorical(y_train, c)
    print("PREPARED TRAINING DATA")
    from sklearn.model_selection import StratifiedShuffleSplit as ss
    sss = ss(n_splits=1, test_size=0.1, random_state=0)
    for train_index, test_index in sss.split(X_train, y_train):
        x_t, x_v = X_train[train_index], X_train[test_index]
        y_t, y_v = y_train[train_index], y_train[test_index]
        y_f, y_f_v = y_flat[train_index], y_flat[test_index]

    return x_t, y_t, x_v, y_v, y_f
Esempio n. 3
0
    def __init__(self,
                 Cs=500,
                 cv=10,
                 sampler='skf',
                 solver='liblinear',
                 **kwargs):

        super(self.__class__, self).__init__()

        self.penalty = 'l1'
        self.solver = solver
        self.Cs = Cs
        self.sampler = sampler
        self.cv_folds = cv

        if self.sampler == 'skf':
            self.cv = skf(n_splits=self.cv_folds)

        elif self.sampler == 'sss':
            self.cv = sss(n_splits=self.cv_folds)

        elif self.sampler == 'kf':
            self.cv = kf(n_splits=self.cv_folds)

        elif self.sampler == 'ss':
            self.cv = ss(n_splits=self.cv_folds)

        else:
            raise (Exception(
                'Selected sampler is not a valid. Please choose '
                '"skf" for stratified K-fold or "sss" for '
                'stratified shuffle split. Also "sk" and "ss" for '
                'the respective non-stratified methods.'))

        for k, v in kwargs.items():
            setattr(self, k, v)

        self.x = None
        self.y = None
Esempio n. 4
0
def cv_reg(x, test_size = 0.2, n_splits = 5, random_state=None):
    return ss(n_splits, test_size, random_state=random_state).split(x)
Esempio n. 5
0
def cv_reg(x, test_size = 0.2, n_splits = 5, random_state=None): return ss(n_splits, test_size, random_state=random_state).split(x)


def timeit(klass, params, x, y):