コード例 #1
0
def under_sample_train(x_train, y_train, random=False, seed=666):
    if (random):
        model_under_sample = under_sampling.RandomUnderSampler(
            random_state=seed)
        x_train, y_train = model_under_sample.fit_resample(x_train, y_train)
    else:
        model_under_sample = under_sampling.NearMiss(version=2,
                                                     random_state=seed,
                                                     n_jobs=-1)
        x_train, y_train = model_under_sample.fit_resample(x_train, y_train)
    return x_train, y_train
コード例 #2
0
def f_NearMiss(X_train, y_train, seed):
    """
    Use: X_train, y_train, seed
    returns X_train, y_train
    """

    nm = us.NearMiss(version=3,
                     return_indices=True,
                     n_neighbors=10,
                     random_state=seed)
    X_train, y_train, idx_res = nm.fit_sample(X_train, y_train)
    return (X_train, y_train)
コード例 #3
0
ファイル: Kaggle.py プロジェクト: lilujunai/DLToolbox
def init(bsize):
    data, label = load("Kaggle.npz")
    #转换到球极坐标
    # norm=np.sqrt(np.sum(data**2,axis=1,keepdims=True))
    # ag=data/norm
    # data=np.concatenate([data,norm,ag],axis=1)
    #使用sin和cos信息
    # data=np.concatenate([np.sin(data),np.cos(data)],axis=1)

    # 下采样 制造平衡样本
    cr = under_sampling.NearMiss()
    data, label = cr.fit_sample(data, label)
    #上采样 制造平衡样本
    # ocr=over_sampling.ADASYN()
    # data,label=ocr.fit_sample(data,label)
    #混肴
    idx = list(range(len(data)))
    random.shuffle(idx)
    data, label = data[idx], label[idx]
    #onehot
    olabel = np.zeros(shape=(len(label), 2))
    for i, l in enumerate(label):
        olabel[i][int(l - 1)] = 1

    #类型转换
    data = data.astype("float32")
    olabel = olabel.astype("float32")
    #
    train_sum = int(len(data) / 1.3)
    tdata, tlabel = data[:train_sum], olabel[:train_sum]
    test_data, test_label = data[train_sum:], olabel[train_sum:]
    train_set = mxdata.ArrayDataset(nd.array(tdata), nd.array(tlabel))
    test_set = mxdata.ArrayDataset(nd.array(test_data), nd.array(test_label))

    #
    #loader
    train_loader = mxdata.DataLoader(train_set, batch_size=bsize)
    test_loader = mxdata.DataLoader(test_set, batch_size=bsize)

    return train_loader, test_loader
コード例 #4
0
dce_features = ['ese-dce']
# Define the extension of each features
ext_features = ['_ese__dce.npy']
# Define the path of the ground for the prostate
path_gt = ['GT_inv/prostate', 'GT_inv/pz', 'GT_inv/cg', 'GT_inv/cap']
# Define the label of the ground-truth which will be provided
label_gt = ['prostate', 'pz', 'cg', 'cap']
# Define the path where to store the data
path_store = '/data/prostate/balanced/mp-mri-prostate/exp-3'

N_JOBS = -1
# Create the under_samplers and over_samplers list to use
samplers = [
    under_sampling.InstanceHardnessThreshold(n_jobs=N_JOBS,
                                             estimator='random-forest'),
    under_sampling.NearMiss(version=1, n_jobs=N_JOBS),
    under_sampling.NearMiss(version=2, n_jobs=N_JOBS),
    under_sampling.NearMiss(version=3, n_jobs=N_JOBS),
    under_sampling.RandomUnderSampler(),
    over_sampling.SMOTE(kind='regular', n_jobs=N_JOBS),
    over_sampling.SMOTE(kind='borderline1', n_jobs=N_JOBS),
    over_sampling.SMOTE(kind='borderline2', n_jobs=N_JOBS),
    over_sampling.RandomOverSampler()
]
# Define the sub-folder to use
sub_folder = [
    'iht', 'nm1', 'nm2', 'nm3', 'rus', 'smote', 'smote-b1', 'smote-b2', 'ros'
]

# Generate the different path to be later treated
path_patients_list_gt = []
コード例 #5
0
#test
for i in test_nan.index:
    fill_col, id_ = impute(i, train, test_nan)
    test_nan.loc[i, fill_col] = train.loc[id_, fill_col]

train = pd.concat([train, train_nan], axis=0)
del train_nan
#test
test = pd.concat([test, test_nan], axis=0)
del test_nan

y = train['renewal']
x = train.drop('renewal', axis=1)

ros = over_sampling.ADASYN()
rus = under_sampling.NearMiss()
rcs = combine.SMOTEENN()
rcs2 = combine.SMOTETomek()

log = BaggingClassifier(LogisticRegressionCV(Cs=6))
rf = BaggingClassifier(RandomForestClassifier())
gbc = BaggingClassifier(
    GradientBoostingClassifier(n_estimators=250, learning_rate=0.01))
sv = SVC(C=0.8, probability=True)
for sample, sample_name in zip([rcs2, ros, rus, rcs, rcs2],
                               ['rcs2', 'ros', 'rus', 'rcs']):
    print(sample_name)
    x_rs, y_rs = sample.fit_sample(x, y)
    for model, model_name in zip([log, rf, gbc], ['log', 'rf', 'gbc']):
        model.fit(x_rs, y_rs)
        filename = 'C:/Users/cheekati/Desktop/ml/AV Mck/' + str(
コード例 #6
0
def resample_classes(X,
                     Y,
                     how='und1',
                     random_state=None,
                     test_size=0.3,
                     n_jobs=2,
                     split=True,
                     verbose=True):
    """

    """
    if how == 'und1':
        if verbose:
            msg = 'Under-sampling the majority class(es) by randomly picking '
            msg += 'samples without replacement'
            print msg
        samp = imbus.RandomUnderSampler(random_state=random_state,
                                        replacement=False)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'und2':
        if verbose:
            msg = 'Under-sampling by generating centroids based on clustering '
            msg += 'methods'
            print msg
        samp = imbus.ClusterCentroids(ratio='auto',
                                      random_state=random_state,
                                      estimator=None,
                                      n_jobs=n_jobs)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'und3':
        if verbose:
            print 'Under-sampling based on NearMiss methods'
        samp = imbus.NearMiss(ratio='auto',
                              return_indices=False,
                              random_state=random_state,
                              version=1,
                              size_ngh=None,
                              n_neighbors=3,
                              ver3_samp_ngh=None,
                              n_neighbors_ver3=3,
                              n_jobs=n_jobs)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'over1':
        if verbose:
            msg = 'Over-sampling the minority class(es) by picking samples at '
            msg += 'random with replacement'
            print
        samp = imbov.RandomOverSampler(random_state=random_state)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'over2':
        if verbose:
            msg = 'Over-sapmling using SMOTE - Synthetic Minority Over-sampling '
            msg += 'Technique'
            print msg
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbov.SMOTE(random_state=random_state,
                               ratio=.99,
                               k=None,
                               k_neighbors=5,
                               m=None,
                               m_neighbors=10,
                               out_step=0.5,
                               kind='regular',
                               svm_estimator=None,
                               n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    elif how == 'over3':
        if verbose:
            msg = 'Over-sampling using ADASYN - Adaptive Synthetic Sampling '
            msg += 'Approach for Imbalanced Learning'
            print msg
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbov.ADASYN(ratio=.93,
                                random_state=random_state,
                                k=None,
                                n_neighbors=5,
                                n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    elif how == 'comb1':
        if verbose:
            print 'Combine over- and under-sampling using SMOTE and Tomek links.'
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbcom.SMOTETomek(ratio=.99,
                                     random_state=random_state,
                                     smote=None,
                                     tomek=None,
                                     k=None,
                                     m=None,
                                     out_step=None,
                                     kind_smote=None,
                                     n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    else:
        print 'Sampling approach not recognized'
        return

    if verbose:
        print '\t\t\t1\t2\t3\t4'
        val_y = pd.Series(Y).value_counts(sort=False).values
        msg = 'Counts in y_init:\t{}\t{}\t{}\t{} '
        print msg.format(val_y[0], val_y[1], val_y[2], val_y[3])
        val_yres = pd.Series(y_res).value_counts(sort=False).values
        msg = 'Counts in y_resamp:\t{}\t{}\t{}\t{} '
        print msg.format(val_yres[0], val_yres[1], val_yres[2], val_yres[3])

    if split:
        X_train, X_test, y_train, y_test = train_test_split(
            X_res, y_res, test_size=test_size, random_state=random_state)
        if verbose:
            val_ytr = pd.Series(y_train).value_counts(sort=False).values
            msg = 'Counts in y_train:\t{}\t{}\t{}\t{} '
            print msg.format(val_ytr[0], val_ytr[1], val_ytr[2], val_ytr[3])

            val_yte = pd.Series(y_test).value_counts(sort=False).values
            msg = 'Counts in y_test:\t{}\t{}\t{}\t{} '
            print msg.format(val_yte[0], val_yte[1], val_yte[2], val_yte[3])

            print 'X_train:', X_train.shape, ', X_test:', X_test.shape

        return X_train, X_test, y_train, y_test
    else:
        return X_res, y_res