Exemple #1
0
def _ANASYN(self):
    """ADAptive SYNthetic (ADASYN) is based on the idea of
    adaptively generating minority data samples according to their distributions using K nearest neighbor.
    The algorithm adaptively updates the distribution and
    there are no assumptions made for the underlying distribution of the data."""
    print("before: ", len(self.x_train))
    resampler = uns.InstanceHardnessThreshold(
        sampling_strategy=0.2, random_state=self.seed
    )
    self.X_train_smote2, self.y_train_smote2 = resampler.fit_resample(
        self.x_train, self.y_train
    )
    self.x_train = pd.DataFrame(self.X_train_smote2, columns=self.x_train.columns)
    self.y_train = pd.DataFrame(
        self.y_train_smote2, columns=["Local Relapse Y(1) /N(0)"]
    )
    print("after: ", len(self.x_train))

    adasyn = ADASYN(random_state=self.seed)
    self.X_train_smote, self.y_train_smote = adasyn.fit_sample(
        self.x_train, self.y_train
    )
    print("X_train_SMOTE:\n", self.X_train_smote[1])

    self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns)
    self.y_train = pd.DataFrame(
        self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"]
    )

    print("len smote: \n", len(self.X_train_smote))
    print("len new x_train: \n", len(self.x_train))

    number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1]
    print("number positive responses y_train:\n", len(number_pos_x))
Exemple #2
0
path_features = '/data/prostate/extraction/mp-mri-prostate'
# Define a list of the path where the feature are kept
dce_features = ['ese-dce']
# Define the extension of each features
ext_features = ['_ese__dce.npy']
# Define the path of the ground for the prostate
path_gt = ['GT_inv/prostate', 'GT_inv/pz', 'GT_inv/cg', 'GT_inv/cap']
# Define the label of the ground-truth which will be provided
label_gt = ['prostate', 'pz', 'cg', 'cap']
# Define the path where to store the data
path_store = '/data/prostate/balanced/mp-mri-prostate/exp-3'

N_JOBS = -1
# Create the under_samplers and over_samplers list to use
samplers = [
    under_sampling.InstanceHardnessThreshold(n_jobs=N_JOBS,
                                             estimator='random-forest'),
    under_sampling.NearMiss(version=1, n_jobs=N_JOBS),
    under_sampling.NearMiss(version=2, n_jobs=N_JOBS),
    under_sampling.NearMiss(version=3, n_jobs=N_JOBS),
    under_sampling.RandomUnderSampler(),
    over_sampling.SMOTE(kind='regular', n_jobs=N_JOBS),
    over_sampling.SMOTE(kind='borderline1', n_jobs=N_JOBS),
    over_sampling.SMOTE(kind='borderline2', n_jobs=N_JOBS),
    over_sampling.RandomOverSampler()
]
# Define the sub-folder to use
sub_folder = [
    'iht', 'nm1', 'nm2', 'nm3', 'rus', 'smote', 'smote-b1', 'smote-b2', 'ros'
]

# Generate the different path to be later treated
def main():
    # test the estimators
    n_jobs = 10
    imblearn_est = under_sampling.InstanceHardnessThreshold(n_jobs=n_jobs)
    wrapped_est = ImblearnWrapper(imblearn_est)