コード例 #1
0
percentage_list = [4, 10, 25, 35, 50]

#This loop executes the oversampling strategy (In this case ADASYN) for all the ratio's that were tested.
for ratio, percentage in zip(ratio_list, percentage_list):
    #Create a train-test split where the ratio of target class is maintained
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=47, stratify=y)
    #Initialize a ADASYN sampler with ratio that will be tested
    over = ADASYN(sampling_strategy=ratio)
    #Initialize a pipeline (One can add extra steps here if required)
    steps = [ ('o', over)]
    pipeline = Pipeline(steps)
    #Resample data
    x_res, y_res = pipeline.fit_resample(x_train, y_train)
    print('resample finished')
    #Train an xg_boost model with resampled data
    xgb = xg_boost(x_res, y_res, x_test, y_test, f"ADASYN_{percentage}")


# The code below was used to calculate the running times.
# Since some running times were very long, we let the code time-out after 10 hours.
# It is less relevant for WWF, hence it is commented out.

#List of sub-sample sizes that were evaluated to calculate running times.
# subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000]
# times_subsetsize_list = []

# def calculate_running_times():
#     for i in subset_list:
#         start = time.time()
#         x_rest, x_sub, y_rest, y_sub = train_test_split(X, y, test_size=i/len(X), stratify=y, random_state=47)
#         print("ADASYN", i)
コード例 #2
0
    #Create a train-test split where the ratio of target class is maintained
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=47,
                                                        stratify=y)
    #Initialize a SMOTETomek sampler with ratio that will be tested
    over = imblearn.combine.SMOTETomek(sampling_strategy=ratio)
    #Initialize a pipeline (One can add extra steps here if required)
    steps = [('o', over)]
    pipeline = Pipeline(steps)
    #Resample data
    x_train_res, y_train_res = pipeline.fit_resample(x_train, y_train)
    print('resample finished')
    #Train an xg_boost model with resampled data
    xg_boost(x_train_res, y_train_res, x_test, y_test,
             f"smote_tomek{percentage}")

# The code below was used to calculate the running times.
# Since some running times were very long, we let the code time-out after 10 hours.
# It is less relevant for WWF, hence it is commented out.

#List of sub-sample sizes that were evaluated to calculate running times.
# subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000]
# times_subsetsize_list = []
#
# def calculate_running_times():
#     for i in subset_list:
#         start = time.time()
#         print("SMOTETomek", i)
#         x_rest, x_sub, y_rest, y_sub = train_test_split(X, y, test_size=i/len(X), stratify=y, random_state=47)
#         over = imblearn.combine.SMOTETomek(sampling_strategy=0.042)
コード例 #3
0
#1 = random undersampling, 2 = random oversampling, 3 = random under and oversampling combined.
sampler_choice = 2
#test
#Sampling_strategy variable is used to set the sampling ratio.
sampling_strategy = 1
if sampler_choice == 1:
    #Initialize a random over sampler with ratio that will be tested
    under = imb.under_sampling.RandomUnderSampler(
        sampling_strategy=sampling_strategy)
    #Initialize a pipeline (One can add extra steps here if required)
    steps = [('u', under)]
    pipeline = imb.pipeline.Pipeline(steps)
    #Resample data
    x_train, y_train = pipeline.fit_resample(x_train, y_train)
    #Train an xg_boost model with resampled data
    xg_boost(x_train, y_train, x_test, y_test, 'Random Undersampling')
elif sampler_choice == 2:
    #Initialize a random under sampler with ratio that will be tested
    over = imb.over_sampling.RandomOverSampler(
        sampling_strategy=sampling_strategy)
    #Initialize a pipeline (One can add extra steps here if required)
    steps = [('o', over)]
    pipeline = imb.pipeline.Pipeline(steps)
    #Resample data
    x_train, y_train = pipeline.fit_resample(x_train, y_train)
    #Train an xg_boost model with resampled data
    xg_boost(x_train, y_train, x_test, y_test, 'Random Oversampling')
elif sampler_choice == 3:
    #Initialize a random over sampler, then a random undersampler with ratios that will be tested
    over = imb.over_sampling.RandomOverSampler(sampling_strategy=0.10)
    under = imb.under_sampling.RandomUnderSampler(
コード例 #4
0
ファイル: SMOTE.py プロジェクト: Psychotechnopath/WWF_Project
    # Create a train-test split where the ratio of target class is maintained
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=47,
                                                        stratify=y)
    #Initialize a SMOTE sampler with ratio that will be tested
    over = SMOTE(sampling_strategy=ratio)
    #Initialize a pipeline (One can add extra steps here if required)
    steps = [('o', over)]
    pipeline = Pipeline(steps)
    #Resample data
    x_res, y_res = pipeline.fit_resample(x_train, y_train)
    print('resample finished')
    #Train an xg_boost model with resampled data
    xgb = xg_boost(x_res, y_res, x_test, y_test, f"SMOTE_{percentage}")

# The code below was used to calculate the running times.
# Since some running times were very long, we let the code time-out after 10 hours.
# # It is less relevant for WWF, hence it is commented out.

#List of sub-sample sizes that were evaluated to calculate running times.
# subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000]
# times_subsetsize_list = []

# def calculate_running_times():
#     for i in subset_list:
#         start = time.time()
#         x_rest, x_sub, y_rest, y_sub = train_test_split(X, y, test_size=i/len(X), stratify=y, random_state=47)
#
#         print("SMOTE", i)
コード例 #5
0
# Create a train-test split where the ratio of target class is maintained
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=47,
                                                    stratify=y)

# Initialize a TomekLinks sampler
under = imblearn.under_sampling.TomekLinks(sampling_strategy='majority')
# Initialize a pipeline (One can add extra steps here if required)
steps = [('o', under)]
pipeline = Pipeline(steps)
# Resample data
x_train_res, y_train_res = pipeline.fit_resample(x_train, y_train)
# Train an xg_boost model with resampled data
xg_boost(x_train_res, y_train_res, x_test, y_test, f"tomek_links{len(X)}")

# The code below was used to calculate the running times.
# Since some running times were very long, we let the code time-out after 10 hours.
# It is less relevant for WWF, hence it is commented out.

#List of sub-sample sizes that were evaluated to calculate running times.
# subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000]
# times_subsetsize_list = []
#
# def calculate_running_times():
#     for i in subset_list:
#         start = time.time()
#         x_rest, x_sub, y_rest, y_sub = train_test_split(X, y, test_size=i/len(X), stratify=y, random_state=47)
#         # Third pipeline Tomek links
#         print("Tomek", i)