Example #1
0
    def model_selection(self):
        X = np.vstack([data_min, data_maj])
        y = np.hstack(
            [np.repeat(1, len(data_min)),
             np.repeat(0, len(data_maj))])

        # setting cache path
        cache_path = os.path.join(os.path.expanduser('~'), 'smote_test')
        if not os.path.exists(cache_path):
            os.mkdir(cache_path)

        # prepare dataset
        dataset = {'data': X, 'target': y, 'name': 'ballpark_data'}

        # instantiating classifiers
        knn_classifier = KNeighborsClassifier()
        dt_classifier = DecisionTreeClassifier()

        # instantiate the validation object
        oversamplers = sv.get_n_quickest_oversamplers(5)
        classifiers = [knn_classifier, dt_classifier]
        samp_obj, cl_obj = sv.model_selection(dataset=dataset,
                                              samplers=oversamplers,
                                              classifiers=classifiers,
                                              cache_path=cache_path,
                                              n_jobs=1)

        self.assertTrue((samp_obj is not None) and (cl_obj is not None))

        results = sv.read_oversampling_results(datasets=[dataset],
                                               cache_path=cache_path)

        self.assertTrue(len(results) > 0)
# import datasets
from sklearn import datasets

# setting cache path
cache_path = os.path.join(os.path.expanduser('~'), 'workspaces', 'smote_test')

# prepare dataset
dataset = datasets.load_breast_cancer()
dataset = {
    'data': dataset['data'],
    'target': dataset['target'],
    'name': 'breast_cancer'
}

# instantiating classifiers
knn_classifier = KNeighborsClassifier()
dt_classifier = DecisionTreeClassifier()

# instantiate the validation object
samp_obj, cl_obj = sv.model_selection(
    datasets=[dataset],
    samplers=sv.get_n_quickest_oversamplers(5),
    classifiers=[knn_classifier, dt_classifier],
    cache_path=cache_path,
    n_jobs=5,
    max_n_sampler_parameters=35)

# oversampling and classifier training
X_samp, y_samp = samp_obj.sample(dataset['data'], dataset['target'])
cl_obj.fit(X_samp, y_samp)
Example #3
0
def test_model_selection():
    data_min = np.array([[5.7996138, -0.25574582], [3.0637093, 2.11750874],
                         [4.91444087, -0.72380123], [1.06414164, 0.08694243],
                         [2.59071708, 0.75283568], [3.44834937, 1.46118085],
                         [2.8036378, 0.69553702], [3.57901791, 0.71870743],
                         [3.81529064, 0.62580927], [3.05005506, 0.33290343],
                         [1.83674689, 1.06998465], [2.08574889, -0.32686821],
                         [3.49417022, -0.92155623], [2.33920982, -1.59057568],
                         [1.95332431, -0.84533309], [3.35453368, -1.10178101],
                         [4.20791149, -1.41874985], [2.25371221, -1.45181929],
                         [2.87401694, -0.74746037], [1.84435381, 0.15715329]])

    data_maj = np.array([[-1.40972752, 0.07111486], [-1.1873495, -0.20838002],
                         [0.51978825, 2.1631319], [-0.61995016, -0.45111475],
                         [2.6093289, -0.40993063], [-0.06624482, -0.45882838],
                         [-0.28836659, -0.59493865], [0.345051, 0.05188811],
                         [1.75694985, 0.16685025], [0.52901288, -0.62341735],
                         [0.09694047, -0.15811278], [-0.37490451, -0.46290818],
                         [-0.32855088,
                          -0.20893795], [-0.98508364, -0.32003935],
                         [0.07579831, 1.36455355], [-1.44496689, -0.44792395],
                         [1.17083343, -0.15804265], [1.73361443, -0.06018163],
                         [-0.05139342, 0.44876765], [0.33731075, -0.06547923],
                         [-0.02803696, 0.5802353], [0.20885408, 0.39232885],
                         [0.22819482, 2.47835768], [1.48216063, 0.81341279],
                         [-0.6240829, -0.90154291], [0.54349668, 1.4313319],
                         [-0.65925018, 0.78058634], [-1.65006105, -0.88327625],
                         [-1.49996313, -0.99378106], [0.31628974, -0.41951526],
                         [0.64402186, 1.10456105], [-0.17725369, -0.67939216],
                         [0.12000555, -1.18672234], [2.09793313, 1.82636262],
                         [-0.11711376, 0.49655609], [1.40513236, 0.74970305],
                         [2.40025472, -0.5971392], [-1.04860983, 2.05691699],
                         [0.74057019, -1.48622202], [1.32230881, -2.36226588],
                         [-1.00093975,
                          -0.44426212], [-2.25927766, -0.55860504],
                         [-1.12592836, -0.13399132], [0.14500925, -0.89070934],
                         [0.90572513, 1.23923502], [-1.25416346, -1.49100593],
                         [0.51229813, 1.54563048], [-1.36854287, 0.0151081],
                         [0.08169257, -0.69722099], [-0.73737846, 0.42595479],
                         [0.02465411, -0.36742946], [-1.14532211, -1.23217124],
                         [0.98038343, 0.59259824], [-0.20721222, 0.68062552],
                         [-2.21596433, -1.96045872], [-1.20519292, -1.8900018],
                         [0.47189299, -0.4737293], [1.18196143, 0.85320018],
                         [0.03255894, -0.77687178], [0.32485141, -0.34609381]])

    X = np.vstack([data_min, data_maj])
    y = np.hstack([np.repeat(1, len(data_min)), np.repeat(0, len(data_maj))])

    # setting cache path
    cache_path = os.path.join(os.path.expanduser('~'), 'smote_test')
    if not os.path.exists(cache_path):
        os.mkdir(cache_path)

    # prepare dataset
    dataset = {'data': X, 'target': y, 'name': 'ballpark_data'}

    # instantiating classifiers
    knn_classifier = KNeighborsClassifier()
    dt_classifier = DecisionTreeClassifier()

    # instantiate the validation object
    samp_obj, cl_obj = sv.model_selection(
        dataset=dataset,
        samplers=sv.get_n_quickest_oversamplers(5),
        classifiers=[knn_classifier, dt_classifier],
        cache_path=cache_path,
        n_jobs=1)

    assert (not samp_obj is None) and (not cl_obj is None)

    results = sv.read_oversampling_results(datasets=[dataset],
                                           cache_path=cache_path)

    assert len(results) > 0
# Executing the model selection using 5 parallel jobs and at most 35 random but meaningful parameter combinations
# with the oversamplers.

samplers = [
    sv.polynom_fit_SMOTE, sv.ProWSyn, sv.SMOTE_IPF, sv.Lee, sv.SMOBD,
    sv.G_SMOTE, sv.CCR, sv.LVQ_SMOTE, sv.Assembled_SMOTE, sv.SMOTE_TomekLinks,
    sv.SMOTE, sv.Random_SMOTE, sv.CE_SMOTE, sv.SMOTE_Cosine, sv.Selected_SMOTE,
    sv.Supervised_SMOTE, sv.CBSO, sv.cluster_SMOTE, sv.NEATER, sv.ADASYN,
    sv.NoSMOTE
]

samp_obj, cl_obj = sv.model_selection(dataset=dataset,
                                      samplers=samplers,
                                      classifiers=all_classifiers,
                                      cache_path=cache_path,
                                      n_jobs=5,
                                      max_samp_par_comb=25,
                                      random_state=5)

# In[6]:

# Oversampling and training the classifier providing the best results in the model selection procedure

results = sv.read_oversampling_results([dataset],
                                       cache_path,
                                       all_results=False)

results.to_csv('aggregated_results.csv')

results = sv.read_oversampling_results([dataset], cache_path, all_results=True)
Example #5
0
    'target': dataset['target'],
    'name': 'breast_cancer'
}

# In[4]:

# Specifying the classifiers.

knn_classifier = KNeighborsClassifier()
dt_classifier = DecisionTreeClassifier()

# In[5]:

# Executing the model selection using 5 parallel jobs and at most 35 random but meaningful parameter combinations
# with the oversamplers.

samp_obj, cl_obj = sv.model_selection(
    dataset=dataset,
    samplers=sv.get_n_quickest_oversamplers(5),
    classifiers=[knn_classifier, dt_classifier],
    cache_path=cache_path,
    n_jobs=5,
    max_samp_par_comb=35)

# In[6]:

# Oversampling and training the classifier providing the best results in the model selection procedure

X_samp, y_samp = samp_obj.sample(dataset['data'], dataset['target'])
cl_obj.fit(X_samp, y_samp)
Example #6
0
                                             sv.OUPS,
                                             sv.NoSMOTE], 
                                  classifiers= [KNeighborsClassifier()],
                                  validator= RepeatedStratifiedKFold(n_repeats= 3,
                                                                     n_splits= 5),
                                  cache_path= cache_path,
                                  max_samp_par_comb= 3,
                                  all_results= True,
                                  n_jobs= 6)

print(results[['sampler', 'sampler_parameters', 'auc']])

#%% oversampler selection

from sklearn.tree import DecisionTreeClassifier

np.random.seed(random_seed)
samp, clas= sv.model_selection(dataset= ecoli, 
                               samplers= sv.get_all_oversamplers(), 
                               classifiers= [KNeighborsClassifier(),
                                             DecisionTreeClassifier()],
                               validator= RepeatedStratifiedKFold(n_repeats= 3,
                                                                  n_splits= 5),
                               score= 'auc',
                               cache_path= cache_path,
                               max_samp_par_comb= 3,
                               n_jobs= 6)

print(samp)
print(clas)