def fit(self, X , y = None):
        # 'Random under-sampling'
        smote =  CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51)
        #Accuracy: 0.939693267481
        #Precision: 0.238095238095
        #Recall: 0.897435897436

        #Accuracy: 0.962568234988
        #Precision: 0.324468085106
        #Recall: 0.782051282051
        #SMOTE(ratio=ratio, kind='borderline1')
        #Accuracy: 0.971146347803
        #Precision: 0.372093023256
        #Recall: 0.615384615385
        #SMOTE(ratio=ratio, kind='borderline2')
        #Accuracy: 0.965427605927
        #Precision: 0.333333333333
        #Recall: 0.705128205128
        #svm_args = {'class_weight': 'auto'}
        #svmsmote = SMOTE(ratio=ratio, kind='svm', **svm_args)
        #Accuracy: 0.972186119054
        #Precision: 0.395683453237
        #Recall: 0.705128205128

       # smote = SMOTE(ratio='auto', kind='regular')
        X, y = smote.fit_sample(X.toarray(), y)
        weights = np.array([1/y.mean() if i == 1 else 1 for i in y])
        return super(RandomForestClassifier, self).fit(X,y,sample_weight=weights)
Exemple #2
0
def test_cnn_fit_sample_with_object():
    """Test the fit sample routine with a knn object"""

    # Resample the data
    knn = KNeighborsClassifier(n_neighbors=1)
    cnn = CondensedNearestNeighbour(random_state=RND_SEED,
                                    n_neighbors=knn)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181],
                     [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907],
                     [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382],
                     [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973],
                     [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657],
                     [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

    cnn = CondensedNearestNeighbour(random_state=RND_SEED,
                                    n_neighbors=1)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemple #3
0
def random_instance_selection(dfZ, x, blackbox, dataset):
    dfZ1, Z = random_neighborhood(dfZ, x, blackbox, dataset)
    y = blackbox.predict(Z)

    cnn = CondensedNearestNeighbour(return_indices=True)
    Z, _, _ = cnn.fit_sample(Z, y)
    dfZ = build_df2explain(blackbox, Z, dataset)
    return dfZ, Z
Exemple #4
0
def test_cnn_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    cnn.fit(X, Y)
    assert_raises(RuntimeError, cnn.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_cnn_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    cnn.fit(X, Y)
    assert_raises(RuntimeError, cnn.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def cnn_test(data_set: pd.DataFrame, metric: str, k: int, weights='uniform'):
    X = np.array(data_set.iloc[:, 0:2])
    y = np.array(data_set.iloc[:, 2:])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
    cnn = CondensedNearestNeighbour(n_neighbors=k, sampling_strategy="all")
    X_train_re, y_train_re = cnn.fit_resample(X_train, y_train)
    clf = neighbors.KNeighborsClassifier(k, metric=metric, weights=weights)
    clf.fit(X_train_re, y_train_re.ravel())
    predicted = clf.predict(X_test)
    accuracy = accuracy_score(predicted, y_test)
    print(accuracy)
    plot_decisions_boundaries(X_train, y_train, clf=clf)
def test_cnn_fit_sample():
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907], [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382], [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973], [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657], [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemple #9
0
def test_cnn_fit_sample():
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907], [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382], [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973], [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657], [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    X_resampled, y_resampled = cnn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemple #11
0
def condensed_nearest_neighbour(X,
                                y,
                                visualize=False,
                                pca2d=True,
                                pca3d=True,
                                tsne=True,
                                pie_evr=True):
    cnn = CondensedNearestNeighbour(random_state=42)
    X_res, y_res = cnn.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
def test_cnn_fit():
    """Test the fitting method"""

    # Create the object
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    # Fit the data
    cnn.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(cnn.min_c_, 0)
    assert_equal(cnn.maj_c_, 1)
    assert_equal(cnn.stats_c_[0], 500)
    assert_equal(cnn.stats_c_[1], 4500)
def test_cnn_fit():
    """Test the fitting method"""

    # Create the object
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    # Fit the data
    cnn.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(cnn.min_c_, 0)
    assert_equal(cnn.maj_c_, 1)
    assert_equal(cnn.stats_c_[0], 500)
    assert_equal(cnn.stats_c_[1], 4500)
def test_cnn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_cnn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Exemple #16
0
def test_cnn_fit_sample_with_indices():
    # Resample the data
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907], [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382], [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973], [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657], [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Exemple #17
0
 def getsampler(self, type):
     if type == 'none':
         sampler = NoSampler()
     elif type == 'randomunder':
         sampler = RandomUnderSampler()
     elif type == 'nearmiss':
         sampler = NearMiss()
     elif type == 'allknn':
         sampler = AllKNN()
     elif type == 'condensednn':
         sampler = CondensedNearestNeighbour()
     elif type == 'editednn':
         sampler = EditedNearestNeighbours()
     elif type == 'repeatededitednn':
         sampler = RepeatedEditedNearestNeighbours()
     elif type == 'tomeklinks':
         sampler = TomekLinks()
     elif type == 'randomover':
         sampler = RandomOverSampler()
     elif type == 'smote':
         sampler = SMOTE()
     elif type == 'adasyn':
         sampler = ADASYN()
     elif type == 'smotenc':
         sampler = SMOTENC()
     elif type == 'quality':  # and self.quality_model_selection_type == 'extended':
         sampler = QualitySampler(self.n_init)
     else:
         print("Unsupported sampler %s" % type)
         exit(1)
     if type != 'none' and type != 'quality' and 'random_state' in sampler.get_params(
     ).keys():
         sampler.set_params(random_state=self.random_state)
     return sampler
def test_cnn_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    assert_raises(RuntimeError, cnn.sample, X, Y)
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
def test_cnn_fit_sample_with_wrong_object():
    """Test either if an error is raised while a wrong object is given"""

    # Resample the data
    knn = 'rnd'
    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn)
    assert_raises(ValueError, cnn.fit_sample, X, Y)
Exemple #21
0
def Balance_classes(X_train, y_train, Sampling_Function):
    if Sampling_Function == 'RandomUnderSampler':
        us = RandomUnderSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'NearMiss1':
        us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3)
    elif Sampling_Function == 'NearMiss2':
        us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3)
    elif Sampling_Function == 'NearMiss3':
        us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3)
    elif Sampling_Function == 'CondensedNearestNeighbour':
        us = CondensedNearestNeighbour(random_state=1)
    elif Sampling_Function == 'EditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'RepeatedEditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'TomekLinks':
        us = TomekLinks(random_state=1)
    elif Sampling_Function == 'RandomOverSampler':
        us = RandomOverSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'SMOTE':
        us = SMOTE(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTETomek':
        us = SMOTETomek(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTEENN':
        us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5)
    elif Sampling_Function == 'EasyEnsemble':
        us = EasyEnsemble()
    elif Sampling_Function == 'BalanceCascade_rf':
        us = BalanceCascade(classifier='random-forest', random_state=1)
    elif Sampling_Function == 'BalanceCascade_svm':
        us = BalanceCascade(classifier='linear-svm', random_state=1)

    X_train_res, y_train_res = us.fit_sample(X_train, y_train)

    return X_train_res, y_train_res
Exemple #22
0
def under_sample(X, y, sampler="RandomUnderSampler"):
    # list of all samplers, in case you want to iterate all of them
    samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold',
                     'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours',
                     'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection']
    print(samplers_list)

    # currently there is no parameters sampler
    # this dict is used to choose a resampler by user. default is random
    samplers = {
        "RandomUnderSampler": RandomUnderSampler(),
        "ClusterCentroids": ClusterCentroids(),
        "NearMiss": NearMiss(),
        "InstanceHardnessThreshold": InstanceHardnessThreshold(),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(),
        "EditedNearestNeighbours": EditedNearestNeighbours(),
        "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(),
        "AllKNN": AllKNN(),
        "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(),
        "OneSidedSelection": OneSidedSelection(),
    }
    sampler = samplers[sampler]

    # plot y class count before and after resample
    print("before", sorted(Counter(y).items()))

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print("after", sorted(Counter(y_resampled).items()))

    print('===' * 4, 'under_sample finished')

    return X_resampled, y_resampled
def test_continuous_error():
    """Test either if an error is raised when the target are continuous
    type"""

    # continuous case
    y = np.linspace(0, 1, 20)
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    assert_warns(UserWarning, cnn.fit, X, y)
def test_cnn_init():
    """Test the initialisation of the object"""

    # Define a ratio
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)

    assert_equal(cnn.n_seeds_S, 1)
    assert_equal(cnn.n_jobs, 1)
def Resampling(train_x, train_y, resampling_method):
    train_y.data = LabelEncoder().fit_transform(train_y.data)
    # summarize distribution

    # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling
    #plotGraphics.piePlot(train_y, "Before Resampling")

    # ---- UNDER-SAMPLING ------ #
    if resampling_method == "ClusterCentroids":
        resample = ClusterCentroids(voting='hard', random_state=42)

    if resampling_method == "CondensedNearestNeighbour":
        resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42)

    if resampling_method == "EditedNearestNeighbours":
        resample = EditedNearestNeighbours(n_neighbors=7,
                                           kind_sel='mode',
                                           n_jobs=-1)

    if resampling_method == "RepeatedEditedNearestNeighbours":
        resample = RepeatedEditedNearestNeighbours(n_neighbors=7,
                                                   kind_sel='mode',
                                                   n_jobs=-1)

    if resampling_method == "AllKNN":
        resample = AllKNN(n_neighbors=7,
                          kind_sel='mode',
                          allow_minority=True,
                          n_jobs=-1)

    if resampling_method == "NearMiss":
        resample = NearMiss(n_neighbors=7, n_jobs=-1)

    if resampling_method == "NeighbourhoodCleaningRule":
        resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all')

    if resampling_method == "RandomUnderSampler":
        resample = RandomUnderSampler(random_state=42)

    if resampling_method == "TomekLinks":
        resample = TomekLinks(n_jobs=-1)

    # ---- OVER-SAMPLING ------ #
    if resampling_method == "BorderlineSMOTE":
        resample = BorderlineSMOTE(random_state=42, n_jobs=-1)

    if resampling_method == "KMeansSMOTE":
        resample = KMeansSMOTE(random_state=42)

    if resampling_method == "RandomUnderSampler":
        resample = RandomOverSampler(random_state=42)

    if resampling_method == "SMOTE":
        resample = SMOTE(random_state=42, n_jobs=-1)

    # transform the dataset
    train_x.data, train_y.data = resample.fit_resample(train_x.data,
                                                       train_y.data)
def test_cnn_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    cnn = CondensedNearestNeighbour(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_warns(UserWarning, cnn.fit, X, y_single_class)
def votingClassifier():
    print(colored("------Voting Classification-------", 'red'))

    # models
    random_forest = RandomForestClassifier(criterion='entropy',
                                           max_depth=30,
                                           n_estimators=48,
                                           random_state=0)
    clf_lr = LogisticRegression()
    clf_knn = KNeighborsClassifier(n_neighbors=7)
    # build classifier
    model = VotingClassifier(estimators=[('rf', random_forest),
                                         ('knn', clf_knn)],
                             voting='soft',
                             n_jobs=-1,
                             weights=[2, 1])

    print("Training the Voting classification.......")

    # start timer
    starttime = timeit.default_timer()  # start timer

    cnn = CondensedNearestNeighbour(random_state=42)  # doctest: +SKIP

    # train
    model.fit(train_x, train_Y)

    print("The time difference is :", timeit.default_timer() - starttime)

    print("Predicting test data.......")

    # predict
    y_pred = model.predict(test_x)

    # results
    c_matrix = confusion_matrix(test_Y, y_pred)
    error = zero_one_loss(test_Y, y_pred)
    score = accuracy_score(test_Y, y_pred)

    # display results
    print('Confusion Matrix\n---------------------------\n', c_matrix)
    print('---------------------------')
    print("Error: {:.4f}%".format(error * 100))
    print("Accuracy Score: {:.4f}%".format(score * 100))
    print(classification_report(test_Y, y_pred))
    print('accuracy: ', c_matrix.diagonal() / c_matrix.sum(axis=1))

    # Plot non-normalized confusion matrix
    disp = plot_confusion_matrix(model,
                                 test_x,
                                 test_Y,
                                 cmap=plt.cm.Greens,
                                 values_format='.0f',
                                 xticks_rotation='horizontal')
    plt.title("Confusion Matrix for Voting Classifier")

    plt.show()
Exemple #28
0
def get_data(force_reload=False, strategy='oversampling', test_size=0.15):
    train_data_file = os.path.join(DATA_DIR, 'train_data.{}.npy'.format(strategy))
    train_labels_file = os.path.join(DATA_DIR, 'train_labels.{}.npy'.format(strategy))
    val_data_file = os.path.join(DATA_DIR, 'val_data.{}.npy'.format(strategy))
    val_labels_file = os.path.join(DATA_DIR, 'val_labels.{}.npy'.format(strategy))

    training_files_exist = os.path.exists(train_data_file) and os.path.exists(train_labels_file)
    val_files_exist = os.path.exists(val_data_file) and os.path.exists(val_labels_file)

    if not force_reload and training_files_exist and val_files_exist:
        X_train = np.load(train_data_file)
        y_train = np.load(train_labels_file)

        X_val = np.load(val_data_file)
        y_val = np.load(val_labels_file)
    else:
        train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
        X, y = to_data_format(train_df)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size)

        print('Shapes before: {}, {}'.format(X_train.shape, y_train.shape))

        if strategy == 'oversampling':
            X_train, y_train = SMOTE(n_jobs=n_jobs).fit_resample(X_train, y_train)
        elif strategy == 'combine':
            smote = SMOTE(n_jobs=n_jobs)
            enn = EditedNearestNeighbours(n_jobs=n_jobs)
            X_train, y_train = SMOTEENN(smote=smote, enn=enn).fit_resample(X_train, y_train)
        elif strategy == 'undersampling':
            enn = EditedNearestNeighbours(n_jobs=n_jobs)
            X_train, y_train = enn.fit_resample(X_train, y_train)
        elif strategy == 'condensed-undersampling':
            cnn = CondensedNearestNeighbour(n_jobs=n_jobs, n_neighbors=3)
            X_train, y_train = cnn.fit_resample(X_train, y_train)

        print('Shapes after: {}, {}'.format(X_train.shape, y_train.shape))

        np.save(train_data_file, X_train)
        np.save(train_labels_file, y_train)
        np.save(val_data_file, X_val)
        np.save(val_labels_file, y_val)

    return X_train, X_val, y_train, y_val
Exemple #29
0
    def resample(self, X, y, by, random_state=None, visualize=False):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y
        if visualize:
            df = pd.DataFrame(X_train)
            df['label'] = y_train
            df.plot.scatter(x=0,
                            y=1,
                            c='label',
                            s=3,
                            colormap='coolwarm',
                            title='{} training set'.format(by))
        return X_train, y_train
def train_stage(df_path, cb_path):

    print('Load Train Data.')
    df = pd.read_csv(df_path)
    print('\nShape of Train Data: {}'.format(df.shape))

    y_df = np.array(df['target'])
    df_ids = np.array(df.index)
    df.drop(['ID_code', 'target'], axis=1, inplace=True)

    cb_cv_result = np.zeros(df.shape[0])

    skf = StratifiedKFold(n_splits=15, shuffle=False, random_state=42)
    skf.get_n_splits(df_ids, y_df)

    #sm = TomekLinks(random_state=42)
    sm = CondensedNearestNeighbour(random_state=42, n_jobs=3)

    print('\nModel Fitting...')
    for counter, ids in enumerate(skf.split(df_ids, y_df)):
        print('\nFold {}'.format(counter + 1))
        X_fit, y_fit = df.values[ids[0]], y_df[ids[0]]
        X_val, y_val = df.values[ids[1]], y_df[ids[1]]

        X_fit, y_fit = sm.fit_sample(X_fit, y_fit)

        print('CatBoost')
        cb_cv_result[ids[1]] += fit_cb(X_fit,
                                       y_fit,
                                       X_val,
                                       y_val,
                                       counter,
                                       cb_path,
                                       name='cb')

        del X_fit, X_val, y_fit, y_val
        gc.collect()

    auc_cb = round(roc_auc_score(y_df, cb_cv_result), 4)
    print('Catboost VAL AUC: {}'.format(auc_cb))

    return 0
Exemple #31
0
def readFile(path, y_label,method, encode_features=[], skew_exempted=[], training_ratio=0.7, shuffle=True, needSkew=False,fea_eng=True):
    raw = pd.read_csv(path)
    n, d = raw.shape
   

    if (shuffle):
        raw = raw.sample(frac=1).reset_index(drop=True)  # shuffle
    
    if (needSkew):
        skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop(skew_exempted)].apply(lambda x: skew(x.dropna()))
        skewed = skewed[skewed > 0.75].index
        raw[skewed] = np.log1p(raw[skewed])  # reduce skewness
    
    raw = pd.get_dummies(raw, columns=encode_features)  # encode categorical features
    raw = raw.fillna(raw.mean())
    # if(method=='OverSample'):
    #     ind_more=np.argmax(np.bincount(raw[y_label]))
    #     more=raw[ind]
    #     less=raw[-ind]
    #     x = [randint(0, len(less)) for a in range(0, len(more)-len(less))]
    #     raw.
    X=raw.drop(y_label,axis=1)
    y=raw[y_label]
    if(method=='OverSample'):        
        ada = ADASYN(random_state=42)
        X_res, y_res = ada.fit_resample(X, y)
        X=X_res
        y=y_res
    if(method=='UnderSample'): 
        # for i in []   
        model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP
        X_res, y_res = model.fit_resample(X, y) #doctest: +SKIP    \      
        X=X_res
        y=y_res
    # if(method=='Weights'): 
    # if(fea_eng==True):
    #     # X,y=feature_eng(X,y)
    X_train, X_test, y_train, y_test=split(X,y, training_ratio)
    return X_train, X_test, y_train, y_test
Exemple #32
0
def test_cnn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181],
                     [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907],
                     [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382],
                     [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973],
                     [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657],
                     [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Exemple #33
0
    def resample(self, X, y, by, random_state=None):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'ROS':
            sampler = RandomOverSampler(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y

        return X_train, y_train
def test_cnn_init():
    """Test the initialisation of the object"""

    # Define a ratio
    verbose = True
    cnn = CondensedNearestNeighbour(random_state=RND_SEED, verbose=verbose)

    assert_equal(cnn.size_ngh, 1)
    assert_equal(cnn.n_seeds_S, 1)
    assert_equal(cnn.n_jobs, -1)
    assert_equal(cnn.random_state, RND_SEED)
    assert_equal(cnn.verbose, verbose)
    assert_equal(cnn.min_c_, None)
    assert_equal(cnn.maj_c_, None)
    assert_equal(cnn.stats_c_, {})
def UnderSample(X, Y, method='Random', random_state=42):
    if X.size == len(X):
        X = X.reshape(-1, 1)
    if method is 'Cluster':  # 默认kmeans估计器
        sampler = ClusterCentroids(ratio='auto',
                                   random_state=random_state,
                                   estimator=None)
    elif method is 'Random':
        sampler = RandomUnderSampler(ratio='auto',
                                     random_state=random_state,
                                     replacement=False)
    elif method is 'NearMiss_1':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=1)
    elif method is 'NearMiss_2':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=2)
    elif method is 'NearMiss_3':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=3)
    elif method is 'TomekLinks':
        sampler = TomekLinks(ratio='auto', random_state=random_state)
    elif method is 'ENN':  # kind_sel可取'all'和'mode'
        sampler = EditedNearestNeighbours(ratio='auto',
                                          random_state=random_state,
                                          kind_sel='all')
    elif method is 'RENN':  # kind_sel可取'all'和'mode'
        sampler = RepeatedEditedNearestNeighbours(ratio='auto',
                                                  random_state=random_state,
                                                  kind_sel='all')
    elif method is 'All_KNN':
        sampler = AllKNN(ratio='auto',
                         random_state=random_state,
                         kind_sel='all')
    elif method is 'CNN':
        sampler = CondensedNearestNeighbour(ratio='auto',
                                            random_state=random_state)
    elif method is 'One_SS':
        sampler = OneSidedSelection(ratio='auto', random_state=random_state)
    elif method is 'NCR':
        sampler = NeighbourhoodCleaningRule(ratio='auto',
                                            random_state=random_state,
                                            kind_sel='all',
                                            threshold_cleaning=0.5)
    elif method is 'IHT':
        sampler = InstanceHardnessThreshold(estimator=None,
                                            ratio='auto',
                                            random_state=random_state)
    X_resampled, Y_resampled = sampler.fit_sample(X, Y)
    return X_resampled, Y_resampled
def equalize_training_dataset_with_CondensedNN(x_train, y_train):
    from imblearn.under_sampling import CondensedNearestNeighbour

    old_shape = list(x_train.shape)
    # reshape before using using over/undersampling method
    x_tmp = np.reshape(x_train, (x_train.shape[0], -1))
    x_resampled, y_resampled = CondensedNearestNeighbour(
        sampling_strategy={i: 180
                           for i in range(0, 43)},
        n_neighbors=5,
        n_jobs=8).fit_resample(x_tmp, y_train)
    print(sorted(Counter(y_resampled).items()))
    # reshape after using using over/undersampling method
    old_shape[0] = x_resampled.shape[0]
    x_resampled = np.reshape(x_resampled, tuple(old_shape))

    return x_resampled, y_resampled
Exemple #37
0
def under_sampling_algs():
    algs = list()
    algs.append(("No Rs Undersampling case", "No Re-sampling"))
    algs.append((RandomUnderSampler(random_state=1), 'RU'))
    algs.append((ClusterCentroids(random_state=1), 'CC'))
    algs.append((TomekLinks(), 'TL'))
    algs.append((NearMiss(version=1), 'NM1'))
    algs.append((NearMiss(version=2), 'NM2'))
    algs.append((NearMiss(version=3), 'NM3'))
    algs.append((CondensedNearestNeighbour(random_state=1), 'CNN'))
    algs.append((OneSidedSelection(random_state=1), 'OSS'))
    algs.append((EditedNearestNeighbours(), 'ENN'))
    algs.append((NeighbourhoodCleaningRule(), 'NCL'))
    algs.append((InstanceHardnessThreshold(random_state=1), 'IHT'))
    algs.append((RepeatedEditedNearestNeighbours(), 'RENN'))
    algs.append((AllKNN(), 'AllKNN'))
    return algs
Exemple #38
0
def load_data(mode: str, normalize: bool = True):
    df, hidden_df = __load_data_first_time()

    # Extract x and y
    y = np.array(df['earnings'].to_numpy(), dtype=int)
    del df['earnings']

    x = np.array(df.to_numpy(), dtype=float)

    # Hidden to numpy
    hidden = hidden_df.to_numpy()

    if mode == 'vanilla':
        pass

    elif mode == 'smote':
        x, y = SMOTE().fit_sample(x, y)

    elif mode == 'adasyn':
        x, y = ADASYN().fit_sample(x, y)

    elif mode == 'bordersmote':
        x, y = BorderlineSMOTE().fit_sample(x, y)

    elif mode == 'randomover':
        x, y, idxs = RandomOverSampler(return_indices=True).fit_sample(x, y)
        hidden = hidden[idxs]

    elif mode == 'randomunder':
        x, y, idxs = RandomUnderSampler(return_indices=True).fit_sample(x, y)
        hidden = hidden[idxs]

    elif mode == 'tomek':
        x, y, idxs = TomekLinks(return_indices=True).fit_sample(x, y)
        hidden = hidden[idxs]

    elif mode == 'knn':
        x, y, idxs = CondensedNearestNeighbour(return_indices=True, n_neighbors=3).fit_sample(x, y)
        hidden = hidden[idxs]

    if normalize:
        x -= np.mean(x, axis=0)
        x /= np.std(x, axis=0)

    return x, y, hidden
def test_cnn_fit_sample_with_wrong_object():
    knn = 'rnd'
    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn)
    with raises(ValueError, match="has to be a int or an "):
        cnn.fit_sample(X, Y)
Exemple #40
0
palette = sns.color_palette()


# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Condensed Nearest Neighbours
cnn = CondensedNearestNeighbour()
X_resampled, y_resampled = cnn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)