def test_cnn_fit_resample_with_object():
    knn = KNeighborsClassifier(n_neighbors=1)
    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn)
    X_resampled, y_resampled = cnn.fit_resample(X, Y)

    X_gt = np.array([
        [-0.10903849, -0.12085181],
        [0.01936241, 0.17799828],
        [0.05230552, 0.09043907],
        [-1.25020462, -0.40402054],
        [0.70524765, 0.39816382],
        [0.35831463, 1.33483198],
        [-0.284881, -0.62730973],
        [0.03394306, 0.03986753],
        [-0.01252787, 0.34102657],
        [0.15198585, 0.12512646],
    ])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=1)
    X_resampled, y_resampled = cnn.fit_resample(X, Y)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def cnn_test(data_set: pd.DataFrame, metric: str, k: int, weights='uniform'):
    X = np.array(data_set.iloc[:, 0:2])
    y = np.array(data_set.iloc[:, 2:])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
    cnn = CondensedNearestNeighbour(n_neighbors=k, sampling_strategy="all")
    X_train_re, y_train_re = cnn.fit_resample(X_train, y_train)
    clf = neighbors.KNeighborsClassifier(k, metric=metric, weights=weights)
    clf.fit(X_train_re, y_train_re.ravel())
    predicted = clf.predict(X_test)
    accuracy = accuracy_score(predicted, y_test)
    print(accuracy)
    plot_decisions_boundaries(X_train, y_train, clf=clf)
Exemple #3
0
def condensed_nearest_neighbour(X,
                                y,
                                visualize=False,
                                pca2d=True,
                                pca3d=True,
                                tsne=True,
                                pie_evr=True):
    cnn = CondensedNearestNeighbour(random_state=42)
    X_res, y_res = cnn.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
def test_cnn_fit_resample_with_indices():
    cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = cnn.fit_resample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [0.05230552, 0.09043907], [-1.25020462, -0.40402054],
                     [0.70524765, 0.39816382], [0.35831463, 1.33483198],
                     [-0.284881, -0.62730973], [0.03394306, 0.03986753],
                     [-0.01252787, 0.34102657], [0.15198585, 0.12512646]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2])
    idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Exemple #5
0
def get_data(force_reload=False, strategy='oversampling', test_size=0.15):
    train_data_file = os.path.join(DATA_DIR, 'train_data.{}.npy'.format(strategy))
    train_labels_file = os.path.join(DATA_DIR, 'train_labels.{}.npy'.format(strategy))
    val_data_file = os.path.join(DATA_DIR, 'val_data.{}.npy'.format(strategy))
    val_labels_file = os.path.join(DATA_DIR, 'val_labels.{}.npy'.format(strategy))

    training_files_exist = os.path.exists(train_data_file) and os.path.exists(train_labels_file)
    val_files_exist = os.path.exists(val_data_file) and os.path.exists(val_labels_file)

    if not force_reload and training_files_exist and val_files_exist:
        X_train = np.load(train_data_file)
        y_train = np.load(train_labels_file)

        X_val = np.load(val_data_file)
        y_val = np.load(val_labels_file)
    else:
        train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
        X, y = to_data_format(train_df)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size)

        print('Shapes before: {}, {}'.format(X_train.shape, y_train.shape))

        if strategy == 'oversampling':
            X_train, y_train = SMOTE(n_jobs=n_jobs).fit_resample(X_train, y_train)
        elif strategy == 'combine':
            smote = SMOTE(n_jobs=n_jobs)
            enn = EditedNearestNeighbours(n_jobs=n_jobs)
            X_train, y_train = SMOTEENN(smote=smote, enn=enn).fit_resample(X_train, y_train)
        elif strategy == 'undersampling':
            enn = EditedNearestNeighbours(n_jobs=n_jobs)
            X_train, y_train = enn.fit_resample(X_train, y_train)
        elif strategy == 'condensed-undersampling':
            cnn = CondensedNearestNeighbour(n_jobs=n_jobs, n_neighbors=3)
            X_train, y_train = cnn.fit_resample(X_train, y_train)

        print('Shapes after: {}, {}'.format(X_train.shape, y_train.shape))

        np.save(train_data_file, X_train)
        np.save(train_labels_file, y_train)
        np.save(val_data_file, X_val)
        np.save(val_labels_file, y_val)

    return X_train, X_val, y_train, y_val
Exemple #6
0
def readFile(path, y_label,method, encode_features=[], skew_exempted=[], training_ratio=0.7, shuffle=True, needSkew=False,fea_eng=True):
    raw = pd.read_csv(path)
    n, d = raw.shape
   

    if (shuffle):
        raw = raw.sample(frac=1).reset_index(drop=True)  # shuffle
    
    if (needSkew):
        skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop(skew_exempted)].apply(lambda x: skew(x.dropna()))
        skewed = skewed[skewed > 0.75].index
        raw[skewed] = np.log1p(raw[skewed])  # reduce skewness
    
    raw = pd.get_dummies(raw, columns=encode_features)  # encode categorical features
    raw = raw.fillna(raw.mean())
    # if(method=='OverSample'):
    #     ind_more=np.argmax(np.bincount(raw[y_label]))
    #     more=raw[ind]
    #     less=raw[-ind]
    #     x = [randint(0, len(less)) for a in range(0, len(more)-len(less))]
    #     raw.
    X=raw.drop(y_label,axis=1)
    y=raw[y_label]
    if(method=='OverSample'):        
        ada = ADASYN(random_state=42)
        X_res, y_res = ada.fit_resample(X, y)
        X=X_res
        y=y_res
    if(method=='UnderSample'): 
        # for i in []   
        model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP
        X_res, y_res = model.fit_resample(X, y) #doctest: +SKIP    \      
        X=X_res
        y=y_res
    # if(method=='Weights'): 
    # if(fea_eng==True):
    #     # X,y=feature_eng(X,y)
    X_train, X_test, y_train, y_test=split(X,y, training_ratio)
    return X_train, X_test, y_train, y_test
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train,
                        PM_ova_X_train, PM_ova_y_train, SC_ova_X_train,
                        SC_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN()
        AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ALLKNN":
        AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN()
        AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train,
                                                    AP_ova_y_train)
        PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train,
                                                    PM_ova_y_train)
        SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train,
                                                    SC_ova_y_train)
    elif imb_technique == "CNN":
        AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ENN":
        AP_enn, PM_enn, SC_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "IHT":
        AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NCR":
        AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AP_ova_y_train = [
            0 if i == "Add penalty" else 1 for i in AP_ova_y_train
        ]
        AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train]
        PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_ova_y_train = [
            0 if i == "Send for Credit Collection" else 1
            for i in SC_ova_y_train
        ]
        SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NM":
        AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss()
        AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "OSS":
        AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RENN":
        AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "SMOTE":
        AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE()
        AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "BSMOTE":
        AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN()
        AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train,
                                                   AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train,
                                                   PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train,
                                                   SC_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek()
        AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "TOMEK":
        AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks()
        AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "ROS":
        AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RUS":
        AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
            X_best_train = f_classif_select.fit_transform(X_train, y_train)
            X_best_test = f_classif_select.fit_transform(X_test, y_test)

            knn.fit(X_best_train, y_train)
            y_pred = knn.predict(X_best_test)
            scores[i].append(metric(y_test, y_pred))

    for dataset_score in scores:
        print(np.mean(dataset_score))

if not SKIP_CNN:
    scores = [[] for _ in range(len(datasets))]

    for i, dataset in enumerate(datasets):
        X, y = dataset
        X, y = cnn.fit_resample(X, y)
        for train_index, test_index in rskf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            f_classif_select = SelectKBest(k=K_BEST)
            X_best_train = f_classif_select.fit_transform(X_train, y_train)
            X_best_test = f_classif_select.fit_transform(X_test, y_test)

            knn.fit(X_best_train, y_train)
            y_pred = knn.predict(X_best_test)
            scores[i].append(metric(y_test, y_pred))

    for dataset_score in scores:
        print(np.mean(dataset_score))
Exemple #9
0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_res, y_res = ros.fit_resample(X_train, y_train)
    train_and_measure(clf, 'ros', X_res, y_res, X_test, y_test)

# SMOTE + ENN
print('SMOTEENN')
for i in range(N):
    clf = get_model()
    smnn = SMOTEENN()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_res, y_res = smnn.fit_resample(X_train, y_train)
    train_and_measure(clf, 'smoteenn', X_res, y_res, X_test, y_test)

# CNN
print('CNN')
for i in range(N):
    clf = get_model()
    cnn = CondensedNearestNeighbour()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_res, y_res = cnn.fit_resample(X_train, y_train)
    train_and_measure(clf, 'cnn', X_res, y_res, X_test, y_test)

# RUS
print('RUS')
for i in range(N):
    clf = get_model()
    rus = RandomUnderSampler()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_res, y_res = rus.fit_resample(X_train, y_train)
    train_and_measure(clf, 'rus', X_res, y_res, X_test, y_test)
Exemple #10
0
plotCounts()

# unfortunately we can clearly see that the data is imbalanced by having about 84% of the asteroids
# as not hazardous and about 16% as hazardous

# HANDLING IMBALANCED DATA
# to handle the imbalanced data we will use and compare multiple techniques and algorithms

# from the graph above we can clearly see that all Hazardous data points and condensed in a small region,
# i don't think using oversampling here is a good idea, instead we could just use undersampling
# or try cost-sensitive down-weighting

# CondensedNearestNeighbour technique for undersampling
from imblearn.under_sampling import CondensedNearestNeighbour
cnn = CondensedNearestNeighbour(n_neighbors=5, n_seeds_S=55)
cnn_X, cnn_y = cnn.fit_resample(X, y)
plotData2D(cnn_X, cnn_y)

# CondensedNearestNeighbour removes too many instances we will not use it

# NeighbourhoodCleaningRule technique for undersampling
from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr = NeighbourhoodCleaningRule(sampling_strategy='majority',
                                n_neighbors=5,
                                kind_sel='mode')
ncr_X, ncr_y = ncr.fit_resample(X, y)
plotData2D(ncr_X, ncr_y)

# NeighbourhoodCleaningRule also doesn't work  for this dataset because it removes many data points
# in just one region and that will mess up our decision boundary and make our predictions worse.
Exemple #11
0
def all_imblearn(xx, yy):
    
    imblearnlist = []  
    
    """OVER SAMPLING"""
    
    """Random Over Sampler"""
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(xx, yy)
    randomOverSampler = [X_resampled, y_resampled, 'random over sampler']
    imblearnlist.append(randomOverSampler)
    
    """SMOTE"""
    X_resampled, y_resampled = SMOTE().fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote']
    imblearnlist.append(smote)
    
    """SMOTE borderline1"""
    sm = SMOTE(kind='borderline1')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote borderline1']
    imblearnlist.append(smote)
    
    """SMOTE borderline2"""
    sm = SMOTE(kind='borderline2')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote borderline2']
    imblearnlist.append(smote)
    
    """SMOTE svm"""
    sm = SMOTE(kind='svm')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote svm']
    imblearnlist.append(smote)
    
    """SMOTENC"""
    smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
    X_resampled, y_resampled = smote_nc.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smotenc']
    imblearnlist.append(smote)
    
#    """ADASYN"""
#    X_resampled, y_resampled = ADASYN.fit_resample(xx, yy)
#    adasyn = [X_resampled, y_resampled, 'adasyn']
#    imblearnlist.append(adasyn)
#    


    """UNDER SAMPLING"""
    
    """Cluster Centroids"""
    cc = ClusterCentroids(random_state=0)
    X_resampled, y_resampled = cc.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'cluster centroids']
    imblearnlist.append(reSampled)

    """Random Over Sampler"""
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'random under sampler']
    imblearnlist.append(reSampled)
    
    """Near Miss 1"""
    nm1 = NearMiss(version=1)
    X_resampled, y_resampled = nm1.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 1']
    imblearnlist.append(reSampled)
    
    """Near Miss 2"""
    nm2 = NearMiss(version=2)
    X_resampled, y_resampled = nm2.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 2']
    imblearnlist.append(reSampled)
    
    """Near Miss 3"""
    nm3 = NearMiss(version=3)
    X_resampled, y_resampled = nm3.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 3']
    imblearnlist.append(reSampled)
    
    """Edited Nearest Neighbours"""
    enn = EditedNearestNeighbours()
    X_resampled, y_resampled = enn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'edited nearest neighbours']
    imblearnlist.append(reSampled)
    
    """Repeated Edited Nearest Neighbours"""
    renn = RepeatedEditedNearestNeighbours()
    X_resampled, y_resampled = renn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'repeated edited nearest neighbours']
    imblearnlist.append(reSampled)
    
    """All KNN"""
    allknn = AllKNN()
    X_resampled, y_resampled = allknn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'allKNN']
    imblearnlist.append(reSampled)
    
    """Condensed Nearest Neighbour"""
    cnn = CondensedNearestNeighbour(random_state=0)
    X_resampled, y_resampled = cnn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'Condensed Nearest Neighbour']
    imblearnlist.append(reSampled)
    
    """One Sided Selection"""
    oss = OneSidedSelection(random_state=0)
    X_resampled, y_resampled = oss.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'One Sided Selection']
    imblearnlist.append(reSampled)
    
    """Neighbourhood Cleaning Rule"""
    ncr = NeighbourhoodCleaningRule()
    X_resampled, y_resampled = ncr.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'Neighbourhood Cleaning Rule']
    imblearnlist.append(reSampled)


    """OVER AND UNDER SAMPLING"""
    
    """SMOTEENN"""
    smote_enn = SMOTEENN(random_state=0)
    X_resampled, y_resampled = smote_enn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'SMOTEENN']
    imblearnlist.append(reSampled)
    
    """SMOTETomek"""
    smote_tomek = SMOTETomek(random_state=0)
    X_resampled, y_resampled = smote_tomek.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'SMOTETomek']
    imblearnlist.append(reSampled)
    
    return imblearnlist
    
    
    
    
    
    
    
def test_cnn_fit_resample_with_wrong_object():
    knn = 'rnd'
    cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn)
    with raises(ValueError, match="has to be a int or an "):
        cnn.fit_resample(X, Y)
Exemple #13
0
# undersample and plot imbalanced dataset with the Condensed Nearest Neighbor Rule
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import CondensedNearestNeighbour
from matplotlib import pyplot
from numpy import where
# define dataset
X, y = make_classification(n_samples=10000,
                           n_features=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           weights=[0.99],
                           flip_y=0,
                           random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)
# define the undersampling method
undersample = CondensedNearestNeighbour(n_neighbors=1)
# transform the dataset
X, y = undersample.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)
# scatter plot of examples by class label
for label, _ in counter.items():
    row_ix = where(y == label)[0]
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()
    def fit(self, X, y):
        """Fitting."""
        # if not hasattr(self, "base_estimator"):
        # self.set_base_clf()
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y

        minority_X = X[y == 1]
        minority_y = y[y == 1]
        majority_X = X[y == 0]
        majority_y = y[y == 0]

        for i in range(self.ensemble_size):
            self.estimators_.append(base.clone(self.base_estimator))

        for n, estimator in enumerate(self.estimators_):
            np.random.seed(self.random_state + (n * 2))
            bagXminority = minority_X[np.random.choice(
                minority_X.shape[0], len(minority_y), replace=True), :]
            bagXmajority = majority_X[np.random.choice(
                majority_X.shape[0], len(majority_y), replace=True), :]

            bagyminority = np.ones(len(minority_y)).astype('int')
            bagymajority = np.zeros(len(majority_y)).astype('int')

            train_X = np.concatenate((bagXmajority, bagXminority))
            train_y = np.concatenate((bagymajority, bagyminority))

            unique, counts = np.unique(train_y, return_counts=True)

            if self.oversampler == "ROS":
                ros = RandomOverSampler(random_state=self.random_state +
                                        (n * 2))
                try:
                    train_X, train_y = ros.fit_resample(train_X, train_y)
                except:
                    pass
            elif self.oversampler == "B2":
                b2 = BorderlineSMOTE(random_state=self.random_state + (n * 2),
                                     kind='borderline-2')
                try:
                    train_X, train_y = b2.fit_resample(train_X, train_y)
                except:
                    pass
            elif self.oversampler == "RUS":
                rus = RandomUnderSampler(random_state=self.random_state +
                                         (n * 2))
                try:
                    train_X, train_y = rus.fit_resample(train_X, train_y)
                    # _, ys_counter = np.unique(train_ys, return_counts=True)

                    # if np.sum(ys_counter) < 9:
                    # rus = RandomUnderSampler(random_state=self.random_state+(n*2), sampling_strategy={0:(9-ys_counter[1]), 1:ys_counter[1]})
                    # train_Xs, train_ys = rus.fit_resample(train_X, train_y)
                    # train_X, train_y = train_Xs, train_ys
                    # else:
                    # train_X, train_y = train_Xs, train_ys
                except:
                    pass
            elif self.oversampler == "CNN":
                cnn = CondensedNearestNeighbour(
                    random_state=self.random_state + (n * 2))
                try:
                    train_X, train_y = cnn.fit_resample(train_X, train_y)
                except:
                    pass
            # if train_X.shape[0] >= 5:
            estimator.fit(train_X, train_y)
            # else:
            #     print("Padlem, więc biorę %i sasiadow" % train_X.shape[0])
            #     self.estimators_[n] = KNeighborsClassifier(weights='distance', n_neighbors=train_X.shape[0]).fit(train_X, train_y)

        # Return the classifier
        return self
Exemple #15
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        X, y = check_X_y(X, y)
        if not hasattr(self, "ensemble_"):
            self.ensemble_ = []
            self.ensemble_base_ = []

        # Check feature consistency
        if hasattr(self, "X_"):
            if self.X_.shape[1] != X.shape[1]:
                raise ValueError("number of features does not match")

        self.X_, self.y_ = X, y
        if self.oversampled == "None":
            self.dsel_X_, self.dsel_y_ = self.X_, self.y_
        elif self.oversampled == "ROS":
            ros = RandomOverSampler(random_state=42)
            try:
                self.dsel_X_, self.dsel_y_ = ros.fit_resample(self.X_, self.y_)
            except:
                self.dsel_X_, self.dsel_y_ = self.X_, self.y_
        elif self.oversampled == "B2":
            b2 = BorderlineSMOTE(random_state=42, kind='borderline-2')
            try:
                self.dsel_X_, self.dsel_y_ = b2.fit_resample(self.X_, self.y_)
            except:
                self.dsel_X_, self.dsel_y_ = self.X_, self.y_
        elif self.oversampled == "RUS":
            rus = RandomUnderSampler(random_state=42)
            try:
                self.dsel_X_, self.dsel_y_ = rus.fit_resample(self.X_, self.y_)
                # _, ys_counter = np.unique(self.dsel_y_, return_counts=True)

                # if np.sum(ys_counter) < 9:
                # rus = RandomUnderSampler(random_state=42, sampling_strategy={0:(9-ys_counter[1]), 1:ys_counter[1]})
                # self.dsel_X_, self.dsel_y_ = rus.fit_resample(self.X_, self.y_)
            except:
                self.dsel_X_, self.dsel_y_ = self.X_, self.y_
        elif self.oversampled == "CNN":
            cnn = CondensedNearestNeighbour(random_state=42)
            try:
                self.dsel_X_, self.dsel_y_ = cnn.fit_resample(self.X_, self.y_)
            except:
                self.dsel_X_, self.dsel_y_ = self.X_, self.y_

        # Check classes
        self.classes_ = classes
        if self.classes_ is None:
            self.classes_, _ = np.unique(y, return_inverse=True)

        # Append new estimator
        self.candidate_ = clone(self.base_estimator).fit(self.X_, self.y_)
        self.ensemble_.append(self.candidate_)
        self.ensemble_base_.extend(self.candidate_.estimators_)

        # Remove the worst when ensemble becomes too large
        if len(self.ensemble_) > self.n_estimators:
            self.prune_index_ = np.argmin(
                [self.metric(y, clf.predict(X)) for clf in self.ensemble_])
            # print(self.prune_index_)
            del self.ensemble_[self.prune_index_]
            a = (((self.prune_index_ + 1) * 10) - 10)
            b = (((self.prune_index_ + 1) * 10))
            del self.ensemble_base_[a:b]
            # print(a, ":", b)

        return self
Exemple #16
0
def sample_dataset(X_train, y_train):
    sampling_method = CondensedNearestNeighbour(random_state=seed)
    X_resampled, y_resampled = sampling_method.fit_resample(X_train, y_train)
    print("Shape del dataset original: {}. Shape del dataset procesado{} ".format(X_train.shape, X_resampled.shape))
    return X_resampled, y_resampled
X_train, y_train, X_test, y_test = input.read_train_test_data(
    config['patch_size'], conv3d=True)
#X_test, y_test, X_train, y_train = input.read_data(config['patch_size'])

if validation_set:
    X_train, X_val, y_train, y_val = \
        train_test_split(X_train, y_train, test_size=0.5, random_state=42, stratify=y_train)

if undersampling:
    X, y, _, _ = input.read_data(patch_size=1)
    X_reshaped = X.reshape(X.shape[0], input.bands)
    print("Elements before undersampling: %i" % len(X))
    print(sorted(Counter(y).items()))
    enn = CondensedNearestNeighbour(n_jobs=8)
    enn.fit_resample(X_reshaped, y_train)
    print("Elements after undersampling: %i" % len(enn.sample_indices_))
    X_test, y_test = np.delete(X_train, enn.sample_indices_,
                               axis=0), np.delete(y_train,
                                                  enn.sample_indices_,
                                                  axis=0)
    X_train, y_train = np.take(X_train, enn.sample_indices_,
                               axis=0), np.take(y_train,
                                                enn.sample_indices_,
                                                axis=0)
    print(sorted(Counter(y_train).items()))

if oversampling:
    X_train, y_train = input.oversample_data(X_train, y_train,
                                             config['patch_size'])
Exemple #18
0
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train,
                        AI_ova_X_train, AI_ova_y_train, AW_ova_X_train,
                        AW_ova_y_train, CC_ova_X_train, CC_ova_y_train,
                        QA_ova_X_train, QA_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN(
        ), ADASYN(), ADASYN()
        AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ALLKNN":
        AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN(
        ), AllKNN(), AllKNN(), AllKNN(), AllKNN()
        AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train,
                                                    AA_ova_y_train)
        AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train,
                                                    AI_ova_y_train)
        AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train,
                                                    AW_ova_y_train)
        CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train,
                                                    CC_ova_y_train)
        QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train,
                                                    QA_ova_y_train)
    elif imb_technique == "CNN":
        AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ENN":
        AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "IHT":
        AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NCR":
        AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AA_ova_y_train = [
            0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train
        ]
        AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_ova_y_train = [
            0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train
        ]
        AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_ova_y_train = [
            0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train
        ]
        AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_ova_y_train = [
            0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train
        ]
        CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_ova_y_train = [
            0 if i == "Queued/Awaiting Assignment" else 1
            for i in QA_ova_y_train
        ]
        QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NM":
        AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss(
        ), NearMiss(), NearMiss()
        AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "OSS":
        AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection(
        ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RENN":
        AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "SMOTE":
        AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE(
        ), SMOTE()
        AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "BSMOTE":
        AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE(
        ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN(
        ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN()
        AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train,
                                                   AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train,
                                                   AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train,
                                                   AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train,
                                                   CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train,
                                                   QA_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek(
        ), SMOTETomek(), SMOTETomek(), SMOTETomek()
        AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "TOMEK":
        AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks(
        ), TomekLinks(), TomekLinks(), TomekLinks()
        AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "ROS":
        AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler(
        ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RUS":
        AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler(
        ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
Exemple #19
0
def condensed_nearest_neighbour(X, y):
    cnn = CondensedNearestNeighbour(random_state=42)
    X_res, y_res = cnn.fit_resample(X, y)
    return X_res, y_res
Exemple #20
0
# # # 8 ---------- Glass Identification
# dataset = pd.read_csv('data/glass.txt')
# X_data = dataset.iloc[:, 1:9].values
# y_data = dataset.iloc[:, 10].values

# ---------- ABALONE -----
# dataset = pd.read_csv('data/abalone.txt')
# X_data = dataset.iloc[:, 0:].values
# y_data = dataset.iloc[:, 8].values

print(X_data.shape)
print('-------')

# ------- CNN --------
cnn = CondensedNearestNeighbour()
X_cnn, y_cnn = cnn.fit_resample(X_data, y_data)
print(X_cnn.shape)

# ------- ENN --------
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X_data, y_data)
print(X_enn.shape)

# ------- RENN --------
renn = RepeatedEditedNearestNeighbours()
X_renn, y_renn = renn.fit_resample(X_data, y_data)
print(X_renn.shape)

# ------- Tomek --------
tl = TomekLinks()
X_t, y_t = tl.fit_resample(X_data, y_data)
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=200,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Condensed Nearest Neighbours
cnn = CondensedNearestNeighbour(return_indices=True)
X_resampled, y_resampled, idx_resampled = cnn.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled)

idx_class_0 = y_resampled == 0
plt.scatter(X_res_vis[idx_class_0, 0],
            X_res_vis[idx_class_0, 1],
            alpha=.8,
            label='Class #0')
plt.scatter(X_res_vis[~idx_class_0, 0],
            X_res_vis[~idx_class_0, 1],
            alpha=.8,
Exemple #22
0
def sample_data_by_cnn(X, y):
    cnn = CondensedNearestNeighbour(random_state=42)
    return cnn.fit_resample(X, y)
def get_uds_CNN(data_list, label):
    cnn = CondensedNearestNeighbour(random_state=42)
    X_res, y_res = cnn.fit_resample(data_list, label)