Esempio n. 1
0
def test_validate_estimator_init():
    smote = SMOTE(random_state=RND_SEED)
    enn = EditedNearestNeighbours(random_state=RND_SEED,
                                  sampling_strategy='all')
    smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667], [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 2
0
 def ENN(df, debug=True):
     X = df.values[:, :-1]
     y = df.values[:, -1].astype(int)
     if debug:
         print('ENN: Original dataset shape %s' % Counter(y))
     enn = EditedNearestNeighbours(sampling_strategy="auto")
     X_res, y_res = enn.fit_resample(X, y)
     df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1])
     df_resampled.insert(len(df_resampled.columns), df.columns[-1],
                         y_res)
     if debug:
         print('ENN: Resampled dataset shape %s' % Counter(y_res))
     return df_resampled
    def resample(x, y, sampling_type=None):
        x_out, y_out = x, y
        if sampling_type == "smoteenn":
            sme = SMOTEENN(random_state=1)
            x_out, y_out = sme.fit_sample(x, y)
        else:
            if sampling_type == "enn":
                enn = EditedNearestNeighbours(random_state=1)
                x_out, y_out = enn.fit_sample(x, y)

        print("Before resampling:", sorted(Counter(y).items()))
        print("After resampling:", sorted(Counter(y_out).items()))
        return x_out, y_out
Esempio n. 4
0
def test_enn_fit_sample_with_indices():
    enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169], [0.78318102, 2.59153329],
                     [0.52726792, -0.38735648]])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    idx_gt = np.array([4, 11, 0, 3, 1, 8, 15])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Esempio n. 5
0
def edited_nearest_neighbour(X,
                             y,
                             visualize=False,
                             pca2d=True,
                             pca3d=True,
                             tsne=True,
                             pie_evr=True):
    enn = EditedNearestNeighbours()
    X_res, y_res = enn.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
Esempio n. 6
0
def get_under_sample_models():
    models, names = list(), list()
    models.append(TomekLinks())
    names.append('TomesLinks')
    models.append(EditedNearestNeighbours())
    names.append('EditedNearestNeighbors')
    models.append(RepeatedEditedNearestNeighbours())
    names.append('RENN')
    models.append(OneSidedSelection())
    names.append('OneSidedSelection')
    models.append(NeighbourhoodCleaningRule())
    names.append('NCR')
    return models, names
def smote_en_resampling(data_X, data_y, k_neighbors=5):
    # Perform under and over sampling using SMOTE and EN
    smote = SMOTE(sampling_strategy='minority',
                  k_neighbors=k_neighbors,
                  n_jobs=8)
    enn = EditedNearestNeighbours(n_neighbors=k_neighbors, n_jobs=8)
    smoteen = SMOTEENN(sampling_strategy="minority",
                       smote=smote,
                       enn=enn,
                       n_jobs=8)
    resamp_X, resamp_y = smoteen.fit_sample(data_X, data_y)

    return resamp_X, resamp_y
def test_enn_fit():
    """Test the fitting method"""

    # Create the object
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    # Fit the data
    enn.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(enn.min_c_, 0)
    assert_equal(enn.maj_c_, 1)
    assert_equal(enn.stats_c_[0], 500)
    assert_equal(enn.stats_c_[1], 4500)
def test_enn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'enn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def readFile(path,
             y_label,
             method,
             encode_features=[],
             skew_exempted=[],
             training_ratio=0.7,
             shuffle=True,
             needSkew=False,
             fea_eng=True):
    raw = pd.read_csv(path)
    n, d = raw.shape

    if (shuffle):
        raw = raw.sample(frac=1).reset_index(drop=True)  # shuffle

    if (needSkew):
        skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop(
            skew_exempted)].apply(lambda x: skew(x.dropna()))
        skewed = skewed[skewed > 0.75].index
        raw[skewed] = np.log1p(raw[skewed])  # reduce skewness

    raw = pd.get_dummies(
        raw, columns=encode_features)  # encode categorical features
    raw = raw.fillna(raw.mean())
    # if(method=='OverSample'):
    #     ind_more=np.argmax(np.bincount(raw[y_label]))
    #     more=raw[ind]
    #     less=raw[-ind]
    #     x = [randint(0, len(less)) for a in range(0, len(more)-len(less))]
    #     raw.
    X = raw.drop(y_label, axis=1)
    y = raw[y_label]
    X_train, X_test, y_train, y_test = split(X, y, training_ratio)
    if (method == 'OverSample'):
        ada = ADASYN(random_state=42)
        X_res, y_res = ada.fit_resample(X_train, y_train)
        X_train = X_res
        y_train = y_res
    if (method == 'UnderSample'):
        # for i in []
        #model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP
        model = EditedNearestNeighbours(random_state=42)
        X_res, y_res = model.fit_resample(X_train, y_train)
        X_train = X_res
        y_train = y_res
    # if(method=='Weights'):
    # if(fea_eng==True):
    #     # X,y=feature_eng(X,y)

    return X_train, X_test, y_train, y_test
Esempio n. 11
0
def test_enn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169], [0.78318102, 2.59153329],
                     [0.52726792, -0.38735648]])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 12
0
def test_sample_regular_pass_smote_enn():
    smote = SMOTEENN(smote=SMOTE(ratio='auto', random_state=RND_SEED),
                     enn=EditedNearestNeighbours(ratio='all',
                                                 random_state=RND_SEED),
                     random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667], [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
	def enn(self, data):
		'''
		Applies editted nearest neighbor to remove samples whose neighbors mostly belong to other classes
		'''
		df = data
		X = df.as_matrix(self.features)
		y = np.ravel(df.as_matrix(['label']))
		
		enn = EditedNearestNeighbours(ratio='all',kind_sel='mode',n_neighbors=5,random_state=42,n_jobs=4)
		X_res, y_res = enn.fit_sample(X, y)

		df_enn = pd.DataFrame(X_res, columns=self.features)
		df_enn['label'] = y_res
		return df_enn
Esempio n. 14
0
def samplingMethod(X_train, y_train, sampling="None"):
    if sampling == "SMOTE":
        sm = SMOTE(random_state=42, n_jobs=-1)
        X, y_train = sm.fit_sample(X_train.toarray(), y_train)
        X_train = csr_matrix(X)
    elif sampling == "ENN":
        enn = EditedNearestNeighbours(random_state=42, n_jobs=-1)
        X, y_train = enn.fit_sample(X_train.toarray(), y_train)
        X_train = csr_matrix(X)
    elif sampling == "SMOTEENN":
        sme = SMOTEENN(random_state=42, n_jobs=-1)
        X, y_train = sme.fit_sample(X_train.toarray(), y_train)
        X_train = csr_matrix(X)
    return X_train, y_train
def test_enn_fit_sample_mode():
    enn = EditedNearestNeighbours(random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.42772181, 0.526027],
                     [1.92365863, 0.82718767], [0.25738379, 0.95564169],
                     [-0.284881, -0.62730973], [0.57062627, 1.19528323],
                     [0.78318102, 2.59153329], [0.35831463, 1.33483198],
                     [-0.14313184, -1.0412815], [-0.09816301, -0.74662486],
                     [0.52726792, -0.38735648], [0.2821046, -0.07862747]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 16
0
    def resample(self, X, y, by, random_state=None):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'ROS':
            sampler = RandomOverSampler(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y

        return X_train, y_train
def plot_data(X, Y):
    # train_X = PCA(n_components=2).fit_transform(train_X)
    plt.rcParams['figure.figsize'] = (27.0, 5.0)
    fig = plt.figure()
    ax0 = fig.add_subplot(1, 5, 1)
    ax0.scatter(X[:, 0], X[:, 1], c=Y)
    #ax0.set_title('Original dataset')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X1, Y1 = SMOTE().fit_sample(X, Y)
    ax1 = fig.add_subplot(1, 5, 2)
    ax1.scatter(X1[:, 0], X1[:, 1], c=Y1)
    #ax1.set_title('SMOTE')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X2, Y2 = BorderlineSMOTE(kind='borderline-1').fit_sample(X, Y)
    ax2 = fig.add_subplot(1, 5, 3)
    ax2.scatter(X2[:, 0], X2[:, 1], c=Y2)
    #ax2.set_title('Borderline-SMOTE')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    enn = EditedNearestNeighbours()
    X3, Y3 = enn.fit_sample(X, Y)
    smo = SMOTE(k_neighbors=5)
    X3, Y3 = smo.fit_sample(X3, Y3)
    ax3 = fig.add_subplot(1, 5, 4)
    ax3.scatter(X3[:, 0], X3[:, 1], c=Y3)
    #ax3.set_title('ADASYN')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X4, Y4 = ADASYN(n_neighbors=3).fit_sample(X, Y)
    ax4 = fig.add_subplot(1, 5, 4)
    ax4.scatter(X4[:, 0], X4[:, 1], c=Y4)
    #ax4.set_title('SMOTE+ENN')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X5, Y5 = dbscan_based.MultiDbscanBasedOverSample(eps=0.3, min_pts=5).fit_sample(X, Y)
    ax5 = fig.add_subplot(1, 5, 5)
    ax5.scatter(X5[:, 0], X5[:, 1], c=Y5)
    #ax5.set_title('MC-ODG')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    plt.show()
def test_enn_init():
    """Test the initialisation of the object"""

    # Define a ratio
    verbose = True
    enn = EditedNearestNeighbours(random_state=RND_SEED, verbose=verbose)

    assert_equal(enn.size_ngh, 3)
    assert_equal(enn.kind_sel, 'all')
    assert_equal(enn.n_jobs, -1)
    assert_equal(enn.random_state, RND_SEED)
    assert_equal(enn.verbose, verbose)
    assert_equal(enn.min_c_, None)
    assert_equal(enn.maj_c_, None)
    assert_equal(enn.stats_c_, {})
Esempio n. 19
0
def under_sample_data(matrix, y_train):
    add_to_log('Under Sampling')
    add_to_log('Sample distribution %s' % Counter(y_train))
    # clean proximity samples using TomeKLinks
    tl = TomekLinks(random_state=11, sampling_strategy='majority', n_jobs=-1)
    X_res, y_res = tl.fit_resample(matrix, y_train)
    add_to_log('TomekLinks distribution %s' % Counter(y_res))

    enn = EditedNearestNeighbours(random_state=7,
                                  sampling_strategy='majority',
                                  n_jobs=-1)
    X_res, y_res = enn.fit_resample(X_res, y_res)

    add_to_log('EditedNearestNeighbours distribution %s' % Counter(y_res))
    return X_res, y_res
def equalize_training_dataset_with_EditedNN(x_train, y_train):
    from imblearn.under_sampling import EditedNearestNeighbours

    old_shape = list(x_train.shape)
    # reshape before using using over/undersampling method
    x_tmp = np.reshape(x_train, (x_train.shape[0], -1))
    x_resampled, y_resampled = EditedNearestNeighbours(
        sampling_strategy='not minority', n_neighbors=5,
        n_jobs=8).fit_resample(x_tmp, y_train)
    print(sorted(Counter(y_resampled).items()))
    # reshape after using using over/undersampling method
    old_shape[0] = x_resampled.shape[0]
    x_resampled = np.reshape(x_resampled, tuple(old_shape))

    return x_resampled, y_resampled
def final_model(X, y):
    # define the model
    smoteenn = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'))
    model = LogisticRegression(solver='liblinear')
    pipeline = imb_pipe(steps=[('e', smoteenn), ('m', model)])

    # fit the model
    pipeline.fit(X, y)

    # evaluate on some non-spill cases (known class 0)
    print('Non-Spill Cases:')
    data = [[329, 1627.54, 1409.43, 51, 822500, 35, 6.1, 4610, 0.17, 178.4, 0.2, 0.24, 0.39, 0.12, 0.27, 138.32, 34.81,
             2.02, 0.14, 0.19, 75.26, 0.47, 351.67, 0.18, 9.24, 0.38, 2.57, -2.96, -0.28, 1.93, 0, 1.93, 34, 1710, 0,
             25.84, 78, 55, 1460.31, 710.63, 451.78, 150.85, 3.23, 0, 4530.75, 66.25, 7.85],
            [3234, 1091.56, 1357.96, 32, 8085000, 40.08, 8.98, 25450, 0.22, 317.7, 0.18, 0.2, 0.49, 0.09, 0.41, 114.69,
             41.87, 2.31, 0.15, 0.18, 75.26, 0.53, 351.67, 0.18, 9.24, 0.24, 3.56, -3.09, -0.31, 2.17, 0, 2.17, 281,
             14490, 0, 80.11, 78, 55, 4287.77, 3095.56, 1937.42, 773.69, 2.21, 0, 4927.51, 66.15, 7.24],
            [2339, 1537.68, 1633.02, 45, 5847500, 38.13, 9.29, 22110, 0.24, 264.5, 0.21, 0.26, 0.79, 0.08, 0.71, 89.49,
             32.23, 2.2, 0.17, 0.22, 75.26, 0.51, 351.67, 0.18, 9.24, 0.27, 4.21, -2.84, -0.29, 2.16, 0, 2.16, 228,
             12150, 0, 83.6, 78, 55, 3959.8, 2404.16, 1530.38, 659.67, 2.59, 0, 4732.04, 66.34, 7.67]]

    for row in data:
        # make prediction
        yhat = pipeline.predict([row])
        # get the label
        label = yhat[0]
        # summarize
        print('>Predicted=%d (expected 0)' % (label))

    # evaluate on some spill cases (known class 1)
    print('Spill Cases:')
    data = [[2971, 1020.91, 630.8, 59, 7427500, 32.76, 10.48, 17380, 0.32, 427.4, 0.22, 0.29, 0.5, 0.08, 0.42, 149.87,
             50.99, 1.89, 0.14, 0.18, 75.26, 0.44, 351.67, 0.18, 9.24, 2.5, 10.63, -3.07, -0.28, 2.18, 0, 2.18, 164,
             8730, 0, 40.67, 78, 55, 5650.88, 1749.29, 1245.07, 348.7, 4.54, 0, 25579.34, 65.78, 7.41],
            [3155, 1118.08, 469.39, 11, 7887500, 30.41, 7.99, 15880, 0.26, 496.7, 0.2, 0.26, 0.69, 0.11, 0.58, 118.11,
             43.96, 1.76, 0.15, 0.18, 75.26, 0.4, 351.67, 0.18, 9.24, 0.78, 8.68, -3.19, -0.33, 2.19, 0, 2.19, 150,
             8100, 0, 31.97, 78, 55, 3471.31, 3059.41, 2043.9, 477.23, 1.7, 0, 28172.07, 65.72, 7.58],
            [115, 1449.85, 608.43, 88, 287500, 40.42, 7.34, 3340, 0.18, 86.1, 0.21, 0.32, 0.5, 0.17, 0.34, 71.2, 16.73,
             1.82, 0.19, 0.29, 87.65, 0.46, 132.78, -0.01, 3.78, 0.7, 4.79, -3.36, -0.23, 1.95, 0, 1.95, 29, 1530,
             0.01, 38.8, 89, 69, 1400, 250, 150, 45.13, 9.33, 1, 31692.84, 65.81, 7.84]]

    for row in data:
        # make prediction
        yhat = pipeline.predict([row])
        # get the label
        label = yhat[0]
        # summarize
        print('>Predicted=%d (expected 1)' % (label))
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 1836)
    assert_equal(count_y_res[2], 5)
def test_enn_fit_resample():
    enn = EditedNearestNeighbours()
    X_resampled, y_resampled = enn.fit_resample(X, Y)

    X_gt = np.array([
        [-0.10903849, -0.12085181],
        [0.01936241, 0.17799828],
        [2.59928271, 0.93323465],
        [1.92365863, 0.82718767],
        [0.25738379, 0.95564169],
        [0.78318102, 2.59153329],
        [0.52726792, -0.38735648],
    ])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 24
0
def sampling(X, Y, sample_type="over"):
    """
    This is to pick the sampling technique and output the data after sampled
    :param X: input data
    :param Y: classification data
    :param sample_type: can take a list or str of sampling technique
                        default is oversampling. options: over, under, combine
    :return: cascade data of X and Y
    """
    if "over" in sample_type:
        # using SMOTE for over sampling
        X_oversampled, y_oversampled = SMOTE(sampling_strategy="minority",
                                             random_state=42).fit_resample(
                                                 X, Y)
    if "under" in sample_type:
        # using ENN for under sampling, since centroid has memory issues
        # centroid undersample
        # X_under, y_under = ClusterCentroids(random_state=42).fit_resample(X,Y)
        X_under, y_under = EditedNearestNeighbours(
            random_state=42).fit_resample(X, Y)
    if "combine" in sample_type:
        # using sklearn built-in SMOTEENN for comebined sampling
        # because centroids has memory issue
        X_comb, y_comb = SMOTEENN(random_state=42).fit_resample(X, Y)
        # X_oversampled, y_oversampled = SMOTE(sampling_strategy="minority", random_state=42).fit_resample(X, Y)
        # X_comb, y_comb = ClusterCentroids(random_state=42).fit_resample(X_oversampled,y_oversampled)

    X_Y_under = list()
    X_Y_over = list()
    X_Y_comb = list()
    X_Y = dict()
    # append the data back for return
    if 'under' in sample_type:
        X_Y_under = np.append(X_under,
                              y_under.reshape(len(y_under), 1),
                              axis=1)
    if 'over' in sample_type:
        X_Y_over = np.append(X_oversampled,
                             y_oversampled.reshape(len(y_oversampled), 1),
                             axis=1)
    if 'combine' in sample_type:
        X_Y_comb = np.append(X_comb, y_comb.reshape(len(y_comb), 1), axis=1)

    X_Y.setdefault("under", X_Y_under)
    X_Y.setdefault("over", X_Y_over)
    X_Y.setdefault("combine", X_Y_comb)
    return X_Y
Esempio n. 25
0
def UnderSample(X, Y, method='Random', random_state=42):
    if X.size == len(X):
        X = X.reshape(-1, 1)
    if method is 'Cluster':  # 默认kmeans估计器
        sampler = ClusterCentroids(ratio='auto',
                                   random_state=random_state,
                                   estimator=None)
    elif method is 'Random':
        sampler = RandomUnderSampler(ratio='auto',
                                     random_state=random_state,
                                     replacement=False)
    elif method is 'NearMiss_1':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=1)
    elif method is 'NearMiss_2':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=2)
    elif method is 'NearMiss_3':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=3)
    elif method is 'TomekLinks':
        sampler = TomekLinks(ratio='auto', random_state=random_state)
    elif method is 'ENN':  # kind_sel可取'all'和'mode'
        sampler = EditedNearestNeighbours(ratio='auto',
                                          random_state=random_state,
                                          kind_sel='all')
    elif method is 'RENN':  # kind_sel可取'all'和'mode'
        sampler = RepeatedEditedNearestNeighbours(ratio='auto',
                                                  random_state=random_state,
                                                  kind_sel='all')
    elif method is 'All_KNN':
        sampler = AllKNN(ratio='auto',
                         random_state=random_state,
                         kind_sel='all')
    elif method is 'CNN':
        sampler = CondensedNearestNeighbour(ratio='auto',
                                            random_state=random_state)
    elif method is 'One_SS':
        sampler = OneSidedSelection(ratio='auto', random_state=random_state)
    elif method is 'NCR':
        sampler = NeighbourhoodCleaningRule(ratio='auto',
                                            random_state=random_state,
                                            kind_sel='all',
                                            threshold_cleaning=0.5)
    elif method is 'IHT':
        sampler = InstanceHardnessThreshold(estimator=None,
                                            ratio='auto',
                                            random_state=random_state)
    X_resampled, Y_resampled = sampler.fit_sample(X, Y)
    return X_resampled, Y_resampled
Esempio n. 26
0
    def __init__(self):
        self.time_stamp = datetime.datetime.now().strftime("%Y_%b_%d_%H_%M")

        print('Model Stamp:' + self.time_stamp)

        self.clf = RandomForestClassifier(class_weight='balanced', n_jobs=-1, criterion='gini',
                                          n_estimators=30, warm_start=True)

        self.vector = HashingVectorizer(n_features=2 ** 22, alternate_sign=False, analyzer='word',
                                        decode_error='ignore', token_pattern=r'\b\w{1,}[^\d\W]+\b',
                                        ngram_range=(2, 2))

        # Samplers are not needed during testing
        self.samplers = [
            TomekLinks(random_state=11, sampling_strategy='majority', n_jobs=-1),
            EditedNearestNeighbours(random_state=7, sampling_strategy='majority', n_jobs=-1)
        ]
def balancing_data(X, y, method):

    if method == "RandomOverSampler":
        b_method = RandomOverSampler(random_state=0)
    elif method == "TomekLinks":
        b_method = TomekLinks(random_state=0)
    elif method == "SMOTEENN":
        b_method = SMOTEENN(random_state=0)
    elif method == "SMOTETomek":
        b_method = SMOTETomek(random_state=0)
    elif method == "EditedNearestNeighbours":
        b_method = EditedNearestNeighbours(random_state = 0)

    #Balancing and returning the balanced data.
    X_resampled, y_resampled = b_method.fit_sample(X, y)

    return(X_resampled, y_resampled)
Esempio n. 28
0
def under_sampling_algs():
    algs = list()
    algs.append(("No Rs Undersampling case", "No Re-sampling"))
    algs.append((RandomUnderSampler(random_state=1), 'RU'))
    algs.append((ClusterCentroids(random_state=1), 'CC'))
    algs.append((TomekLinks(), 'TL'))
    algs.append((NearMiss(version=1), 'NM1'))
    algs.append((NearMiss(version=2), 'NM2'))
    algs.append((NearMiss(version=3), 'NM3'))
    algs.append((CondensedNearestNeighbour(random_state=1), 'CNN'))
    algs.append((OneSidedSelection(random_state=1), 'OSS'))
    algs.append((EditedNearestNeighbours(), 'ENN'))
    algs.append((NeighbourhoodCleaningRule(), 'NCL'))
    algs.append((InstanceHardnessThreshold(random_state=1), 'IHT'))
    algs.append((RepeatedEditedNearestNeighbours(), 'RENN'))
    algs.append((AllKNN(), 'AllKNN'))
    return algs
def get_models():
    models, names = list(), list()
    # TL
    models.append(TomekLinks())
    names.append('TL')
    # ENN
    models.append(EditedNearestNeighbours())
    names.append('ENN')
    # RENN
    models.append(RepeatedEditedNearestNeighbours())
    names.append('RENN')
    # OSS
    models.append(OneSidedSelection())
    names.append('OSS')
    # NCR
    models.append(NeighbourhoodCleaningRule())
    names.append('NCR')
    return models, names
def balance_data(X_content, ratings):
    """
    Balance the training data, first apply oversampling (SMOTE) afterwards clean the data/undersample (ENN)
    imput arguments:
        X_content: The full feature matrix, not yet transformed to TFIDF format
        ratings: The corresponding ratings
    output arguments:
        return_csr: The balanced X_content
        return_ratings: The balanced, corresponding ratings
    """

    # Initialize SMOTE object for oversampling and ENN object for cleaning the oversampled data
    sm = SMOTE()
    enn = EditedNearestNeighbours()
    nr_revs = X_content.shape[0]
    
    # Handle content in 20 parts to avoind Memory errors!
    return_csr = csr_matrix((0, X_content.shape[1]))
    return_ratings = []
    nr_chuncks = 20
    chunck = nr_revs/nr_chuncks
    for x in range(0,nr_chuncks):
        # Get appropriot part of the data
        if x < nr_chuncks-1:
            X_now = X_content[x*chunck:(x+1)*chunck, :].toarray()
            ratings_now = ratings[x*chunck:(x+1)*chunck]
        else:
            X_now = X_content[x*chunck:nr_revs, :].toarray()
            ratings_now = ratings[x*chunck:nr_revs]

        # Apply SMOTE for each minority class
        for i in range(0,4):
            X_now, ratings_now = sm.fit_sample(X_now, ratings_now)

        # Apply ENN for cleaning
        X_now, ratings_now = enn.fit_sample(X_now, ratings_now)

        # Append data to the return matrix
        vstack([return_csr,csr_matrix(X_now)])
        return_ratings.extend(ratings_now)

    print "balanced"
    return return_csr, return_ratings