Esempio n. 1
0
def test_enn_not_good_object():
    nn = 'rnd'
    enn = EditedNearestNeighbours(n_neighbors=nn,
                                  random_state=RND_SEED,
                                  kind_sel='mode')
    with raises(ValueError, match="has to be one of"):
        enn.fit_sample(X, Y)
Esempio n. 2
0
def test_enn_fit_sample_with_nn_object():
    """Test the fit sample routine using a NN object"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=4)
    enn = EditedNearestNeighbours(n_neighbors=nn, random_state=RND_SEED,
                                  kind_sel='mode')
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181],
                     [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465],
                     [1.42772181, 0.526027],
                     [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169],
                     [-0.284881, -0.62730973],
                     [0.57062627, 1.19528323],
                     [0.78318102, 2.59153329],
                     [0.35831463, 1.33483198],
                     [-0.14313184, -1.0412815],
                     [-0.09816301, -0.74662486],
                     [0.52726792, -0.38735648],
                     [0.2821046, -0.07862747]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 3
0
def model_preprocess(data):
    '''
    Function that applies Edited Nearest Neighbors from the imblearn library to create a more balanced training set.
    
    Arguments:
    data: dataframe with features and labels
    
    Returns:
    train: the training set with more balanced label distribution
    trainlab: the labels for the training set
    test: test set with same label distribution as the original dataset
    test_nlab: subset of test set that does not contain the labels
    testlab: the labels for the test set
    
    '''
    a = time.time()
    #encoding
    #split into testing and training
    train, test = train_test_split(data, test_size=0.2, stratify=data.Label)
    #training
    trainlab = train.Label
    train = train.drop('Label', axis=1)
    tlabels = list(train)
    #testing
    testlab = test.Label
    test_nlab = test.drop('Label', axis=1)

    #perform the imbalance technique: Edited Nearest Neighbors
    enn = EditedNearestNeighbours()
    train, trainlab = enn.fit_sample(train, trainlab)
    train = pd.DataFrame(train, columns=tlabels)

    print('Preprocessing Completed in %.3f seconds.' % (time.time() - a))

    return train, trainlab, test, test_nlab, testlab
Esempio n. 4
0
    def predict_defects(self,
                        train: pd.DataFrame,
                        test: pd.DataFrame,
                        oversample: bool = True,
                        binarize: bool = True) -> Tuple[list, list]:
        """
        Predict for Defects

        Parameters
        ----------
        train: numpy.ndarray or pandas.core.frame.DataFrame
            Training dataset as a pandas dataframe
        test: pandas.core.frame.DataFrame
            Test dataset as a pandas dataframe
        oversample: Bool
            Oversample with SMOTE
        binarize: Bool
            A boolean variable to

        Return
        ------
        actual: numpy.ndarray
            Actual defect counts
        predicted: numpy.ndarray
            Predictied defect counts
        """

        if binarize:
            train = self._binarize(train)
            test = self._binarize(test)

        x_train = train[train.columns[:-1]].values
        y_train = train[train.columns[-1]].values

        # pca = PCA(n_components=3)
        # pca.fit(x_train)
        # x_train = pca.transform(x_train)
        # x_train = model.transform(x_train)

        if oversample:
            k = min(2, sum(y_train) - 1)
            # sm = SMOTE(kind='regular', k_neighbors=k)
            sm = EditedNearestNeighbours()
            x_train, y_train = sm.fit_sample(x_train, y_train)

        lsvc = clone(self.clf, safe=True)
        lsvc.fit(x_train, y_train)
        model = SelectFromModel(lsvc, prefit=True)
        x_train = model.transform(x_train)
        # set_trace()
        # pca = PCA(n_components=3)
        # pca.fit(x_train)
        # x_train = pca.transform(x_train)
        self.clf.fit(x_train, y_train)
        actual = test[test.columns[-1]].values.astype(int)
        x_test = test[test.columns[:-1]]
        x_test = model.transform(x_test)
        # x_test = pca.transform(x_test)
        predicted = self.clf.predict(x_test).astype(int)
        return actual, predicted
def test_enn_fit_sample():
    enn = EditedNearestNeighbours()
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169], [0.78318102, 2.59153329],
                     [0.52726792, -0.38735648]])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample():
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169], [0.78318102, 2.59153329],
                     [0.52726792, -0.38735648]])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 9
0
def renn_sampling(X,Y):
    enn = ENN(return_indices=True)
    nsamples, nx, ny = X.shape
    print(X.shape)
    X = X.reshape((nsamples, nx*ny))

    X, Y, idx_resampled = enn.fit_sample(X,Y)
    
    nsamples, ny = X.shape
    print(X.shape)
    X = X.reshape((nsamples, nx, ny/nx))
    Y = Y.reshape((nsamples, 1))
    return X, Y
    def resample(x, y, sampling_type=None):
        x_out, y_out = x, y
        if sampling_type == "smoteenn":
            sme = SMOTEENN(random_state=1)
            x_out, y_out = sme.fit_sample(x, y)
        else:
            if sampling_type == "enn":
                enn = EditedNearestNeighbours(random_state=1)
                x_out, y_out = enn.fit_sample(x, y)

        print("Before resampling:", sorted(Counter(y).items()))
        print("After resampling:", sorted(Counter(y_out).items()))
        return x_out, y_out
def test_enn_fit_sample_with_indices():
    enn = EditedNearestNeighbours(return_indices=True)
    X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169], [0.78318102, 2.59153329],
                     [0.52726792, -0.38735648]])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    idx_gt = np.array([4, 11, 0, 3, 1, 8, 15])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_enn_fit_sample_mode():
    enn = EditedNearestNeighbours(kind_sel='mode')
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.42772181, 0.526027],
                     [1.92365863, 0.82718767], [0.25738379, 0.95564169],
                     [-0.284881, -0.62730973], [0.57062627, 1.19528323],
                     [0.78318102, 2.59153329], [0.35831463, 1.33483198],
                     [-0.14313184, -1.0412815], [-0.09816301, -0.74662486],
                     [0.52726792, -0.38735648], [0.2821046, -0.07862747]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_enn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'enn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Esempio n. 14
0
def samplingMethod(X_train, y_train, sampling="None"):
    if sampling == "SMOTE":
        sm = SMOTE(random_state=42, n_jobs=-1)
        X, y_train = sm.fit_sample(X_train.toarray(), y_train)
        X_train = csr_matrix(X)
    elif sampling == "ENN":
        enn = EditedNearestNeighbours(random_state=42, n_jobs=-1)
        X, y_train = enn.fit_sample(X_train.toarray(), y_train)
        X_train = csr_matrix(X)
    elif sampling == "SMOTEENN":
        sme = SMOTEENN(random_state=42, n_jobs=-1)
        X, y_train = sme.fit_sample(X_train.toarray(), y_train)
        X_train = csr_matrix(X)
    return X_train, y_train
def test_enn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'enn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'enn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'enn_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
	def enn(self, data):
		'''
		Applies editted nearest neighbor to remove samples whose neighbors mostly belong to other classes
		'''
		df = data
		X = df.as_matrix(self.features)
		y = np.ravel(df.as_matrix(['label']))
		
		enn = EditedNearestNeighbours(ratio='all',kind_sel='mode',n_neighbors=5,random_state=42,n_jobs=4)
		X_res, y_res = enn.fit_sample(X, y)

		df_enn = pd.DataFrame(X_res, columns=self.features)
		df_enn['label'] = y_res
		return df_enn
def plot_data(X, Y):
    # train_X = PCA(n_components=2).fit_transform(train_X)
    plt.rcParams['figure.figsize'] = (27.0, 5.0)
    fig = plt.figure()
    ax0 = fig.add_subplot(1, 5, 1)
    ax0.scatter(X[:, 0], X[:, 1], c=Y)
    #ax0.set_title('Original dataset')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X1, Y1 = SMOTE().fit_sample(X, Y)
    ax1 = fig.add_subplot(1, 5, 2)
    ax1.scatter(X1[:, 0], X1[:, 1], c=Y1)
    #ax1.set_title('SMOTE')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X2, Y2 = BorderlineSMOTE(kind='borderline-1').fit_sample(X, Y)
    ax2 = fig.add_subplot(1, 5, 3)
    ax2.scatter(X2[:, 0], X2[:, 1], c=Y2)
    #ax2.set_title('Borderline-SMOTE')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    enn = EditedNearestNeighbours()
    X3, Y3 = enn.fit_sample(X, Y)
    smo = SMOTE(k_neighbors=5)
    X3, Y3 = smo.fit_sample(X3, Y3)
    ax3 = fig.add_subplot(1, 5, 4)
    ax3.scatter(X3[:, 0], X3[:, 1], c=Y3)
    #ax3.set_title('ADASYN')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X4, Y4 = ADASYN(n_neighbors=3).fit_sample(X, Y)
    ax4 = fig.add_subplot(1, 5, 4)
    ax4.scatter(X4[:, 0], X4[:, 1], c=Y4)
    #ax4.set_title('SMOTE+ENN')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    X5, Y5 = dbscan_based.MultiDbscanBasedOverSample(eps=0.3, min_pts=5).fit_sample(X, Y)
    ax5 = fig.add_subplot(1, 5, 5)
    ax5.scatter(X5[:, 0], X5[:, 1], c=Y5)
    #ax5.set_title('MC-ODG')
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    plt.show()
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 1836)
    assert_equal(count_y_res[2], 5)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    enn = EditedNearestNeighbours(random_state=RND_SEED)
    X_resampled, y_resampled = enn.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 1836)
    assert_equal(count_y_res[2], 5)
def test_enn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.92365863, 0.82718767],
                     [0.25738379, 0.95564169], [0.78318102, 2.59153329],
                     [0.52726792, -0.38735648]])
    y_gt = np.array([0, 0, 1, 1, 2, 2, 2])
    idx_gt = np.array([4, 11, 0, 3, 1, 8, 15])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def balance_data(X_content, ratings):
    """
    Balance the training data, first apply oversampling (SMOTE) afterwards clean the data/undersample (ENN)
    imput arguments:
        X_content: The full feature matrix, not yet transformed to TFIDF format
        ratings: The corresponding ratings
    output arguments:
        return_csr: The balanced X_content
        return_ratings: The balanced, corresponding ratings
    """

    # Initialize SMOTE object for oversampling and ENN object for cleaning the oversampled data
    sm = SMOTE()
    enn = EditedNearestNeighbours()
    nr_revs = X_content.shape[0]
    
    # Handle content in 20 parts to avoind Memory errors!
    return_csr = csr_matrix((0, X_content.shape[1]))
    return_ratings = []
    nr_chuncks = 20
    chunck = nr_revs/nr_chuncks
    for x in range(0,nr_chuncks):
        # Get appropriot part of the data
        if x < nr_chuncks-1:
            X_now = X_content[x*chunck:(x+1)*chunck, :].toarray()
            ratings_now = ratings[x*chunck:(x+1)*chunck]
        else:
            X_now = X_content[x*chunck:nr_revs, :].toarray()
            ratings_now = ratings[x*chunck:nr_revs]

        # Apply SMOTE for each minority class
        for i in range(0,4):
            X_now, ratings_now = sm.fit_sample(X_now, ratings_now)

        # Apply ENN for cleaning
        X_now, ratings_now = enn.fit_sample(X_now, ratings_now)

        # Append data to the return matrix
        vstack([return_csr,csr_matrix(X_now)])
        return_ratings.extend(ratings_now)

    print "balanced"
    return return_csr, return_ratings
def test_enn_fit_sample_with_nn_object():
    """Test the fit sample routine using a NN object"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=4)
    enn = EditedNearestNeighbours(
        n_neighbors=nn, random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = enn.fit_sample(X, Y)

    X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828],
                     [2.59928271, 0.93323465], [1.42772181, 0.526027],
                     [1.92365863, 0.82718767], [0.25738379, 0.95564169],
                     [-0.284881, -0.62730973], [0.57062627, 1.19528323],
                     [0.78318102, 2.59153329], [0.35831463, 1.33483198],
                     [-0.14313184, -1.0412815], [-0.09816301, -0.74662486],
                     [0.52726792, -0.38735648], [0.2821046, -0.07862747]])
    y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 23
0
def run(X=None, Y=None, random_state=42, smote_ratio="minority", smote_kind="regular", enn_ratio="all", enn_kind_sel="all", enn_n_neighbors=3, save_dist=False, file=None):

    sm = None

    if smote_kind == "svm":
        sm = SMOTE(random_state=random_state, ratio=smote_ratio, kind=smote_kind, svm_estimator=SVC())
    else:
        sm = SMOTE(random_state=random_state, ratio=smote_ratio, kind=smote_kind,)

    enn = EditedNearestNeighbours(random_state=random_state, ratio=enn_ratio, kind_sel=enn_kind_sel, n_neighbors=enn_n_neighbors)

    X_resampled, Y_resampled = sm.fit_sample(X, Y)

    if(save_dist):
        with open(file, "a") as arch:
           arch.write("SMOTE: " + str(Counter(Y_resampled)) + " ")

    X_st, Y_st = enn.fit_sample(X_resampled, Y_resampled)

    if(save_dist):
        with open(file, "a") as arch:
            arch.write("ENN:" + str(Counter(y_st))+"\n")

    return X_st, Y_st
Esempio n. 24
0
def compare_different_oversample_method(model, sample_method, X, Y):
    n_split = 5
    skf = StratifiedKFold(n_splits=n_split, shuffle=True)
    res_list = np.zeros(4)
    cnt=0
    for train_indices, test_indices in skf.split(X, Y):
        cnt+=1
        print('正在进行第{}次交叉验证'.format(cnt))
        train_X, train_Y, test_X, test_Y = X[train_indices], Y[train_indices], X[test_indices], Y[test_indices]
        min_k_kearest = min(Counter(train_Y)) - 1
        if sample_method == 'SMOTE_ENN':
            enn = EditedNearestNeighbours()
            train_X, train_Y = enn.fit_sample(train_X, train_Y)
            smo = SMOTE(k_neighbors=min(3, min_k_kearest))
            if min_k_kearest > 0:
                train_X, train_Y = smo.fit_sample(train_X, train_Y)
        elif sample_method == 'smote':
            smo = SMOTE(k_neighbors=min(3, min_k_kearest))
            if min_k_kearest > 0:
                train_X, train_Y = smo.fit_sample(train_X, train_Y)
        elif sample_method == 'borderline_smote':
            smo = BorderlineSMOTE(kind='borderline-1', k_neighbors=min(3, min_k_kearest))
            if min_k_kearest > 0:
                train_X, train_Y = smo.fit_sample(train_X, train_Y)
        elif sample_method == 'adasyn':
            ada = ADASYN(n_neighbors=min(2, min_k_kearest))
            if min_k_kearest > 0:
                train_X, train_Y = ada.fit_sample(train_X, train_Y)
        elif sample_method:
            train_X, train_Y = sample_method.fit_sample(train_X, train_Y)
        model.fit(train_X, train_Y)
        y_score = model.predict(test_X)
        y_score_prob = model.predict_proba(test_X)[:, 1]
        # res_list1 += cal_multi_class_matrics(test_Y,y_sampled_score,y_sampled_score_prob)
        res_list += cal_multi_class_matrics(test_Y, y_score, y_score_prob)
    return res_list / n_split
Esempio n. 25
0
        print("ratio", i)
        results['ratio'][a] = i
        print("neighbors", j)
        results['neighbors'][a] = j
        b = a
        a = a + 1
        results['Class'][b] = 0
        results['Class'][a] = 1
        results['Datasize'][b] = datasize[0]
        results['Datasize'][a] = datasize[1]
        results['Training Datasize'][b] = trainingdatasize[0]
        results['Training Datasize'][a] = trainingdatasize[1]
        results['Testing Datasize'][b] = testingdatasize[0]
        results['Testing Datasize'][a] = testingdatasize[1]
        enn = EditedNearestNeighbours(random_state=5, n_neighbors=j)
        X_train_sampled, y_train_sampled = enn.fit_sample(
            X_train_sampled1, y_train_sampled1)
        samplingdatasize = collections.Counter(y_train_sampled)
        print("sampled training data size", samplingdatasize)
        results['After sampling'][b] = samplingdatasize[0]
        results['After sampling'][a] = samplingdatasize[1]

        #random forest
        clf = RandomForestClassifier(n_estimators=100,
                                     max_depth=5,
                                     random_state=0,
                                     oob_score=True)
        clf.fit(X_train_sampled, y_train_sampled)
        y_pred = clf.predict(X_test)
        y_test_arr = np.array(y_test['Outcome'])
        oobscore = clf.oob_score_
        print("oob score", oobscore)
Esempio n. 26
0
def hyperParamSearch(X_train,
                     y_train,
                     X_test,
                     y_test,
                     clf="logistic",
                     scoring='accuracy',
                     preprocess='MaxMin',
                     sampling="None"):
    tuned_parameters = dict()
    # sampling
    if sampling == "SMOTE":
        sm = SMOTE(random_state=42, n_jobs=-1)
        X, y_train = sm.fit_sample(X_train.toarray(), y_train)
        X_train = csr_matrix(X)
    elif sampling == "ENN":
        enn = EditedNearestNeighbours(random_state=42, n_jobs=-1)
        X, y_train = enn.fit_sample(X_train.toarray(), y_train)
        X_train = csr_matrix(X)
    elif sampling == "SMOTEENN":
        sme = SMOTEENN(random_state=42, n_jobs=-1)
        X, y_train = sme.fit_sample(X_train.toarray(), y_train)
        X_train = csr_matrix(X)


# preprocessing
    if preprocess == 'MaxMin':
        preprocessing = ('MaxMin', MaxAbsScaler())
    if preprocess == 'Binarization':
        preprocessing = ('Bin', Binarizer())

    if clf == "logistic":
        #Parameters of pipelines can be set using ‘__’ separated parameter names:
        tuned_parameters = [{
            'logistic__penalty': ['l2'],
            'logistic__C': [0.001, 0.1, 1, 10, 100],
            'logistic__class_weight': [None]
        }]
        pipe = Pipeline(
            steps=[preprocessing, ('logistic', LogisticRegression(n_jobs=-1))])
    if clf == "randomForest":
        tuned_parameters = [{
            'randomForest__n_estimators': [100, 500],
            'randomForest__min_samples_leaf': [1, 10, 25],
            'randomForest__class_weight': [None, 'balanced']
        }]
        pipe = Pipeline(steps=[
            preprocessing, ('randomForest', RandomForestClassifier(n_jobs=-1))
        ])
    if clf == "KNN":
        tuned_parameters = [{
            'KNN__n_neighbors': [5, 10, 20, 40],
            'KNN__weights': ['distance', 'uniform'],
            'KNN__metric': ['euclidean', 'manhattan']
        }]
        pipe = Pipeline(
            steps=[preprocessing, ('KNN', KNeighborsClassifier(n_jobs=-1))])
    for score in scoring:
        estimator = GridSearchCV(pipe,
                                 tuned_parameters,
                                 cv=3,
                                 scoring=score,
                                 error_score=-1,
                                 n_jobs=-1)
        estimator.fit(X_train, y_train)

        save_name = "final_%s(%s based_%s preprocessed_%s sampling).pkl" % (
            clf, score, preprocess, sampling)
        joblib.dump(estimator, save_name, compress=True)
        # print information
        print("************************* GENERAL INFO ***********************")
        print(" - classifier : %s" % (clf))
        print(" - sampling : %s" % (sampling))
        print(" - preprocessing : %s" % (preprocess))
        print(" - hyperParam based on : %s" % (score))
        print("**************************************************************")
        print("Best parameters set found on development set:")
        print(estimator.best_params_)

        print("%s scores on development set:" % (score))
        means = estimator.cv_results_['mean_test_score']
        stds = estimator.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     estimator.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print("Detailed classification report:")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        y_true, y_pred = y_test, estimator.predict(X_test)
        #        print(classification_report(y_true, y_pred))

        confus = confusion_matrix(y_true, y_pred)
        print '*****CV python****'
        print confus
def test_enn_not_good_object():
    nn = 'rnd'
    enn = EditedNearestNeighbours(
        n_neighbors=nn, kind_sel='mode')
    with raises(ValueError, match="has to be one of"):
        enn.fit_sample(X, Y)
if corr:
    df_corr = df.corr()
    plt.figure(figsize=(15,10))
    seaborn.heatmap(df_corr, cmap="YlGnBu") # Displaying the Heatmap
    seaborn.set(font_scale=2,style='white')
    plt.title('Heatmap correlation')
    plt.show()
    exit()

X_train = df.as_matrix(columns = ['gaze0_x','gaze0_y','gaze0_z','gaze1_x','gaze1_y','gaze1_z','poser_x','poser_y','poser_z','au23','au05','au12'])    ## Features with High Correlation and Importance Values

train_label1 = df.as_matrix(columns = ['label'])
y_train = np.ravel(train_label1)

rus = EditedNearestNeighbours(random_state=42)
X_resampled, y_resampled = rus.fit_sample(X_train, y_train)

df1 = pd.read_csv('test.csv')
test_data = df1.as_matrix(columns = ['gaze0_x','gaze0_y','gaze0_z','gaze1_x','gaze1_y','gaze1_z','poser_x','poser_y','poser_z','au23','au05','au12'])

test_label1 = df1.as_matrix(columns = ['label'])
test_label = np.ravel(test_label1)								

if gridsearch:
	C_range = 10. ** np.arange(-2, 3)
	gamma_range = 10. ** np.arange(-3, 2)
	param_grid = dict(gamma=gamma_range, C=C_range)
	grid = GridSearchCV(SVC(), param_grid=param_grid, cv=StratifiedKFold(y=y_resampled, n_folds=5))
	grid.fit(X_resampled, y_resampled)
	print("The best classifier is: ", grid.best_estimator_)
	exit()
Esempio n. 29
0
clf_smote.fit(X_smote,y_smote)
preditions_smote=clf_smote.predict(X_test)
#学习曲线
train_sizes,train_scores,test_scores=learning_curve(estimator=clf_smote,
                                X=X_smote,y=y_smote,
                                train_sizes=np.linspace(0.05,1,10),
                                 cv=10, n_jobs=1,random_state=0)
train_mean_smote=np.mean(train_scores,axis=1)
test_mean_smote=np.mean(test_scores,axis=1)
train_std_smote=np.std(train_scores,axis=1)
test_std_smote=np.std(train_scores,axis=1)
###################################################################
##ENN
from imblearn.under_sampling import EditedNearestNeighbours
ENN=EditedNearestNeighbours(random_state=42)
X_enn,y_enn=ENN.fit_sample(X_train,y_train)
##建立模型
clf_enn = RandomForestClassifier(oob_score=True)
clf_enn.fit(X_enn,y_enn)
preditions_enn=clf_enn.predict(X_test)
#学习曲线
train_sizes,train_scores,test_scores=learning_curve(estimator=clf_enn,
                                X=X_enn,y=y_enn,
                                train_sizes=np.linspace(0.05,1,10),
                                 cv=10, n_jobs=1,random_state=0)
train_mean_enn=np.mean(train_scores,axis=1)
test_mean_enn=np.mean(test_scores,axis=1)
train_std_enn=np.std(train_scores,axis=1)
test_std_enn=np.std(train_scores,axis=1)
###################################################################
##SMOTE+ENN
Esempio n. 30
0
    def pre_process(train_index, test_index):
        train_x, test_x = X_train_all[train_index], X_train_all[test_index]
        train_y, test_y = y_train[train_index], y_train[test_index]

        #Class Balance on the training split
        if class_balance_method == 'rand_under':
            rus = RandomUnderSampler(sampling_strategy='majority',
                                     random_state=0)
            train_x, train_y = rus.fit_sample(train_x, train_y)

        elif class_balance_method == 'enn':
            enn = EditedNearestNeighbours(n_neighbors=5,
                                          random_state=0,
                                          n_jobs=1)
            train_x, train_y = enn.fit_sample(train_x, train_y)

        elif class_balance_method == 'renn':
            renn = RepeatedEditedNearestNeighbours(n_neighbors=5,
                                                   random_state=0,
                                                   n_jobs=1)
            train_x, train_y = renn.fit_sample(train_x, train_y)

        elif class_balance_method == 'tomek':
            tl = TomekLinks(random_state=0)
            train_x, train_y = tl.fit_sample(train_x, train_y)

        elif class_balance_method == 'tomek_enn':
            tl = TomekLinks(random_state=0)
            train_x, train_y = tl.fit_sample(train_x, train_y)

            enn = EditedNearestNeighbours(n_neighbors=5,
                                          random_state=0,
                                          n_jobs=1)
            train_x, train_y = enn.fit_sample(train_x, train_y)

        elif class_balance_method == 'tomek_renn':
            tl = TomekLinks(random_state=0)
            train_x, train_y = tl.fit_sample(train_x, train_y)

            renn = RepeatedEditedNearestNeighbours(n_neighbors=5,
                                                   random_state=0,
                                                   n_jobs=1)
            train_x, train_y = renn.fit_sample(train_x, train_y)

        #Feature Selection on the training split
        #For all methods except the relief based
        feature_scores = 'N/A'

        if feature_selection_method == 'no':
            selected_features = X_df.columns

        elif feature_selection_method == 'chi2':
            selected_features, X_train_df, train_x, test_x = chi2_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)

        elif feature_selection_method == 'anovaF':
            selected_features, X_train_df, train_x, test_x = anova_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)

        elif feature_selection_method == 'reliefF':
            selected_features, feature_scores, train_x, test_x = relieff_fs(
                X_df, train_x, test_x, train_y)

        elif feature_selection_method == 'multisurf':
            selected_features, feature_scores, train_x, test_x = multisurf_fs(
                X_df, train_x, test_x, train_y)

        elif feature_selection_method == 'chi2_reliefF':
            selected_features_chi2, X_train_df, X_train_chi2, X_test_chi2 = chi2_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)
            selected_features, feature_scores, train_x, test_x = relieff_fs(
                X_train_df, X_train_chi2, X_test_chi2, train_y)

        elif feature_selection_method == 'chi2_multisurf':
            selected_features_chi2, X_train_df, X_train_chi2, X_test_chi2 = chi2_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)
            selected_features, feature_scores, train_x, test_x = multisurf_fs(
                X_train_df, X_train_chi2, X_test_chi2, train_y)

        elif feature_selection_method == 'anova_reliefF':
            selected_features_anova, X_train_df, X_train_anova, X_test_anova = anova_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)
            selected_features, feature_scores, train_x, test_x = relieff_fs(
                X_train_df, X_train_anova, X_test_anova, train_y)

        elif feature_selection_method == 'anova_multisurf':
            selected_features_anova, X_train_df, X_train_anova, X_test_anova = anova_fs(
                X_df, train_x, test_x, train_y, p_val_thresh)
            selected_features, feature_scores, train_x, test_x = multisurf_fs(
                X_train_df, X_train_anova, X_test_anova, train_y)

        return train_x, train_y, test_x, test_y, selected_features, feature_scores
Esempio n. 31
0
for i in r2l:
    y_3[y_3 == i] = 'R2L'  # r2l
for i in dos:
    y_3[y_3 == i] = 'DOS'  # dos
for i in probe:
    y_3[y_3 == i] = 'Probing'  # probe
y_3[y_3 == "normal."] = 'Normal'  # normal
y_3 = np.array(y_3)  # 变成array格式,一维
classes=['Normal','Probing','DOS','U2R','R2L']
colors=['blue','red','y','m','g']
#欠采样 ENN
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import EditedNearestNeighbours
params=[3,6,9,12,15,18]
for i in params:
    oversampler = EditedNearestNeighbours(random_state=42, n_neighbors=i)
    X_enn, y_e = oversampler.fit_sample(X_3, y_3)
#标准化
    scaler=StandardScaler().fit(X_enn)
    X_e=scaler.transform(X_enn)
    #可视化
    X_train_2,X_test_2,y_train_2,y_test_2=train_test_split(X_e,y_e,test_size=0.2,random_state=0)  #切分样本
    X_embedded = TSNE(n_components=2).fit_transform(X_test_2)
    plt.figure()
    plt.title("ENN")
    for index,label,color in zip(range(len(classes)),classes,colors):
        plt.scatter(X_embedded[y_test_2==label,0],X_embedded[y_test_2==label,1],label=classes[index],c=color)
    plt.legend(loc='best')
plt.show()
# # print('重新取样数据集的形状 - Resampled dataset shape {}'.format(Counter(sm_target)))
# # End: 过采样使用SMOTE - oversampling using smote

classfication(sm_data, sm_target, "Data after oversampling using SMOTE")

# # Start: 欠采样使用tomekLink - undersampling using tomekLink
tlink = TomekLinks(random_state=42, ratio='auto')
tl_data, tl_target = tlink.fit_sample(ada_data, ada_target)
print('Resampled dataset shape {}'.format(Counter(tl_target)))
# # # End: 欠采样使用tomekLink - undersampling using tomekLink

classfication(tl_data, tl_target, "ADASYN Data after cleaning using TomekLink")

# # Start: 欠采样使用CondensedNearesNeighbors - undersampling using CondensedNearesNeighbors
enn = EditedNearestNeighbours(random_state=42, n_neighbors=1, ratio='auto')
enn_data, enn_target = enn.fit_sample(X_data, target)
# # print('重新取样数据集的形状 - Resampled dataset shape {}'.format(Counter(enn_target)))
# # End: 欠采样使用CondensedNearesNeighbors - undersampling using CondensedNearesNeighbors

classfication(
    enn_data, enn_target,
    "使用随机采样器进行欠采样后的数据 - Data after under sampling using Edited Nearest Neighbors"
)

# Start:欠采样使用RandomUnderSampler - undersampling using RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
rus_data, rus_target = rus.fit_sample(X_data, target)
print('Resampled dataset shape {}'.format(Counter(rus_target)))
# End : 欠采样使用RandomUnderSampler - undersampling using RandomUnderSampler

classfication(
Esempio n. 33
0
                        cv=10,
                        scoring=('roc_auc', 'average_precision'))
scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()
# (0.9518183780276207, 0.6767076447148238)

######### Edited Nearest Neighbor #########

# removes all samples that are misclassified by KNN from the training data (`mode`)
# Or if have any point from other class as neighbor (`all`)
# So basically, what you're doing here is you clean up outliers and boundaries.

from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours(n_neighbors=5)

X_train_enn, y_train_enn = enn.fit_sample(X_train, y_train)
enn_mode = EditedNearestNeighbours(kind_sel="mode", n_neighbors=5)
X_train_enn_mode, y_train_enn_mode = enn_mode.fit_sample(X_train, y_train)
print(X_train_enn_mode.shape)
print(np.bincount(y_train_enn_mode))

### Pipeline method

enn_pipe = make_imb_pipeline(EditedNearestNeighbours(n_neighbors=5),
                             LogisticRegression())

scores = cross_validate(enn_pipe,
                        X_train,
                        y_train,
                        cv=10,
                        scoring=('roc_auc', 'average_precision'))
Esempio n. 34
0
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Three subplots, unpack the axes array immediately
f, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
print('Reduced {:.2f}\%'.format(100 * (1 - float(len(X_resampled))/ len(X))))

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax2.set_title('Edited nearest neighbours')

# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_sample(X, y)
Esempio n. 35
0
# # In[135]:
#
#
# smotenc+enn
X_smote = np.array(df_smotenc[[
    'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id',
    'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model',
    'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18',
    'C19', 'C20', 'C21'
]])
Y_smote = list(df_smotenc['click'])
#
from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(X_smotenc, y_smotenc)
#
# # In[52]:
#
#
# df_smotenc = pd.DataFrame(X_smotenc,
#                           columns=column1)
# df_smotenc = pd.concat([df_smotenc, pd.DataFrame(y_smotenc, columns=['click'])], axis=1)
# for i in column1:
#     df_smotenc[i] = df_smotenc[i].astype(int)
#
# # In[53]:
#
#
# df_smX_resampledotenc.head()
#
def test_deprecation_random_state():
    enn = EditedNearestNeighbours(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        enn.fit_sample(X, Y)
Esempio n. 37
0
                    random_state=np.random.randint(100),
                    kind='regular',
                    n_jobs=-1)
os_X_train, os_y_train = oversampler.fit_sample(X_train.fillna(0), y_train)

##ADASYN 运行起来很慢###
X_resampled_adasyn, y_resampled_adasyn = ADASYN(
    sampling_strategy=0.2,
    n_jobs=-1).fit_sample(train.loc[:, feature].fillna(0).values,
                          train["y"].values.astype('int'))

###删除边界的一些噪声点###
from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours(random_state=0)
X_resampled, y_resampled = enn.fit_sample(X, y)

dtrain = xgb.DMatrix(data=train.loc[:, feature].astype('float'),
                     label=train['y'].astype('int'))
dval = xgb.DMatrix(data=val.loc[:, feature].astype('float'),
                   label=val['y'].astype('int'))
train.loc[:, feature].info(null_counts=True)

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 2,
Esempio n. 38
0
class SMOTEENN(SMOTEENN):
    def __init__(self, ratio='auto', random_state=None, smote=None, enn=None):
        """
        Creates an object of the imblearn.combine.SMOTEENN class.

        :param ratio: str, dict, or callable, optional (default='auto')
               Ratio to use for resampling the data set.
               - If "str", has to be one of: (i) 'minority': resample the minority class;
                 (ii) 'majority': resample the majority class,
                 (iii) 'not minority': resample all classes apart of the minority class,
                 (iv) 'all': resample all classes, and (v) 'auto': correspond to 'all' with for over-sampling
                 methods and 'not_minority' for under-sampling methods. The classes targeted will be over-sampled or
                 under-sampled to achieve an equal number of sample with the majority or minority class.
               - If "dict`", the keys correspond to the targeted classes. The values correspond to the desired number
                 of samples.
               - If callable, function taking "y" and returns a "dict". The keys correspond to the targeted classes.
                 The values correspond to the desired number of samples.
        :param random_state: int, RandomState instance or None, optional (default=None)
               If int, random_state is the seed used by the random number generator; If RandomState instance,
               random_state is the random number generator; If None, the random number generator is the RandomState
               instance used by 'np.random'.
        :param smote: object, optional (default=SMOTE())
               The :class: imblearn.over_sampling.SMOTE object to use. If none provide a
               :class: imblearn.over_sampling.SMOTE object with default parameters will be given.
        :param enn: object, optional (default=EditedNearestNeighbours())
               The :class: imblearn.under_sampling.EditedNearestNeighbours object to use. If none provide a
               :class: imblearn.under_sampling.EditedNearestNeighbours object with default parameters will be given.
        """
        super(SMOTEENN, self).__init__(ratio=ratio,
                                       random_state=random_state,
                                       smote=smote,
                                       enn=enn)

    def _validate_estimator(self):
        """
        Private function to validate SMOTE and ENN objects.
        :return:
        """

        if self.smote is not None:
            if isinstance(self.smote, SMOTE):
                self.smote_ = self.smote
            else:
                raise ValueError('smote needs to be a SMOTE object.'
                                 'Got {} instead.'.format(type(self.smote)))
        else:
            self.smote_ = SMOTE(ratio=self.ratio,
                                k_neighbors=3,
                                random_state=self.random_state)

        if self.enn is not None:
            if isinstance(self.enn, EditedNearestNeighbours):
                self.enn_ = self.enn
            else:
                raise ValueError('enn needs to be an EditedNearestNeighbours.'
                                 ' Got {} instead.'.format(type(self.enn)))
        else:
            self.enn_ = EditedNearestNeighbours(ratio="all",
                                                kind_sel="mode",
                                                random_state=self.random_state)

    def fit(self, X, y):
        """
        Find the classes statistics before to perform sampling.

        :param X: {array-like, sparse matrix}, shape (n_samples, n_features)
               Matrix containing the data which have to be sampled.
        :param y: array-like, shape (n_samples,)
               Corresponding label for each sample in X.
        :return: object; Return self

        """
        return super(SMOTEENN, self).fit(X, y)

    def _sample(self, X, y):
        """
        Edited to apply ENN first to remove problematic samples and then apply SMOTE.

        :param X: {array-like, sparse matrix}, shape (n_samples, n_features)
               Matrix containing the data which have to be sampled.
        :param y: array-like, shape (n_samples,)
               Corresponding label for each sample in X.
        :return: X_resampled, y_resampled
        """
        self._validate_estimator()

        X_res, y_res = self.enn_.fit_sample(X, y)
        return self.smote_.fit_sample(X_res, y_res)