Esempi in Python per TomekLinks, esempi in Python per unbalanced_dataset.under_sampling.TomekLinks

Esempio n. 1

0

Mostra file

File: utils.py Progetto: Zerowxm/kdd-cup2009

def test_rest(x, y,c=0,ratio='auto'):
    c=c
    if(c==0):
        print('Random under-sampling')
        US = UnderSampler(indices_support=indices_support, verbose=verbose,ratio=ratio)
        x, y, idx_tmp = US.fit_transform(x, y)
        print ('Indices selected')
        print(idx_tmp)
    elif(c==1):
        print('Tomek links')
        TL = TomekLinks(verbose=verbose,ratio=ratio)
        x, y = TL.fit_transform(x, y)
    elif(c==2):
        print('Clustering centroids')
        CC = ClusterCentroids(verbose=verbose,ratio=ratio)
        x, y = CC.fit_transform(x, y)
    elif(c==3):
        print('NearMiss-1')
        NM1 = NearMiss(version=1, indices_support=indices_support, verbose=verbose,ratio=ratio)
        x, y, idx_tmp = NM1.fit_transform(x, y)
        print ('Indices selected')
        print(idx_tmp)
    elif(c==4):
        print('NearMiss-2')
        NM2 = NearMiss(version=2, indices_support=indices_support, verbose=verbose,ratio=ratio)
        x, y, idx_tmp = NM2.fit_transform(x, y)
        print ('Indices selected')
        print(idx_tmp)
    elif(c==5):
        print('NearMiss-3')
        NM3 = NearMiss(version=3, indices_support=indices_support, verbose=verbose,ratio=ratio)
        x, y, idx_tmp = NM3.fit_transform(x, y)
        print ('Indices selected')
        print(idx_tmp)
    elif(c==6):
        print('Neighboorhood Cleaning Rule')
        NCR = NeighbourhoodCleaningRule(indices_support=indices_support, verbose=verbose,ratio=ratio)
        x, y, idx_tmp = NCR.fit_transform(x, y)
        print ('Indices selected')
        print(idx_tmp)
    elif(c==7):
        print('Random over-sampling')
        OS = OverSampler(verbose=verbose,ratio=ratio)
        x, y = OS.fit_transform(x, y)
    elif(c==8):
        print('SMOTE Tomek links')
        STK = SMOTETomek(verbose=verbose,ratio=ratio)
        x, y = STK.fit_transform(x, y)
    elif(c==9):
        print('SMOTE ENN')
        SENN = SMOTEENN(verbose=verbose,ratio=ratio)
        x, y = SENN.fit_transform(x, y)
    else:
        print('EasyEnsemble')
        EE = EasyEnsemble(verbose=verbose,ratio=ratio)
        x, y = EE.fit_transform(x, y)
    return x, y

Esempio n. 2

0

Mostra file

def test_rest(x, y):

    print('Random under-sampling')
    US = UnderSampler(indices_support=indices_support, verbose=verbose)
    usx, usy, idx_tmp = US.fit_transform(x, y)
    print ('Indices selected')
    print(idx_tmp)

    print('Tomek links')
    TL = TomekLinks(verbose=verbose)
    tlx, tly = TL.fit_transform(x, y)

    print('Clustering centroids')
    CC = ClusterCentroids(verbose=verbose)
    ccx, ccy = CC.fit_transform(x, y)

    print('NearMiss-1')
    NM1 = NearMiss(version=1, indices_support=indices_support, verbose=verbose)
    nm1x, nm1y, idx_tmp = NM1.fit_transform(x, y)
    print ('Indices selected')
    print(idx_tmp)

    print('NearMiss-2')
    NM2 = NearMiss(version=2, indices_support=indices_support, verbose=verbose)
    nm2x, nm2y, idx_tmp = NM2.fit_transform(x, y)
    print ('Indices selected')
    print(idx_tmp)

    print('NearMiss-3')
    NM3 = NearMiss(version=3, indices_support=indices_support, verbose=verbose)
    nm3x, nm3y, idx_tmp = NM3.fit_transform(x, y)
    print ('Indices selected')
    print(idx_tmp)

    print('Neighboorhood Cleaning Rule')
    NCR = NeighbourhoodCleaningRule(indices_support=indices_support, verbose=verbose)
    ncrx, ncry, idx_tmp = NCR.fit_transform(x, y)
    print ('Indices selected')
    print(idx_tmp)

    print('Random over-sampling')
    OS = OverSampler(verbose=verbose)
    ox, oy = OS.fit_transform(x, y)

    print('SMOTE Tomek links')
    STK = SMOTETomek(verbose=verbose)
    stkx, stky = STK.fit_transform(x, y)

    print('SMOTE ENN')
    SENN = SMOTEENN(verbose=verbose)
    sennx, senny = SENN.fit_transform(x, y)

    print('EasyEnsemble')
    EE = EasyEnsemble(verbose=verbose)
    eex, eey = EE.fit_transform(x, y)

Esempio n. 3

0

Mostra file

File: test_tomek_links.py Progetto: StefanKal/CancerDataChallenge

def test_tl_fit_transform():
    """Test the fit transform routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

Esempio n. 4

0

Mostra file

def test_tl_fit_transform():
    """Test the fit transform routine"""

    # Resample the data
    tl = TomekLinks(random_state=RND_SEED)
    X_resampled, y_resampled = tl.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

Esempio n. 5

0

Mostra file

def test_tl_fit():
    """Test the fitting method"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    # Fit the data
    tl.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(tl.min_c_, 0)
    assert_equal(tl.maj_c_, 1)
    assert_equal(tl.stats_c_[0], 500)
    assert_equal(tl.stats_c_[1], 4500)

Esempio n. 6

0

Mostra file

File: test_tomek_links.py Progetto: StefanKal/CancerDataChallenge

def test_tl_fit():
    """Test the fitting method"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    # Fit the data
    tl.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(tl.min_c_, 0)
    assert_equal(tl.maj_c_, 1)
    assert_equal(tl.stats_c_[0], 500)
    assert_equal(tl.stats_c_[1], 4500)

Esempio n. 7

0

Mostra file

File: test_tomek_links.py Progetto: StefanKal/CancerDataChallenge

def test_tl_fit_transform_with_indices():
    """Test the fit transform routine with indices support"""

    # Resample the data
    tl = TomekLinks(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = tl.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'tl_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)

Esempio n. 8

0

Mostra file

def test_tl_fit_transform_with_indices():
    """Test the fit transform routine with indices support"""

    # Resample the data
    tl = TomekLinks(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = tl.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'tl_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'tl_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'tl_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)

Esempio n. 9

0

Mostra file

File: music_data.py Progetto: alexandreyy/music_year_regression

    def balance_data_undersampling_tomek_links(self):
        '''
        Balance data clustering centroids.
        '''
        x = self.X
        y = self.y
        y.shape = (len(self.y))
        verbose = True

        TL = TomekLinks(verbose=verbose)
        tlx, tly = TL.fit_transform(x, y)

        self.X = tlx
        self.y = tly
        self.y.shape = (len(self.y), 1)

Esempio n. 10

0

Mostra file

def test_tl_transform_wt_fit():
    """Test either if an error is raised when transform is called before
    fitting"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    assert_raises(RuntimeError, tl.transform, X, Y)

Esempio n. 11

0

Mostra file

def test_tl_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    tl = TomekLinks(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_raises(RuntimeError, tl.fit, X, y_single_class)

Esempio n. 12

0

Mostra file

def test_tl_init():
    """Test the initialisation of the object"""

    # Define a ratio
    verbose = True
    tl = TomekLinks(random_state=RND_SEED, verbose=verbose)

    assert_equal(tl.n_jobs, -1)
    assert_equal(tl.rs_, RND_SEED)
    assert_equal(tl.verbose, verbose)
    assert_equal(tl.min_c_, None)
    assert_equal(tl.maj_c_, None)
    assert_equal(tl.stats_c_, {})

Esempio n. 13

0

Mostra file

File: data.py Progetto: alexandreyy/insurance_prediction

    def balance_data_oversampling(self, ratio = 2, balance_type = "OverSampler"):
        '''
        Balance data.
        '''
        verbose = True

        if balance_type == "OverSampler":
            sm = OverSampler(verbose = verbose, ratio = ratio)
        elif balance_type == 'SMOTE_borderline1':
            sm = SMOTE(kind = 'borderline1', verbose = verbose, ratio = ratio)
        elif balance_type == 'SMOTE_regular':
            sm = SMOTE(kind = 'regular', verbose = verbose, ratio = ratio)
        elif balance_type == 'SMOTE_borderline2':
            sm = SMOTE(kind = 'borderline2', verbose = verbose, ratio = ratio)
        else:
            sm = TomekLinks(verbose = verbose)

        self.train_x, self.train_y = sm.fit_transform(self.train_x, self.train_y)

Esempio n. 14

0

Mostra file

def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio):
    sampler = None
    verbose = True
    if sample_type == SMOTE_REG:
        sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15)
    elif sample_type == SMOTE_SVM:
        # TODO: Make this configurable?
        svm_args = {'class_weight': 'balanced'}
        sampler = SMOTE(kind='svm',
                        ratio=ratio,
                        verbose=verbose,
                        k=15,
                        **svm_args)
    elif sample_type == SMOTE_BORDERLINE_1:
        sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_BORDERLINE_2:
        sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_ENN:
        sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15)
    elif sample_type == SMOTE_TOMEK:
        sampler = SMOTETomek(ratio=ratio, verbose=verbose, k=15)
    elif sample_type == UNDERSAMPLER:
        sampler = UnderSampler(ratio=ratio,
                               verbose=verbose,
                               replacement=False,
                               random_state=17)
    elif sample_type == ADASYN_SAMPLER:
        sampler = ADASYN(k=15, imb_threshold=0.6, ratio=ratio)
    elif sample_type == TOMEK_LINKS:
        sampler = TomekLinks()
    elif sample_type == CLUSTER_CENTROIDS:
        sampler = ClusterCentroids(ratio=ratio)
    elif sample_type == NEARMISS:
        sampler = NearMiss(ratio=ratio)
    else:
        print "Unrecoqnized sample technique: " + sample_type
        print "Returning original data"
        return train_x, train_y
    return sampler.fit_transform(train_x, train_y)

Esempio n. 15

0

Mostra file

                           weights=[0.1, 0.9],
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=5000,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0],
            X_vis[y == 0, 1],
            label="Class #0",
            alpha=0.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0],
            X_vis[y == 1, 1],

Esempio n. 16

0

Mostra file

File: plot_tomek_links.py Progetto: JFanZhao/UnbalancedDataset

from unbalanced_dataset.under_sampling import TomekLinks

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)

Esempio n. 17

0

Mostra file

File: train.py Progetto: cipher1729/js-crawler

#parameters for decision tree
param_grid_DT = dict(max_features=["auto","sqrt","log2",None],class_weight=["balanced", None] , presort=[True, False])




#stratified shuffle split
X_train, X_test, y_train, y_test = stratify_and_shuffle(scaled_data, target, 0.99,0.01)


verbose= True


#undersample the scaled data
#US = UnderSampler(verbose=True)
US= TomekLinks(verbose=True)
#US = ClusterCentroids(verbose=True)
#US = NearMiss(version=1, verbose=verbose)
#US = OneSidedSelection(verbose=verbose)
#US = EasyEnsemble(verbose=verbose)
#US = NeighbourhoodCleaningRule(verbose=verbose)
#US = BalanceCascade(verbose=verbose)
#svm_args={'class_weight': 'auto'}
#US = SMOTE(kind='svm', verbose=verbose, **svm_args)
##US = SMOTE(kind='regular', verbose=verbose)
#US = SMOTETomek(verbose =verbose)
usx, usy =  US.fit_transform((X_train), y_train)
#usx = X_train
#usy = y_train