Esempio n. 1
0
def under_sampling(X, y, method):
    if method == 'ClusterCentroids':
        model = ClusterCentroids()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RandomUnderSampler':
        model = RandomUnderSampler()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NearMiss':
        model = NearMiss()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'EditedNearestNeighbours':
        model = EditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RepeatedEditedNearestNeighbours':
        model = RepeatedEditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'AllKNN':
        model = AllKNN()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NeighbourhoodCleaningRule':
        model = NeighbourhoodCleaningRule()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'OneSidedSelection':
        model = OneSidedSelection()
        X_resampled, y_resampled = model.fit_resample(X, y)
    return X_resampled, y_resampled
Esempio n. 2
0
def test_allknn_fit_resample_with_indices():
    allknn = AllKNN(return_indices=True)
    X_resampled, y_resampled, idx_under = allknn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [0.50307437, 0.498805], [0.84929742, 0.41042894],
                     [0.62649535, 0.46600596], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 14, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21,
        25, 26, 28, 31, 33, 34, 35, 36
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_allclose(y_resampled, y_gt, rtol=R_TOL)
    assert_allclose(idx_under, idx_gt, rtol=R_TOL)
Esempio n. 3
0
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [0.50307437, 0.498805], [0.84929742, 0.41042894],
                     [0.62649535, 0.46600596], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_allclose(y_resampled, y_gt, rtol=R_TOL)
    def fit(self, c_data, x_data, y_data):
        # this is to track evolution of the size of the training samples
        self.samplesize = []
        self.samplesize.append(len(x_data))

        if self.reject_by_calendar:
            mask = self.mask_cal(c_data, y_data)
            # filter rows rejected by this calendar criteria
            # not filtering them might improve second classifier training
            #x_data = normalize(x_data[mask])
            #y_data = y_data[mask]
            self.samplesize.append(len(x_data))

        if self.use_resampling:
            # undersample
            resampler = AllKNN()
            x_data, y_data = resampler.fit_sample(x_data, y_data)
            self.samplesize.append(len(x_data))

            # oversample
            resampler = SMOTEENN()
            x_data, y_data = resampler.fit_sample(x_data, y_data)
            self.samplesize.append(len(x_data))

        # train clf only with filtered and resampled data
        if self.use_weights:
            try:
                self.clf.fit(x_data, y_data, self.get_weights(y_data))
            except TypeError:
                print "The classifier selected does not admit weights for training samples"
                print "Switching to no weights"
                self.use_weights = False
                self.clf.fit(x_data, y_data)
        else:
            self.clf.fit(x_data, y_data)
Esempio n. 5
0
def test_allknn_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    assert_raises(RuntimeError, allknn.sample, X, Y)
Esempio n. 6
0
def test_allknn_fit_resample_with_nn_object():
    nn = NearestNeighbors(n_neighbors=4)
    allknn = AllKNN(n_neighbors=nn, kind_sel='mode')
    X_resampled, y_resampled = allknn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [-0.12840393, 0.66446571], [1.02956816, 0.36061601],
                     [1.12202806, 0.33811558], [-0.35946678, 0.72510189],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [-0.28479268, 0.70459548], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [1.32319756, -0.13181616], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 7
0
def sampler(name, ratio, random_state=0, return_indices=True, **kwargs):
    if name == "rus":
        sampler = RandomUnderSampler(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "nm":
        sampler = NearMiss(
            ratio=ratio,
            return_indices=return_indices,
            random_state=random_state,
            **kwargs,
        )
    elif name == "enn":
        sampler = EditedNearestNeighbours(return_indices=return_indices,
                                          random_state=random_state,
                                          **kwargs)
    elif name == "renn":
        sampler = RepeatedEditedNearestNeighbours(
            return_indices=return_indices, random_state=random_state, **kwargs)
    elif name == "allknn":
        sampler = AllKNN(return_indices=return_indices,
                         random_state=random_state,
                         **kwargs)
    elif name == "tl":
        sampler = TomekLinks(return_indices=return_indices,
                             random_state=random_state,
                             **kwargs)
    else:
        raise ValueError
    return sampler
Esempio n. 8
0
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
Esempio n. 9
0
def under_sample(X, y, sampler="RandomUnderSampler"):
    # list of all samplers, in case you want to iterate all of them
    samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold',
                     'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours',
                     'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection']
    print(samplers_list)

    # currently there is no parameters sampler
    # this dict is used to choose a resampler by user. default is random
    samplers = {
        "RandomUnderSampler": RandomUnderSampler(),
        "ClusterCentroids": ClusterCentroids(),
        "NearMiss": NearMiss(),
        "InstanceHardnessThreshold": InstanceHardnessThreshold(),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(),
        "EditedNearestNeighbours": EditedNearestNeighbours(),
        "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(),
        "AllKNN": AllKNN(),
        "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(),
        "OneSidedSelection": OneSidedSelection(),
    }
    sampler = samplers[sampler]

    # plot y class count before and after resample
    print("before", sorted(Counter(y).items()))

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print("after", sorted(Counter(y_resampled).items()))

    print('===' * 4, 'under_sample finished')

    return X_resampled, y_resampled
Esempio n. 10
0
 def getsampler(self, type):
     if type == 'none':
         sampler = NoSampler()
     elif type == 'randomunder':
         sampler = RandomUnderSampler()
     elif type == 'nearmiss':
         sampler = NearMiss()
     elif type == 'allknn':
         sampler = AllKNN()
     elif type == 'condensednn':
         sampler = CondensedNearestNeighbour()
     elif type == 'editednn':
         sampler = EditedNearestNeighbours()
     elif type == 'repeatededitednn':
         sampler = RepeatedEditedNearestNeighbours()
     elif type == 'tomeklinks':
         sampler = TomekLinks()
     elif type == 'randomover':
         sampler = RandomOverSampler()
     elif type == 'smote':
         sampler = SMOTE()
     elif type == 'adasyn':
         sampler = ADASYN()
     elif type == 'smotenc':
         sampler = SMOTENC()
     elif type == 'quality':  # and self.quality_model_selection_type == 'extended':
         sampler = QualitySampler(self.n_init)
     else:
         print("Unsupported sampler %s" % type)
         exit(1)
     if type != 'none' and type != 'quality' and 'random_state' in sampler.get_params(
     ).keys():
         sampler.set_params(random_state=self.random_state)
     return sampler
Esempio n. 11
0
def test_continuous_error():
    """Test either if an error is raised when the target are continuous
    type"""

    # continuous case
    y = np.linspace(0, 1, 40)
    ann = AllKNN(random_state=RND_SEED)
    assert_warns(UserWarning, ann.fit, X, y)
Esempio n. 12
0
def test_allknn_init():
    # Define a ratio
    allknn = AllKNN(random_state=RND_SEED)

    assert_equal(allknn.n_neighbors, 3)
    assert_equal(allknn.kind_sel, 'all')
    assert_equal(allknn.n_jobs, -1)
    assert_equal(allknn.random_state, RND_SEED)
Esempio n. 13
0
def test_all_knn_allow_minority():
    X, y = make_classification(n_samples=10000,
                               n_features=2,
                               n_informative=2,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=3,
                               n_clusters_per_class=1,
                               weights=[0.2, 0.3, 0.5],
                               class_sep=0.4,
                               random_state=0)

    allknn = AllKNN(allow_minority=True)
    X_res_1, y_res_1 = allknn.fit_resample(X, y)
    allknn = AllKNN()
    X_res_2, y_res_2 = allknn.fit_resample(X, y)
    assert len(y_res_1) < len(y_res_2)
Esempio n. 14
0
def Resampling(train_x, train_y, resampling_method):
    train_y.data = LabelEncoder().fit_transform(train_y.data)
    # summarize distribution

    # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling
    #plotGraphics.piePlot(train_y, "Before Resampling")

    # ---- UNDER-SAMPLING ------ #
    if resampling_method == "ClusterCentroids":
        resample = ClusterCentroids(voting='hard', random_state=42)

    if resampling_method == "CondensedNearestNeighbour":
        resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42)

    if resampling_method == "EditedNearestNeighbours":
        resample = EditedNearestNeighbours(n_neighbors=7,
                                           kind_sel='mode',
                                           n_jobs=-1)

    if resampling_method == "RepeatedEditedNearestNeighbours":
        resample = RepeatedEditedNearestNeighbours(n_neighbors=7,
                                                   kind_sel='mode',
                                                   n_jobs=-1)

    if resampling_method == "AllKNN":
        resample = AllKNN(n_neighbors=7,
                          kind_sel='mode',
                          allow_minority=True,
                          n_jobs=-1)

    if resampling_method == "NearMiss":
        resample = NearMiss(n_neighbors=7, n_jobs=-1)

    if resampling_method == "NeighbourhoodCleaningRule":
        resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all')

    if resampling_method == "RandomUnderSampler":
        resample = RandomUnderSampler(random_state=42)

    if resampling_method == "TomekLinks":
        resample = TomekLinks(n_jobs=-1)

    # ---- OVER-SAMPLING ------ #
    if resampling_method == "BorderlineSMOTE":
        resample = BorderlineSMOTE(random_state=42, n_jobs=-1)

    if resampling_method == "KMeansSMOTE":
        resample = KMeansSMOTE(random_state=42)

    if resampling_method == "RandomUnderSampler":
        resample = RandomOverSampler(random_state=42)

    if resampling_method == "SMOTE":
        resample = SMOTE(random_state=42, n_jobs=-1)

    # transform the dataset
    train_x.data, train_y.data = resample.fit_resample(train_x.data,
                                                       train_y.data)
Esempio n. 15
0
def test_allknn_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_warns(UserWarning, allknn.fit, X, y_single_class)
Esempio n. 16
0
def test_allknn_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    allknn.fit(X, Y)
    assert_raises(RuntimeError, allknn.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Esempio n. 17
0
def test_allknn_init():
    """Test the initialisation of the object"""

    # Define a ratio
    allknn = AllKNN(random_state=RND_SEED)

    assert_equal(allknn.n_neighbors, 3)
    assert_equal(allknn.kind_sel, 'all')
    assert_equal(allknn.n_jobs, -1)
    assert_equal(allknn.random_state, RND_SEED)
Esempio n. 18
0
    def resample(self, X, y, by, random_state=None, visualize=False):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y
        if visualize:
            df = pd.DataFrame(X_train)
            df['label'] = y_train
            df.plot.scatter(x=0,
                            y=1,
                            c='label',
                            s=3,
                            colormap='coolwarm',
                            title='{} training set'.format(by))
        return X_train, y_train
Esempio n. 19
0
 def resample(self):
     """
     Resampling data usinf AllKNN and SMOTE
     """
     print("Sampling data...")
     # Under Sampling
     allknn = AllKNN(sampling_strategy={28: 565})
     self.X, self.y = allknn.fit_resample(self.X, self.y)
     #Over Sampling
     smote = SMOTE(ratio="all")
     self.X, self.y = smote.fit_resample(self.X, self.y)
Esempio n. 20
0
    def __init__(self, name):
        self.strategie = None
        self.name = name

        if name == "enn":
            self.strategie = EditedNearestNeighbours(sampling_strategy='auto',
                                                     n_neighbors=3,
                                                     kind_sel='all',
                                                     n_jobs=-1)
        elif name == "allknn":
            self.strategie = AllKNN(sampling_strategy='auto',
                                    n_neighbors=3,
                                    kind_sel='all',
                                    allow_minority=False,
                                    n_jobs=-1)
        elif name == "renn":
            self.strategie = RepeatedEditedNearestNeighbours(
                sampling_strategy='auto',
                n_neighbors=3,
                max_iter=100,
                kind_sel='all',
                n_jobs=-1)

        elif name == "tomek":
            self.strategie = TomekLinks(sampling_strategy='auto', n_jobs=-1)

        elif name == "smote":
            self.strategie = SMOTE(sampling_strategy='auto',
                                   k_neighbors=5,
                                   n_jobs=-1,
                                   random_state=42)

        elif name == "bdsmote":
            self.strategie = BorderlineSMOTE(random_state=42, n_jobs=-1)

        elif name == "adasyn":
            self.strategie = ADASYN(sampling_strategy='auto',
                                    n_neighbors=5,
                                    n_jobs=-1,
                                    random_state=42)

        elif name == "smoteenn":
            self.strategie = SMOTEENN(sampling_strategy='auto',
                                      smote=None,
                                      enn=None,
                                      n_jobs=-1,
                                      random_state=42)

        elif name == "smotetomek":
            self.strategie = SMOTETomek(sampling_strategy='auto',
                                        smote=None,
                                        tomek=None,
                                        n_jobs=-1,
                                        random_state=42)
Esempio n. 21
0
def all_KNN(X, Y):
    from imblearn.under_sampling import AllKNN
    allknn = AllKNN()
    allknn.fit_resample(X, Y)
    indexes = allknn.sample_indices_
    nobj = len(Y)
    mask = np.zeros(nobj, dtype=int)
    for i in range(nobj):
        if i in indexes:
            mask[i] = 1
    return True, mask
Esempio n. 22
0
def test_allknn_fit_sample_mode():
    """Test the fit sample routine using the mode as selection"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED, kind_sel='mode')
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x_mode.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y_mode.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
Esempio n. 23
0
def undersampling(type):
    if type == 'random':
        und = RandomUnderSampler(ratio='majority', random_state=42)
    elif type == 'knn':
        und = AllKNN(ratio='majority', random_state=42, n_jobs=4)
    elif type == 'centroids':
        und = ClusterCentroids(ratio='majority', n_jobs=-1)
    x, y = und.fit_sample(train, label)
    x = pd.DataFrame(x, columns=train.columns.values)
    y = pd.DataFrame(y, columns=['is_attributed'])

    return x, y
Esempio n. 24
0
def all_KNN(X, Y):
    from imblearn.under_sampling import AllKNN
    allknn = AllKNN()
    allknn.fit_resample(X, Y)
    indexes = allknn.sample_indices_
    mask = []
    for i in range(len(X)):
        if i in indexes:
            mask.append(1)
        else:
            mask.append(0)
    return True, np.asarray(mask)
Esempio n. 25
0
def test_allknn_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    allknn = AllKNN(random_state=RND_SEED)
    X_resampled, y_resampled = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
Esempio n. 26
0
def aiiknn(X,
           y,
           visualize=False,
           pca2d=True,
           pca3d=True,
           tsne=True,
           pie_evr=True):
    allknn = AllKNN()
    X_res, y_res = allknn.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
Esempio n. 27
0
def test_allknn_fit():
    """Test the fitting method"""

    # Create the object
    allknn = AllKNN(random_state=RND_SEED)
    # Fit the data
    allknn.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(allknn.min_c_, 0)
    assert_equal(allknn.maj_c_, 1)
    assert_equal(allknn.stats_c_[0], 500)
    assert_equal(allknn.stats_c_[1], 4500)
Esempio n. 28
0
def undersample(X_train_org, y_train_org, sampler='AllKNN', size=1000):
    """Undersample the training set data using one of various techniques."""

    # Select a sampler type.
    if sampler == "RandomUnderSampler":
        samp = RandomUnderSampler(sampling_strategy={True: size, False: size})
    if sampler == 'AllKNN':
        samp = AllKNN()

    # Resample the data using the selected sampler.
    X_train, y_train = samp.fit_resample(X_train_org, y_train_org)
    print(sorted(Counter(y_train).items()))

    return X_train, y_train
    def fit(self, X, y):
        # Preparação dos argumentos para os métodos da biblioteca ``scikit-learn``
        #Xlinha = X[self.columns]
        #ylinha = y
        allknn = AllKNN()

        #allknn = AllKNN()
        X_resampled, y_resampled = allknn.fit_resample(X, y)
        X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                            y_resampled,
                                                            test_size=0.2,
                                                            random_state=1)
        model = XGBClassifier()
        return model
Esempio n. 30
0
def test_allknn_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    allknn = AllKNN(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'allknn_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'allknn_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'allknn_idx.npy'))
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_almost_equal(y_resampled, y_gt)
    assert_array_almost_equal(idx_under, idx_gt)