Beispiel #1
0
def test_sample_kmeans_density_estimation(data, density_exponent,
                                          cluster_balance_threshold):
    X, y = data
    smote = KMeansSMOTE(random_state=42,
                        density_exponent=density_exponent,
                        cluster_balance_threshold=cluster_balance_threshold)
    smote.fit_sample(X, y)
Beispiel #2
0
def test_sample_kmeans_not_enough_clusters():
    rng = np.random.RandomState(42)
    X = rng.randn(30, 2)
    y = np.array([1] * 20 + [0] * 10)

    smote = KMeansSMOTE(random_state=42, kmeans_estimator=30, k_neighbors=2)
    with pytest.raises(RuntimeError):
        smote.fit_resample(X, y)
def test_kmeans_smote_param_error(data, density_exponent, cluster_balance_threshold):
    X, y = data
    kmeans_smote = KMeansSMOTE(
        density_exponent=density_exponent,
        cluster_balance_threshold=cluster_balance_threshold,
    )
    with pytest.raises(ValueError, match="should be 'auto' when a string"):
        kmeans_smote.fit_resample(X, y)
def sample(xtrain, ytrain):
    sm = KMeansSMOTE(random_state=42)
    x_res, y_res = sm.fit_resample(xtrain, ytrain)
    y = y_res
    x_res = pd.DataFrame(x_res)
    #y_res=pd.DataFrame(y_res)
    x_res.columns = xtrain.columns
    #y_res.columns=["Leak_type"]
    return x_res, y_res
def test_sample_kmeans_density_estimation(density_exponent, cluster_balance_threshold):
    X, y = make_classification(
        n_samples=10_000, n_classes=2, weights=[0.3, 0.7], random_state=42
    )
    smote = KMeansSMOTE(
        random_state=0,
        density_exponent=density_exponent,
        cluster_balance_threshold=cluster_balance_threshold,
    )
    smote.fit_resample(X, y)
Beispiel #6
0
def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator):
    X, y = data
    kmeans_smote = KMeansSMOTE(random_state=42,
                               kmeans_estimator=kmeans_estimator,
                               k_neighbors=k_neighbors)
    X_resampled, y_resampled = kmeans_smote.fit_sample(X, y)
    assert X_resampled.shape == (24, 2)
    assert y_resampled.shape == (24, )

    assert kmeans_smote.nn_k_.n_neighbors == 3
    assert kmeans_smote.kmeans_estimator_.n_clusters == 3
Beispiel #7
0
def over_sample_data(matrix, y_train):
    add_to_log('Over Sampling')
    add_to_log('Sample distribution %s' % Counter(y_train))
    b_line = KMeansSMOTE(k_neighbors=5,
                         sampling_strategy='not majority',
                         n_jobs=-1,
                         random_state=3,
                         kmeans_estimator=100)
    matrix_resampled, y_resampled = b_line.fit_resample(matrix, y_train)
    add_to_log('Resample distribution %s' % Counter(y_resampled))
    return matrix_resampled, y_resampled
Beispiel #8
0
def keans_smote(X,
                y,
                visualize=False,
                pca2d=True,
                pca3d=True,
                tsne=True,
                pie_evr=True):
    sm = KMeansSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
Beispiel #9
0
def test_kmeans_smote(data):
    X, y = data
    kmeans_smote = KMeansSMOTE(kmeans_estimator=1,
                               random_state=42,
                               cluster_balance_threshold=0.0,
                               k_neighbors=5)
    smote = SMOTE(random_state=42)

    X_res_1, y_res_1 = kmeans_smote.fit_sample(X, y)
    X_res_2, y_res_2 = smote.fit_sample(X, y)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)

    assert kmeans_smote.nn_k_.n_neighbors == 6
    assert kmeans_smote.kmeans_estimator_.n_clusters == 1
    assert 'batch_size' in kmeans_smote.kmeans_estimator_.get_params()
def Resampling(train_x, train_y, resampling_method):
    train_y.data = LabelEncoder().fit_transform(train_y.data)
    # summarize distribution

    # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling
    #plotGraphics.piePlot(train_y, "Before Resampling")

    # ---- UNDER-SAMPLING ------ #
    if resampling_method == "ClusterCentroids":
        resample = ClusterCentroids(voting='hard', random_state=42)

    if resampling_method == "CondensedNearestNeighbour":
        resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42)

    if resampling_method == "EditedNearestNeighbours":
        resample = EditedNearestNeighbours(n_neighbors=7,
                                           kind_sel='mode',
                                           n_jobs=-1)

    if resampling_method == "RepeatedEditedNearestNeighbours":
        resample = RepeatedEditedNearestNeighbours(n_neighbors=7,
                                                   kind_sel='mode',
                                                   n_jobs=-1)

    if resampling_method == "AllKNN":
        resample = AllKNN(n_neighbors=7,
                          kind_sel='mode',
                          allow_minority=True,
                          n_jobs=-1)

    if resampling_method == "NearMiss":
        resample = NearMiss(n_neighbors=7, n_jobs=-1)

    if resampling_method == "NeighbourhoodCleaningRule":
        resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all')

    if resampling_method == "RandomUnderSampler":
        resample = RandomUnderSampler(random_state=42)

    if resampling_method == "TomekLinks":
        resample = TomekLinks(n_jobs=-1)

    # ---- OVER-SAMPLING ------ #
    if resampling_method == "BorderlineSMOTE":
        resample = BorderlineSMOTE(random_state=42, n_jobs=-1)

    if resampling_method == "KMeansSMOTE":
        resample = KMeansSMOTE(random_state=42)

    if resampling_method == "RandomUnderSampler":
        resample = RandomOverSampler(random_state=42)

    if resampling_method == "SMOTE":
        resample = SMOTE(random_state=42, n_jobs=-1)

    # transform the dataset
    train_x.data, train_y.data = resample.fit_resample(train_x.data,
                                                       train_y.data)
Beispiel #11
0
    def over_sample(self,
                    method="BorderLine",
                    sampling_strategy="minority",
                    random_state=42,
                    k_neighbors=5,
                    n_neighbors=10,
                    kind="borderline-1"):
        """
        过采样方法
        :param method: str, option: ADASYN, BorderLine,KMeans,Random,SVM
        :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m}
        :param random_state:int
        :param k_neighbors:int
        :param n_neighbors:int
        :param kind:str, borderline-1,borderline-2
        :return:df
        """
        feature_name = self._df.columns.difference(["id",
                                                    self._target]).tolist()
        X = self._df[feature_name].values
        y = self._df[self._target].values

        print("Original label shape {}".format(Counter(y)))

        if method == "ADASYN":
            overSm = ADASYN(sampling_strategy=sampling_strategy,
                            random_state=random_state,
                            n_neighbors=k_neighbors)
        elif method == "BorderLine":
            overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy,
                                     random_state=random_state,
                                     k_neighbors=k_neighbors,
                                     m_neighbors=n_neighbors,
                                     kind=kind)
        elif method == "KMeans":
            overSm = KMeansSMOTE(sampling_strategy=sampling_strategy,
                                 random_state=random_state,
                                 k_neighbors=k_neighbors)
        elif method == "Random":
            overSm = RandomOverSampler(sampling_strategy=sampling_strategy,
                                       random_state=random_state)
        elif method == "SVM":
            overSm = SVMSMOTE(sampling_strategy=sampling_strategy,
                              random_state=random_state,
                              k_neighbors=k_neighbors,
                              m_neighbors=n_neighbors,
                              out_step=0.5)
        else:
            print("不支持{}该抽样方法".format(method))
            return self._df

        X_res, y_res = overSm.fit_resample(X, y)
        print("overSample label shape {}".format(Counter(y_res)))
        _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1)
        df_new = pd.DataFrame(data=_data,
                              columns=feature_name + [self._target])
        return df_new
Beispiel #12
0
    def __get_smote(self):
        if self.algorithm == 'Borderline':
            return BorderlineSMOTE(random_state=RANDOM_STATE)
        elif self.algorithm == 'KMeans':
            return KMeansSMOTE(random_state=RANDOM_STATE,
                               kmeans_estimator=KMeans(n_clusters=20))
        elif self.algorithm == 'SVM':
            return SVMSMOTE(random_state=RANDOM_STATE)
        elif self.algorithm == 'Tomek':
            return SMOTETomek(random_state=RANDOM_STATE)

        return SMOTE(random_state=RANDOM_STATE)
def equalize_training_dataset_with_KMeansSMOTE(x_train, y_train):
    from imblearn.over_sampling import KMeansSMOTE

    old_shape = list(x_train.shape)
    # reshape before using using over/undersampling method
    x_tmp = np.reshape(x_train, (x_train.shape[0], -1))
    x_resampled, y_resampled = KMeansSMOTE(
        sampling_strategy='not majority',
        n_jobs=8,
        cluster_balance_threshold=0.009).fit_resample(x_tmp, y_train)
    print(sorted(Counter(y_resampled).items()))
    # reshape after using over/undersampling method
    old_shape[0] = x_resampled.shape[0]
    x_resampled = np.reshape(x_resampled, tuple(old_shape))

    return x_resampled, y_resampled
Beispiel #14
0
def runSmote(X, y, algorithm='default', split_synthetic=False, verbose=True):
    if verbose:
        log.info("Data before oversampling")
        log.info("Dataset: {0}, {1}".format(X.shape, len(y)))

    n_casos = np.count_nonzero(y == 1)
    n_controles = np.count_nonzero(y == 0)

    N = abs(n_casos - n_controles)

    if algorithm == 'Borderline':
        if verbose:
            log.info("Running Borderline Smote")
        X_novo, y_novo = BorderlineSMOTE(
            random_state=random_state).fit_resample(X, y)
    elif algorithm == 'KMeans':
        if verbose:
            log.info("Running KMeans Smote")
        X_novo, y_novo = KMeansSMOTE(
            random_state=random_state,
            kmeans_estimator=KMeans(n_clusters=20)).fit_resample(X, y)
    elif algorithm == 'SVM':
        if verbose:
            log.info("Running SVM Smote")
        X_novo, y_novo = SVMSMOTE(random_state=random_state).fit_resample(X, y)
    elif algorithm == 'Tomek':
        if verbose:
            log.info("Running Smote Tomek")
        X_novo, y_novo = SMOTETomek(random_state=random_state).fit_resample(
            X, y)
    else:
        if verbose:
            log.info("Running default Smote")
        X_novo, y_novo = SMOTE(random_state=random_state).fit_resample(X, y)

    if verbose:
        log.info("Data after oversampling")
        log.info("Dataset: {0}, {1}".format(X_novo.shape, len(y_novo)))

    if split_synthetic:
        synthetic_X = X_novo[-N:]
        synthetic_y = y_novo[-N:]

        return X, y, synthetic_X, synthetic_y
    else:
        return X_novo, y_novo, None, None
Beispiel #15
0
def over_sample(X, y, sampler="RandomUnderSampler"):
    samplers = {
        "RandomOverSampler": RandomOverSampler(),
        "KMeansSMOTE": KMeansSMOTE(),
        "ADASYN": ADASYN(),
        "SMOTE": SMOTE(),
        "BorderlineSMOTE": BorderlineSMOTE(),
        "SVMSMOTE": SVMSMOTE(),
        "SMOTENC": SMOTENC(categorical_features=[]),
    }
    sampler = samplers[sampler]

    # plot y class count before and after resample
    print("before", sorted(Counter(y).items()))

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print("after", sorted(Counter(y_resampled).items()))

    print('===' * 4, 'under_sample finished')

    return X_resampled, y_resampled
Beispiel #16
0
def target_training_data(targetclass):
    ##### target_training_data((targetclass)) is meant for genearing second half for training data set
    ##### generation of evalation data set in outsourced to all_target_training_data(Nnofs,Nnofs_evaluate,fractrain):
    
    import dictionary
    dictionary=dictionary.dict
    print(' ')
#    print('working on training set:')
#    print('targetclass=',targetclass)
    print('Resampling training set for class', targetclass)
    X1=3;X2=40  ##### components in the high-dimensional data point to be displayed for visualisation
    #dict_classes=gen_dictionary.gen_dictionary()
    #dict_classes=dictionary
    classes=[ keys for keys in dictionary ]
    classdir=classes
    traincontainer=[];traincontainer_y=[]
    origcontainer_y=[];origcontainer=[]
    origcontainer_ynn=[];
    traincontainer_ynn=[]
    appendsecondhalf=[]
    lensh=0
    for i in range(len(classdir)):
        cl=classdir[i]
        dirinclass=os.listdir(cl)
        lendirinclass=len(dirinclass)   
        dirinclass=[ os.path.join(cl,dirinclass[i],'ta.npy') for i in range(lendirinclass) ]  
        #print('i=',i,'cl=',cl, 'lendirinclass=',lendirinclass)
        
    #################### targetclass ############
        fnorig=str(targetclass)+'.orig.npy'
        fnorig_y=str(targetclass)+'_y.orig.npy'
        fnorig_ynn=str(targetclass)+'_ynn.orig.npy'
        fntrain=str(targetclass)+'.train.npy'
        fntrain_y=str(targetclass)+'_y.train.npy'
        fntrain_ynn=str(targetclass)+'_ynn.train.npy'
        
        if cl==targetclass:
#            print('{i, cl }=',{i,cl})
#            print('in target class',targetclass)
            #print('dirinclass=',dirinclass)        
            shuffle(dirinclass)
            firsthalf=dirinclass[0:int(fractrain*len(dirinclass))]
            secondhalf=dirinclass[int(fractrain*len(dirinclass)):]
            appendsecondhalf.append(secondhalf)
#            print('firsthalf=',firsthalf)
#            print('secondhalf=',secondhalf)       
            #####################################
            for k in range(len(firsthalf)):
             #       print('to append', firsthalf[k],'into',fnorig)
                    datain=np.load(firsthalf[k])
                    origcontainer.append(datain)
             #       print('to append', 0,'into',fnorig_y)
                    origcontainer_y.append(0)
                    origcontainer_ynn.append([0,1])
 #                   print('print from firsthalf:')
 #                   print('k:',k,'targetclass:',targetclass,'dictionary[targetclass]:',dictionary[targetclass])
    
            for k in range(len(secondhalf)):
              #      print('to append', secondhalf[k],'into',fntrain)
                    datain=np.load(secondhalf[k])
                    traincontainer.append(datain)
               #     print('to append',0,'into',fntrain_y)
                    traincontainer_y.append(0)
                    traincontainer_ynn.append([0,1])
            #####################################            
                    
        else:
#            print('{i, cl }=',{i,cl})
#            print('classes other than targetclass',targetclass)
    
            for k in range(len(dirinclass)):
                nnpy=dirinclass[k]
                if os.path.isfile(nnpy):
                    datain=np.load(nnpy)
                #    print('to append', nnpy,'into',fntrain)
                    traincontainer.append(datain)
                 #   print('to append', '1','into',fntrain_y)
                    traincontainer_y.append(1)
                    traincontainer_ynn.append([0,1])    
    
    origcontainer=np.array(origcontainer)
    origcontainer_y=np.array(origcontainer_y)
    
    traincontainer=np.array(traincontainer)
    traincontainer_y=np.array(traincontainer_y)
    
    origcontainer_ynn=np.array(origcontainer_ynn)
    traincontainer_ynn=np.array(traincontainer_ynn)
    
#    np.save(fnorig, origcontainer)                            
#    np.save(fnorig_y,origcontainer_y)
#    np.save(fnorig_ynn,origcontainer_ynn)
    np.save(fntrain, traincontainer)                            
    np.save(fntrain_y,traincontainer_y)
    np.save(fntrain_ynn,traincontainer_ynn)
    #################### end of targetclass ############
        
    ####################################################
    #print('Begin of oversampling for trainning set ')
    #k=len(classdir);
    k=2;
    seed=10
    X = traincontainer
    y = traincontainer_y
    #ynn = traincontainer_ynn
    
    #print('1',X.shape)
    X = np.reshape(X, (X.shape[0], X.shape[2]*X.shape[2]))
    #print('2',X.shape)
    
    ####### scatter plot of X and y
    #plt.xlabel('x')
    #plt.ylabel('y')
    #plt.scatter(X[:, X1], X[:, X2], marker='o', 
    #               c=y, s=25, edgecolor='k', cmap=plt.cm.coolwarm)
    #plt.show()
    
    #### creating sampling_strategy #####
    #lensh=max(lensh,len(secondhalf))
    #print('maxlensh  ======== ',lensh)
    sampling_strategy={}
    #sampling_strategy[0]=Nnofs*list(y).count(0)  
    #sampling_strategy[0]=Nnofs*list(y).count(1)  
    #sampling_strategy[1]=Nnofs*list(y).count(1)  
    print('npycountt:',npycountt)
    sampling_strategy[0]=Nnofs*npycountt
    sampling_strategy[1]=Nnofs*npycountt

    print('sampling_strategy (training set) = ',sampling_strategy)
    
  #  print("counter before oversampling = ", sorted(Counter(y).items()))
    
    
    ##### implementing oversampling ####
    if sampler_train == 'SMOTE':
        k=2;seed=10;n_jobs=-1;
        X_res, y_res= SMOTE(sampling_strategy=sampling_strategy, k_neighbors=k-1, random_state=seed,n_jobs=n_jobs)\
                      .fit_resample(X, y)

    if sampler_train == 'BorderlineSMOTE':
        k=2;seed=10;n_jobs=-1;
        X_res, y_res=imblearn.over_sampling.BorderlineSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k,n_jobs=n_jobs) \
        .fit_resample(X, y)

    if sampler_train == 'ADASYN':
        k=3;seed=10;n_jobs=-1;
        X_res, y_res = ADASYN(random_state=seed,sampling_strategy=sampling_strategy,n_neighbors=k+1,n_jobs=n_jobs)\
            .fit_resample(X, y)
    
    if sampler_train == 'KMeansSMOTE':
        k=2;seed=10;n_jobs=-1;
        X_res, y_res = KMeansSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k+2,n_jobs=n_jobs)\
            .fit_resample(X, y)
            
    if sampler_train == 'RandomOverSampler':
        k=2;seed=10
        X_res, y_res = RandomOverSampler(sampling_strategy=sampling_strategy,random_state=seed)\
            .fit_resample(X, y)
            
    if sampler_train == 'SVMSMOTE':
        k=4
        m_neighbors=2*k
        n_jobs=-1;seed=10;
        X_res, y_res = SVMSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k,n_jobs=n_jobs)\
            .fit_resample(X, y)
    
    
    #### implementing oversampling ####
    y_resnn = [ [y_res[i], np.abs((y_res[i]**(1) - 1))] for i in range(len(y_res))]    
    
    
    #plt.xlabel('x')
    #plt.ylabel('y')
    #plt.scatter(X_res[:, X1], X_res[:, X2], marker='o', 
    #               c=y_res, s=25, edgecolor='k', cmap=plt.cm.coolwarm)
    #plt.show()
 #   print("counter before oversampling (trainning set) = ", sorted(Counter(y).items()))
 #   print("counter after oversampling (trainning set) = ", sorted(Counter(y_res).items()))
    dim=int(X_res.shape[1]**0.5)
    X_res=X_res.reshape(X_res.shape[0],dim,dim)
       
    ### report sizes of data before and after oversampling
#    norig=sum([ Counter(y)[keys] for keys in Counter(y) ])
#    print('Total number of data before oversampling:',norig)
#    novsp=sum([ Counter(y_res)[keys] for keys in Counter(y_res) ])
#    print('Total number of data after oversampling:', novsp)
#    print('Ratio of number of data after and before oversampling of trainning data:',novsp/norig)
### here     

    print("counter before oversampling (trainning set) = ", sorted(Counter(y).items())[0])
    norigsample=sorted(Counter(y).items())[0][1]
    print("counter after oversampling (trainning set) = ", sorted(Counter(y_res).items()))
    norig=sum([ Counter(y)[keys] for keys in Counter(y) ])
    print('Total number of data before oversampling:',norigsample)
    #novsp=sum([ Counter(y_res)[keys] for keys in Counter(y_res) ])
    novsp=Counter(y_res)[0]
    print('Total number of data after oversampling:', novsp)
    print('Ratio of number of data after and before oversampling of trainning data:',novsp/norigsample)

### end here
    ### save oversampled data
    fnres=targetclass+'_ovsp.train.npy'
    fnres_y=targetclass + '_y_ovsp.train.npy'
    fnres_ynn=targetclass + '_ynn_ovsp.train.npy'
    np.save(fnres_y, y_res)
    np.save(fnres_ynn, y_resnn)
    np.save(fnres, X_res)
    ####################################################
    return firsthalf,appendsecondhalf
def kmeans_smote(x, y):
    print("----KMeans SMOTE----")
    sampler = KMeansSMOTE(random_state=42)
    X, y = sampler.fit_sample(x, y)
    return X, y
# the KMeans version will make a clustering before to generate samples in each
# cluster independently depending each cluster density.

fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8),
      (ax9, ax10)) = plt.subplots(5, 2, figsize=(15, 30))
X, y = create_dataset(n_samples=5000,
                      weights=(0.01, 0.05, 0.94),
                      class_sep=0.8)

ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10))
for ax, sampler in zip(
        ax_arr,
    (SMOTE(random_state=0), BorderlineSMOTE(random_state=0,
                                            kind='borderline-1'),
     BorderlineSMOTE(random_state=0, kind='borderline-2'),
     KMeansSMOTE(random_state=0), SVMSMOTE(random_state=0))):
    clf = make_pipeline(sampler, LinearSVC())
    clf.fit(X, y)
    plot_decision_function(X, y, clf, ax[0])
    ax[0].set_title('Decision function for {}'.format(
        sampler.__class__.__name__))
    plot_resampling(X, y, sampler, ax[1])
    ax[1].set_title('Resampling using {}'.format(sampler.__class__.__name__))
fig.tight_layout()

###############################################################################
# When dealing with a mixed of continuous and categorical features, SMOTE-NC
# is the only method which can handle this case.

# create a synthetic data set with continuous and categorical features
rng = np.random.RandomState(42)
Beispiel #19
0
def keans_smote(X, y):
    sm = KMeansSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    return X_res, y_res
def test_sample_kmeans_not_enough_clusters(data):
    X, y = data
    smote = KMeansSMOTE(cluster_balance_threshold=10, random_state=42)
    with pytest.raises(RuntimeError):
        smote.fit_resample(X, y)
                         ids=["borderline", "svm"])
def test_smote_m_neighbors(numerical_data, smote):
    # check that m_neighbors is properly set. Regression test for:
    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568
    X, y = numerical_data
    _ = smote.fit_resample(X, y)
    assert smote.nn_k_.n_neighbors == 6
    assert smote.nn_m_.n_neighbors == 11


@pytest.mark.parametrize(
    "smote, neighbor_estimator_name",
    [
        (ADASYN(random_state=0), "n_neighbors"),
        (BorderlineSMOTE(random_state=0), "k_neighbors"),
        (KMeansSMOTE(random_state=1), "k_neighbors"),
        (SMOTE(random_state=0), "k_neighbors"),
        (SVMSMOTE(random_state=0), "k_neighbors"),
    ],
    ids=["adasyn", "borderline", "kmeans", "smote", "svm"],
)
def test_numerical_smote_custom_nn(numerical_data, smote,
                                   neighbor_estimator_name):
    X, y = numerical_data
    params = {
        neighbor_estimator_name: _CustomNearestNeighbors(n_neighbors=5),
    }
    smote.set_params(**params)
    X_res, _ = smote.fit_resample(X, y)

    assert X_res.shape[0] >= 120
print(y.value_counts())
y = np.ravel(y)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)
# forest.fit(X_train, y_train)
# print("Original set\n{}".format(classification_report(y_test, forest.predict(X_test))))
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X_train)

# Apply the random over-sampling
ada = KMeansSMOTE(random_state=42)
X_resampled, y_resampled = ada.fit_sample(X_train, y_train)
y_resampled = np.ravel(y_resampled)
forest.fit(X_resampled, y_resampled)
print(Counter(y_resampled))
print(y_resampled.shape)
X_res_vis = pca.transform(X_resampled)

print("KMeansSMOTE\n{}".format(
    classification_report(y_test, forest.predict(X_test))))

f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y_train == 0, 0],
                 X_vis[y_train == 0, 1],
                 label="Class #0",
Beispiel #23
0
                                                    y_resampled,
                                                    test_size=0.2)
model_resample = lr.fit(x_train, y_train)
y_predict = model_resample.predict(x_test)
print(classification_report(y_test, y_predict))
from imblearn.over_sampling import BorderlineSMOTE
sm = BorderlineSMOTE(random_state=2020)
X_res, y_res = sm.fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(X_res,
                                                    y_res,
                                                    test_size=0.2)
model_resample = lr.fit(x_train, y_train)
y_predict = model_resample.predict(x_test)
print(classification_report(y_test, y_predict))
from imblearn.over_sampling import KMeansSMOTE
sm = KMeansSMOTE(random_state=2020, cluster_balance_threshold=0.1)
X_res, y_res = sm.fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(X_res,
                                                    y_res,
                                                    test_size=0.2)
model_resample = lr.fit(x_train, y_train)
y_predict = model_resample.predict(x_test)
print(classification_report(y_test, y_predict))
from imblearn.over_sampling import SVMSMOTE
sm = SVMSMOTE(random_state=2020)
X_res, y_res = sm.fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(X_res,
                                                    y_res,
                                                    test_size=0.2)
model_resample = lr.fit(x_train, y_train)
y_predict = model_resample.predict(x_test)
    '''Optimización SMOTE'''

    best_params_smote = svc_param_selection(Xtrain, ytrain, 5)
    SVM_smote = svm.SVC(kernel='rbf', C=best_params_smote['C'], gamma=best_params_smote['gamma'],
                        class_weight='balanced')

    print('valor c ideal SVM SMOTE', best_params_smote['C'], 'valor gamma ideal SVM SMOTE', best_params_smote['gamma'])

    border_sm = BorderlineSMOTE(k_neighbors=27, random_state=91, sampling_strategy=1)

    sm = SVMSMOTE(random_state=91, k_neighbors=2, sampling_strategy=1, svm_estimator=SVM_smote)

    ada = ADASYN(random_state=91, n_neighbors=27, sampling_strategy=1, n_jobs=6)

    Kmeans = KMeansSMOTE(random_state=91, k_neighbors=2, sampling_strategy=1, n_jobs=6,
                         kmeans_estimator=MiniBatchKMeans(n_clusters=20))

    '''Muestreo Sintetico'''

    #Xtrain, ytrain = SMOTE().fit_resample(Xtrain, ytrain)
    Xtrain, ytrain = border_sm.fit_resample(Xtrain, ytrain)

    '''Selección de caracteristicas'''

    # rel_MI = SelectKBest(score_func=score_func, k=num_features)
    # Xtrain = rel_MI.fit_transform(Xtrain, ytrain)
    # Xtest = rel_MI.transform(Xtest)
    # rel_MI_support = rel_MI.get_support()
    # rel_MI_feature = X_frame.loc[:, rel_MI_support].columns.tolist()
    # rel_MI_scores = rel_MI.scores_[rel_MI_support].tolist()
    # feature_selection_df = pd.DataFrame({'Feature': rel_MI_feature, 'Score':rel_MI_scores})
    adaBoost.fit(X_train, y_train)
    res = adaBoost.predict(features[test_index])
    bl_smote_scores['AB'] += metrics.f1_score(res, target[test_index])
    bl_smote_con_mat['AB'] += confusion_matrix(y_true=target[test_index],
                                               y_pred=res)

    # Gradient Boost Classifier
    gradBoost = GradientBoostingClassifier(random_state=0)
    gradBoost.fit(X_train, y_train)
    res = gradBoost.predict(features[test_index])
    bl_smote_scores['GB'] += metrics.f1_score(res, target[test_index])
    bl_smote_con_mat['GB'] += confusion_matrix(y_true=target[test_index],
                                               y_pred=res)

    # K-Means Smote
    km_smote = KMeansSMOTE(random_state=0)
    X_train, y_train = km_smote.fit_sample(features[train_index],
                                           target[train_index])
    # unique, counts = np.unique(y_train, return_counts=True)
    # print("Kmeans uni, count:",np.asarray((unique, counts)).T)

    # Logistic Regression
    logistic = LogisticRegression(random_state=0)
    logistic.fit(X_train, y_train)
    res = logistic.predict(features[test_index])
    km_scores['LR'] += metrics.f1_score(res, target[test_index])
    km_con_mat['LR'] += confusion_matrix(y_true=target[test_index], y_pred=res)
    #
    # Ada Boost Classifier
    adaBoost = AdaBoostClassifier(random_state=0)
    adaBoost.fit(X_train, y_train)
Beispiel #26
0
def fscore(params_org):
    #print(params_org)
    parambk = copy.deepcopy(params_org)
    ifError =0
    global best, HPOalg,params_best, errorcount
    params= params_org['classifier']
    classifier = params.pop('name')
    p_random_state = params.pop('random_state')
    
    if (classifier == 'SVM'):  
        param_value= params.pop('gamma_value')
        if(params['gamma'] == "value"):
            params['gamma'] = param_value
        else:
            pass   
        clf = SVC(max_iter = 10000, cache_size= 700, random_state = p_random_state,**params)
        #max_iter=10000 and cache_size= 700 https://github.com/EpistasisLab/pennai/issues/223
        #maxvalue https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L262
    elif(classifier == 'RF'):        
        clf = RandomForestClassifier(random_state = p_random_state, **params)
    elif(classifier == 'KNN'):
        p_value = params.pop('p')
        if(p_value==0):
            params['metric'] = "chebyshev"
        elif(p_value==1):
            params['metric'] = "manhattan"
        elif(p_value==2):
            params['metric'] = "euclidean"
        else:
            params['metric'] = "minkowski"
            params['p'] = p_value
        #https://github.com/hyperopt/hyperopt-sklearn/blob/fd718c44fc440bd6e2718ec1442b1af58cafcb18/hpsklearn/components.py#L302
        clf = KNeighborsClassifier(**params)
    elif(classifier == 'DTC'):        
        clf = DecisionTreeClassifier(random_state = p_random_state, **params)
    elif(classifier == 'LR'):        
        penalty_solver = params.pop('penalty_solver')
        params['penalty'] = penalty_solver.split("+")[0]
        params['solver'] = penalty_solver.split("+")[1]
        clf = LogisticRegression(random_state = p_random_state, **params)
    #resampling parameter
    p_sub_params= params_org.pop('sub')
    p_sub_type = p_sub_params.pop('type')
    sampler = p_sub_params.pop('smo_grp')
    gmean = []
    if (p_sub_type == 'SMOTE'):
        smo = SMOTE(**p_sub_params)
    elif (p_sub_type == 'ADASYN'):
        smo = ADASYN(**p_sub_params)
    elif (p_sub_type == 'BorderlineSMOTE'):
        smo = BorderlineSMOTE(**p_sub_params)
    elif (p_sub_type == 'SVMSMOTE'):
        smo = SVMSMOTE(**p_sub_params)
    elif (p_sub_type == 'SMOTENC'):
        smo = SMOTENC(**p_sub_params)
    elif (p_sub_type == 'KMeansSMOTE'):
        smo = KMeansSMOTE(**p_sub_params)
    elif (p_sub_type == 'RandomOverSampler'):
        smo = RandomOverSampler(**p_sub_params)
#Undersampling
    elif (p_sub_type == 'TomekLinks'):
        smo = TomekLinks(**p_sub_params)
    elif (p_sub_type == 'ClusterCentroids'):
        if(p_sub_params['estimator']=='KMeans'):
            p_sub_params['estimator']= KMeans(random_state = p_random_state)
        elif(p_sub_params['estimator']=='MiniBatchKMeans'):
            p_sub_params['estimator']= MiniBatchKMeans(random_state = p_random_state)
        smo = ClusterCentroids(**p_sub_params) 
    elif (p_sub_type == 'RandomUnderSampler'):
        smo = RandomUnderSampler(**p_sub_params)
    elif (p_sub_type == 'NearMiss'):
        smo = NearMiss(**p_sub_params)
    elif (p_sub_type == 'InstanceHardnessThreshold'):
        if(p_sub_params['estimator']=='knn'):
            p_sub_params['estimator']= KNeighborsClassifier()
        elif(p_sub_params['estimator']=='decision-tree'):
            p_sub_params['estimator']=DecisionTreeClassifier()
        elif(p_sub_params['estimator']=='adaboost'):
            p_sub_params['estimator']=AdaBoostClassifier()
        elif(p_sub_params['estimator']=='gradient-boosting'):
            p_sub_params['estimator']=GradientBoostingClassifier()
        elif(p_sub_params['estimator']=='linear-svm'):
            p_sub_params['estimator']=CalibratedClassifierCV(LinearSVC())
        elif(p_sub_params['estimator']=='random-forest'):
            p_sub_params['estimator']=RandomForestClassifier(n_estimators=100)
        smo = InstanceHardnessThreshold(**p_sub_params) 
    elif (p_sub_type == 'CondensedNearestNeighbour'):
        smo = CondensedNearestNeighbour(**p_sub_params)
    elif (p_sub_type == 'EditedNearestNeighbours'):
        smo = EditedNearestNeighbours(**p_sub_params)
    elif (p_sub_type == 'RepeatedEditedNearestNeighbours'):
        smo = RepeatedEditedNearestNeighbours(**p_sub_params) 
    elif (p_sub_type == 'AllKNN'):
        smo = AllKNN(**p_sub_params)
    elif (p_sub_type == 'NeighbourhoodCleaningRule'):
        smo = NeighbourhoodCleaningRule(**p_sub_params) 
    elif (p_sub_type == 'OneSidedSelection'):
        smo = OneSidedSelection(**p_sub_params)
#Combine
    elif (p_sub_type == 'SMOTEENN'):
        smo = SMOTEENN(**p_sub_params)
    elif (p_sub_type == 'SMOTETomek'):
        smo = SMOTETomek(**p_sub_params)
    e=''
    try:        
        for train, test in cv.split(X, y):
            if(p_sub_type=='NO'):
                X_smo_train, y_smo_train = X[train], y[train]
            else:
                X_smo_train, y_smo_train = smo.fit_sample(X[train], y[train])
            y_test_pred = clf.fit(X_smo_train, y_smo_train).predict(X[test])
            gm = geometric_mean_score(y[test], y_test_pred, average='binary')
            gmean.append(gm)
        mean_g=np.mean(gmean)
    except Exception as eec:
        e=eec
        mean_g = 0
        ifError =1 
        errorcount = errorcount+1
    gm_loss = 1 - mean_g
    abc=time.time()-starttime
    if mean_g > best:
        best = mean_g
        params_best = copy.deepcopy(parambk)
    return {'loss': gm_loss,
            'mean': mean_g,
            'status': STATUS_OK,         
            # -- store other results like this
            'run_time': abc,
            'iter': iid,
            'current_best': best,
            'eval_time': time.time(),            
            'SamplingGrp': sampler,
            'SamplingType': p_sub_type,
            'ifError': ifError,
            'Error': e,
            'params' : parambk,
            'attachments':
                {'time_module': pickle.dumps(time.time)}
           }   
Beispiel #27
0
# density.

# %%
from imblearn.over_sampling import BorderlineSMOTE, KMeansSMOTE, SVMSMOTE

X, y = create_dataset(n_samples=5000,
                      weights=(0.01, 0.05, 0.94),
                      class_sep=0.8)

fig, axs = plt.subplots(5, 2, figsize=(15, 30))

samplers = [
    SMOTE(random_state=0),
    BorderlineSMOTE(random_state=0, kind="borderline-1"),
    BorderlineSMOTE(random_state=0, kind="borderline-2"),
    KMeansSMOTE(random_state=0),
    SVMSMOTE(random_state=0),
]

for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X,
        y,
        clf,
        ax[0],
        title=f"Decision function for {sampler.__class__.__name__}")
    plot_resampling(X, y, sampler, ax[1])

fig.suptitle("Decision function and resampling using SMOTE variants")
fig.tight_layout()
Beispiel #28
0
"""

plt.figure(figsize=(20,5))
plt.title('Classes da variável Churn desbalanceadas', size=15)
sns.countplot(x='Churn', data=churn2)
plt.xlabel('Classes', size=15)
plt.ylabel('');

"""Rebalanceamento das classes com vários algoritmos de reamostragem de dados. Aqui irei aplicar model0s de *Oversampling*, de *Undersampling* e dessas duas técnicas de forma combinada."""

#Algoritmos de Oversampling
X1,y1=SMOTE().fit_resample(X,y)
X2,y2=ADASYN().fit_resample(X,y)
X3,y3=BorderlineSMOTE().fit_resample(X,y)
X4,y4=SVMSMOTE().fit_resample(X,y)
X5,y5=KMeansSMOTE().fit_resample(X,y)
X6,y6=SMOTEN().fit_resample(X,y)
#X7,y7=SMOTENC().fit_resample(X,y)
X8,y8=RandomOverSampler().fit_resample(X,y)

#Algoritmos de Undersampling
X9,y9=RandomUnderSampler().fit_resample(X,y)
X10,y10=NearMiss().fit_resample(X,y)
X11,y11=EditedNearestNeighbours().fit_resample(X,y)
X12,y12=RepeatedEditedNearestNeighbours().fit_resample(X,y)
X13,y13=AllKNN().fit_resample(X,y)
#X14,y14=CondensedNearestNeighbour().fit_resample(X,y)
X15,y15=OneSidedSelection().fit_resample(X,y)
X16,y16=NeighbourhoodCleaningRule().fit_resample(X,y)
X17,y17=InstanceHardnessThreshold().fit_resample(X,y)
Beispiel #29
0
    y = data[col[-1]]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=10)
    imp = SimpleImputer(strategy='mean')  # 均值 单变量插补
    X_train = imp.fit_transform(X_train)  # 训练集插补
    X_test = imp.transform(X_test)  # 测试集插补

    prep = StandardScaler()
    X_train = prep.fit_transform(X_train)
    X_test = prep.transform(X_test)

    ops_ada = ADASYN(random_state=10)
    ops_bsmote = BorderlineSMOTE(random_state=10)
    ops_ksmote = KMeansSMOTE(random_state=10)
    ops_rs = RandomOverSampler(random_state=10)
    ops_s = SMOTE(random_state=10)

    X_train_ada, y_train_ada = ops_ada.fit_sample(X_train, y_train)
    X_train_bsmote, y_train_bsmote = ops_bsmote.fit_sample(X_train, y_train)
    X_train_ksmote, y_train_ksmote = ops_ksmote.fit_sample(X_train, y_train)
    X_train_rs, y_train_rs = ops_rs.fit_sample(X_train, y_train)
    X_train_s, y_train_s = ops_s.fit_sample(X_train, y_train)

    dic_ = {
        'ADASYN': [X_train_ada, y_train_ada],
        'BorderlineSMOTE': [X_train_bsmote, y_train_bsmote],
        'RandomOverSampler': [X_train_rs, y_train_rs],
        'SMOTE': [X_train_s, y_train_s]
    }
Beispiel #30
0
y_vals = dataframe.iloc[:, 18:19]
print(y_vals.value_counts())

pca = PCA(n_components=3)
#X_train =pca.fit_transform(X_train)

y_train = y_train.ravel()
seed = 6

#GET CATERGORICAL FEATURES SEPARATED FROM CONTINUOUS, scale continuous, smotenc with all
smote_value = 0.55
print("smote value is " + str(smote_value))

sm = KMeansSMOTE(random_state=seed,
                 sampling_strategy=smote_value,
                 cluster_balance_threshold=0.3)
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=17)
inpt = 17


def create_model(x):
    def bm():
        clf = Sequential()
        clf.add(Dense(9, activation='relu', input_dim=x))
        clf.add(Dense(9, activation='relu'))
        clf.add(Dense(2, activation='sigmoid'))
        clf.compile(loss='categorical_crossentropy', optimizer=SGD())
        return model

    return bm