コード例 #1
0
def get_models():
    models, names = list(), list()
    # SMOTEENN
    sampling = SMOTEENN(enn=EditedNearestNeighbours(
        sampling_strategy='majority'))
    model = LogisticRegression(solver='liblinear')
    steps = [('e', sampling), ('m', model)]
    models.append(Pipeline(steps=steps))
    names.append('LR')
    # SMOTEENN + Norm
    sampling = SMOTEENN(enn=EditedNearestNeighbours(
        sampling_strategy='majority'))
    model = LogisticRegression(solver='liblinear')
    steps = [('t', MinMaxScaler()), ('e', sampling), ('m', model)]
    models.append(Pipeline(steps=steps))
    names.append('Norm')
    # SMOTEENN + Std
    sampling = SMOTEENN(enn=EditedNearestNeighbours(
        sampling_strategy='majority'))
    model = LogisticRegression(solver='liblinear')
    steps = [('t', StandardScaler()), ('e', sampling), ('m', model)]
    models.append(Pipeline(steps=steps))
    names.append('Std')
    # SMOTEENN + Power
    sampling = SMOTEENN(enn=EditedNearestNeighbours(
        sampling_strategy='majority'))
    model = LogisticRegression(solver='liblinear')
    steps = [('t1', MinMaxScaler()), ('t2', PowerTransformer()),
             ('e', sampling), ('m', model)]
    models.append(Pipeline(steps=steps))
    names.append('Power')
    return models, names
コード例 #2
0
def test_validate_estimator_deprecation():
    """Test right processing while passing old parameters"""

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668],
                     [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])

    smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

    smt = SMOTEENN(random_state=RND_SEED, k=5)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #3
0
def test_error_wrong_object():
    smote = 'rnd'
    enn = 'rnd'
    smt = SMOTEENN(smote=smote, random_state=RND_SEED)
    with raises(ValueError, match="smote needs to be a SMOTE"):
        smt.fit_sample(X, Y)
    smt = SMOTEENN(enn=enn, random_state=RND_SEED)
    with raises(ValueError, match="enn needs to be an "):
        smt.fit_sample(X, Y)
コード例 #4
0
def test_error_wrong_object():
    smote = 'rnd'
    enn = 'rnd'
    smt = SMOTEENN(smote=smote, random_state=RND_SEED)
    assert_raises_regex(ValueError, "smote needs to be a SMOTE",
                        smt.fit_sample, X, Y)
    smt = SMOTEENN(enn=enn, random_state=RND_SEED)
    assert_raises_regex(ValueError, "enn needs to be an ", smt.fit_sample, X,
                        Y)
コード例 #5
0
def test_error_wrong_object():
    """Test either if an error is raised while wrong objects are provided
    at the initialization"""

    # Create a SMOTE and Tomek object
    smote = 'rnd'
    enn = 'rnd'

    smt = SMOTEENN(smote=smote, random_state=RND_SEED)
    assert_raises(ValueError, smt.fit, X, Y)
    smt = SMOTEENN(enn=enn, random_state=RND_SEED)
    assert_raises(ValueError, smt.fit, X, Y)
コード例 #6
0
def statistic_set(model,df,test_set,k_fold):
    start_time=time.time()

    #input_x,test_x,input_y,test_y=train_test_split(input_x,input_y,
    #                                test_size=0.25,stratify=input_y,random_state=43)
    input_y= list(df['label'])
    input_x= make_set(df)

    if not test_set.empty:
        src_df_res=test_set
        test_x=make_set(src_df_res)
        test_y=list(src_df_res['label'])
        df=df[~df[['SOURCE_ID_1','SOURCE_ID_2']].apply(tuple,1).isin(src_df_res[['SOURCE_ID_1','SOURCE_ID_2']].apply(tuple,1))]
        input_y= list(df['label'])
        input_x= make_set(df)
    else:
        input_x,test_x,input_y,test_y=train_test_split(input_x,input_y,
                                    test_size=0.25,stratify=input_y,random_state=43)
    


    if k_fold==1:
        cv= KFold(5,shuffle=True,random_state=43)
        for i,(idx_train,idx_test) in enumerate(cv.split(input_x,input_y)):
            x_train_list=[]
            y_train_list=[]
            x_test_list=[]
            y_test_list=[]

            for idx in idx_train:
                x_train_list.append(input_x[idx])
                y_train_list.append(input_y[idx])

            for idx in idx_test:
                x_test_list.append(input_x[idx])
                y_test_list.append(input_y[idx])
            
            x_train_list,y_train_list=SMOTEENN(random_state=0).fit_sample(x_train_list,y_train_list)

            clf=model.fit(x_train_list,y_train_list)

            print("score = %.8f"%(clf.score(x_test_list,y_test_list)))
    
    input_x,input_y = SMOTEENN(random_state=0).fit_sample(input_x,input_y)

    fin_clf=model.fit(input_x,input_y)
    fin_score=fin_clf.score(test_x,test_y)

    print('final_score')
    print(fin_score)
    res_time=time.time()-start_time
    return fin_clf,fin_score,test_y,fin_clf.predict(test_x),test_x,res_time
コード例 #7
0
def test_senn_multiclass_error():
    """ Test either if an error is raised when the target are not binary
    type. """

    # continuous case
    y = np.linspace(0, 1, 20)
    sm = SMOTEENN(random_state=RND_SEED)
    assert_warns(UserWarning, sm.fit, X, y)

    # multiclass case
    y = np.array([0] * 3 + [1] * 2 + [2] * 15)
    sm = SMOTEENN(random_state=RND_SEED)
    assert_warns(UserWarning, sm.fit, X, y)
コード例 #8
0
def test_validate_estimator_deprecation():
    smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667], [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
    smt = SMOTEENN(random_state=RND_SEED, k=5)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
コード例 #9
0
def test_parallelisation():
    # Check if default job count is 1
    smt = SMOTEENN(random_state=RND_SEED)
    smt._validate_estimator()
    assert smt.n_jobs == 1
    assert smt.smote_.n_jobs == 1
    assert smt.enn_.n_jobs == 1

    # Check if job count is set
    smt = SMOTEENN(random_state=RND_SEED, n_jobs=8)
    smt._validate_estimator()
    assert smt.n_jobs == 8
    assert smt.smote_.n_jobs == 8
    assert smt.enn_.n_jobs == 8
コード例 #10
0
    def use_debug_parameters(self, reduced_selected_features):
        # Define parameters as an array of dicts in case different parameters are used for different optimizations
        params_debug = [
            {
                'scaler': [StandardScaler()],
                'sampling':
                [modelutil.Nosampler(),
                 SMOTE(), SMOTEENN(),
                 ADASYN()],
                'feat__cols': reduced_selected_features[0:2],
                'model__kernel': ['linear'],
                'model__C': [0.1, 1, 10],
                'model__gamma': [0.1, 1, 10],
            },
            {
                'scaler': [StandardScaler(), Normalizer()],
                'sampling': [modelutil.Nosampler()],
                'feat__cols': reduced_selected_features[0:1],
                'model__C': [1],  # default C=1
                'model__kernel': ['rbf'],
                'model__gamma': [1]
                # Only relevant in rbf, default='auto'=1/n_features
            }
        ]

        return params_debug
def SMOTE_ENN(X_train,
              Y_train,
              seed,
              sampling_strategy,
              k_neighbors_smote=5,
              n_neighbors_enn=3,
              kind_sel='all'):
    enn = EditedNearestNeighbours(random_state=seed,
                                  n_jobs=-1,
                                  n_neighbors=n_neighbors_enn,
                                  kind_sel=kind_sel,
                                  sampling_strategy=sampling_strategy)
    smote = SMOTE(random_state=seed,
                  n_jobs=-1,
                  k_neighbors=k_neighbors_smote,
                  sampling_strategy=sampling_strategy)
    smote_enn = SMOTEENN(random_state=seed,
                         smote=smote,
                         enn=enn,
                         sampling_strategy=sampling_strategy)
    print('Before SMOTE + ENN : ', sorted(Counter(Y_train).items()))
    X_train_resampled, Y_train_resampled = smote_enn.fit_resample(
        X_train, Y_train)
    print('After SMOTE + ENN : ', sorted(Counter(Y_train_resampled).items()))

    X_train_resampled, Y_train_resampled = shuffle_dataset(
        X_train_resampled, Y_train_resampled, seed)

    return X_train_resampled, Y_train_resampled
コード例 #12
0
def smoter(df):
    IDs = df.Quote_ID
    target = df.QuoteConversion_Flag
    data = df.drop(['QuoteConversion_Flag'], axis=1).values
    print("Before SMOTE: ", sorted(Counter(target).items()))

    ####
    # ENN
    ####
    enn = ENN(sampling_strategy="not majority",
              kind_sel="mode",
              n_neighbors=5,
              n_jobs=-1,
              random_state=RANDOM_STATE)
    smote_enn = SMOTEENN(enn=enn, random_state=RANDOM_STATE)
    X_resampled, y_resampled = smote_enn.fit_resample(data, target)
    print("SMOTE ENN: ", sorted(Counter(y_resampled).items()))

    ####
    # Tomeks
    ####
    # smote_tomek = SMOTETomek(random_state=0)
    # X_resampled, y_resampled = smote_tomek.fit_resample(data, target)
    # print("Using SMOTE: ", sorted(Counter(y_resampled).items()))

    data = pd.DataFrame(data=X_resampled, columns=FIELDS)
    target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag'])

    return data, target
コード例 #13
0
def OverSampling_SMOTE(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    train_df_X = train_df.drop('TARGET', axis=1)
    train_df_y = train_df.TARGET

    # SMOTE
    print('Creating Smote Data...')
    smote = SMOTE(k_neighbors=5, n_jobs=-1)
    smote_enn = make_pipeline(SimpleImputer(), SMOTEENN(smote=smote))
    X_res, y_res = smote_enn.fit_resample(train_df_X, train_df_y)

    X_res_df = pd.DataFrame(X_res, columns=train_df_X.columns)

    train_df_new = X_res_df.join(y_res.to_frame())

    df = train_df_new.append(test_df)

    # Save data to csv file
    df.to_csv('data/df_prepared_to_model.csv')

    # Save data to pickle file
    df.to_pickle("data/df_prepared_to_model.pkl")

    return df
コード例 #14
0
def smot2(train_x, train_y, feature_columns):

    from imblearn.combine import SMOTEENN
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import TomekLinks
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.over_sampling import ADASYN
    from sklearn.svm import SVC
    from imblearn.under_sampling import CondensedNearestNeighbour

    print('\nOriginal dataset shape {}'.format(Counter(train_y)))

    sm = SMOTEENN(ratio='minority',
                  n_jobs=3,
                  random_state=42,
                  n_neighbors=50,
                  smote=SMOTE())
    #sm = ADASYN(ratio='minority', n_jobs=3,random_state=42,n_neighbors=100)

    #sm = SMOTE(ratio='minority', n_jobs=3, random_state=42,m_neighbors=200)

    #sm = CondensedNearestNeighbour(ratio='majority', random_state=42)

    log.traceLogInfo("\nFIT DE SMOT2 ...equilibrage")
    X_res, y_res = sm.fit_sample(train_x, train_y)

    print('\nResampled dataset shape {}'.format(Counter(y_res)))
    # reconstitution DATAFRAME
    train_x = pd.DataFrame(X_res, columns=feature_columns)
    train_y = pd.Series(y_res)

    return train_x, train_y
コード例 #15
0
    def __init__(self,  window_size=6, training_ratio=.7, seq="sequence", pos="label"):
        self.training_ratio = training_ratio  # Float value representing % of data used for training
        self.features = []
        self.labels = []
        self.words = []
        self.window_size = window_size
        self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4),
                                       "mlp_adam": MLPClassifier(),
                                       "svc": svm.SVC(verbose=1),
                                       "xgb": XGBClassifier(max_delta_step=5),
                                       "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf")
                                       }

        self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(),
                                    "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(),
                                    "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(),
                                    "near_miss": NearMiss(), "pass": -1}
        self.seq = seq
        self.pos = pos
        self.random_data = 0
        self.test_results = 0
        self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"}
        self.vector = 0
        self.features_labels = {}
        self.test_cv = 0
        self.benchmark_mcc = 0
        self.mcc_scorer = make_scorer(matthews_corrcoef)
コード例 #16
0
def SMOTE_methods(df_train, target, method):
    '''The output data has been normalized by MinMaxScaler'''
    scaler = MinMaxScaler()
    X = df_train.drop([target], axis=1)
    y = df_train[target]
    X_normalized = scaler.fit_transform(X)
    if method == 'regular':
        X_res, y_res = SMOTE(kind='regular').fit_sample(X_normalized, y)
    elif method == 'borderline1':
        X_res, y_res = SMOTE(kind='borderline1').fit_sample(X_normalized, y)
    elif method == 'borderline2':
        X_res, y_res = SMOTE(kind='borderline2').fit_sample(X_normalized, y)
    elif method == 'svm':
        X_res, y_res = SMOET(kind='svm').fit_sample(X_normalized, y)
    elif method == 'Tomek':
        sm = SMOTETomek()
        X_res, y_res = sm().fit_sample(X_normalized, y)
    elif method == 'ENN':
        sm = SMOTEENN()
        X_res, y_res = sm().fit_sample(X_normalized, y)
    else:
        raise ValueError('输入方法有误')
    df_final = pd.DataFrame(X_res, columns=X.columns)
    df_final['target'] = y_res
    return df_final
コード例 #17
0
def test_smote_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    smote = SMOTEENN(random_state=RND_SEED)
    assert_raises(RuntimeError, smote.sample, X, Y)
コード例 #18
0
def split_data_resampling(X, y, test_percentage=0.2):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_percentage, random_state=42)
    smote_enn = SMOTEENN(random_state=0)
    X_train_resampled, y_train_resampled = smote_enn.fit_resample(
        X_train, y_train)
    return X_train_resampled, y_train_resampled, X_test, y_test
コード例 #19
0
ファイル: preprocessor.py プロジェクト: Navpreet2289/code
    def __init__(self,
                 companiesPath,
                 riskPath,
                 target='is_fraud',
                 oversampler=SMOTEENN(),
                 testKwargs={
                     'test_size': .3,
                     'random_state': 22
                 },
                 auto=True):

        rawData = Reader(companiesPath, riskPath)
        self.fitRiskData = RiskData().fit(rawData.dfRisk)
        dfRisk = self.fitRiskData.transform(rawData.dfRisk)
        df = rawData.dfCompanies.merge(dfRisk, how='left', on='company_id')

        train, test = tts(df, stratify=df[target], **testKwargs)

        self.partition_data(train, test, target, oversampler)

        self.results = {}
        self.estimators = {}
        self.featureEval = None

        if auto:
            self.build(build_param_grid())
            self.evaluate()
コード例 #20
0
def Balance_classes(X_train, y_train, Sampling_Function):
    if Sampling_Function == 'RandomUnderSampler':
        us = RandomUnderSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'NearMiss1':
        us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3)
    elif Sampling_Function == 'NearMiss2':
        us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3)
    elif Sampling_Function == 'NearMiss3':
        us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3)
    elif Sampling_Function == 'CondensedNearestNeighbour':
        us = CondensedNearestNeighbour(random_state=1)
    elif Sampling_Function == 'EditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'RepeatedEditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'TomekLinks':
        us = TomekLinks(random_state=1)
    elif Sampling_Function == 'RandomOverSampler':
        us = RandomOverSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'SMOTE':
        us = SMOTE(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTETomek':
        us = SMOTETomek(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTEENN':
        us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5)
    elif Sampling_Function == 'EasyEnsemble':
        us = EasyEnsemble()
    elif Sampling_Function == 'BalanceCascade_rf':
        us = BalanceCascade(classifier='random-forest', random_state=1)
    elif Sampling_Function == 'BalanceCascade_svm':
        us = BalanceCascade(classifier='linear-svm', random_state=1)

    X_train_res, y_train_res = us.fit_sample(X_train, y_train)

    return X_train_res, y_train_res
コード例 #21
0
ファイル: main_nba.py プロジェクト: tonysy/MachineLearningHW
def main():
    data_x, data_y = read_mat(path='./data/nbadata.mat')
    # import pdb; pdb.set_trace()
    # print('------- Newton Method------')
    sm = SMOTEENN()
    # syn_data_x, syn_data_y = sm.fit_sample(data_x, data_y)
    syn_data_x, syn_data_y = data_aug(data_x, data_y)
    mean = np.mean(data_x, axis=1)
    mean = np.expand_dims(mean, axis=1)

    std = np.std(data_x, axis=1)
    std = np.expand_dims(std, axis=1)
    data_x = (data_x - mean) / std
    # syn_data_x, syn_data_y = data_aug(data_x,data_y)
    # import pdb; pdb.set_trace()
    mean_syn = np.mean(syn_data_x, axis=1)
    mean_syn = np.expand_dims(mean_syn, axis=1)

    std_syn = np.std(syn_data_x, axis=1)
    std_syn = np.expand_dims(std_syn, axis=1)
    syn_data_x = (syn_data_x - mean_syn) / std_syn

    # import pdb; pdb.set_trace()
    model = LogisticRegression(x_data=np.vstack([data_x, syn_data_x]),
                               y_data=np.hstack([data_y, syn_data_y]),
                               original_x=data_x,
                               original_y=data_y)

    # model = LogisticRegression(x_data=syn_data_x, y_data=syn_data_y, original_x=data_x, original_y=data_y)
    # model = LogisticRegression(x_data=data_x, y_data=data_y, original_x=data_x, original_y=data_y)
    # model.model_fit(method='NewtonMethod',lr=5e-2,error_bound=1e-1)
    # model.model_fit(method='GradientDescent',lr=1e-3, error_bound=1e-2)
    model.model_fit(method='BFGS', lr=1e-1, error_bound=1e-1)
コード例 #22
0
def smpote_test():
    # 读取测试测试数据集中的数据
    truth_df = pd.read_hdf('D:\\kpi\\1.hdf')
    # print(truth_df["KPI ID"])
    kpi_names = truth_df['KPI ID'].values
    truth = truth_df[truth_df["KPI ID"] == kpi_names[0]]
    y = truth['label']

    X = truth.drop(columns=['label', 'KPI ID'])
    sm = SMOTEENN()
    X_resampled, y_resampled = sm.fit_sample(X, y)

    dfX = pd.DataFrame(X_resampled, columns=['timestamp', 'value'])
    DFy = pd.DataFrame(y_resampled, columns=['label'])

    plt.plot(np.array(X['timestamp']),
             np.array(X['value']),
             color='green',
             label='training accuracy')
    plt.legend()  # 显示图例
    plt.show()

    dfX = dfX.join(DFy).sort_values(by="timestamp", ascending=True)

    plt.plot(np.array(dfX['timestamp']),
             np.array(dfX['value']),
             color='red',
             label='training accuracy')
    plt.legend()  # 显示图例
    plt.show()
コード例 #23
0
def test_validate_estimator_init():
    """Test right processing while passing objects as initialization"""

    # Create a SMOTE and Tomek object
    smote = SMOTE(random_state=RND_SEED)
    enn = EditedNearestNeighbours(random_state=RND_SEED)

    smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)

    X_resampled, y_resampled = smt.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668],
                     [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #24
0
    def fit(self, c_data, x_data, y_data):
        # this is to track evolution of the size of the training samples
        self.samplesize = []
        self.samplesize.append(len(x_data))

        if self.reject_by_calendar:
            mask = self.mask_cal(c_data, y_data)
            # filter rows rejected by this calendar criteria
            # not filtering them might improve second classifier training
            #x_data = normalize(x_data[mask])
            #y_data = y_data[mask]
            self.samplesize.append(len(x_data))

        if self.use_resampling:
            # undersample
            resampler = AllKNN()
            x_data, y_data = resampler.fit_sample(x_data, y_data)
            self.samplesize.append(len(x_data))

            # oversample
            resampler = SMOTEENN()
            x_data, y_data = resampler.fit_sample(x_data, y_data)
            self.samplesize.append(len(x_data))

        # train clf only with filtered and resampled data
        if self.use_weights:
            try:
                self.clf.fit(x_data, y_data, self.get_weights(y_data))
            except TypeError:
                print "The classifier selected does not admit weights for training samples"
                print "Switching to no weights"
                self.use_weights = False
                self.clf.fit(x_data, y_data)
        else:
            self.clf.fit(x_data, y_data)
コード例 #25
0
    def use_debug_parameters(self, reduced_selected_features):
        ### XGBOOST CODE start
        params_debug = [{
            'scaler': [StandardScaler()],
            'sampling': [modelutil.Nosampler(),
                         SMOTE(),
                         SMOTEENN(),
                         ADASYN()],
            'feat__cols':
            reduced_selected_features[0:2],
            'model__nthread':
            [4],  # when use hyperthread, xgboost may become slower
            'model__objective': ['binary:logistic'],
            'model__learning_rate': [0.05, 0.5],  # so called `eta` value
            'model__max_depth': [6, 7, 8],
            'model__min_child_weight': [11],
            'model__silent': [1],
            'model__subsample': [0.8],
            'model__colsample_bytree': [0.7],
            'model__n_estimators':
            [5, 10],  # number of trees, change it to 1000 for better results
            'model__missing': [-999],
            'model__seed': [1337]
        }]

        return params_debug
コード例 #26
0
ファイル: evaluation.py プロジェクト: wsgan001/SoBA
def sampling(X, y):
    debug('Started sampling')
    lists = []
    names = []
    lists.append((X, y))
    names.append('original')

    ### ovesampling
    query_time = time.time()
    pp = SMOTE(kind='regular')
    X_pp, y_pp = pp.fit_sample(X, y)
    lists.append((X_pp, y_pp))
    names.append('over-SMOTE')
    process_time = int(time.time() - query_time)
    debug('Finished sampling SMOTE in {} seconds'.format(process_time))

    ### undersampling
    # query_time = time.time()
    # pp = EditedNearestNeighbours()
    # X_pp, y_pp = pp.fit_sample(X, y)
    # lists.append((X_pp, y_pp))
    # names.append('under-ENN')
    # process_time = int(time.time() - query_time)
    # debug('Finished sampling ENN in {} seconds'.format(process_time))

    ### oversampling + undersampling
    query_time = time.time()
    pp = SMOTEENN()
    X_pp, y_pp = pp.fit_sample(X, y)
    lists.append((X_pp, y_pp))
    names.append('over+under-SMOTE-ENN')
    process_time = int(time.time() - query_time)
    debug('Finished sampling SMOTE-ENN in {} seconds'.format(process_time))

    return lists, names
コード例 #27
0
def resampling(X_train, y_train):
    from imblearn.combine import SMOTEENN
    sm = SMOTEENN()
    print('dataset shape {}'.format(Counter(y_train)))
    X_train, y_train = sm.fit_sample(X_train, y_train)
    print('Resampled dataset shape {}'.format(Counter(y_train)))
    return X_train, y_train
コード例 #28
0
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
コード例 #29
0
def over_sampling(x_train, y_train):
    print()
    print("Doing over sampling...")
    print("Before over sampling:")
    class0_num = np.sum(y_train == 0)
    class1_num = np.sum(y_train == 1)
    class2_num = np.sum(y_train == 2)
    print("#Sample in Class 0: {}".format(class0_num))
    print("#Sample in Class 1: {}".format(class1_num))
    print("#Sample in Class 2: {}".format(class2_num))
    # Using SMOTE: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
    # an Over-sampling approach
    # Over sampling on training and validation data
    # sm = SMOTE(sampling_strategy='auto', random_state=10)
    # sm = SVMSMOTE(random_state=0)
    sm = SMOTEENN(random_state=0)
    # sm = SMOTETomek(ratio='auto')
    x_train, y_train = sm.fit_resample(x_train, y_train)

    # x_train, y_train = sm.fit_resample(x_train, y_train)
    # X_train, X_val, y_train, y_val = train_test_split(X_train,y,test_size=0.2,random_state=7)
    x_out = x_train
    y_out = y_train

    print("After over sampling:")
    class0_num = np.sum(y_out == 0)
    class1_num = np.sum(y_out == 1)
    class2_num = np.sum(y_out == 2)
    print("#Sample in Class 0: {}".format(class0_num))
    print("#Sample in Class 1: {}".format(class1_num))
    print("#Sample in Class 2: {}".format(class2_num))

    return x_out, y_out
コード例 #30
0
def resample_dataset(df, feature_list, repo_type):
    num_rows = len(df.index)  # number of rows in <df>
    num_features = len(feature_list)  # number of feature columns to resample
    cur_row = []  # list to hold the current row of <df>
    feat_val_mat = []  # the matrix (list of lists) to hold all feature values
    counter = 0  # counter for progress

    print "\nResampling data for the " + repo_type + " dataset..."
    for idx, row in tqdm(df.iterrows(),
                         desc="\tProgress"):  # loop <num_rows> times
        counter += 1
        # print_progress(counter, num_rows)
        for j in range(num_features):  # loop <num_features> times
            cur_row.append(
                row[feature_list[j]])  # form list of current row values
        feat_val_mat.append(cur_row)  # append <cur_row> to <feat_val_mat>
        cur_row = []

    smote_obj = SMOTEENN(
        sampling_strategy="all", random_state=99
    )  # <smote_obj> should over/under-sample both the "NEUTRAL" and "INSECURE" classes
    resampled_data, resampled_targets = smote_obj.fit_resample(
        feat_val_mat, list(df["SECU_FLAG"]))

    resampled_df = pd.DataFrame(
        resampled_data, columns=feature_list)  # recreate the reduced dataframe
    resampled_df[
        "SECU_FLAG"] = resampled_targets  # re-initialize the "SECU_FLAG" column
    resampled_df["REPO_TYPE"] = [repo_type] * len(
        resampled_df.index)  # re-initialize the "REPO_TYPE" column
    return resampled_df