Ejemplo n.º 1
0
def test_validate_estimator_init():
    """Test right processing while passing objects as initialization"""

    # Create a SMOTE and Tomek object
    smote = SMOTE(random_state=RND_SEED)
    enn = EditedNearestNeighbours(random_state=RND_SEED)

    smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)

    X_resampled, y_resampled = smt.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668],
                     [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 2
0
def test_validate_estimator_default():
    """Test right processing while passing no object as initialization"""

    smt = SMOTEENN(random_state=RND_SEED)

    X_resampled, y_resampled = smt.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668],
                     [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])

    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 3
0
def test_sample_regular():
    """Test sample function with regular SMOTE."""

    # Create the object
    smote = SMOTEENN(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668],
                     [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 4
0
def test_sample_regular_half():
    """Test sample function with regular SMOTE and a ratio of 0.5."""

    # Create the object
    ratio = 0.8
    smote = SMOTEENN(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.36784496, -0.1953161],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 5
0
def resampling(X_train, y_train):
    from imblearn.combine import SMOTEENN
    sm = SMOTEENN()
    print('dataset shape {}'.format(Counter(y_train)))
    X_train, y_train = sm.fit_sample(X_train, y_train)
    print('Resampled dataset shape {}'.format(Counter(y_train)))
    return X_train, y_train
def split_data_resampling(X, y, test_percentage=0.2):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_percentage, random_state=42)
    smote_enn = SMOTEENN(random_state=0)
    X_train_resampled, y_train_resampled = smote_enn.fit_resample(
        X_train, y_train)
    return X_train_resampled, y_train_resampled, X_test, y_test
Ejemplo n.º 7
0
def get_smotenn(X_trn, y_trn, seed=int(623 * 449)):
    """
    Resamples using SMOTENN
    """
    SME = SMOTEENN(random_state=seed)
    X_trn, y_trn = SME.fit_resample(X_trn, y_trn)
    return X_trn, y_trn
Ejemplo n.º 8
0
def over_sampling(x_train, y_train):
    print()
    print("Doing over sampling...")
    print("Before over sampling:")
    class0_num = np.sum(y_train == 0)
    class1_num = np.sum(y_train == 1)
    class2_num = np.sum(y_train == 2)
    print("#Sample in Class 0: {}".format(class0_num))
    print("#Sample in Class 1: {}".format(class1_num))
    print("#Sample in Class 2: {}".format(class2_num))
    # Using SMOTE: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
    # an Over-sampling approach
    # Over sampling on training and validation data
    # sm = SMOTE(sampling_strategy='auto', random_state=10)
    # sm = SVMSMOTE(random_state=0)
    sm = SMOTEENN(random_state=0)
    # sm = SMOTETomek(ratio='auto')
    x_train, y_train = sm.fit_resample(x_train, y_train)

    # x_train, y_train = sm.fit_resample(x_train, y_train)
    # X_train, X_val, y_train, y_val = train_test_split(X_train,y,test_size=0.2,random_state=7)
    x_out = x_train
    y_out = y_train

    print("After over sampling:")
    class0_num = np.sum(y_out == 0)
    class1_num = np.sum(y_out == 1)
    class2_num = np.sum(y_out == 2)
    print("#Sample in Class 0: {}".format(class0_num))
    print("#Sample in Class 1: {}".format(class1_num))
    print("#Sample in Class 2: {}".format(class2_num))

    return x_out, y_out
Ejemplo n.º 9
0
def runtree(data, target):
    lb = preprocessing.LabelEncoder()
    lb.fit(target)
    target1 = lb.transform(target)
    sm = SMOTEENN()
    clf = tree.DecisionTreeClassifier()
    folds = [3]
    depths = [10]
    print("------------ TREE ------------")

    for fold in folds:
        skf = StratifiedKFold(n_splits=fold, random_state=5)
        test_target = []
        test_predict = []
        test_proba = []
        test_proba_target = []
        for train_index, test_index in skf.split(data, target1):
            clf_ = clone(clf)
            X_resampled, y_resampled = sm.fit_sample(data[train_index], target1[train_index])
            clf_.fit(X_resampled, y_resampled)
            test_predict.append(clf_.predict(data[test_index]))
            test_target.append(target1[test_index])
            test_proba_target.extend(target1[test_index])
            test_proba.extend(clf_.predict_proba(data[test_index])[:, 1])

        print_scores(test_predict, test_target)
        print(roc_auc_score(y_true=test_proba_target, y_score=test_proba))
Ejemplo n.º 10
0
    def get_simple_train_test_split(self):
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, test_size=self.test_size, random_state=self.random_state
        )

        if self.missingvals:
            # Impute missing vals with column mean
            imp = SimpleImputer()
            imp.fit(X_train)
            X_train = imp.transform(X_train)
            X_test = imp.transform(X_test)

        if self.balance:
            # Balance out classes
            # Not needed when we use frequency binning!
            balancer = SMOTEENN(random_state=self.random_state)
            X_train, y_train = balancer.fit_resample(X_train, y_train)

        if self.standardize:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

        return X_train, y_train, X_test, y_test
Ejemplo n.º 11
0
def get_models():
    models, names = list(), list()
    # SMOTEENN
    sampling = SMOTEENN(enn=EditedNearestNeighbours(
        sampling_strategy='majority'))
    model = LogisticRegression(solver='liblinear')
    steps = [('e', sampling), ('m', model)]
    models.append(Pipeline(steps=steps))
    names.append('LR')
    # SMOTEENN + Norm
    sampling = SMOTEENN(enn=EditedNearestNeighbours(
        sampling_strategy='majority'))
    model = LogisticRegression(solver='liblinear')
    steps = [('t', MinMaxScaler()), ('e', sampling), ('m', model)]
    models.append(Pipeline(steps=steps))
    names.append('Norm')
    # SMOTEENN + Std
    sampling = SMOTEENN(enn=EditedNearestNeighbours(
        sampling_strategy='majority'))
    model = LogisticRegression(solver='liblinear')
    steps = [('t', StandardScaler()), ('e', sampling), ('m', model)]
    models.append(Pipeline(steps=steps))
    names.append('Std')
    # SMOTEENN + Power
    sampling = SMOTEENN(enn=EditedNearestNeighbours(
        sampling_strategy='majority'))
    model = LogisticRegression(solver='liblinear')
    steps = [('t1', MinMaxScaler()), ('t2', PowerTransformer()),
             ('e', sampling), ('m', model)]
    models.append(Pipeline(steps=steps))
    names.append('Power')
    return models, names
def SMOTE_ENN(X_train,
              Y_train,
              seed,
              sampling_strategy,
              k_neighbors_smote=5,
              n_neighbors_enn=3,
              kind_sel='all'):
    enn = EditedNearestNeighbours(random_state=seed,
                                  n_jobs=-1,
                                  n_neighbors=n_neighbors_enn,
                                  kind_sel=kind_sel,
                                  sampling_strategy=sampling_strategy)
    smote = SMOTE(random_state=seed,
                  n_jobs=-1,
                  k_neighbors=k_neighbors_smote,
                  sampling_strategy=sampling_strategy)
    smote_enn = SMOTEENN(random_state=seed,
                         smote=smote,
                         enn=enn,
                         sampling_strategy=sampling_strategy)
    print('Before SMOTE + ENN : ', sorted(Counter(Y_train).items()))
    X_train_resampled, Y_train_resampled = smote_enn.fit_resample(
        X_train, Y_train)
    print('After SMOTE + ENN : ', sorted(Counter(Y_train_resampled).items()))

    X_train_resampled, Y_train_resampled = shuffle_dataset(
        X_train_resampled, Y_train_resampled, seed)

    return X_train_resampled, Y_train_resampled
Ejemplo n.º 13
0
def smpote_test():
    # 读取测试测试数据集中的数据
    truth_df = pd.read_hdf('D:\\kpi\\1.hdf')
    # print(truth_df["KPI ID"])
    kpi_names = truth_df['KPI ID'].values
    truth = truth_df[truth_df["KPI ID"] == kpi_names[0]]
    y = truth['label']

    X = truth.drop(columns=['label', 'KPI ID'])
    sm = SMOTEENN()
    X_resampled, y_resampled = sm.fit_sample(X, y)

    dfX = pd.DataFrame(X_resampled, columns=['timestamp', 'value'])
    DFy = pd.DataFrame(y_resampled, columns=['label'])

    plt.plot(np.array(X['timestamp']),
             np.array(X['value']),
             color='green',
             label='training accuracy')
    plt.legend()  # 显示图例
    plt.show()

    dfX = dfX.join(DFy).sort_values(by="timestamp", ascending=True)

    plt.plot(np.array(dfX['timestamp']),
             np.array(dfX['value']),
             color='red',
             label='training accuracy')
    plt.legend()  # 显示图例
    plt.show()
Ejemplo n.º 14
0
def test_validate_estimator_deprecation():
    """Test right processing while passing old parameters"""

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668],
                     [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])

    smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

    smt = SMOTEENN(random_state=RND_SEED, k=5)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def resample_dataset(df, feature_list, repo_type):
    num_rows = len(df.index)  # number of rows in <df>
    num_features = len(feature_list)  # number of feature columns to resample
    cur_row = []  # list to hold the current row of <df>
    feat_val_mat = []  # the matrix (list of lists) to hold all feature values
    counter = 0  # counter for progress

    print "\nResampling data for the " + repo_type + " dataset..."
    for idx, row in tqdm(df.iterrows(),
                         desc="\tProgress"):  # loop <num_rows> times
        counter += 1
        # print_progress(counter, num_rows)
        for j in range(num_features):  # loop <num_features> times
            cur_row.append(
                row[feature_list[j]])  # form list of current row values
        feat_val_mat.append(cur_row)  # append <cur_row> to <feat_val_mat>
        cur_row = []

    smote_obj = SMOTEENN(
        sampling_strategy="all", random_state=99
    )  # <smote_obj> should over/under-sample both the "NEUTRAL" and "INSECURE" classes
    resampled_data, resampled_targets = smote_obj.fit_resample(
        feat_val_mat, list(df["SECU_FLAG"]))

    resampled_df = pd.DataFrame(
        resampled_data, columns=feature_list)  # recreate the reduced dataframe
    resampled_df[
        "SECU_FLAG"] = resampled_targets  # re-initialize the "SECU_FLAG" column
    resampled_df["REPO_TYPE"] = [repo_type] * len(
        resampled_df.index)  # re-initialize the "REPO_TYPE" column
    return resampled_df
def balanced_train(data, features):
    X = data[features]
    y = data['label']
    from imblearn.combine import SMOTEENN
    smote_enn = SMOTEENN(random_state=42)
    X_resampled, y_resampled = smote_enn.fit_sample(X, y)
    return X_resampled, y_resampled
Ejemplo n.º 17
0
def resample(X, Y, nb_class):
    print("original shape: ", X.shape)
    labels = Y.astype(int)
    counts = np.bincount(labels)

    if len(counts) != nb_class:
        print("there is no samples to interpolate! skip this fold.")
        return X, Y

    class_dist = counts / float(sum(counts))
    print("original dist: ", class_dist)

    org_shape = X.shape
    sampler = SMOTEENN(random_state=0)
    flattend_X = X.reshape(
        (X.shape[0], X.shape[1] * X.shape[2] * X.shape[3] * X.shape[4]))
    X_resampled, Y_resampled = sampler.fit_sample(flattend_X, labels)
    X_resampled = X_resampled.reshape(
        (X_resampled.shape[0], X.shape[1], X.shape[2], X.shape[3], X.shape[4]))
    print("sampled shape: ", X_resampled.shape)

    Y_resampled = Y_resampled.astype(int)
    counts = np.bincount(Y_resampled)
    class_dist = counts / float(sum(counts))
    print("after SMOTEENN dist: ", class_dist)
    return X_resampled, Y_resampled
Ejemplo n.º 18
0
def smoter(df):
    IDs = df.Quote_ID
    target = df.QuoteConversion_Flag
    data = df.drop(['QuoteConversion_Flag'], axis=1).values
    print("Before SMOTE: ", sorted(Counter(target).items()))

    ####
    # ENN
    ####
    enn = ENN(sampling_strategy="not majority",
              kind_sel="mode",
              n_neighbors=5,
              n_jobs=-1,
              random_state=RANDOM_STATE)
    smote_enn = SMOTEENN(enn=enn, random_state=RANDOM_STATE)
    X_resampled, y_resampled = smote_enn.fit_resample(data, target)
    print("SMOTE ENN: ", sorted(Counter(y_resampled).items()))

    ####
    # Tomeks
    ####
    # smote_tomek = SMOTETomek(random_state=0)
    # X_resampled, y_resampled = smote_tomek.fit_resample(data, target)
    # print("Using SMOTE: ", sorted(Counter(y_resampled).items()))

    data = pd.DataFrame(data=X_resampled, columns=FIELDS)
    target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag'])

    return data, target
Ejemplo n.º 19
0
def smot2(train_x, train_y, feature_columns):

    from imblearn.combine import SMOTEENN
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import TomekLinks
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.over_sampling import ADASYN
    from sklearn.svm import SVC
    from imblearn.under_sampling import CondensedNearestNeighbour

    print('\nOriginal dataset shape {}'.format(Counter(train_y)))

    sm = SMOTEENN(ratio='minority',
                  n_jobs=3,
                  random_state=42,
                  n_neighbors=50,
                  smote=SMOTE())
    #sm = ADASYN(ratio='minority', n_jobs=3,random_state=42,n_neighbors=100)

    #sm = SMOTE(ratio='minority', n_jobs=3, random_state=42,m_neighbors=200)

    #sm = CondensedNearestNeighbour(ratio='majority', random_state=42)

    log.traceLogInfo("\nFIT DE SMOT2 ...equilibrage")
    X_res, y_res = sm.fit_sample(train_x, train_y)

    print('\nResampled dataset shape {}'.format(Counter(y_res)))
    # reconstitution DATAFRAME
    train_x = pd.DataFrame(X_res, columns=feature_columns)
    train_y = pd.Series(y_res)

    return train_x, train_y
Ejemplo n.º 20
0
def unba_smoteenn(x,y):
    x1 = x.reshape(x.shape[0],-1)# 7259*480
    smoteenn = SMOTEENN(random_state=0) # 建立smoteenn模型对象
    x1,y1 = smoteenn.fit_resample(x1,y)# 扩增以后*480
    x2 = np.zeros((x1.shape[0],x.shape[1],x.shape[2],1))
    for i in tqdm(range(x1.shape[0])):
        x2[i,:,:,0] = np.reshape(x1[i],(60,8))
    return x2,y1
Ejemplo n.º 21
0
def train_decisiontree_with(configurationname,
                            train_data,
                            k,
                            score_function,
                            undersam=False,
                            oversam=False,
                            export=False,
                            **kwargs):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data

    max_depth = None if "max_depth" not in kwargs else kwargs["max_depth"]

    dtc = DecisionTreeClassifier(criterion="entropy",
                                 random_state=0,
                                 max_depth=max_depth)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    selector = SelectKBest(score_function, k=k)
    selector = selector.fit(X_train, y_train)

    X_train = selector.transform(X_train)

    fitted_ids = [i for i in selector.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        print("Exporting tree to graph...")
        export_graphviz(dtc,
                        out_file=DATAP + "/temp/trees/sltree_" +
                        configurationname + ".dot",
                        filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Ejemplo n.º 22
0
def imbalance_undersampling(datafile):
    df = filling_missing(datafile)
    # combine oversampling and undersampling togeter with SMOTEENN
    smote_enn = SMOTEENN(random_state=0)
    X_resampled, y_resampled = smote_enn.fit_resample(df[features], df.country_destination)
    print(sorted(Counter(y_resampled).items()))
    back = pd.DataFrame(np.hstack((X_resampled, y_resampled[:, None]))) #[516489 rows x 14 columns]
    # print(back)
    return back
Ejemplo n.º 23
0
def SMOTEENN_oversampling(x, y):
    print('Original dataset shape {}'.format(Counter(y)))

    smote_enn = SMOTEENN(random_state=42)
    x_sampled, y_sampled = smote_enn.fit_sample(x, y)

    print('With SMOTEENN sampled dataset shape {}'.format(Counter(y_sampled)))

    return x_sampled, y_sampled
Ejemplo n.º 24
0
def test_error_wrong_object():
    smote = 'rnd'
    enn = 'rnd'
    smt = SMOTEENN(smote=smote, random_state=RND_SEED)
    assert_raises_regex(ValueError, "smote needs to be a SMOTE",
                        smt.fit_sample, X, Y)
    smt = SMOTEENN(enn=enn, random_state=RND_SEED)
    assert_raises_regex(ValueError, "enn needs to be an ", smt.fit_sample, X,
                        Y)
Ejemplo n.º 25
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    sm = SMOTEENN(random_state=RND_SEED)
    sm.fit(X, Y)
    assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Ejemplo n.º 26
0
def over_sampling(data):
    data = data.drop('aid', axis=1)
    data = data.drop('uid', axis=1)
    y = data['label']
    X = data.drop('label', axis=1)
    sme = SMOTEENN()
    X_res, y_res = sme.fit_sample(X, y)
    data_res = pd.concat([X_res, y_res], axis=1)
    data_res.to_csv('./data/train_all_after_overSamlping.csv', index=False)
Ejemplo n.º 27
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    sm = SMOTEENN(random_state=RND_SEED)
    sm.fit(X, Y)
    assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Ejemplo n.º 28
0
def balance(x, y, randomstate=None, **kwargs):
    sm = SMOTEENN(random_state=randomstate,
                  n_jobs=3,
                  n_neighbors=kwargs['neighbors'])
    print('dataset shape {}'.format(Counter(y)))
    print('Resampling...')
    rx, ry = sm.fit_sample(x, y)
    print('Resampled dataset shape {}'.format(Counter(ry)))
    return rx, ry
def test_validate_estimator_default():
    smt = SMOTEENN(random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_resample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [
        0.61319159, -0.11571667
    ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 30
0
def balancingClassesSmoteenn(x_train, y_train):

    # Using SMOTEEN to balance our training data points
    smn = SMOTEENN(random_state=7)
    features_balanced, target_balanced = smn.fit_resample(x_train, y_train)

    print("Count for each class value after SMOTEEN:",
          collections.Counter(target_balanced))

    return features_balanced, target_balanced
Ejemplo n.º 31
0
def balance_train_data(data):
    print("Start balancing...")
    features, labels = data

    start_time = time.time()
    smote_enn = SMOTEENN(random_state=42)
    features, labels = smote_enn.fit_sample(features, labels)
    print("Balanced dataset:", sorted(Counter(labels).items()))
    print("Balancing time:", time.time() - start_time)
    return (features, labels)
Ejemplo n.º 32
0
def test_validate_estimator_default():
    smt = SMOTEENN(random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_resample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667], [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 33
0
def test_sample_regular_half():
    ratio = 0.8
    smote = SMOTEENN(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.08711622, 0.93259929]])
    y_gt = np.array([0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 34
0
def smote_enn_sampling(X,Y):
    nsamples, nx, ny = X.shape
    X = X.reshape((nsamples, nx*ny))

    X, Y, idx_resampled = SMOTEENN().fit_sample(X,Y)
    
    nsamples, ny = X.shape
    X = X.reshape((nsamples, nx, ny/nx))
    Y = Y.reshape((nsamples, 1))
    return X, Y
def test_sample_regular_half():
    sampling_strategy = {0: 10, 1: 12}
    smote = SMOTEENN(
        sampling_strategy=sampling_strategy, random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_resample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.08711622, 0.93259929]])
    y_gt = np.array([0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_init():
    smote = SMOTE(random_state=RND_SEED)
    enn = EditedNearestNeighbours(sampling_strategy='all')
    smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_resample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [
        0.61319159, -0.11571667
    ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 37
0
def test_smote_fit():
    """Test the fitting method"""

    # Create the object
    smote = SMOTEENN(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(smote.min_c_, 0)
    assert_equal(smote.maj_c_, 1)
    assert_equal(smote.stats_c_[0], 500)
    assert_equal(smote.stats_c_[1], 4500)
Ejemplo n.º 38
0
def test_sample_regular():
    """Test sample function with regular SMOTE."""

    # Create the object
    smote = SMOTEENN(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_enn_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_enn_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 39
0
def test_sample_regular_pass_smote_enn():
    smote = SMOTEENN(smote=SMOTE(ratio='auto', random_state=RND_SEED),
                     enn=EditedNearestNeighbours(ratio='all',
                                                 random_state=RND_SEED),
                     random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667],
                     [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 40
0
 def SMOTE(self, bug_rate, X, Y):
     """
     Combine over- and under-sampling using SMOTE and
      Edited Nearest Neighbours.
      通过改进的SMOTE来对原来的数据集做处理
     :param bug_rate:
     :param X:数据集除了lable以外的部分
     :param Y:lable信息
     :return:处理过的X,Y。
     """
     from collections import Counter
     from imblearn.combine import SMOTEENN
     sme = SMOTEENN(ratio=bug_rate)
     x_res, y_res = sme.fit_sample(X, Y)
     import numpy as np
     nx = np.column_stack((x_res, y_res))
     self.new_list_SMOTE = nx
Ejemplo n.º 41
0
	def __init__(self,kind,data,target,verbose = False, ratio = 'auto'):

		assert len(data) == len(target)
		self.data = data
		self.target = target

		if kind in [Undersampling.ClusterCentroids]:
			if verbose: print('> CLUSTER CENTROIDS')

			# Undersampling por Cluster Centroids
			self.undersampler = ClusterCentroids(verbose = verbose, ratio=ratio)
		elif kind in [Undersampling.SMOTEENN]:
			if verbose: print('> SMOTEENN')

			# Undersampling por SMOTEENN
			self.undersampler = SMOTEENN(verbose = verbose, ratio=ratio)
		else:
			raise("Nonexistent undersampling type: "+kind.name)
Ejemplo n.º 42
0
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False,
                            export=False):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Ejemplo n.º 43
0
def test_error_wrong_object():
    smote = 'rnd'
    enn = 'rnd'
    smt = SMOTEENN(smote=smote, random_state=RND_SEED)
    with raises(ValueError, match="smote needs to be a SMOTE"):
        smt.fit_resample(X, Y)
    smt = SMOTEENN(enn=enn, random_state=RND_SEED)
    with raises(ValueError, match="enn needs to be an "):
        smt.fit_resample(X, Y)
def test_parallelisation():
    # Check if default job count is 1
    smt = SMOTEENN(random_state=RND_SEED)
    smt._validate_estimator()
    assert smt.n_jobs == 1
    assert smt.smote_.n_jobs == 1
    assert smt.enn_.n_jobs == 1

    # Check if job count is set
    smt = SMOTEENN(random_state=RND_SEED, n_jobs=8)
    smt._validate_estimator()
    assert smt.n_jobs == 8
    assert smt.smote_.n_jobs == 8
    assert smt.enn_.n_jobs == 8
Ejemplo n.º 45
0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#smoteen
sme = SMOTEENN(random_state=42)
os_X,os_y = sme.fit_sample(X_train,y_train)

#QDA
clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True)
clf_QDA.fit(os_X, os_y)
y_true, y_pred = y_test, clf_QDA.predict(X_test)

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
Ejemplo n.º 46
0
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=100, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply SMOTE + ENN
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=0.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
Ejemplo n.º 47
0
        i = n//2
        return (data[i - 1] + data[i])/2

start = time()
n_iter = 100          ## Number of evaluations (SMAC)
n_validations = 7     ## Number of Monte-Carlo Cross-Validations for each model's accuracy evaluated

## Dataset 11

url11 = "https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticdata2000.txt"
dataset11 = np.genfromtxt(urllib.urlopen(url11))

X = dataset11[:,0:85]
Y = dataset11[:,85]

sm = SMOTEENN()
X, Y = sm.fit_sample(X, Y)

# We fit the MLP with the hyperparameters given and return the model's median accuracy from 7 trials
def mlp(number_layers, number_neurons_1, number_neurons_2, number_neurons_3, number_neurons_4, dropout_rate):

	layers = []
	number_neurons = []

	number_neurons.append(number_neurons_1)
	number_neurons.append(number_neurons_2)
	number_neurons.append(number_neurons_3)
	number_neurons.append(number_neurons_4)

	for i in np.arange(number_layers):
		layers.append(Layer("Sigmoid", units=number_neurons[i], dropout = dropout_rate))
def test_error_wrong_object(smote_params, err_msg):
    smt = SMOTEENN(**smote_params)
    with pytest.raises(ValueError, match=err_msg):
        smt.fit_resample(X, Y)