Esempio n. 1
0
def test_error_wrong_object():
    smote = 'rnd'
    enn = 'rnd'
    smt = SMOTEENN(smote=smote, random_state=RND_SEED)
    with raises(ValueError, match="smote needs to be a SMOTE"):
        smt.fit_resample(X, Y)
    smt = SMOTEENN(enn=enn, random_state=RND_SEED)
    with raises(ValueError, match="enn needs to be an "):
        smt.fit_resample(X, Y)
Esempio n. 2
0
def get_smotenn(X_trn, y_trn, seed=int(623 * 449)):
    """
    Resamples using SMOTENN
    """
    SME = SMOTEENN(random_state=seed)
    X_trn, y_trn = SME.fit_resample(X_trn, y_trn)
    return X_trn, y_trn
Esempio n. 3
0
def smoter(df):
    IDs = df.Quote_ID
    target = df.QuoteConversion_Flag
    data = df.drop(['QuoteConversion_Flag'], axis=1).values
    print("Before SMOTE: ", sorted(Counter(target).items()))

    ####
    # ENN
    ####
    enn = ENN(sampling_strategy="not majority",
              kind_sel="mode",
              n_neighbors=5,
              n_jobs=-1,
              random_state=RANDOM_STATE)
    smote_enn = SMOTEENN(enn=enn, random_state=RANDOM_STATE)
    X_resampled, y_resampled = smote_enn.fit_resample(data, target)
    print("SMOTE ENN: ", sorted(Counter(y_resampled).items()))

    ####
    # Tomeks
    ####
    # smote_tomek = SMOTETomek(random_state=0)
    # X_resampled, y_resampled = smote_tomek.fit_resample(data, target)
    # print("Using SMOTE: ", sorted(Counter(y_resampled).items()))

    data = pd.DataFrame(data=X_resampled, columns=FIELDS)
    target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag'])

    return data, target
def split_data_resampling(X, y, test_percentage=0.2):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_percentage, random_state=42)
    smote_enn = SMOTEENN(random_state=0)
    X_train_resampled, y_train_resampled = smote_enn.fit_resample(
        X_train, y_train)
    return X_train_resampled, y_train_resampled, X_test, y_test
def resample_dataset(df, feature_list, repo_type):
    num_rows = len(df.index)  # number of rows in <df>
    num_features = len(feature_list)  # number of feature columns to resample
    cur_row = []  # list to hold the current row of <df>
    feat_val_mat = []  # the matrix (list of lists) to hold all feature values
    counter = 0  # counter for progress

    print "\nResampling data for the " + repo_type + " dataset..."
    for idx, row in tqdm(df.iterrows(),
                         desc="\tProgress"):  # loop <num_rows> times
        counter += 1
        # print_progress(counter, num_rows)
        for j in range(num_features):  # loop <num_features> times
            cur_row.append(
                row[feature_list[j]])  # form list of current row values
        feat_val_mat.append(cur_row)  # append <cur_row> to <feat_val_mat>
        cur_row = []

    smote_obj = SMOTEENN(
        sampling_strategy="all", random_state=99
    )  # <smote_obj> should over/under-sample both the "NEUTRAL" and "INSECURE" classes
    resampled_data, resampled_targets = smote_obj.fit_resample(
        feat_val_mat, list(df["SECU_FLAG"]))

    resampled_df = pd.DataFrame(
        resampled_data, columns=feature_list)  # recreate the reduced dataframe
    resampled_df[
        "SECU_FLAG"] = resampled_targets  # re-initialize the "SECU_FLAG" column
    resampled_df["REPO_TYPE"] = [repo_type] * len(
        resampled_df.index)  # re-initialize the "REPO_TYPE" column
    return resampled_df
def SMOTE_ENN(X_train,
              Y_train,
              seed,
              sampling_strategy,
              k_neighbors_smote=5,
              n_neighbors_enn=3,
              kind_sel='all'):
    enn = EditedNearestNeighbours(random_state=seed,
                                  n_jobs=-1,
                                  n_neighbors=n_neighbors_enn,
                                  kind_sel=kind_sel,
                                  sampling_strategy=sampling_strategy)
    smote = SMOTE(random_state=seed,
                  n_jobs=-1,
                  k_neighbors=k_neighbors_smote,
                  sampling_strategy=sampling_strategy)
    smote_enn = SMOTEENN(random_state=seed,
                         smote=smote,
                         enn=enn,
                         sampling_strategy=sampling_strategy)
    print('Before SMOTE + ENN : ', sorted(Counter(Y_train).items()))
    X_train_resampled, Y_train_resampled = smote_enn.fit_resample(
        X_train, Y_train)
    print('After SMOTE + ENN : ', sorted(Counter(Y_train_resampled).items()))

    X_train_resampled, Y_train_resampled = shuffle_dataset(
        X_train_resampled, Y_train_resampled, seed)

    return X_train_resampled, Y_train_resampled
Esempio n. 7
0
    def get_simple_train_test_split(self):
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, test_size=self.test_size, random_state=self.random_state
        )

        if self.missingvals:
            # Impute missing vals with column mean
            imp = SimpleImputer()
            imp.fit(X_train)
            X_train = imp.transform(X_train)
            X_test = imp.transform(X_test)

        if self.balance:
            # Balance out classes
            # Not needed when we use frequency binning!
            balancer = SMOTEENN(random_state=self.random_state)
            X_train, y_train = balancer.fit_resample(X_train, y_train)

        if self.standardize:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

        return X_train, y_train, X_test, y_test
Esempio n. 8
0
def over_sampling(x_train, y_train):
    print()
    print("Doing over sampling...")
    print("Before over sampling:")
    class0_num = np.sum(y_train == 0)
    class1_num = np.sum(y_train == 1)
    class2_num = np.sum(y_train == 2)
    print("#Sample in Class 0: {}".format(class0_num))
    print("#Sample in Class 1: {}".format(class1_num))
    print("#Sample in Class 2: {}".format(class2_num))
    # Using SMOTE: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
    # an Over-sampling approach
    # Over sampling on training and validation data
    # sm = SMOTE(sampling_strategy='auto', random_state=10)
    # sm = SVMSMOTE(random_state=0)
    sm = SMOTEENN(random_state=0)
    # sm = SMOTETomek(ratio='auto')
    x_train, y_train = sm.fit_resample(x_train, y_train)

    # x_train, y_train = sm.fit_resample(x_train, y_train)
    # X_train, X_val, y_train, y_val = train_test_split(X_train,y,test_size=0.2,random_state=7)
    x_out = x_train
    y_out = y_train

    print("After over sampling:")
    class0_num = np.sum(y_out == 0)
    class1_num = np.sum(y_out == 1)
    class2_num = np.sum(y_out == 2)
    print("#Sample in Class 0: {}".format(class0_num))
    print("#Sample in Class 1: {}".format(class1_num))
    print("#Sample in Class 2: {}".format(class2_num))

    return x_out, y_out
def unba_smoteenn(x,y):
    x1 = x.reshape(x.shape[0],-1)# 7259*480
    smoteenn = SMOTEENN(random_state=0) # 建立smoteenn模型对象
    x1,y1 = smoteenn.fit_resample(x1,y)# 扩增以后*480
    x2 = np.zeros((x1.shape[0],x.shape[1],x.shape[2],1))
    for i in tqdm(range(x1.shape[0])):
        x2[i,:,:,0] = np.reshape(x1[i],(60,8))
    return x2,y1
Esempio n. 10
0
def train_decisiontree_with(configurationname,
                            train_data,
                            k,
                            score_function,
                            undersam=False,
                            oversam=False,
                            export=False,
                            **kwargs):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data

    max_depth = None if "max_depth" not in kwargs else kwargs["max_depth"]

    dtc = DecisionTreeClassifier(criterion="entropy",
                                 random_state=0,
                                 max_depth=max_depth)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    selector = SelectKBest(score_function, k=k)
    selector = selector.fit(X_train, y_train)

    X_train = selector.transform(X_train)

    fitted_ids = [i for i in selector.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        print("Exporting tree to graph...")
        export_graphviz(dtc,
                        out_file=DATAP + "/temp/trees/sltree_" +
                        configurationname + ".dot",
                        filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Esempio n. 11
0
def imbalance_undersampling(datafile):
    df = filling_missing(datafile)
    # combine oversampling and undersampling togeter with SMOTEENN
    smote_enn = SMOTEENN(random_state=0)
    X_resampled, y_resampled = smote_enn.fit_resample(df[features], df.country_destination)
    print(sorted(Counter(y_resampled).items()))
    back = pd.DataFrame(np.hstack((X_resampled, y_resampled[:, None]))) #[516489 rows x 14 columns]
    # print(back)
    return back
Esempio n. 12
0
def balancingClassesSmoteenn(x_train, y_train):

    # Using SMOTEEN to balance our training data points
    smn = SMOTEENN(random_state=7)
    features_balanced, target_balanced = smn.fit_resample(x_train, y_train)

    print("Count for each class value after SMOTEEN:",
          collections.Counter(target_balanced))

    return features_balanced, target_balanced
Esempio n. 13
0
def test_validate_estimator_default():
    smt = SMOTEENN(random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_resample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667], [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_default():
    smt = SMOTEENN(random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_resample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [
        0.61319159, -0.11571667
    ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half():
    sampling_strategy = {0: 10, 1: 12}
    smote = SMOTEENN(
        sampling_strategy=sampling_strategy, random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_resample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.08711622, 0.93259929]])
    y_gt = np.array([0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 16
0
def smoteenn_sffs_reduction_classify_full():

    (X, Y), feature_names = read_dataset(
        screening='')  # no screening results, only risk factors

    # dataset resampling for imbalanced data compensation
    smoteenn = SMOTEENN()

    Xres, Yres = smoteenn.fit_resample(X, Y)  # resampled dataset
    print('Resampling')
    print('Original dataset size:', Counter(Y))
    print('Resampled dataset size:', Counter(Yres))

    # feature selection using sequential forward floating selection and tuned SVM
    scoring = [
        'accuracy', 'precision', 'recall', 'balanced_accuracy',
        'average_precision', 'brier_score_loss', 'neg_log_loss'
    ]

    param_grid = {'C': np.logspace(-3, 3, 7), 'kernel': ['rbf']}

    grid = GridSearchCV(estimator=SVC(probability=True, gamma='scale'),
                        param_grid=param_grid,
                        n_jobs=-1,
                        verbose=10,
                        cv=5,
                        scoring=scoring,
                        refit='balanced_accuracy',
                        iid=False,
                        error_score=0)

    grid.fit(Xres, Yres)
    print(grid.best_params_)

    selector = SequentialFeatureSelector(
        forward=False,
        floating=True,
        k_features='best',
        verbose=2,
        n_jobs=-1,
        scoring='balanced_accuracy',
        cv=5,
        estimator=SVC(probability=True,
                      gamma='scale',
                      kernel=grid.best_params_['kernel'],
                      C=grid.best_params_['C']))

    selector.fit(Xres, Yres, custom_feature_names=feature_names)

    with open('smoteenn_sbfs.pkl', 'wb') as f:
        pickle.dump(selector, f, -1)

    df = pd.DataFrame(selector.subsets_)
    df.to_csv('smoteenn_sbfs.csv')
Esempio n. 17
0
def main():
    PARSER = argparse.ArgumentParser(description="Prediction of subscription")

    PARSER.add_argument(
        "--filename_to_predict",
        "-fp",
        required=True,
        help="File to predict ( only csv supported for now )",
    )

    ARGS = PARSER.parse_args()

    SAVED_FILENAME = f"{ARGS.filename_to_predict}"
    update_progress(0)
    print("progress : Predict subscription")

    dataset_merged = DatasetBuilder(
        filename_bank=stg.FILENAME_BANK,
        filename_socio=stg.FILENAME_SOCIO_ECO).create_dataset()
    X_train = dataset_merged.drop(columns=stg.COL_RAW_SUBSCRIPTION)
    y_train = dataset_merged[stg.COL_RAW_SUBSCRIPTION].values

    update_progress(0.2)
    print("progress : Build Dataset")
    preprocessor_pipeline = PipelineCreator().preprocessor
    X_train_processed = preprocessor_pipeline.fit_transform(X_train)

    update_progress(0.3)
    print("progress : Deal with imbalenced classes")
    smote_enn = SMOTEENN(sampling_strategy=0.8,
                         random_state=stg.RANDOM_STATE,
                         n_jobs=-1)
    X_train, y_train = smote_enn.fit_resample(X_train_processed, y_train)

    update_progress(0.6)
    print("progress : Fit model")
    random_forest_classifier = RandomForestClassifier(**stg.RFC_PARAMS)
    random_forest_classifier.fit(X_train, y_train)

    update_progress(0.9)
    print("progress : Predict")
    X_test = DatasetBuilder(
        filename_bank=SAVED_FILENAME,
        filename_socio=stg.FILENAME_SOCIO_ECO_TEST,
        is_test=True,
    ).create_dataset()
    X_test_transformed = preprocessor_pipeline.transform(X_test)
    predictions = random_forest_classifier.predict(X_test_transformed)
    X_test["PREDICTED_SUBSCRIPTION"] = predictions
    X_test.to_csv(join(stg.PROCESSED_DATA_DIR, "predictions.csv"))
    print("Completed!")
    print(
        "You can find the csv file with the predictions inside in data/processed/predictions.csv ! "
    )
Esempio n. 18
0
def test_sample_regular_half():
    sampling_strategy = {0: 10, 1: 12}
    smote = SMOTEENN(sampling_strategy=sampling_strategy,
                     random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_resample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.08711622, 0.93259929]])
    y_gt = np.array([0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 19
0
def test_validate_estimator_init():
    smote = SMOTE(random_state=RND_SEED)
    enn = EditedNearestNeighbours(sampling_strategy='all')
    smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_resample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667], [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_init():
    smote = SMOTE(random_state=RND_SEED)
    enn = EditedNearestNeighbours(sampling_strategy='all')
    smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_resample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [
        0.61319159, -0.11571667
    ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
 def prep_data(self, test_ratio, smoteenn, smotomek):
     # split data into train and test
     X_train, X_test, y_train, y_test = train_test_split(
         self.X, self.y, test_size=test_ratio, random_state=4)
     # if smoteenn is true, use smoteenn sampling
     if smoteenn:
         sme = SMOTEENN(random_state=1)
         X_train, y_train = sme.fit_resample(X_train, y_train)
     # if smotomek is true, use smotomek sampling
     if smotomek:
         smt = SMOTETomek(random_state=1)
         X_train, y_train = smt.fit_resample(X_train, y_train)
     return X_train, X_test, y_train, y_test
Esempio n. 22
0
def smote_enn(X,
              y,
              visualize=False,
              pca2d=True,
              pca3d=True,
              tsne=True,
              pie_evr=True):
    sme = SMOTEENN(random_state=42)
    X_res, y_res = sme.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
Esempio n. 23
0
class PreProcessor:
    """
    Perform pre-processing
    Import dataframe and apply sklearn transformations
    Vectorize (non full text) strings with TF/IDF
    Normalise int values, remove mean and scale to unit variance
    Instantiate an embedding transformer for message text feature
    Returns a sparse matrix of features
    """
    def __init__(self):
        stopwords = set(corpus.stopwords.words('english'))
        self.mapper = DataFrameMapper([
            (['created_at'], StandardScaler()),
            (['user_created_at'], StandardScaler()),
            (['favorite_count'], StandardScaler()),
            (['retweet_count'], StandardScaler()),
            (['user_followers_count'], StandardScaler()),
            (['user_following_count'], StandardScaler()),
            ('hashtags', TfidfVectorizer(stop_words=stopwords, max_features=1_000)),
            ('urls', TfidfVectorizer(stop_words=stopwords, max_features=1_000)),
            ('user_description', TfidfVectorizer(stop_words=stopwords, max_features=10_000)),
            ('user_location', TfidfVectorizer(stop_words=stopwords, max_features=1_000)),
            ('user_name', TfidfVectorizer(stop_words=stopwords, max_features=1_000)),
            ('user_screen_name', TfidfVectorizer(stop_words=stopwords, max_features=1_000)),
            ('user_profile_urls', TfidfVectorizer(stop_words=stopwords, max_features=1_000)),
            ('full_text', EmbedTransformer())
        ], sparse=True)
        self.svd = TruncatedSVD(algorithm='randomized')
        self.balancer = SMOTEENN(n_jobs=12)

    def transform(self, df):
        labels = label_binarize(df.pop('label'), classes=['none', 'astroturf'])
        return self.mapper.fit_transform(df), labels

    def truncate(self, data_array, components=1000):
        """
        Run feature dimensionality reduction process
        In this case LSA using a randomised sampling methodology (https://arxiv.org/abs/0909.4061)
        Returns a dense array
        """
        self.svd.n_components = components
        return self.svd.fit_transform(data_array)

    def balance(self, data_array, labels):
        """
        Balance classes for training
        Re-sample with SMOTE oversampling (https://arxiv.org/abs/1106.1813)
        & edited nearest-neighbours cleaning of the synthetic data points
        (http://www.inf.ufrgs.br/maslab/pergamus/pubs/balancing-training-data-for.pdf)
        """
        return self.balancer.fit_resample(data_array, labels.ravel())
Esempio n. 24
0
def preprocess_data(PARAMS, train_data, train_label, test_data):
    if PARAMS['data_balancing']:
        from imblearn.combine import SMOTEENN
        print('Unbalanced data: ', np.shape(train_data))
        # Over and under sampling
        smote_enn = SMOTEENN(sampling_strategy=1.0)
        train_data, train_label = smote_enn.fit_resample(
            train_data, train_label)
        print('Balanced data: ', np.shape(train_data))

    if PARAMS['scale_data']:
        train_data, test_data = scale_data(train_data, test_data)

    return train_data, train_label, test_data
Esempio n. 25
0
def rebalance():
    sm = SMOTEENN()
    train_data.replace(to_replace=np.nan, value=0, inplace=True)
    train_data.replace(to_replace=-np.inf, value=0, inplace=True)
    train_data.replace(to_replace=np.inf, value=0, inplace=True)
    print("rebalance data:", now())
    X_resampled, y_resampled = sm.fit_resample(train_data[features],
                                               train_data[target])
    X_resampled = pd.DataFrame(X_resampled, columns=features)
    y_resampled = pd.DataFrame(y_resampled, columns=target)
    X_resampled['is_trade'] = y_resampled['is_trade']
    del y_resampled
    gc.collect()
    return X_resampled
    def Smote_ENN(self):
        '''
        First oversamples the minority classes using SMOTE and then cleans
        all the data using ENN.

        Returns
        -------
        None.
        '''
        X_train = self.X_train.copy()
        y_train = self.y_train.copy()
        sme = SMOTEENN(random_state=2020)
        (self.X_train_balanced,
         self.y_train_balanced) = sme.fit_resample(X_train, y_train)
Esempio n. 27
0
def train_decisiontree_FPR(configurationname,
                           train_data,
                           score_function,
                           undersam=False,
                           oversam=False,
                           export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    # if export:
    print("Exporting decision tree image...")
    export_graphviz(dtc,
                    out_file=DATAP + "/temp/trees/sltree_" +
                    configurationname + ".dot",
                    filled=True)
    transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Esempio n. 28
0
def applying_classifier_to_all_accounts():
    df = pickle.load(open(m4r_data + "us_and_georgia_accounts.p", "rb"))

    trainset = get_full_dataset()
    users = get_full_dataset(df)

    X_users = users[features]

    X_trn = trainset[features]
    y_trn = trainset["class"].replace({"bot": 1, "human": 0})

    SME = SMOTEENN(random_state=2727841)
    X_trn, y_trn = SME.fit_resample(X_trn, y_trn)

    scaling = StandardScaler()
    X_trn = scaling.fit_transform(X_trn)
    X_users = scaling.transform(X_users)

    clf = AdaBoostClassifier(n_estimators=50, random_state=9926737)
    clf.fit(X_trn, y_trn)

    p_users = np.round(clf.predict(X_users))

    users["predicted_class"] = p_users

    users["predicted_class"] = users["predicted_class"].replace({
        0: "human",
        1: "bot"
    })

    print("% Bots (ALL): ", sum(p_users) / len(p_users))

    # adding predicted class column to df
    df = df.merge(users[["user.id", "predicted_class"]],
                  how="left",
                  on="user.id")

    # adding donald trump to users
    d_row = {
        "user.id": 25073877,
        "user.name": "realDonaldTrump",
        "user.screen_name": "realDonaldTrump",
        "user.verified": True,
        "predicted_class": "human"
    }
    df = df.append(d_row, ignore_index=True)
Esempio n. 29
0
    def SMOTEENN(self,configFile,data):
        from imblearn.combine import SMOTEENN
        cf = configparser.ConfigParser()
        cf.read(configFile)
        sampling_strategy = str(cf.get("resample","sampling_strategy"))
        random = int(cf.get("data","random"))
        matchObj = re.match( r'.*[^0-9\.].*', sampling_strategy, re.I)
        if not matchObj:
            sampling_strategy = float(sampling_strategy)

        model = SMOTEENN(sampling_strategy=sampling_strategy,random_state=random)
        data['y'].index = [int(x) for x in range(0,len(data['y']))]
        data['x'].index = [int(x) for x in range(0,len(data['x']))]
        X_resampled, y_resampled = model.fit_resample(data['x'], data['y'])
        self.logging_config(u"resample class \n {}".format(y_resampled.value_counts()),"info")
        data_dict = {'x':X_resampled,'y':y_resampled}
        return data_dict
Esempio n. 30
0
def run_smoteenn(X, y, sampling_strategy='auto'):
    '''
    INPUT:
        X; a numpy array of predictors
        y; a binary target vector
        **kwargs, other keyword arguments to SMOTEENN; see imblearn docs
    OUTPUT:
        smx; predictor array with synthetically oversampled minority examples
        smy; target vector with synthetically oversampled minority examples
    NOTES:
        Takes a predictor numpy array, X, and a binary target vector, y, and returns
        arrays, smx, and smy, where the minority class has been synthetically
        oversampled using the SMOTE method, then cleaned with ENN
    '''
    sm = SMOTEENN(sampling_strategy=sampling_strategy, n_jobs=-1, random_state=1)
    smx, smy = sm.fit_resample(X, y)
    return smx, smy
Esempio n. 31
0
def train(data, batch_size=10000, test_every=10, max_steps=int(1e6), n_epochs=1, log_file=None, model_path=None):
    """
    Peform training
    :param data: take an opened zipfile
    :param batch_size: size of the bach
    :param test_every: number of test over time. This also define the train test split
    :param max_steps: number of total line to read from the zip file
    :param n_epochs:  number of epochs over a single minibatch
    :param log_file: log file for training
    :return: column name
    """
    columns = next(data)

    feed = feeder(data, batch_size)
    sampler = SMOTEENN()
    if model_path is not None:
        with open(model_path,'rb') as f:
            clf = pkl.load(f)
    else:
        clf = SGDClassifier()

    for global_step in tqdm(range(max_steps)):
        try:
            x_tr, y_tr = next(feed)
            x_tr = minmax_scale(x_tr)
        except StopIteration:
            feed = feeder(data, batch_size)
            continue
        for _ in range(n_epochs):
            try:
                x_tr, y_tr = sampler.fit_resample(x_tr, y_tr)
            except ValueError as e:
                tqdm.write(str(e))
                continue
            clf.fit(x_tr, y_tr)

        if global_step % test_every == 0:
            y_hat = clf.predict(x_tr)
            tqdm.write(classification_report_imbalanced(y_tr, y_hat), file=log_file)
            fname = f"model/clf_{global_step}.pkl"
            with open(fname, 'wb') as f:
                pkl.dump(clf, f)
            tqdm.write(f"File saved as {fname}")
    return columns
Esempio n. 32
0
def test_sample_regular_pass_smote_enn():
    smote = SMOTEENN(
        smote=SMOTE(sampling_strategy="auto", random_state=RND_SEED),
        enn=EditedNearestNeighbours(sampling_strategy="all"),
        random_state=RND_SEED,
    )
    X_resampled, y_resampled = smote.fit_resample(X, Y)

    X_gt = np.array([
        [1.52091956, -0.49283504],
        [0.84976473, -0.15570176],
        [0.61319159, -0.11571667],
        [0.66052536, -0.28246518],
        [-0.28162401, -2.10400981],
        [0.83680821, 1.72827342],
        [0.08711622, 0.93259929],
    ])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 33
0
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False,
                            export=False):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Esempio n. 34
0
    def fit_resample(self, X, y):
        """
        Resample the dataset.

        First standardize X, then perform SMOTEENN, then de-standardize
        to return results in the same "units" as input.

        Parameters
        ----------
        X : ndarray
            Dense, feature matrix where rows are observations.
        y : ndarray
            1-D array of responses.

        Returns
        -------
        X_resample, y_resampled : ndarray, ndarray
            Resampled X and y in the original "units" of X.
        """
        ss = StandardScaler()
        X_std = ss.fit_transform(X)

        sm = SMOTEENN(
            sampling_strategy=self.sampling_strategy_smoteenn,
            random_state=self.random_state,
            smote=SMOTE(
                random_state=self.random_state,
                k_neighbors=self.k_smote,
                sampling_strategy=self.sampling_strategy_smote,
            ),
            enn=ENN(
                sampling_strategy=self.sampling_strategy_enn,
                n_neighbors=self.k_enn,
                kind_sel=self.kind_sel_enn,
            ),
        )

        X_res, y_res = sm.fit_resample(X_std, y)

        return ss.inverse_transform(X_res), y_res
Esempio n. 35
0
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=100, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply SMOTE + ENN
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=0.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=0.5)
def test_error_wrong_object(smote_params, err_msg):
    smt = SMOTEENN(**smote_params)
    with pytest.raises(ValueError, match=err_msg):
        smt.fit_resample(X, Y)