def fit(self, X, y):
        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(all_ft_idx)
            if len(idxs) == 1:
                self.hsic_idx_ = idxs[0]
            else:
                self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        print("HSIC done.", len(self.hsic_idx_))

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self
Esempio n. 2
0
def test_ada_fit_resample_nn_obj():
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_resample(X, Y)
    X_gt = np.array([
        [0.11622591, -0.0317206],
        [0.77481731, 0.60935141],
        [1.25192108, -0.22367336],
        [0.53366841, -0.30312976],
        [1.52091956, -0.49283504],
        [-0.28162401, -2.10400981],
        [0.83680821, 1.72827342],
        [0.3084254, 0.33299982],
        [0.70472253, -0.73309052],
        [0.28893132, -0.38761769],
        [1.15514042, 0.0129463],
        [0.88407872, 0.35454207],
        [1.31301027, -0.92648734],
        [-1.11515198, -0.93689695],
        [-0.18410027, -0.45194484],
        [0.9281014, 0.53085498],
        [-0.14374509, 0.27370049],
        [-0.41635887, -0.38299653],
        [0.08711622, 0.93259929],
        [1.70580611, -0.11219234],
        [0.88161986, -0.2829741],
        [0.35681689, -0.18814597],
        [1.4148276, 0.05308106],
        [0.3136591, -0.31327875],
    ])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def oversampling():
    categories = ['Tools', 'Hardware', 'Other', 'Script', 'Software']

    docs_to_train = sklearn.datasets.load_files(
        '/Users/rishabm/Desktop/MergeFileJTOrg/data1',
        description=None,
        categories=categories,
        load_content=True,
        encoding='utf-8')

    X_train, X_test, y_train, y_test = train_test_split(docs_to_train.data,
                                                        docs_to_train.target,
                                                        test_size=0.2)
    ogY = Counter(y_train)
    print('OLD SAMPLES: ')
    for key, value in ogY.items():
        print(key, value)
    matrix = loadmatrix('matrix/2019_Jun_21_14_44_mtrx.joblib')
    adasyn = ADASYN()
    matrix_resampled, y_resampled = adasyn.fit_resample(matrix, y_train)
    y_resampled.astype(int)
    newY = Counter(y_resampled)
    print('NEW SAMPLES: ')
    for key, value in newY.items():
        print(key, value)
	def create_adasyn_samples(self, num_ADASYN, train_p, train_n):

		# train_x_expanded, train_y_binary = self.pre_process(test_data=False)

		# inos_p_old = train_x_expanded[train_y_binary == 1]
		# inos_n = train_x_expanded[train_y_binary == 0]
		# generate 30% ADASYN samples
		# prepare data to run ADASYN: ADASYN trains on entire original training data
		X = pd.concat((train_p.transpose(), train_n.transpose()), axis=0)
		# create y
		y_p = np.ones(train_p.shape[1])
		y_n = np.zeros(train_n.shape[1])
		y = np.concatenate((y_p, y_n))
		# We will generate equal number of minority samples as majority samples
		majority_sample_cnt = train_n.shape[1]
		# ada = ADASYN(sampling_strategy={1: majority_sample_cnt, 0: majority_sample_cnt})
		ada = ADASYN(sampling_strategy=1.0, n_neighbors=10)
		# X contains all data, should be in format of n_samples*n_features
		X_res, y_res = ada.fit_resample(X, y)
		# In X_res, the first segment is original minority class samples, 2nd segment is original majority class samples
		# last segment is synthesized minority samples, we only want the last segment
		num_adasyn_samples_generated = X_res.shape[0] - train_p.shape[1] - train_n.shape[1]
		starting_index = X_res.shape[0] - num_adasyn_samples_generated
		X_adasyn = X_res.iloc[starting_index:X_res.shape[0], :]
		print("debug, X_adasyn shape")
		print(X_adasyn.shape)

		return X_adasyn
Esempio n. 5
0
def adasyn(x, y):
    # Adaptive Synthetic
    n_neighbors = math.ceil(sum(y) * 0.01)
    
    adasyn = ADASYN(sampling_strategy=1,
                   n_neighbors=n_neighbors)
    return adasyn.fit_resample(x, y)
def create_train_and_test_sets(training_data_json: JSONType) -> List[np.ndarray]:
    # print("create_train_and_test_sets")
    data_as_list_of_dicts = json.loads(training_data_json)

    x, y = split_data_to_x_and_y(data_as_list_of_dicts)

    # x = df_one_hot_vectors.iloc[:, 3:].values
    # y = df_one_hot_vectors.iloc[:, :1].values.ravel()  # tutaj powinny być chyba 3 kolumny

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

    print('Liczba etykiet w zbiorze y:', np.bincount(y))
    print('Liczba etykiet w zbiorze y_train:', np.bincount(y_train))
    print('Liczba etykiet w zbiorze y_test:', np.bincount(y_test))

    pca = PCA(n_components=500, random_state=1)
    x_train = pca.fit_transform(x_train)
    x_test = pca.transform(x_test)

    adasyn = ADASYN(random_state=1)
    x_train, y_train = adasyn.fit_resample(x_train, y_train)

    sc = StandardScaler()
    sc.fit(x_train)
    x_train_std = sc.transform(x_train)
    x_train_std = normalize(x_train_std, norm='l2')
    x_test_std = sc.transform(x_test)
    x_test_std = normalize(x_test_std, norm='l2')

    return [pca, sc, x_test_std, x_train_std, y_test, y_train]
def boost_select(data, target, n_feat):

    import xgboost as xgb
    import sklearn as sk

    X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(
        data, target, shuffle=True, test_size=0.33)
    # Oversampling
    from imblearn.over_sampling import ADASYN, SMOTE

    ada = ADASYN(sampling_strategy='minority')
    X_train_, y_train_ = ada.fit_resample(X_train, y_train)
    print(X_train_.shape)

    from numpy import sort
    from sklearn.feature_selection import SelectFromModel

    param = {
        'max_depth': 9,
        'eta': 0.7,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    param['booster'] = 'dart'
    param['nthread'] = 4
    param['silent'] = 1
    param['eval_metric'] = 'auc'

    model = xgb.XGBClassifier(params=param)
    model.fit(X_train_, y_train_)

    selection = SelectFromModel(model, max_features=n_feat, prefit=True)
    new_data = selection.transform(data)
    return new_data
def boost(X_train, X_test, y_train, y_test):
    ada = ADASYN(sampling_strategy='minority')
    X_train_, y_train_ = ada.fit_resample(X_train, y_train)
    print(X_train_.shape)

    from numpy import sort
    from sklearn.feature_selection import SelectFromModel

    param = {
        'max_depth': 9,
        'eta': 0.7,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    param['booster'] = 'dart'
    param['nthread'] = 4
    param['silent'] = 1
    param['eval_metric'] = 'auc'

    model = xgb.XGBClassifier(params=param)
    model.fit(X_train_, y_train_)

    y_pred = model.predict(X_test)

    return y_pred
Esempio n. 9
0
def load_dataset(method="randomover"):
    dataset = pd.read_csv("Pretreatment Data/Data.csv", encoding="GBK")

    Y_orig = np.array(dataset["Y"])
    X_orig = dataset.drop(["Y"], axis=1)
    X_orig = np.array(X_orig)
    X_train, X_test, Y_train, Y_test = train_test_split(X_orig,
                                                        Y_orig,
                                                        test_size=0.25,
                                                        random_state=0)

    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)

    # 过采样
    if method == "randomover":
        ros = RandomOverSampler(random_state=0)
        X_resampled, Y_resampled = ros.fit_sample(X_train, Y_train)
    # 使用SMOTE模型生成新的样本
    if method == "smote":
        smo = SMOTE(random_state=0)
        X_resampled, Y_resampled = smo.fit_resample(X_train, Y_train)
    # 使用ADASYN模型
    if method == "adasyn":
        ada = ADASYN(random_state=0)
        X_resampled, Y_resampled = ada.fit_resample(X_train, Y_train)
    # 欠采样
    if method == "randomunder":
        rus = RandomUnderSampler(random_state=0)
        X_resampled, Y_resampled = rus.fit_sample(X_train, Y_train)

    return X_resampled, Y_resampled, X_test, Y_test
Esempio n. 10
0
def unba_adasyn(x,y):
    x1 = x.reshape(x.shape[0],-1)# 7259*480
    adasyn = ADASYN(sampling_strategy='minority') # 建立ros模型对象
    x1,y1 = adasyn.fit_resample(x1,y)# 扩增以后*480
    x2 = np.zeros((x1.shape[0],x.shape[1],x.shape[2],1))
    for i in tqdm(range(x1.shape[0])):
        x2[i,:,:,0] = np.reshape(x1[i],(60,8))
    return x2,y1
Esempio n. 11
0
def balance_df(x_train, y_train, neighborgs):
    oversampler = ADASYN(sampling_strategy='not majority',
                         n_neighbors=neighborgs,
                         random_state=42)

    x_train, y_train = oversampler.fit_resample(x_train, y_train)

    return x_train, y_train
Esempio n. 12
0
def select_ratio_ADASYN(df,ratio,features=["stanford_polite","perspective_score"]):
    sm = ADASYN(sampling_strategy={0: int(len(df)), 1: int(len(df) // ratio)})
    labels = list(df["toxic"])
    df, new_labels = sm.fit_resample(df[features], labels)
    df = pd.DataFrame(df, columns=features)
    df['toxic'] = new_labels

    return df
Esempio n. 13
0
def boost_select(data, target):

    import xgboost as xgb
    import sklearn as sk

    X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(
        data, target, shuffle=True, test_size=0.33)
    # Oversampling
    from imblearn.over_sampling import ADASYN, SMOTE

    ada = ADASYN(sampling_strategy='minority')
    X_train_, y_train_ = ada.fit_resample(X_train, y_train)
    print(X_train_.shape)

    from numpy import sort
    from sklearn.feature_selection import SelectFromModel

    param = {
        'max_depth': 9,
        'eta': 0.7,
        'silent': 1,
        'objective': 'binary:logistic'
    }
    param['booster'] = 'dart'
    param['nthread'] = 4
    param['silent'] = 1
    param['eval_metric'] = 'auc'

    model = xgb.XGBClassifier(params=param)
    model.fit(X_train_, y_train_)

    # This code is to show how many features surpass a good limit, minimum number = 42
    # By generating the code below, we can see the fit of the model given number of components.
    # Here, we simply waited for the minimum number of components to reach 0.85 threshold
    # then using these components to optimise the deep learning model below.
    # Uncomment the code below to see the feature importance with their governing thresholds.

    # thresholds = sort(model.feature_importances_)[::-1]
    # print(thresholds)
    # for thresh in thresholds:
    #     # select features using threshold
    #     selection = SelectFromModel(model, threshold=thresh, prefit=True)
    #     select_X_train = selection.transform(X_train_)
    #     # train model
    #     selection_model = xgb.XGBClassifier(params=param)
    #     selection_model.fit(select_X_train, y_train_)
    #     # eval model
    #     select_X_test = selection.transform(X_test)
    #     y_pred = selection_model.predict(select_X_test)
    #     predictions = [round(value) for value in y_pred]
    #     accuracy = sk.metrics.accuracy_score(y_test, predictions)
    #     recall = sk.metrics.recall_score(y_test,predictions)
    #     precision = sk.metrics.precision_score(y_test,predictions)
    #     print("Thresh=%.3f, n=%d, Accuracy: %.2f%%%%, Recall: %.2f%%%%, Precision: %.2f%%%%" % (thresh, select_X_train.shape[1], accuracy*100.0,recall*100.0,precision*100.0))

    selection = SelectFromModel(model, max_features=60, prefit=True)
    new_data = selection.transform(data)
    return new_data
Esempio n. 14
0
def test_ada_fit_resample():
    ada = ADASYN(random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_resample(X, Y)
    X_gt = np.array([
        [0.11622591, -0.0317206],
        [0.77481731, 0.60935141],
        [1.25192108, -0.22367336],
        [0.53366841, -0.30312976],
        [1.52091956, -0.49283504],
        [-0.28162401, -2.10400981],
        [0.83680821, 1.72827342],
        [0.3084254, 0.33299982],
        [0.70472253, -0.73309052],
        [0.28893132, -0.38761769],
        [1.15514042, 0.0129463],
        [0.88407872, 0.35454207],
        [1.31301027, -0.92648734],
        [-1.11515198, -0.93689695],
        [-0.18410027, -0.45194484],
        [0.9281014, 0.53085498],
        [-0.14374509, 0.27370049],
        [-0.41635887, -0.38299653],
        [0.08711622, 0.93259929],
        [1.70580611, -0.11219234],
        [0.94899098, -0.30508981],
        [0.28204936, -0.13953426],
        [1.58028868, -0.04089947],
        [0.66117333, -0.28009063],
    ])
    y_gt = np.array([
        0,
        1,
        0,
        0,
        0,
        1,
        1,
        1,
        1,
        1,
        1,
        0,
        0,
        1,
        1,
        1,
        1,
        0,
        1,
        0,
        0,
        0,
        0,
        0,
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 15
0
def run(X_train, y_train):

    adasyn_sampler = ADASYN(sampling_strategy='not majority')
    X_adasyn, y_adasyn = adasyn_sampler.fit_resample(X_train, y_train)
    print("######################")
    print("ADASYN")
    print("######################")
    print("\n")
    return X_adasyn, y_adasyn
Esempio n. 16
0
    def over_sample(self,
                    method="BorderLine",
                    sampling_strategy="minority",
                    random_state=42,
                    k_neighbors=5,
                    n_neighbors=10,
                    kind="borderline-1"):
        """
        过采样方法
        :param method: str, option: ADASYN, BorderLine,KMeans,Random,SVM
        :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m}
        :param random_state:int
        :param k_neighbors:int
        :param n_neighbors:int
        :param kind:str, borderline-1,borderline-2
        :return:df
        """
        feature_name = self._df.columns.difference(["id",
                                                    self._target]).tolist()
        X = self._df[feature_name].values
        y = self._df[self._target].values

        print("Original label shape {}".format(Counter(y)))

        if method == "ADASYN":
            overSm = ADASYN(sampling_strategy=sampling_strategy,
                            random_state=random_state,
                            n_neighbors=k_neighbors)
        elif method == "BorderLine":
            overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy,
                                     random_state=random_state,
                                     k_neighbors=k_neighbors,
                                     m_neighbors=n_neighbors,
                                     kind=kind)
        elif method == "KMeans":
            overSm = KMeansSMOTE(sampling_strategy=sampling_strategy,
                                 random_state=random_state,
                                 k_neighbors=k_neighbors)
        elif method == "Random":
            overSm = RandomOverSampler(sampling_strategy=sampling_strategy,
                                       random_state=random_state)
        elif method == "SVM":
            overSm = SVMSMOTE(sampling_strategy=sampling_strategy,
                              random_state=random_state,
                              k_neighbors=k_neighbors,
                              m_neighbors=n_neighbors,
                              out_step=0.5)
        else:
            print("不支持{}该抽样方法".format(method))
            return self._df

        X_res, y_res = overSm.fit_resample(X, y)
        print("overSample label shape {}".format(Counter(y_res)))
        _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1)
        df_new = pd.DataFrame(data=_data,
                              columns=feature_name + [self._target])
        return df_new
    def classify(self, X, type: str, classifier: str, test_prop: float, res: None, res_method: None):

        if type == 'binary':
            y = self.df['class'].replace(0,1)
        elif type == 'multi':
            y = self.df['class']
        else:
            raise TypeError("Choose a proper type of classification")

        X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_prop, stratify=y)

        if res == True:
            if res_method == 'down':
                nm = NearMiss()
                X_res, Y_res = nm.fit_resample(X_train, Y_train)
            elif res_method == 'up':
                sm = ADASYN()
                X_res, Y_res = sm.fit_resample(X_train, Y_train)
            else:
                raise TypeError("Resampling method not provided. Please use 'up' for oversampling or 'down' for undersampling.")

        if classifier == 'lr':
            model = LogisticRegression(solver='liblinear', class_weight='balanced', C=0.04, penalty='l2')
        elif classifier == 'svc':
            model = LinearSVC(C=0.004, penalty='l2')
        elif classifier == 'rf':
            n_est = int(input("Type in number of trees to estimate from: ").strip())
            model = RandomForestClassifier(n_estimators=n_est, bootstrap=True, max_depth=5)
        elif classifier == 'xgb':
            n_est = int(input("Type in number of trees to estimate from: ").strip())
            model = XGBClassifier(n_estimators=n_est, bootstrap=True, max_depth=5, reg_lamba=0.4)
        elif classifier == 'ada':
            n_est = int(input("Type in number of trees to estimate from: ").strip())
            model = AdaBoostClassifier(n_estimators=n_est, learning_rate=0.005)
        else:
            raise TypeError("Choose a proper classifier. Possible inputs: 'lr', 'svc', 'rf', 'xgb', 'ada' .")

        if res == True:
            model.fit(X_res, Y_res)
        else:
            model.fit(X_train, Y_train)

        Y_pred = model.predict(X_test)

        # Accuracy Percentage
        print(f"Accuracy is {round(accuracy_score(Y_test, Y_pred), 2)*100}%")

        # Classification Report
        print(classification_report(Y_pred, Y_test))

        # Matthew's Correlation Coefficient
        print(f"Matthew's Correlation Coefficient is {matthews_corrcoef(Y_test, Y_pred)}")

        # Plots of Confusion Matrix and ROC Curve
        plot_confusion_matrix(Y_test, Y_pred, figsize=(10,10)) 

        return model
def export_csv(name_dataset, data, labels, name_method):
    # over-sampling
    resampler = ADASYN(random_state=42)
    data_resampled, labels_resampled = resampler.fit_resample(data, labels)

    carpet = "../MO2P/data/all/"

    print("generate")
    full_data = carpet + "./" + name_dataset + "_data_train_" + \
        name_method + "_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv"
    data_A = carpet + "./" + name_dataset + "_data_train_" + \
        name_method + "_A_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv"
    data_B = carpet + "./" + name_dataset + "_data_train_" + \
        name_method + "_B_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv"
    labels_A = carpet + "./" + name_dataset + "_truth_train_" + \
        name_method + "_A_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv"
    labels_B = carpet + "./" + name_dataset + "_truth_train_" + \
        name_method + "_B_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv"

    full_data_resampled = carpet + "./" + name_dataset + "_data_train_" + \
        name_method + "_ADASYN_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv"
    data_A_resampled = carpet + "./" + name_dataset + "_data_train_" + \
        name_method + "_A_ADASYN_" + \
        str(nb_att_1) + "_" + str(nb_att_2) + ".csv"
    data_B_resampled = carpet + "./" + name_dataset + "_data_train_" + \
        name_method + "_B_ADASYN_" + \
        str(nb_att_1) + "_" + str(nb_att_2) + ".csv"
    labels_A_resampled = carpet + "./" + name_dataset + "_truth_train_" + \
        name_method + "_A_ADASYN_" + \
        str(nb_att_1) + "_" + str(nb_att_2) + ".csv"
    labels_B_resampled = carpet + "./" + name_dataset + "_truth_train_" + \
        name_method + "_B_ADASYN_" + \
        str(nb_att_1) + "_" + str(nb_att_2) + ".csv"

    print(full_data)
    print(data_A)
    print(data_B)
    print(labels_A)
    print(labels_B)

    print(full_data_resampled)
    print(data_A_resampled)
    print(data_B_resampled)
    print(labels_A_resampled)
    print(labels_B_resampled)

    data_resampled.to_csv(full_data_resampled, index=False)
    data_resampled.iloc[:, :nb_att_1].to_csv(data_A_resampled, index=False)
    data_resampled.iloc[:, nb_att_1:].to_csv(data_B_resampled, index=False)
    labels_resampled.to_csv(labels_A_resampled, index=False)
    labels_resampled.to_csv(labels_B_resampled, index=False)

    data.to_csv(full_data, index=False)
    data.iloc[:, :nb_att_1].to_csv(data_A, index=False)
    data.iloc[:, nb_att_1:].to_csv(data_B, index=False)
    labels.to_csv(labels_A, index=False)
    labels.to_csv(labels_B, index=False)
Esempio n. 19
0
def cross_val_score_weighted(model,
                             X,
                             y,
                             weights,
                             cv=2,
                             metrics=[
                                 sklearn.metrics.accuracy_score,
                                 sklearn.metrics.precision_score,
                                 sklearn.metrics.recall_score
                             ]):
    from sklearn.model_selection import StratifiedKFold, KFold
    from imblearn.over_sampling import ADASYN, SMOTE
    from imblearn.under_sampling import NearMiss
    from sklearn.preprocessing import StandardScaler, MinMaxScaler

    # Split data
    kf = StratifiedKFold(n_splits=cv, shuffle=True)
    kf.get_n_splits(X, y)
    scores = [[] for metric in metrics]
    scores.append([])

    for train_index, test_index in kf.split(X, y):

        Z_train = StandardScaler().fit(X[train_index])

        model_clone = sklearn.base.clone(model)

        X_test, y_test = Z_train.transform(X[test_index]), y[test_index]
        X_train, y_train = Z_train.transform(X[train_index]), y[train_index]

        # Sampling
        # Oversample
        print("Oversampling\n")
        ada = ADASYN(sampling_strategy='minority')
        X_train, y_train = ada.fit_resample(X[train_index], y[train_index])
        print(X_train.shape)

        # Undersample
        # print("Undersampling\n")
        # nm = NearMiss()
        # X_train, y_train = nm.fit_resample(X[train_index],y[train_index])
        # print(X_train.shape)

        model_clone.fit(X_train, y_train, class_weight=weights)
        y_pred = model_clone.predict(X_test)
        for i, metric in enumerate(metrics):
            score = metric(y_test, y_pred)
            scores[i].append(score)
            print(i)

        model_clone.fit(X_train, y_train, class_weight=weights)
        y_pred_prob = model_clone.predict_proba(X_test)[:, 1]
        score = sklearn.metrics.roc_auc_score(y_test, y_pred_prob)
        scores[3].append(score)

    return scores
Esempio n. 20
0
def expand(x: np.ndarray, y: np.ndarray):
    """
    enlarge the number of the minority class use ADASYN
    :param x:
    :param y:
    :return:
    """
    ada = ADASYN(sampling_strategy=1)
    x_res, y_res = ada.fit_resample(x.reshape(-1, 600 * 3), y.reshape(-1))
    return x_res.reshape(-1, 600, 3), y_res.reshape(
        -1, 1), len(x_res.reshape(-1, 600, 3)) - len(x)
Esempio n. 21
0
    def upsampleData(self, df):

        ros = RandomOverSampler(random_state=42, sampling_strategy='minority')
        x_train_sampled, y_train_sampled = ros.fit_resample(df.drop('Response', axis=1), df['Response'])

        ada = ADASYN(random_state=42)
        x_train_sampled, y_train_sampled = ada.fit_resample(df.drop('Response', axis=1), df['Response'])

        x_train_sampled['Response'] = y_train_sampled
        print(len(x_train_sampled))
        return x_train_sampled
Esempio n. 22
0
 def Adasyn_sampling(self):
     ds = self.training.copy()
     self.report.append('Adasyn_sampling')
     Y = ds["Response"]
     X = ds.drop(columns=["Response"])
     ada = ADASYN(random_state=self.seed)
     X_res, Y_res = ada.fit_resample(X, Y)
     sampled_ds = pd.DataFrame(X_res)
     sampled_ds['Response'] = Y_res
     # sampled_ds.index=ds.index
     sampled_ds.columns = ds.columns
     self.training = round(sampled_ds, 2)
class ADASYNStep():
    def __init__(self, kwargs={}):
        """ Adaptive Synthetic Over-Sampling Technique (ADASYN) to create balanced samples. Uses imblearn’s ADASYN class. 
        
        Parameters
        ----------
        kwargs (dict, default={}) : arguments to pass to imblearn ADASYN class
        
        """
        self.description = "ADASYN Data Augmentation"
        self.kwargs = kwargs
        self.fitted = None
        self.changes_num_samples = True

    def fit(self, X, y=None):
        """ Fits the ADASYN to given data
        
        Parameters
        ----------
        X (DataFrame) : the data to fit

        y (DataFrame) : the labels for X

        Returns
        -------
        (DataFrame, DataFrame) : a tuple of the transformed DataFrames, the first being the X data and the second being the y data
        """
        if y is None:
            print(f"{self.description} step is supervised and needs target values")
            raise ValueError
        self.fitted = ADASYN(**self.kwargs)
        return self.transform(X, y=y)

    def transform(self, X, y=None):
        """ Transforms the given data using previously fitted ADASYN
        
        Parameters
        ----------
        X (DataFrame) : the data to fit

        y (DataFrame) : the labels for X

        Returns
        -------
        (DataFrame, DataFrame) : a tuple of the transformed DataFrames, the first being the X data and the second being the y data
        """
        if self.fitted is None:
            raise TransformError

        X_rs, y_rs = self.fitted.fit_resample(X, y)
        X_rs = pd.DataFrame(X_rs, columns=X.columns)
        y_rs = pd.Series(y_rs, name=y.name)
        return X_rs, y_rs
Esempio n. 24
0
def oversample_ADASYN(df, debug=True):
    X = df.values[:, :-1]
    y = df.values[:, -1].astype(int)
    if debug:
        print('Original dataset shape %s' % Counter(y))
    ada = ADASYN(random_state=0)
    X_res, y_res = ada.fit_resample(X, y)
    df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1])
    df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res)
    if debug:
        print('Resampled dataset shape %s' % Counter(y_res))
    return df_resampled
Esempio n. 25
0
def adasyn(X,
           y,
           visualize=False,
           pca2d=True,
           pca3d=True,
           tsne=True,
           pie_evr=True):
    ada = ADASYN(random_state=42)
    X_res, y_res = ada.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
Esempio n. 26
0
 def imbalanceProcess(self, method='None'):
     if method == 'RandomOverSample':
         ros = RandomOverSampler(random_state=999)
         self.x_train, self.y_train = ros.fit_resample(
             self.x_train, self.y_train)
     if method == 'ADASYN':
         ada = ADASYN(random_state=999)
         self.x_train, self.y_train = ada.fit_resample(
             self.x_train, self.y_train)
     if method == 'SMOTE':
         sm = SMOTE(random_state=999)
         self.x_train, self.y_train = sm.fit_resample(
             self.x_train, self.y_train)
Esempio n. 27
0
def adaptive_synthetic_sampling_func(train_x, train_y, target):
    try:
        logger.info(
            f"counter before ADASYN is: {train_y[target].value_counts()}")
        # transform the dataset
        oversample = ADASYN()
        train_x, train_y = oversample.fit_resample(train_x, train_y)
        # summarize the new class distribution
        logger.info(
            f"counter after ADASYN is: {train_y[target].value_counts()}")
        return train_x, train_y
    except Exception as ex:
        logger.error(
            f"failed to run adaptive_synthetic_sampling_func due to: {ex}")
def DealwithSample(data, label, method="ADA"):
    if method == "ADASYN":
        ada = ADASYN(random_state=42)
        X_res, y_res = ada.fit_resample(data, label)
        return X_res, y_res
    elif method == "RandomOverSampler":
        ros = RandomOverSampler(random_state=42)
        X_res, y_res = ros.fit_resample(data, label)
        print("has oversampled the data {}".format(len(X_res)))
        return X_res, y_res
    elif method == "SMOTE":
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(data, label)
        return X_res, y_res
def readFile(path,
             y_label,
             method,
             encode_features=[],
             skew_exempted=[],
             training_ratio=0.7,
             shuffle=True,
             needSkew=False,
             fea_eng=True):
    raw = pd.read_csv(path)
    n, d = raw.shape

    if (shuffle):
        raw = raw.sample(frac=1).reset_index(drop=True)  # shuffle

    if (needSkew):
        skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop(
            skew_exempted)].apply(lambda x: skew(x.dropna()))
        skewed = skewed[skewed > 0.75].index
        raw[skewed] = np.log1p(raw[skewed])  # reduce skewness

    raw = pd.get_dummies(
        raw, columns=encode_features)  # encode categorical features
    raw = raw.fillna(raw.mean())
    # if(method=='OverSample'):
    #     ind_more=np.argmax(np.bincount(raw[y_label]))
    #     more=raw[ind]
    #     less=raw[-ind]
    #     x = [randint(0, len(less)) for a in range(0, len(more)-len(less))]
    #     raw.
    X = raw.drop(y_label, axis=1)
    y = raw[y_label]
    X_train, X_test, y_train, y_test = split(X, y, training_ratio)
    if (method == 'OverSample'):
        ada = ADASYN(random_state=42)
        X_res, y_res = ada.fit_resample(X_train, y_train)
        X_train = X_res
        y_train = y_res
    if (method == 'UnderSample'):
        # for i in []
        #model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP
        model = EditedNearestNeighbours(random_state=42)
        X_res, y_res = model.fit_resample(X_train, y_train)
        X_train = X_res
        y_train = y_res
    # if(method=='Weights'):
    # if(fea_eng==True):
    #     # X,y=feature_eng(X,y)

    return X_train, X_test, y_train, y_test
Esempio n. 30
0
def adasyn(X, y):
    """Balancing data using ADASYN

    Args:
        X: Training set without Class Target
        y:Training set Class Target

    Returns:
        balanced train_x, test_x
    """
    sample = ADASYN(random_state=42, sampling_strategy='minority')
    X, y = sample.fit_resample(X, y)
    print('after balancing:', X.shape)
    return X, y
Esempio n. 31
0
def test_ada_fit_resample():
    ada = ADASYN(random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
        1.25192108, -0.22367336
    ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
        -0.28162401, -2.10400981
    ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
        0.70472253, -0.73309052
    ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
        0.88407872, 0.35454207
    ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
        -0.18410027, -0.45194484
    ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [
        -0.41635887, -0.38299653
    ], [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.94899098, -0.30508981], [0.28204936, -0.13953426],
                     [1.58028868, -0.04089947], [0.66117333, -0.28009063]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 32
0
def test_adasyn_error(adasyn_params, err_msg):
    adasyn = ADASYN(**adasyn_params)
    with pytest.raises(ValueError, match=err_msg):
        adasyn.fit_resample(X, Y)
Esempio n. 33
0
def test_ada_fit_sampling_strategy_error():
    sampling_strategy = {0: 9, 1: 12}
    ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED)
    with raises(ValueError, match="No samples will be generated."):
        ada.fit_resample(X, Y)
Esempio n. 34
0
def test_ada_wrong_nn_obj():
    nn = 'rnd'
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        ada.fit_resample(X, Y)