def test_fit_sample_auto(): """Test the fit and sample routine with auto ratio.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble( ratio=ratio, random_state=RND_SEED, return_indices=True, n_subsets=3) # Get the different subset X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y) X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [-2.10724436, 0.70263997], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742]]]) y_gt = np.array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]) idx_gt = np.array([[5, 9, 4, 0, 2, 3], [5, 9, 8, 6, 3, 2], [5, 9, 8, 0, 2, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_fit_resample_half(): # Define the sampling_strategy parameter sampling_strategy = {0: 2, 1: 3, 2: 3} # Create the sampling object ee = EasyEnsemble( sampling_strategy=sampling_strategy, random_state=RND_SEED, n_subsets=3) # Get the different subset X_resampled, y_resampled = ee.fit_resample(X, Y) X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556], [1.35269503, 0.44812421], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [-2.10724436, 0.70263997], [-1.23195149, 0.15427291], [0.59091459, 0.40692742], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.35269503, 0.44812421], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]]]) y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def buildModel(clf, X, y, cv_nums=10, is_random=False): # 是否打乱数据 if is_random == True: random_lst = list(np.random.randint(0, 1000, 4)) elif is_random == False: random_lst = [0] * 4 print('----------各种类别不平衡处理方法结果, 为' + str(cv_nums) + '折交叉验证的f1均值----------') # 不做处理,使用原始数据集做预测 print('原始数据集: ', np.mean(cross_val_score(clf, X, y, scoring='f1', cv=cv_nums))) ros = RandomOverSampler(random_state=random_lst[0]) X_oversampled, y_oversampled = ros.fit_sample(X, y) # print(sorted(Counter(y_oversampled).items())) print('过采样: ', np.mean(cross_val_score(clf, X_oversampled, y_oversampled, scoring='f1', cv=cv_nums))) cc = ClusterCentroids(random_state=random_lst[1]) X_undersampled, y_undersampled = cc.fit_sample(X, y) #print(sorted(Counter(y_undersampled).items())) print('欠采样: ', np.mean(cross_val_score(clf, X_undersampled, y_undersampled, scoring='f1', cv=cv_nums))) sm = SMOTE(random_state=random_lst[2]) X_smote, y_smote = sm.fit_sample(X, y) #print(sorted(Counter(y_smote).items())) print('SMOTE: ', np.mean(cross_val_score(clf, X_smote, y_smote, scoring='f1', cv=cv_nums))) # 将样本多的类别划分为若干个集合供不同学习器使用,这样对每个学习器来看都进行了欠采样, # 但在全局来看却不会丢失重要信息,假设将负样本的类别划分为10份,正样本的类别只有1份, # 这样训练10个学习器,每个学习器使用1份负样本和1份正样本,正样本共用 ee = EasyEnsemble(random_state=random_lst[3], n_subsets=10) X_ee, y_ee = ee.fit_sample(X, y)
def test_fit_sample_auto(): """Test the fit and sample routine with auto ratio.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, return_indices=True, n_subsets=3) # Get the different subset X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y) X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [-2.10724436, 0.70263997], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742]]]) y_gt = np.array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]) idx_gt = np.array([[5, 9, 4, 0, 2, 3], [5, 9, 8, 6, 3, 2], [5, 9, 8, 0, 2, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def ensemble_train(X,y, working_dir,n, name, svm=True): ees = EasyEnsemble(random_state=557, n_subsets=n) X_res, y_res = ees.fit_sample(X,y) try: raise Exception('Retrain') with open(working_dir + "/" + name + '.pkl', 'rb') as f1: clf = pickle.load(f1) except: # scores = cross_val_score(clf, X, y, cv=4, scoring="roc_auc") # print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2)) clf = [] for i in range(len(X_res)): print(Counter(y_res[i])) if(svm): clfi = SVC(kernel="linear", probability=True) else: clfi = AdaBoostClassifier(n_estimators=20) #clfi=AdaBoostClassifier() clfi.fit(X_res[i], y_res[i]) clf.append(clfi) scores = cross_val_score(clfi, X_res[i], y_res[i], cv=4, scoring="roc_auc") print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2)) with open(working_dir + "/" + name + '.pkl', 'wb') as f1: pickle.dump(clf, f1) return clf
def test_fit_sample_half(): # Define the sampling_strategy parameter sampling_strategy = {0: 2, 1: 3, 2: 3} # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=RND_SEED, n_subsets=3) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y) X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556], [1.35269503, 0.44812421], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [-2.10724436, 0.70263997], [-1.23195149, 0.15427291], [0.59091459, 0.40692742], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.35269503, 0.44812421], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]]]) y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def __init__(self, base_model, n_subsets): self.base_model = base_model self.n_subsets = n_subsets self.easy_ensemble = EasyEnsemble('auto', random_state=RAND_SEED, n_subsets=4) self.trained_based_models = []
def ezensemble(X_train, y_train): a = list(X_train) ee = EasyEnsemble(random_state=0, n_subsets=10) ee.fit(X_train, y_train) X_resampled, y_resampled = ee.fit_sample(X_train, y_train) X_resampled = pd.DataFrame(X_resampled[1], columns=a) y_resampled = pd.DataFrame(y_resampled[1], columns=['Target']) return X_resampled, y_resampled
def easy_ensemble(train_set, train_label): ee = EasyEnsemble(ratio='auto', return_indices=True, random_state=None, replacement=False, n_subsets=easy_ensemble_num) X_resampled, y_resampled, idx_resampled = ee.fit_sample( train_set, train_label) return X_resampled, y_resampled
def test_random_state_none(): # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def test_random_state_none(): # Define the sampling_strategy parameter sampling_strategy = 'auto' # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def test_random_state_none(): # Define the sampling_strategy parameter sampling_strategy = 'auto' # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_resample(X, Y)
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object ee = EasyEnsemble(random_state=RND_SEED) ee.fit(X, Y) assert_raises(RuntimeError, ee.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def EasySample(data): x = data.iloc[:, 0:2] y = data.iloc[:, -2] # 使用集成方法EasyEnsemble处理不均衡样本 model_EasyEnsemble = EasyEnsemble() # 建立EasyEnsemble模型对象 x_EasyEnsemble_resampled, y_EasyEnsemble_resampled = model_EasyEnsemble.fit_sample( x, y) # 输入数据并应用集成方法处理 print(x_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的x样本集概况 print(y_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的y标签集概况
def test_random_state_none(): """Test that the processing is going throw with random state being None.""" # Define the ratio parameter ratio = 0.5 # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def test_random_state_none(): """Test that the processing is going throw with random state being None.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def fit(self, train_x, train_y): self._estimators = [] ee = EasyEnsemble(replacement=True, n_subsets=self._no_of_estimators) X_res, y_res = ee.fit_sample(train_x, train_y) for i in range(self._no_of_estimators): X, y = X_res[i, :, :], y_res[i, :] estimator = clone(self._base_classifier) estimator.fit(X, y) self._estimators.append(estimator) return self
def easy_ensemble_classifier(clf, x_train, y_train, x_test, nsubs, repl): ee = EasyEnsemble(n_subsets=nsubs, replacement=repl) # Create EasyEnsemble object X_train_res, y_train_res = ee.fit_sample(x_train, y_train) # re-sample the data clfs = [] i = 0 preds_ = np.zeros([1, np.shape(x_test)[0]]) # Iterate through sub-samples: for xtrain in X_train_res: clfs += [clf] clfs[i].fit(xtrain, y_train_res[i]) preds_ = np.add(preds_, clfs[i].predict(x_test)) i += 1 return np.divide(preds_, nsubs)
def get_downsampling_data(train_pth="data/train_data.npy", val_pth="data/val_data.npy", test_pth="data/test_data.npy"): train_data = np.load(train_pth)[:, :-1] train_flag = np.load(train_pth)[:, -1] ee = EasyEnsemble(random_state=0, n_subsets=10) train_data, train_flag = ee.fit_sample(train_data, train_flag) train_flag = np.array(train_flag, dtype=np.int) val_data = np.load(val_pth)[:, :-1] val_flag = np.load(val_pth)[:, -1] val_flag = np.array(val_flag, dtype=np.int) test_data = np.load(test_pth)[:, :-1] test_flag = np.load(test_pth)[:, -1] test_flag = np.array(test_flag, dtype=np.int) return train_data, train_flag, val_data, val_flag, test_data, test_flag
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ee = EasyEnsemble(random_state=RND_SEED) X_resampled, y_resampled = ee.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled[0]) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 400) assert_equal(count_y_res[2], 400)
def test_ee_fit(): """Test the fitting method""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) # Fit the data ee.fit(X, Y) # Check if the data information have been computed assert_equal(ee.min_c_, 0) assert_equal(ee.maj_c_, 1) assert_equal(ee.stats_c_[0], 500) assert_equal(ee.stats_c_[1], 4500)
def Balance_classes(X_train, y_train, Sampling_Function): if Sampling_Function == 'RandomUnderSampler': us = RandomUnderSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'NearMiss1': us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3) elif Sampling_Function == 'NearMiss2': us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3) elif Sampling_Function == 'NearMiss3': us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3) elif Sampling_Function == 'CondensedNearestNeighbour': us = CondensedNearestNeighbour(random_state=1) elif Sampling_Function == 'EditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'RepeatedEditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'TomekLinks': us = TomekLinks(random_state=1) elif Sampling_Function == 'RandomOverSampler': us = RandomOverSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'SMOTE': us = SMOTE(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTETomek': us = SMOTETomek(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTEENN': us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5) elif Sampling_Function == 'EasyEnsemble': us = EasyEnsemble() elif Sampling_Function == 'BalanceCascade_rf': us = BalanceCascade(classifier='random-forest', random_state=1) elif Sampling_Function == 'BalanceCascade_svm': us = BalanceCascade(classifier='linear-svm', random_state=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) return X_train_res, y_train_res
def __init__(self, window_size=6, training_ratio=.7, seq="sequence", pos="label"): self.training_ratio = training_ratio # Float value representing % of data used for training self.features = [] self.labels = [] self.words = [] self.window_size = window_size self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4), "mlp_adam": MLPClassifier(), "svc": svm.SVC(verbose=1), "xgb": XGBClassifier(max_delta_step=5), "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf") } self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(), "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(), "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(), "near_miss": NearMiss(), "pass": -1} self.seq = seq self.pos = pos self.random_data = 0 self.test_results = 0 self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"} self.vector = 0 self.features_labels = {} self.test_cv = 0 self.benchmark_mcc = 0 self.mcc_scorer = make_scorer(matthews_corrcoef)
def test_fit_sample_half(): """Test the fit and sample routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.5 # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ee_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ee_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_continuous_error(): """Test either if an error is raised when the target are continuous type""" # continuous case y = np.linspace(0, 1, 10) ee = EasyEnsemble(random_state=RND_SEED) assert_warns(UserWarning, ee.fit, X, y)
def test_ee_init(): # Define a ratio ratio = 1. ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) assert_equal(ee.ratio, ratio) assert_equal(ee.replacement, False) assert_equal(ee.n_subsets, 10) assert_equal(ee.random_state, RND_SEED)
def get_batch(self, tokenized_samples, labels): e = EasyEnsemble(random_state=0, n_subsets=1) e.fit(tokenized_samples, labels) X_resampled, y_resampled = e.sample(tokenized_samples, labels) X = X_resampled[0] y = y_resampled[0] targets = np.zeros(shape=(len(X), self._num_labels)) samples = np.zeros(shape=(len(X), self._max_document_length)) for sample_ix, sample in enumerate(X): label = y[sample_ix] targets[sample_ix, label] = 1 samples[sample_ix, :sample.shape[0]] = \ sample[:self._max_document_length] return samples, targets
def EnsembleSample(X, Y, method='EasyEnsemble', random_state=42): if X.size == len(X): X = X.reshape(-1, 1) if method is 'EasyEnsemble': sampler = EasyEnsemble(ratio='auto', random_state=random_state, replacement=False, n_subsets=10) elif method is 'BalanceCascade': sampler = BalanceCascade(ratio='auto', random_state=random_state, n_max_subset=None, classifier=None, estimator=None) X_resampled, Y_resampled = sampler.fit_sample(X, Y) # 组合采样+分类器,返回的是分类器 # BalancedBaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, ratio='auto', replacement=False, n_jobs=1, random_state=None, verbose=0) return X_resampled, Y_resampled
def test_ee_init(): # Define a ratio ratio = 1. ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) assert ee.ratio == ratio assert ee.replacement is False assert ee.n_subsets == 10 assert ee.random_state == RND_SEED
def test_ee_fit_invalid_ratio(): """Test either if an error is raised when the balancing ratio to fit is smaller than the one of the data""" # Create the object ratio = 1. / 10000. ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) # Fit the data assert_raises(RuntimeError, ee.fit, X, Y)
def test_ee_init(): # Define a sampling_strategy sampling_strategy = 1. ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=RND_SEED) assert ee.sampling_strategy == sampling_strategy assert ee.replacement is False assert ee.n_subsets == 10 assert ee.random_state == RND_SEED
def test_fit_sample_auto(): """Test the fit and sample routine with auto ratio.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, return_indices=True) # Get the different subset X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ee_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ee_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'ee_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def resample_data(train_feature, train_class, count_sampleset): multiplier = { 0: 1.0, 1: 0.1, 2: 0.1, 3: 1.0, 4: 1.0, 5: 0.1, 6: 1.0, 7: 0.5, 8: 0.1 } target_stats = collections.Counter(train_class) for key, value in target_stats.items(): target_stats[key] = int(value * multiplier[key]) ee = EasyEnsemble(ratio=target_stats, n_subsets=count_sampleset) return ee.fit_sample(train_feature, train_class)
def test_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) assert_raises(RuntimeError, ee.sample, X, Y)
def test_ee_init(): """Test the initialisation of the object""" # Define a ratio ratio = 1. ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) assert_equal(ee.ratio, ratio) assert_equal(ee.replacement, False) assert_equal(ee.n_subsets, 10) assert_equal(ee.random_state, RND_SEED)
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=100, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Easy Ensemble ee = EasyEnsemble(n_subsets=3) X_resampled, y_resampled = ee.fit_sample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) for iy, e in enumerate(X_res_vis): ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1],