def test_fit_sample_auto(): """Test the fit and sample routine with auto ratio.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, return_indices=True, n_subsets=3) # Get the different subset X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y) X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [-2.10724436, 0.70263997], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742]]]) y_gt = np.array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]) idx_gt = np.array([[5, 9, 4, 0, 2, 3], [5, 9, 8, 6, 3, 2], [5, 9, 8, 0, 2, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def buildModel(clf, X, y, cv_nums=10, is_random=False): # 是否打乱数据 if is_random == True: random_lst = list(np.random.randint(0, 1000, 4)) elif is_random == False: random_lst = [0] * 4 print('----------各种类别不平衡处理方法结果, 为' + str(cv_nums) + '折交叉验证的f1均值----------') # 不做处理,使用原始数据集做预测 print('原始数据集: ', np.mean(cross_val_score(clf, X, y, scoring='f1', cv=cv_nums))) ros = RandomOverSampler(random_state=random_lst[0]) X_oversampled, y_oversampled = ros.fit_sample(X, y) # print(sorted(Counter(y_oversampled).items())) print('过采样: ', np.mean(cross_val_score(clf, X_oversampled, y_oversampled, scoring='f1', cv=cv_nums))) cc = ClusterCentroids(random_state=random_lst[1]) X_undersampled, y_undersampled = cc.fit_sample(X, y) #print(sorted(Counter(y_undersampled).items())) print('欠采样: ', np.mean(cross_val_score(clf, X_undersampled, y_undersampled, scoring='f1', cv=cv_nums))) sm = SMOTE(random_state=random_lst[2]) X_smote, y_smote = sm.fit_sample(X, y) #print(sorted(Counter(y_smote).items())) print('SMOTE: ', np.mean(cross_val_score(clf, X_smote, y_smote, scoring='f1', cv=cv_nums))) # 将样本多的类别划分为若干个集合供不同学习器使用,这样对每个学习器来看都进行了欠采样, # 但在全局来看却不会丢失重要信息,假设将负样本的类别划分为10份,正样本的类别只有1份, # 这样训练10个学习器,每个学习器使用1份负样本和1份正样本,正样本共用 ee = EasyEnsemble(random_state=random_lst[3], n_subsets=10) X_ee, y_ee = ee.fit_sample(X, y)
def test_fit_sample_half(): """Test the fit and sample routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.6 # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, n_subsets=3) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y) X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [0.5220963, 0.11349303], [-2.10724436, 0.70263997], [1.10915364, 0.05718352], [0.22039505, 0.26469445], [0.59091459, 0.40692742]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [-2.10724436, 0.70263997], [0.5220963, 0.11349303], [0.22039505, 0.26469445], [1.10915364, 0.05718352], [0.59091459, 0.40692742]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.35269503, 0.44812421], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]]]) y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def ensemble_train(X,y, working_dir,n, name, svm=True): ees = EasyEnsemble(random_state=557, n_subsets=n) X_res, y_res = ees.fit_sample(X,y) try: raise Exception('Retrain') with open(working_dir + "/" + name + '.pkl', 'rb') as f1: clf = pickle.load(f1) except: # scores = cross_val_score(clf, X, y, cv=4, scoring="roc_auc") # print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2)) clf = [] for i in range(len(X_res)): print(Counter(y_res[i])) if(svm): clfi = SVC(kernel="linear", probability=True) else: clfi = AdaBoostClassifier(n_estimators=20) #clfi=AdaBoostClassifier() clfi.fit(X_res[i], y_res[i]) clf.append(clfi) scores = cross_val_score(clfi, X_res[i], y_res[i], cv=4, scoring="roc_auc") print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2)) with open(working_dir + "/" + name + '.pkl', 'wb') as f1: pickle.dump(clf, f1) return clf
def test_fit_sample_half(): # Define the sampling_strategy parameter sampling_strategy = {0: 2, 1: 3, 2: 3} # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=RND_SEED, n_subsets=3) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y) X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556], [1.35269503, 0.44812421], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [-2.10724436, 0.70263997], [-1.23195149, 0.15427291], [0.59091459, 0.40692742], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.35269503, 0.44812421], [1.10915364, 0.05718352], [0.59091459, 0.40692742], [0.22039505, 0.26469445]]]) y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto(): # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble( ratio=ratio, random_state=RND_SEED, return_indices=True, n_subsets=3) # Get the different subset X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y) X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [1.35269503, 0.44812421], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.22039505, 0.26469445]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [-2.10724436, 0.70263997], [0.22039505, 0.26469445], [1.10915364, 0.05718352]], [[0.85117925, 1.0185556], [-0.58539673, 0.62515052], [-1.23195149, 0.15427291], [0.5220963, 0.11349303], [1.10915364, 0.05718352], [0.59091459, 0.40692742]]]) y_gt = np.array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]]) idx_gt = np.array([[5, 9, 4, 0, 2, 3], [5, 9, 8, 6, 3, 2], [5, 9, 8, 0, 2, 1]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def ezensemble(X_train, y_train): a = list(X_train) ee = EasyEnsemble(random_state=0, n_subsets=10) ee.fit(X_train, y_train) X_resampled, y_resampled = ee.fit_sample(X_train, y_train) X_resampled = pd.DataFrame(X_resampled[1], columns=a) y_resampled = pd.DataFrame(y_resampled[1], columns=['Target']) return X_resampled, y_resampled
def test_random_state_none(): # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def easy_ensemble(train_set, train_label): ee = EasyEnsemble(ratio='auto', return_indices=True, random_state=None, replacement=False, n_subsets=easy_ensemble_num) X_resampled, y_resampled, idx_resampled = ee.fit_sample( train_set, train_label) return X_resampled, y_resampled
def test_random_state_none(): # Define the sampling_strategy parameter sampling_strategy = 'auto' # Create the sampling object ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def EasySample(data): x = data.iloc[:, 0:2] y = data.iloc[:, -2] # 使用集成方法EasyEnsemble处理不均衡样本 model_EasyEnsemble = EasyEnsemble() # 建立EasyEnsemble模型对象 x_EasyEnsemble_resampled, y_EasyEnsemble_resampled = model_EasyEnsemble.fit_sample( x, y) # 输入数据并应用集成方法处理 print(x_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的x样本集概况 print(y_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的y标签集概况
def test_random_state_none(): """Test that the processing is going throw with random state being None.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def test_random_state_none(): """Test that the processing is going throw with random state being None.""" # Define the ratio parameter ratio = 0.5 # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=None) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y)
def fit(self, train_x, train_y): self._estimators = [] ee = EasyEnsemble(replacement=True, n_subsets=self._no_of_estimators) X_res, y_res = ee.fit_sample(train_x, train_y) for i in range(self._no_of_estimators): X, y = X_res[i, :, :], y_res[i, :] estimator = clone(self._base_classifier) estimator.fit(X, y) self._estimators.append(estimator) return self
def easy_ensemble_classifier(clf, x_train, y_train, x_test, nsubs, repl): ee = EasyEnsemble(n_subsets=nsubs, replacement=repl) # Create EasyEnsemble object X_train_res, y_train_res = ee.fit_sample(x_train, y_train) # re-sample the data clfs = [] i = 0 preds_ = np.zeros([1, np.shape(x_test)[0]]) # Iterate through sub-samples: for xtrain in X_train_res: clfs += [clf] clfs[i].fit(xtrain, y_train_res[i]) preds_ = np.add(preds_, clfs[i].predict(x_test)) i += 1 return np.divide(preds_, nsubs)
def get_downsampling_data(train_pth="data/train_data.npy", val_pth="data/val_data.npy", test_pth="data/test_data.npy"): train_data = np.load(train_pth)[:, :-1] train_flag = np.load(train_pth)[:, -1] ee = EasyEnsemble(random_state=0, n_subsets=10) train_data, train_flag = ee.fit_sample(train_data, train_flag) train_flag = np.array(train_flag, dtype=np.int) val_data = np.load(val_pth)[:, :-1] val_flag = np.load(val_pth)[:, -1] val_flag = np.array(val_flag, dtype=np.int) test_data = np.load(test_pth)[:, :-1] test_flag = np.load(test_pth)[:, -1] test_flag = np.array(test_flag, dtype=np.int) return train_data, train_flag, val_data, val_flag, test_data, test_flag
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data ee = EasyEnsemble(random_state=RND_SEED) X_resampled, y_resampled = ee.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled[0]) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 400) assert_equal(count_y_res[2], 400)
def test_fit_sample_half(): """Test the fit and sample routine with 0.5 ratio.""" # Define the ratio parameter ratio = 0.5 # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED) # Get the different subset X_resampled, y_resampled = ee.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ee_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ee_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def EnsembleSample(X, Y, method='EasyEnsemble', random_state=42): if X.size == len(X): X = X.reshape(-1, 1) if method is 'EasyEnsemble': sampler = EasyEnsemble(ratio='auto', random_state=random_state, replacement=False, n_subsets=10) elif method is 'BalanceCascade': sampler = BalanceCascade(ratio='auto', random_state=random_state, n_max_subset=None, classifier=None, estimator=None) X_resampled, Y_resampled = sampler.fit_sample(X, Y) # 组合采样+分类器,返回的是分类器 # BalancedBaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, ratio='auto', replacement=False, n_jobs=1, random_state=None, verbose=0) return X_resampled, Y_resampled
def resample_data(train_feature, train_class, count_sampleset): multiplier = { 0: 1.0, 1: 0.1, 2: 0.1, 3: 1.0, 4: 1.0, 5: 0.1, 6: 1.0, 7: 0.5, 8: 0.1 } target_stats = collections.Counter(train_class) for key, value in target_stats.items(): target_stats[key] = int(value * multiplier[key]) ee = EasyEnsemble(ratio=target_stats, n_subsets=count_sampleset) return ee.fit_sample(train_feature, train_class)
def test_fit_sample_auto(): """Test the fit and sample routine with auto ratio.""" # Define the ratio parameter ratio = 'auto' # Create the sampling object ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, return_indices=True) # Get the different subset X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ee_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ee_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'ee_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
class BagClassifier(object): def __init__(self, base_model, n_subsets): self.base_model = base_model self.n_subsets = n_subsets self.easy_ensemble = EasyEnsemble('auto', random_state=RAND_SEED, n_subsets=4) self.trained_based_models = [] def fit(self, X, y): X_s, y_s = self.easy_ensemble.fit_sample(X, y) for idx in xrange(self.n_subsets): clone_model = clone(self.base_model) clone_model.fit(X_s[idx], y_s[idx]) self.trained_based_models.append(clone_model) def predict_proba(self, X): S_test = np.zeros((X.shape[0], len(self.trained_based_models))) for idx, clf in enumerate(self.trained_based_models): S_test[:, idx] = clf.predict_proba(X)[:, 1] return S_test.mean(1)
def update_initial_train(iter_sampling, under_sampling, smote, unmodified_train_X, unmodified_train_y, num_subsets): if iter_sampling == True: print "Oversampling in the active iteration list" ros = RandomOverSampler() initial_X_train = None initial_y_train = None initial_X_train, initial_y_train = ros.fit_sample(unmodified_train_X, unmodified_train_y) elif under_sampling == True: ee = EasyEnsemble(return_indices=True, replacement=True, n_subsets=num_subsets) initial_X_train = None initial_y_train = None initial_X_train, initial_y_train, indices = ee.fit_sample(unmodified_train_X, unmodified_train_y) elif smote == True: ros = SMOTE(k_neighbors=3) initial_X_train = None initial_y_train = None initial_X_train, initial_y_train = ros.fit_sample(unmodified_train_X, unmodified_train_y) else: # initial_X_train[:] = [] # initial_y_train[:] = [] initial_X_train = copy.deepcopy(unmodified_train_X) initial_y_train = copy.deepcopy(unmodified_train_y) return initial_X_train, initial_y_train
X_resampled, y_resampled = smote_enn.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) from imblearn.combine import SMOTETomek smote_tomek = SMOTETomek(random_state=0) X_resampled, y_resampled = smote_tomek.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) ''' Ensemble的例子 一个不均衡的数据集能够通过多个均衡的子集来实现均衡, imblearn.ensemble模块能实现上述功能. EasyEnsemble通过对原始的数据集进行随机下采样实现对数据集进行集成. ''' from imblearn.ensemble import EasyEnsemble ee = EasyEnsemble(random_state=0, n_subsets=10) X_resampled, y_resampled = ee.fit_sample(X, y) print(X_resampled.shape) print(sorted(Counter(y_resampled[0]).items())) #EasyEnsemble有两个很重要的参数: (i) n_subsets控制的是子集的个数 (ii)replacement决定是有放回还是无放回的随机采样. ''' BalanceCascade(级联平衡)的方法通过使用分类器(estimator参数)来确保那些被错分类的样本在下一次进行子集选取的时候也能被采样到. 同样,n_max_subset参数控制子集的个数,以及可以通过设置bootstrap=True来使用bootstraping(自助法). ''' from imblearn.ensemble import BalanceCascade from sklearn.linear_model import LogisticRegression bc = BalanceCascade(random_state=0, estimator=LogisticRegression(random_state=0), n_max_subset=4) X_resampled, y_resampled = bc.fit_sample(X, y)
def data_imbalance_handle(): # 导入数据文件 df = pd.read_table('data2.txt', sep=' ', names=['col1', 'col2', 'col3', 'col4', 'col5', 'label']) # 读取数据文件 x = df.iloc[:, :-1] # 切片,得到输入x y = df.iloc[:, -1] # 切片,得到标签y groupby_data_orgianl = df.groupby('label').count() # 对label做分类汇总 print(groupby_data_orgianl) # 打印输出原始数据集样本分类分布 print "=" * 20 # 使用SMOTE方法进行过抽样处理 model_smote = SMOTE() # 建立SMOTE模型对象 x_smote_resampled, y_smote_resampled = model_smote.fit_sample( x, y) # 输入数据并作过抽样处理 x_smote_resampled = pd.DataFrame( x_smote_resampled, columns=['col1', 'col2', 'col3', 'col4', 'col5']) # 将数据转换为数据框并命名列名 y_smote_resampled = pd.DataFrame(y_smote_resampled, columns=['label']) # 将数据转换为数据框并命名列名 smote_resampled = pd.concat([x_smote_resampled, y_smote_resampled], axis=1) # 按列合并数据框 groupby_data_smote = smote_resampled.groupby( 'label').count() # 对label做分类汇总 print(groupby_data_smote) # 打印输出经过SMOTE处理后的数据集样本分类分布 print "=" * 20 # 使用RandomUnderSampler方法进行欠抽样处理 model_RandomUnderSampler = RandomUnderSampler() # 建立RandomUnderSampler模型对象 x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled = model_RandomUnderSampler.fit_sample( x, y) # 输入数据并作欠抽样处理 x_RandomUnderSampler_resampled = pd.DataFrame( x_RandomUnderSampler_resampled, columns=['col1', 'col2', 'col3', 'col4', 'col5']) # 将数据转换为数据框并命名列名 y_RandomUnderSampler_resampled = pd.DataFrame( y_RandomUnderSampler_resampled, columns=['label']) # 将数据转换为数据框并命名列名 RandomUnderSampler_resampled = pd.concat( [x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled], axis=1) # 按列合并数据框 groupby_data_RandomUnderSampler = RandomUnderSampler_resampled.groupby( 'label').count() # 对label做分类汇总 print(groupby_data_RandomUnderSampler ) # 打印输出经过RandomUnderSampler处理后的数据集样本分类分布 print "=" * 20 # 使用SVM的权重调节处理不均衡样本 model_svm = SVC(class_weight='balanced') # 创建SVC模型对象并指定类别权重 model_svm.fit(x, y) # 输入x和y并训练模型 # 使用集成方法EasyEnsemble处理不均衡样本 model_EasyEnsemble = EasyEnsemble() # 建立EasyEnsemble模型对象 x_EasyEnsemble_resampled, y_EasyEnsemble_resampled = model_EasyEnsemble.fit_sample( x, y) # 输入数据并应用集成方法处理 print(x_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的x样本集概况 print(y_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的y标签集概况 print "=" * 20 # 抽取其中一份数据做审查 index_num = 1 # 设置抽样样本集索引 x_EasyEnsemble_resampled_t = pd.DataFrame( x_EasyEnsemble_resampled[index_num], columns=['col1', 'col2', 'col3', 'col4', 'col5']) # 将数据转换为数据框并命名列名 y_EasyEnsemble_resampled_t = pd.DataFrame( y_EasyEnsemble_resampled[index_num], columns=['label']) # 将数据转换为数据框并命名列名 EasyEnsemble_resampled = pd.concat( [x_EasyEnsemble_resampled_t, y_EasyEnsemble_resampled_t], axis=1) # 按列合并数据框 groupby_data_EasyEnsemble = EasyEnsemble_resampled.groupby( 'label').count() # 对label做分类汇总 print(groupby_data_EasyEnsemble) # 打印输出经过EasyEnsemble处理后的数据集样本分类分布
#=============================STEP 2============================ # PCA optional # split original data into train test sets X_data = data_train.iloc[:, :-1].values y_data = data_train.iloc[:, -1].values # split original data into train test sets X_train_ori, X_test, y_train_ori, y_test = train_test_split( X_data, y_data, test_size=test_size, random_state=42) #===========================FOR TEST================================== #=========================EASY ENSEMBLE=============================== n_subsets = 50 ee = EasyEnsemble(random_state=42, n_subsets=n_subsets) X_train, y_train = ee.fit_sample(X_train_ori, y_train_ori) print("Num of each sets: %d" % y_train.shape[1]) clf_xgbs = [] y_preds = np.zeros((n_subsets, y_test.size)) for i in range(n_subsets): print("Round %3d" % (i)) X_train_i = X_train[i] y_train_i = y_train[i] clf_xgb_i = CLF_XGB(X_train_i, y_train_i, X_test, y_test) clf_xgbs.append(clf_xgb_i) clf_xgb_i.train_model(params, iter_num) #clf_xgb_i.get_feature_scores(features,i) y_pred_i = clf_xgb_i.predict(threshold=CLF_THRESHOLD) y_preds[i] = y_pred_i
def easyemsemble(data, label, n_subsets=0): ee = EasyEnsemble(n_subsets=n_subsets) data_resampled, label_resampled = ee.fit_sample(data, label) data_resampled = data_resampled.reshape(-1, 12) label_resampled = label_resampled.reshape(-1, 1) return data_resampled, label_resampled
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=100, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Easy Ensemble ee = EasyEnsemble(n_subsets=3) X_resampled, y_resampled = ee.fit_sample(X, y) X_res_vis = [] for X_res in X_resampled: X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) for iy, e in enumerate(X_res_vis): ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1], label="Class #1 - set #{}".format(iy), alpha=0.5)
grid_predictions = grid.predict(X_test) print(confusion_matrix(y_test,grid_predictions)) print(classification_report(y_test,grid_predictions)) from sklearn.metrics import accuracy_score print( accuracy_score(y_test, grid_predictions) ) print( grid.best_params_) # # EasyEnsemble # In[ ]: from imblearn.ensemble import EasyEnsemble ee = EasyEnsemble(random_state=42) X_res, y_res = ee.fit_sample(X_train, y_train) # In[ ]: ''' #p = np.c_[X_res,y_res] #d = pd.DataFrame(p, columns = ['WeekOfMonth', 'WeekOfMonthClaimed', 'Age', 'PolicyNumber', 'RepNumber', 'Deductible', 'DriverRating', 'Year', 'Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'BasePolicy', 'FraudFound_P'])
# 使用RandomUnderSampler方法进行欠抽样处理 model_RandomUnderSampler = RandomUnderSampler() # 建立RandomUnderSampler模型对象 x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled = model_RandomUnderSampler.fit_sample(x, y) # 输入数据并作欠抽样处理 x_RandomUnderSampler_resampled = pd.DataFrame(x_RandomUnderSampler_resampled, columns=['col1', 'col2', 'col3', 'col4', 'col5']) # 将数据转换为数据框并命名列名 y_RandomUnderSampler_resampled = pd.DataFrame(y_RandomUnderSampler_resampled, columns=['label']) # 将数据转换为数据框并命名列名 RandomUnderSampler_resampled = pd.concat([x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled], axis=1) # 按列合并数据框 groupby_data_RandomUnderSampler = RandomUnderSampler_resampled.groupby('label').count() # 对label做分类汇总 print (groupby_data_RandomUnderSampler) # 打印输出经过RandomUnderSampler处理后的数据集样本分类分布 # 使用SVM的权重调节处理不均衡样本 model_svm = SVC(class_weight='balanced') # 创建SVC模型对象并指定类别权重 model_svm.fit(x, y) # 输入x和y并训练模型 # 使用集成方法EasyEnsemble处理不均衡样本 model_EasyEnsemble = EasyEnsemble() # 建立EasyEnsemble模型对象 x_EasyEnsemble_resampled, y_EasyEnsemble_resampled = model_EasyEnsemble.fit_sample(x, y) # 输入数据并应用集成方法处理 print (x_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的x样本集概况 print (y_EasyEnsemble_resampled.shape) # 打印输出集成方法处理后的y标签集概况 # 抽取其中一份数据做审查 index_num = 1 # 设置抽样样本集索引 x_EasyEnsemble_resampled_t = pd.DataFrame(x_EasyEnsemble_resampled[index_num], columns=['col1', 'col2', 'col3', 'col4', 'col5']) # 将数据转换为数据框并命名列名 y_EasyEnsemble_resampled_t = pd.DataFrame(y_EasyEnsemble_resampled[index_num], columns=['label']) # 将数据转换为数据框并命名列名 EasyEnsemble_resampled = pd.concat([x_EasyEnsemble_resampled_t, y_EasyEnsemble_resampled_t], axis=1) # 按列合并数据框 groupby_data_EasyEnsemble = EasyEnsemble_resampled.groupby('label').count() # 对label做分类汇总 print (groupby_data_EasyEnsemble) # 打印输出经过EasyEnsemble处理后的数据集样本分类分布
print(smote_resample.groupby('label').count()) #使用RandomUnderSampler model_under = RandomUnderSampler() x_under, y_under = model_under.fit_sample(x, y) x_under_frame = pd.DataFrame(x_under, columns=['col1', 'col2', 'col3', 'col4', 'col5']) y_under_frame = pd.DataFrame(y_under, columns=['label']) under_redample = pd.concat((x_under_frame, y_under_frame), axis=1) print(under_redample.groupby('label').count()) #使用svc model_svc = SVC(class_weight='balanced') #类别权重设置为balance k = model_svc.fit(x, y) print(model_svc.score(x, y)) print('svc', k) #使用集成方法EasyEnsemble model_ensemble = EasyEnsemble() x_ensemble, y_ensemble = model_ensemble.fit_sample(x, y) print(x_ensemble.shape, y_ensemble.shape) #分成了10份数,每一份的数量时58*2,因为多数样本:少数样本=1:1 #(10, 116, 5) (10, 116) index_num = 1 #对应第一个维度【10, 116, 5) (10, 116)】 #抽取其中一份 x_ensemble_frame = pd.DataFrame( x_ensemble[index_num], columns=['col1', 'col2', 'col3', 'col4', 'col5']) y_ensemble_frame = pd.DataFrame(y_ensemble[index_num], columns=['label']) ensemble_frame = pd.concat((x_under_frame, y_ensemble_frame), axis=1) print(ensemble_frame.groupby('label').count())
print('Original train dataset shape {}'.format(Counter(y_train))) print('Original test dataset shape {}'.format(Counter(y_test))) sample_methods = ['no_sample','easy_ensemble','cluster_centroids','edit_nearest_neribours',\ 'near_miss','adasyn','smote','smoteenn'] for sample_method in sample_methods: print sample_method, '****************************************************************************' if sample_method == 'easy_ensemble': ee = EasyEnsemble(ratio='auto', return_indices=True, random_state=None, replacement=False, n_subsets=6) X_resampled, y_resampled, idx_resampled = ee.fit_sample( X_train, y_train) p_list = [] for idx, x in enumerate(X_resampled): print('Resampled dataset shape {}'.format(Counter( y_resampled[idx]))) svc = LinearSVC(random_state=RANDOM_STATE).fit( X_resampled[idx], y_resampled[idx]) p = svc.predict(X_test) print p.shape p_list.append(p) p_list = np.array(p_list) from scipy.stats import mode p_final = mode(p_list)[0][0] accu = np.sum(p_final == y_test) / float(X_test.shape[0]) else: if sample_method == 'cluster_centroids':
replaceOutliers(df_train['content_cnt'], df_train) replaceOutliers(df_train['punish_rate'], df_train) ##########################################seperate x, y and store them to a different data frame x_train = df_train.loc[:, [ 'people_count', 'user_level', 'fans_count', 'money', 'live_count', 'diamond', 'content_cnt', 'punish_rate' ]] y_train = df_train.loc[:, ['status']] print(x_train.head()) print(y_train.head()) from imblearn.ensemble import EasyEnsemble ee = EasyEnsemble(random_state=0, n_subsets=10, replacement=True) x_ee, y_ee = ee.fit_sample(x_train, y_train) print(x_ee.shape) dfee0 = pd.DataFrame(x_ee[0], columns=[ 'people_count', 'user_level', 'fans_count', 'money', 'live_count', 'diamond', 'content_cnt', 'punish_rate' ]) dfee0['status'] = y_ee[0] dfee1 = pd.DataFrame(x_ee[1], columns=[ 'people_count', 'user_level', 'fans_count', 'money', 'live_count', 'diamond', 'content_cnt', 'punish_rate' ]) dfee1['status'] = y_ee[1]
def easyEnsemble(self, random_state=42): ee = EasyEnsemble(random_state=random_state) x_res, y_res = ee.fit_sample(self.x, self.y) return x_res, y_res
print "Oversampling in the active iteration list" ros = RandomOverSampler() initial_X_train = None initial_y_train = None initial_X_train, initial_y_train = ros.fit_sample( unmodified_train_X, unmodified_train_y) else: if under_sampling == True: print "Doing undersampling" # rus = RandomUnderSampler(return_indices=True, replacement=True) rus = EasyEnsemble(return_indices=True, replacement=True, n_subsets=num_subsets) initial_X_train = None initial_y_train = None initial_X_train, initial_y_train, indices = rus.fit_sample( unmodified_train_X, unmodified_train_y) # print(indices) # print(initial_y_train) else: initial_X_train[:] = [] initial_y_train[:] = [] initial_X_train = copy.deepcopy( unmodified_train_X) initial_y_train = copy.deepcopy( unmodified_train_y) loopCounter = loopCounter + 1 print "Fininshed loop", len(initial_X_train) y_pred_all = {}