def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object cc = ClusterCentroids(random_state=RND_SEED) cc.fit(X, Y) assert_raises(RuntimeError, cc.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_multiclass_fit_sample(): y = Y.copy() y[5] = 2 y[6] = 2 cc = ClusterCentroids(random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2
def test_fit_resample_auto(): sampling_strategy = 'auto' cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.13347175, 0.12167502], [ 0.47104475, 0.44386323 ], [0.09125309, -0.85409574], [0.19220316, 0.32337101], [0.094035, -2.55298982], [0.20792588, 1.49407907], [0.04352327, -0.20515826], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) print(X_resampled) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_object(): ratio = 'auto' cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( ratio=ratio, random_state=RND_SEED, estimator=cluster) X_resampled, y_resampled = cc.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[0:1000] = 2 # Resample the data cc = ClusterCentroids(random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 400) assert_equal(count_y_res[1], 400) assert_equal(count_y_res[2], 400)
def test_cc_fit(): """Test the fitting method""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) # Fit the data cc.fit(X, Y) # Check if the data information have been computed assert_equal(cc.min_c_, 0) assert_equal(cc.maj_c_, 1) assert_equal(cc.stats_c_[0], 500) assert_equal(cc.stats_c_[1], 4500)
def test_fit_sample_half(): """Test fit and sample routines with ratio of .5""" # Define the parameter for the under-sampling ratio = .5 # Create the object cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) # Fit and sample X_resampled, y_resampled = cc.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'cc_x_05.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'cc_y_05.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto(): """Test fit and sample routines with auto ratio""" # Define the parameter for the under-sampling ratio = 'auto' # Create the object cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) # Fit and sample X_resampled, y_resampled = cc.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_hard_voting(): sampling_strategy = 'auto' voting = 'hard' cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( sampling_strategy=sampling_strategy, random_state=RND_SEED, estimator=cluster, voting=voting) X_resampled, y_resampled = cc.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.094035, -2.55298982]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) for x in X_resampled: assert np.any(np.all(x == X, axis=1))
def test_fit_sample_half(): """Test fit and sample routines with ratio of .5""" # Define the parameter for the under-sampling ratio = .5 # Create the object cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) # Fit and sample X_resampled, y_resampled = cc.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.19220316, 0.32337101], [0.094035, -2.55298982], [0.20792588, 1.49407907], [0.04352327, -0.20515826], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_fit_sample_check_voting(): cc = ClusterCentroids(random_state=RND_SEED) cc.fit_sample(X, Y) assert cc.voting_ == 'soft' cc = ClusterCentroids(random_state=RND_SEED) cc.fit_sample(sparse.csr_matrix(X), Y) assert cc.voting_ == 'hard'
def test_fit_sample_error(): ratio = 'auto' cluster = 'rnd' cc = ClusterCentroids( ratio=ratio, random_state=RND_SEED, estimator=cluster) with raises(ValueError, match="has to be a KMeans clustering"): cc.fit_sample(X, Y) voting = 'unknown' cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED) with raises(ValueError, match="needs to be one of"): cc.fit_sample(X, Y)
def resample_data(predictors, target, df_data, method): """ This function resamples training datasets prior to training models. """ if method=='adasyn': util = ADASYN() elif method=='random-over-sampler': util = RandomOverSampler() elif method=='smote': util = SMOTE(kind='borderline2') elif method=='smote-tomek': util = SMOTETomek() elif method=='smote-enn': util = SMOTEENN() elif method=='edited-nn': util = EditedNearestNeighbours() elif method=='repeated-edited-nn': util = RepeatedEditedNearestNeighbours() elif method=='all-knn': util = AllKNN() elif method=='one-sided-selection': util = OneSidedSelection() elif method=='cluster-centroids': util = ClusterCentroids() elif method=='random-under-sampler': util = RandomUnderSampler() elif method=='neighbourhood-cleaning-rule': util = NeighbourhoodCleaningRule() elif method=='condensed-nearest-neighbour': util = CondensedNearestNeighbour() elif method=='near-miss': util = NearMiss(version=1) elif method=='instance-hardness-threshold': util = InstanceHardnessThreshold() x_resampled, y_resampled = util.fit_sample(df_data[predictors], df_data[target]) x_resampled = pd.DataFrame(x_resampled, columns=predictors) y_resampled = pd.DataFrame(y_resampled, columns=[target]) return x_resampled, y_resampled
def createUnderOrOverSample(method, given_data, outputdata_filename, max_len, codebook): dataX = [] dataY = [] for xx in given_data: dataX.append(xx[0:-1]) dataY.append(xx[-1]) X = pad_sequences(dataX, maxlen=max_len, dtype='float32') X_norm = X / (float(len(codebook))) y_norm = numpy.array(dataY) # perform over or under sampling X_d = [] y_res = [] if method == "over": sm = SMOTE(kind='borderline2') X_res, y_res = sm.fit_sample(X_norm, y_norm) else: sm = ClusterCentroids() X_res, y_res = sm.fit_sample(X_norm, y_norm) X_d = X_res * (float(len(codebook))) writeSampledSequences(X_d, y_res, codebook, outputdata_filename)
def __init__(self): from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \ TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \ CondensedNearestNeighbour, NeighbourhoodCleaningRule from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \ BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier self.oversamplers = { 'ADASYN': ADASYN(), 'RandomOverSampler': RandomOverSampler(), 'SMOTE': SMOTE(), 'BorderlineSMOTE': BorderlineSMOTE(), 'SVMSMOTE': SVMSMOTE() } self.undersamplers = { 'ClusterCentroids': ClusterCentroids(), 'RandomUnderSampler': RandomUnderSampler(), 'InstanceHardnessThreshold': InstanceHardnessThreshold(), 'NearMiss': NearMiss(), 'TomekLinks': TomekLinks(), 'EditedNearestNeighbours': EditedNearestNeighbours(), 'RepeatedEditedNearestNeighbours': RepeatedEditedNearestNeighbours(), 'AllKNN': AllKNN(), 'OneSidedSelection': OneSidedSelection(), 'CondensedNearestNeighbour': CondensedNearestNeighbour(), 'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule() } self.ensemblesamplers = { 'EasyEnsemble': EasyEnsemble(), 'EasyEnsembleClassifier': EasyEnsembleClassifier(), 'BalancedBaggingClassifier': BalancedBaggingClassifier(), 'BalanceCascade': BalanceCascade(), 'BalancedRandomForestClassifier': BalancedRandomForestClassifier, 'RUSBoostClassifier': RUSBoostClassifier() }
class ClassRebalancer(DataStorer): ''' Class that rebalances the classes according to the sampling_strategy and balance_method during initialization This will help us reduce volume of data to compute without losing information about decision boundary ''' def __init__(self, balance_method_name='undersample_centroid', sampling_strategy=0.5, *args, **kwargs): ''' Init balance_method_name for class that easy to check which used, rebalance classes and init DataStorer with data contained in *args and **kwargs :param balance_method_name: str define the name of rebalance method (imblearn) :param sampling_strategy: float or str ('auto') define of desire balance classes ratio after rebalancing :param args: tuple here should be X, y for init DataStorer Class :param kwargs: dict here should be X, y for init DataStorer Class ''' self.balance_method_name = balance_method_name super().__init__(*args, **kwargs) # here init DataStorer for further potentially using it in rebalance_classes if self.balance_method_name == 'undersample_centroid': self.balance_method = ClusterCentroids(sampling_strategy=sampling_strategy) self.rebalance_classes() else: print(f'balance_method_name: {self.balance_method_name} doesnt fit. Сlasses were not rebalanced') def rebalance_classes(self, ): ''' Just rebalances the data and displays information about changes in class balance ''' print(f'Changing balances from {Counter(self.y).items()}') self.X, self.y = self.balance_method.fit_sample(self.X, self.y) print(f'to {Counter(self.y).items()}')
#Separa los datos que corresponden a las características y a las Etiquetas dataset = dataframe.values X = dataset[0:8330, 0:138].astype(float) Yn = dataset[0:8330:, 138] Y = np_utils.to_categorical(Yn) scaler = Normalizer('l2').fit(X) X_normalized = scaler.transform(X) #Separar los datos entrenamiento y validación 60-40 #Separar los datos entrenamiento y validación 60-40 X_train, X_test, y_train, y_test = train_test_split(X_normalized, Yn, test_size=0.4, random_state=42) ##balancear datos con SMOTE sm = SMOTE(random_state=12, ratio=1.0) X_train1, Y_train1 = sm.fit_sample(X_train, y_train) ##Convertir en vectores binarios y_train y y_test y_train1 = np_utils.to_categorical(Y_train1) y_test1 = np_utils.to_categorical(y_test) #Balancear datos con UNDERSAMPLING #print(sorted(Counter(y_train).items())) cc = ClusterCentroids(random_state=0) X_train2, Y_train2 = cc.fit_sample(X_train, y_train) #print(sorted(Counter(y_resampled).items()) y_train2 = np_utils.to_categorical(Y_train2) y_test2 = np_utils.to_categorical(y_test)
def kmeans(X, y): cc = ClusterCentroids(random_state=42) X_res, y_res = cc.fit_sample(X, y) rand_index = np.random.choice(np.arange(len(X_res)), size=len(X_res), replace=False) return X_res[rand_index], y_res[rand_index]
def test_balanced_batch_generator_class_no_return_indices(data): with pytest.raises(ValueError, match="needs to have an attribute"): BalancedBatchGenerator(*data, sampler=ClusterCentroids(), batch_size=10)
def cluster_centroids_under_sampling(self, keel_dataset): sampler = ClusterCentroids() result = self.__base_preprocessing(keel_dataset, sampler) return result
from imblearn.under_sampling import ClusterCentroids import pandas as pd import numpy as np benchmark = pd.read_csv("./data/feature_new.csv", sep='\t') benchmark['label'] = (benchmark['label'] == "acr").astype(int) X = benchmark[["len", "function", "codon", "dev", "hth"]] y = benchmark['label'] cc = ClusterCentroids(sampling_strategy={0: 25158}, n_jobs=1, random_state=0) X_smt, y_smt = cc.fit_sample(X, y) new_benchmark = pd.concat([y_smt, X_smt], axis=1) new_benchmark.to_csv("./data/feature_CC.csv", sep="\t", index=False)
adasyn_score: [0.36834946 0.36481125 0.3630489 0.35949664 0.36565641] adasyn_smote_accuracy_score: 0.3661544972905157 adasyn_f1_score: 0.3661544972905157 adasyn_cohen_kappa_score: 0.04467966548542168 adasyn_hamming_loss 0.6338455027094844 ''' ''' 下采样(Under-sampling) 原型生成(prototype generation) 给定数据集S,原型生成算法将生成一个子集S’,其中|S’|<|S|,但是子集并非来自于原始数据集. 意思就是说:原型生成方法将减少数据集的样本数量,剩下的样本是由原始数据集生成的,而不是直接来源于原始数据集. ClusterCentroids函数实现了上述功能: 每一个类别的样本都会用K-Means算法的中心点来进行合成, 而不是随机从原始样本进行抽取. ''' from imblearn.under_sampling import ClusterCentroids cc = ClusterCentroids(random_state=0) X_resampled_cc, y_resampled_cc = cc.fit_sample(train_set_1_1, label) print('ClusterCentroids:', sorted(Counter(y_resampled_cc).items())) x_train_cc, x_test_cc, y_train_cc, y_test_cc = train_test_split(X_resampled_cc, y_resampled_cc, random_state=1) # ClusterCentroids函数提供了一种很高效的方法来减少样本的数量, 但需要注意的是, 该方法要求原始数据集最好能聚类成簇. # 此外, 中心点的数量应该设置好, 这样下采样的簇能很好地代表原始数据. svm_clf.fit(x_train_cc, y_train_cc) joblib.dump(svm_clf, '../model/cc_sample_model.pkl') #smote评估 from sklearn.model_selection import cross_val_score scores = cross_val_score(svm_clf, x_test_cc, y_test_cc, cv=5) print('cc_score:', scores) pred3 = svm_clf.predict(x_test_cc)
from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss from sklearn.neural_network import MLPClassifier # import some data to play with X = [] Y = [] reader = DictReader(open("picture.csv", 'r')) for row in reader: Y.append(row['win']) del row['win'] X.append(row) v = DictVectorizer(sparse=False) X = v.fit_transform(X) print('Original dataset shape {}'.format(Counter(Y))) #sm = SMOTE(kind='svm') sm = ClusterCentroids(random_state=42) X, Y = sm.fit_sample(X, Y) print('Resampled dataset shape {}'.format(Counter(Y))) train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.25, random_state=0) ### building the classifiers clfs = [] svc = SVC(kernel="linear", C=0.025,probability=True) svc.fit(train_x, train_y) print('SVC LogLoss {score}'.format(score=log_loss(test_y, svc.predict_proba(test_x)))) clfs.append(svc) svc = SVC(kernel="linear", C=0.025,probability=True) svc.fit(train_x, train_y) print('SVC LogLoss {score}'.format(score=log_loss(test_y, svc.predict_proba(test_x)))) clfs.append(svc)
这两种类型的SMOTE使用的是危险样本来生成新的样本数据,对于 Borderline-1 SMOTE,最近邻中的随机样本b与该少数类样本a来自于不同的类; 不同的是,对于 Borderline-2 SMOTE,随机样本b可以是属于任何一个类的样本; SVM SMOTE:kind='svm',使用支持向量机分类器产生支持向量然后再生成新的少数类样本. ''' ''' 下采样(Under-sampling) 原型生成(prototype generation) 给定数据集S,原型生成算法将生成一个子集S’,其中|S’|<|S|,但是子集并非来自于原始数据集. 意思就是说:原型生成方法将减少数据集的样本数量,剩下的样本是由原始数据集生成的,而不是直接来源于原始数据集. ClusterCentroids函数实现了上述功能: 每一个类别的样本都会用K-Means算法的中心点来进行合成, 而不是随机从原始样本进行抽取. ''' from imblearn.under_sampling import ClusterCentroids cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) # ClusterCentroids函数提供了一种很高效的方法来减少样本的数量, 但需要注意的是, 该方法要求原始数据集最好能聚类成簇. # 此外, 中心点的数量应该设置好, 这样下采样的簇能很好地代表原始数据. ''' 原型选择(prototype selection) 与原型生成不同的是, 原型选择算法是直接从原始数据集中进行抽取. 抽取的方法大概可以分为两类:(i)可控的下采样技术(the controlled under-sampling techniques) (ii)the cleaning under-sampling techniques 第一类的方法可以由用户指定下采样抽取的子集中样本的数量;第二类方法则不接受这种用户的干预. ''' #RandomUnderSampler函数是一种快速并十分简单的方式来平衡各个类别的数据: 随机选取数据的子集. from imblearn.under_sampling import RandomUnderSampler#下采样函数 rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_sample(X, y)
import sys, os, csv from imblearn.under_sampling import ClusterCentroids input_csv_file = sys.argv[1] input_csv = input_csv_file.split(".csv")[0] with open(input_csv_file, newline="") as input_file: reader = csv.reader(input_file, delimiter=',') with open(input_csv + "-cc-.csv", 'w', newline='') as output_file: writer = csv.writer(output_file, delimiter=',') skip_header = True X = [] y = [] cc = ClusterCentroids() for x in reader: if skip_header: skip_header = False continue y.append(x[-1]) X.append(list(map(int, x[:len(x) - 1]))) #print (X) X_res, y_res = cc.fit_sample(X, y) print (len(X_res)) print (len(y_res)) for idx, s in enumerate(X_res): #print (list(s) + list(y_res[idx])) writer.writerow(list(s) + list(y_res[idx])) #break;
print('MIG Score ' + str(score)) print('Feature Count ' + str(X_mkb.shape[1])) # feats = [elem for index, elem in enumerate(selected_features) if mig[index]>=score] # print [features[i] for i in feats] # print return X_mkb #UNDER SAMPLING from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids # cluster sample #clusters of majority class and replace that cluster with centroid of that cluster. ccsampling = ClusterCentroids(random_state=45, sampling_strategy='all') # random sample #can lead to loss of potential information russampling = RandomUnderSampler(random_state=0) # print 'feature selection- none' # print "initial dimension of matrix" # print str(X_train.shape) + ' Training data dimension after feature selection' # print str(Y_train.shape) + ' Training labels' # print 'After random under sampled' # X_train_rus, Y_train_rus = russampling.fit_sample(X_train, Y_train) # print str(X_train_rus.shape) + ' Training data dimension' # print str(Y_train_rus.shape) + ' Training labels'
def under_sample_cluster_centroids(train_inputs, train_targets): sampler = ClusterCentroids(random_state=32) train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets) return train_inputs, train_targets
def test_fit_resample_error(cluster_centroids_params, err_msg): cc = ClusterCentroids(**cluster_centroids_params) with pytest.raises(ValueError, match=err_msg): cc.fit_resample(X, Y)
def test_fit_resample_check_voting(X, expected_voting): cc = ClusterCentroids(random_state=RND_SEED) cc.fit_resample(X, Y) assert cc.voting_ == expected_voting
def unbalance_helper(self, imbalance_method='under_sampling', search_method='grid'): print("get all feature") # 生成所有 feature self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer( ) model_name = None # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble if imbalance_method == 'over_sampling': print("Use SMOTE deal with unbalance data ") # https://www.zhihu.com/question/269698662 # https://www.cnblogs.com/kamekin/p/9824294.html self.X_train, self.y_train = SMOTE().fit_resample( self.X_train, self.y_train) self.X_test, self.y_test = SMOTE().fit_resample( self.X_train, self.y_train) model_name = 'lgb_over_sampling' elif imbalance_method == 'under_sampling': print("Use ClusterCentroids deal with unbalance data") self.X_train, self.y_train = ClusterCentroids( random_state=0).fit_resample(self.X_train, self.y_train) self.X_test, self.y_test = ClusterCentroids( random_state=0).fit_resample(self.X_test, self.y_test) model_name = 'lgb_under_sampling' elif imbalance_method == 'ensemble': self.model = BalancedBaggingClassifier( base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) model_name = 'ensemble' print('search best param') # 使用 set_params 将搜索到的最优参数设置为模型的参数 if imbalance_method != 'ensemble': param = self.param_search(search_method=search_method) param['params']['num_leaves'] = int(param['params']['num_leaves']) param['params']['max_depth'] = int(param['params']['max_depth']) self.model = self.model.set_params(**param['params']) print('fit model ') # 训练, 并输出模型的结果 self.model.fit(self.X_train, self.y_train) Test_predict_label = self.model.predict(self.X_test) Train_predict_label = self.model.predict(self.X_train) per, acc, recall, f1 = get_score(self.y_train, self.y_test, Train_predict_label, Test_predict_label) # 输出训练集的精确率 print('Train accuracy %s' % per) # 输出测试集的准确率 print('test accuracy %s' % acc) # 输出recall print('test recall %s' % recall) # 输出F1-score print('test F1_score %s' % f1) self.save(model_name)
tl = TomekLinks(return_indices=True, ratio='majority') X_tl, y_tl, id_tl = tl.fit_sample(X, y) print('Removed indexes:', id_tl) plot_2d_space(X_tl, y_tl, 'Tomek links under-sampling') from imblearn.under_sampling import ClusterCentroids cc = ClusterCentroids(ratio={0: 10}) X_cc, y_cc = cc.fit_sample(X, y) plot_2d_space(X_cc, y_cc, 'Cluster Centroids under-sampling') from imblearn.over_sampling import SMOTE smote = SMOTE(ratio='minority') X_sm, y_sm = smote.fit_sample(X, y)
df_raw.MonthClaimed.cat.set_categories([ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ], ordered=True, inplace=True) # Convert data in test set apply_cats(df=df_test, trn=df_raw) # Convert category to numerical and replace missing data df, y, nas = proc_df(df_raw, 'FraudFound_P') X_test, _, nas = proc_df(df_test, na_dict=nas) df, y, nas = proc_df(df_raw, 'FraudFound_P', na_dict=nas) # Undersample majority class cc = ClusterCentroids(ratio={0: 6650}, n_jobs=-1) X_cc_full, y_cc_full = cc.fit_sample(df, y) plot_2d_space(X_cc, y_cc, 'Cluster Centroids under-sampling') # Model(LGBoost) lgb_train = lgb.Dataset(X_cc_full, y_cc_full, free_raw_data=False) # Parametes for lgboost parameters = { 'num_leaves': 2**5, 'learning_rate': 0.05, 'is_unbalance': True, 'min_split_gain': 0.03, 'min_child_weight': 1, 'reg_lambda': 1, 'subsample': 1, 'objective': 'binary',
X = QuoraInput # In[ ]: Y = quora_dataset['is_duplicate'] # In[ ]: print('Original dataset shape {}'.format(Counter(Y))) # In[ ]: cc = ClusterCentroids(random_state=42) # In[ ]: cc # In[123]: type(X) # In[ ]: type(Y)
print('auc', roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])) print("") print(classification_report(y_test, clf.predict(X_test))) print("") print('-----------------') best_dict[imbalance] = [clf, roc_auc_score(y_test, clf.predict(X_test))] #analysis with just cluster centroids(best imbalancer) classifiers = [LogisticRegression(), SVC(probability=True), GaussianNB(), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier(n_neighbors=6)] cc = ClusterCentroids() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4444) X_train, y_train = cc.fit_sample(X_train, y_train) fprs,tprs,roc_aucs = [],[],[] for clf in classifiers: clf.fit(X_train,y_train) y_pred = clf.predict_proba(X_test)[:,1] y_true = y_test fpr, tpr, _ = roc_curve(y_true, y_pred) roc_auc = auc(fpr, tpr)
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #ClusterCentroids cc = ClusterCentroids(random_state=0) os_X,os_y = cc.fit_sample(X_train,y_train) #XGboost clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1, max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1, reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000) clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False) evals_result = clf_XG.evals_result() y_true, y_pred = y_test, clf_XG.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1],
plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:, data.columns != 'state'].values, data.loc[:, data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #ClusterCentroids cc = ClusterCentroids(random_state=0) os_X, os_y = cc.fit_sample(X_train, y_train) #Random Forest clf_RF = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0) clf_RF.fit(os_X, os_y) y_true, y_pred = y_test, clf_RF.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
def clustercentroidundersample(self, x_train, y_train): cc = ClusterCentroids() X_cc, y_cc = cc.fit_sample(x_train, y_train) return X_cc, y_cc
def test_balanced_batch_generator_function_no_return_indices(data): with pytest.raises(ValueError, match="needs to have an attribute"): balanced_batch_generator(*data, sampler=ClusterCentroids(), batch_size=10, random_state=42)
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=50, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_resample(X, y) X_res_vis_soft = pca.transform(X_resampled) # Use hard voting instead of soft voting cc = ClusterCentroids(voting='hard') X_resampled, y_resampled = cc.fit_resample(X, y) X_res_vis_hard = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5)) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
from imblearn.under_sampling import ClusterCentroids # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
def undersample(self, X, y): cc = ClusterCentroids(random_state=12) return cc.fit_resample(X, y)
# Prototype generation: under-sampling by generating new samples ############################################################################### ############################################################################### # ``ClusterCentroids`` under-samples by replacing the original samples by the # centroids of the cluster found. fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title(f"Linear SVC with y={Counter(y)}") sampler = ClusterCentroids(random_state=0) clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax2) ax2.set_title(f"Decision function for {sampler.__class__.__name__}") plot_resampling(X, y, sampler, ax3) ax3.set_title(f"Resampling using {sampler.__class__.__name__}") fig.tight_layout() ############################################################################### # Prototype selection: under-sampling by selecting existing samples ############################################################################### ############################################################################### # The algorithm performing prototype selection can be subdivided into two # groups: (i) the controlled under-sampling methods and (ii) the cleaning
def generate_undersample_km_rus(X: pd.DataFrame, y: pd.Series): rus = ClusterCentroids(random_state=0) X_resampled, y_resampled = rus.fit_sample(X, y) print(sorted(Counter(y_resampled).items())) return X_resampled, y_resampled
from sklearn.cross_validation import train_test_split from sklearn.metrics import f1_score, recall_score, precision_score, classification_report from sklearn.metrics import confusion_matrix, roc_curve, accuracy_score, roc_auc_score from feature_creation import X_train, y_train from feature_creation import selector, idx, df_reduced_train from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN from imblearn.under_sampling import TomekLinks, ClusterCentroids, NearMiss, CondensedNearestNeighbour, RandomUnderSampler from imblearn.under_sampling import OneSidedSelection, InstanceHardnessThreshold from imblearn.combine import SMOTEENN, SMOTETomek from sklearn.linear_model import SGDClassifier imbalances = [ RandomUnderSampler(), TomekLinks(), ClusterCentroids(), NearMiss(version=1, size_ngh=5), NearMiss(version=2, size_ngh=7), NearMiss(version=3, size_ngh=3), CondensedNearestNeighbour(size_ngh=3, n_seeds_S=51), OneSidedSelection(size_ngh=5, n_seeds_S=51), OneSidedSelection(size_ngh=5, n_seeds_S=35), InstanceHardnessThreshold(), RandomOverSampler(ratio='auto'), ADASYN(ratio='auto', k=3), ADASYN(ratio=0.1, k=5), ADASYN(ratio=0.2, k=7), ADASYN(ratio=0.4, k=7), SMOTE(ratio='auto', kind='regular', k=5), SMOTE(ratio=0.1, kind='regular', k=5), SMOTE(ratio='auto', kind='regular', k=7), SMOTE(ratio='auto', kind='regular', k=9, out_step=0.6),
def UnderSample(X, Y, method='Random', random_state=42): if X.size == len(X): X = X.reshape(-1, 1) if method is 'Cluster': # 默认kmeans估计器 sampler = ClusterCentroids(ratio='auto', random_state=random_state, estimator=None) elif method is 'Random': sampler = RandomUnderSampler(ratio='auto', random_state=random_state, replacement=False) elif method is 'NearMiss_1': sampler = NearMiss(ratio='auto', random_state=random_state, version=1) elif method is 'NearMiss_2': sampler = NearMiss(ratio='auto', random_state=random_state, version=2) elif method is 'NearMiss_3': sampler = NearMiss(ratio='auto', random_state=random_state, version=3) elif method is 'TomekLinks': sampler = TomekLinks(ratio='auto', random_state=random_state) elif method is 'ENN': # kind_sel可取'all'和'mode' sampler = EditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'RENN': # kind_sel可取'all'和'mode' sampler = RepeatedEditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'All_KNN': sampler = AllKNN(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'CNN': sampler = CondensedNearestNeighbour(ratio='auto', random_state=random_state) elif method is 'One_SS': sampler = OneSidedSelection(ratio='auto', random_state=random_state) elif method is 'NCR': sampler = NeighbourhoodCleaningRule(ratio='auto', random_state=random_state, kind_sel='all', threshold_cleaning=0.5) elif method is 'IHT': sampler = InstanceHardnessThreshold(estimator=None, ratio='auto', random_state=random_state) X_resampled, Y_resampled = sampler.fit_sample(X, Y) return X_resampled, Y_resampled