def test_multiclass(): dataset = datasets.load_wine() oversampler = sv.MulticlassOversampling(sv.distance_SMOTE()) X_samp, y_samp = oversampler.sample(dataset['data'], dataset['target']) assert len(X_samp) > 0 oversampler = sv.MulticlassOversampling(sv.distance_SMOTE(), strategy='equalize_1_vs_many') X_samp, y_samp = oversampler.sample(dataset['data'], dataset['target'])
def over_sample(X, y, len_data, random_seed=None): # -> 没有使用 """ 对样本较少的类别进行过采样,增加样本数目,实现样本平衡 """ oversampler = sv.MulticlassOversampling( sv.distance_SMOTE(random_state=random_seed)) X_samp, y_samp = oversampler.sample(X, y) return X_samp, y_samp
# In[4]: # printing the number of samples for i in np.unique(y): print("class %d - samples: %d" % (i, np.sum(y == i))) # ## Oversampling # # In this section multiclass oversampling is driven by the binary oversampler ```distance_SMOTE```. # In[5]: # chosing an oversampler supporting multiclass oversampling oversampler = sv.MulticlassOversampling(sv.distance_SMOTE()) # In[6]: X_samp, y_samp = oversampler.sample(X, y) # ## Illustrating the outcome # In[7]: # printing the number of samples for i in np.unique(y_samp): print("class %d - samples: %d" % (i, np.sum(y_samp == i))) # In[8]:
data_x_tst = pd.read_csv('nepal_earthquake_tst.csv') # Comprueba balanceo de clases # GraficoComprobarVar(data_y, "damage_grade") # Comprueba valores perdidos #ComprobarValPer(data_x) # Calcula la matriz de correlación #MatrizCorrelacion(data_x) # Elimina etiquetas eliminaLabels(data_x, data_y, data_x_tst, ['building_id']) # Preprocesado category to number X = catToNum(data_x).values y = np.ravel(data_y.values) X_tst = catToNum(data_x_tst).values oversampler = sv.MulticlassOversampling(sv.distance_SMOTE(proportion=0.75)) X_sample, y_sample = oversampler.sample(X, y) ''' print("------ RandomForest...") rfm = RandomForestClassifier(max_features = 'sqrt', criterion='gini', n_estimators=500, \ max_depth=25, random_state=76592621, \ n_jobs=-1) # Hago la validación cruzada para el algoritmo #rfm, y_train_clf, y_test_clf = validacion_cruzada(rfm, X_sample, y_sample) print("------ Generando submission...") submission(X_sample, y_sample, X_tst, rfm) ''' print("------ Catboost...") cbc = CatBoostClassifier(n_estimators=450,
np.random.seed(random_seed) # In[3]: libras = imb_datasets.fetch_datasets()['libras_move'] X, y = libras['data'], libras['target'] # In[4]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) # ## Fitting a pipeline # In[5]: oversampler = sv.MulticlassOversampling(sv.distance_SMOTE()) classifier = KNeighborsClassifier(n_neighbors=5) # In[6]: model = Pipeline([('scale', StandardScaler()), ('clf', sv.OversamplingClassifier(oversampler, classifier))]) # In[7]: model.fit(X, y) # ## Grid search # In[8]:
from matplotlib import pyplot as plt import pandas as pd import numpy as np from sklearn.metrics import roc_auc_score, f1_score from sklearn import metrics import smote_variants as sv from sklearn.linear_model import LogisticRegression from sklearn import svm import heapq # prepare the smote data --> X_samp, y_samp df = pd.read_csv("original_data.csv") data = np.array(df) X = data[:,:-1] y = data[:, -1] oversampler = sv.distance_SMOTE() X_samp, y_samp = oversampler.sample(X, y) X_samp = np.round(X_samp) y_samp = np.round(y_samp) X_samp, y_samp = X_samp[len(X):], y_samp[len(y):] # set the count of smote data SmoteNum = 11 X_samp = X_samp[:SmoteNum,:] y_samp = y_samp[:SmoteNum] # init arrays for saving test scores and train scores meanAUC = np.array([]) meanPrecision = np.array([]) meanRecall = np.array([]) meanAccuracy = np.array([]) meanF1score = np.array([])
validator= RepeatedStratifiedKFold(n_repeats= 8, n_splits= 5)) print(results.T[['sampler', 'auc', 'gacc']]) np.random.seed(random_seed) results= sv.cross_validate(dataset= libras, sampler= sv.NoSMOTE(), classifier= KNeighborsClassifier(), validator= RepeatedStratifiedKFold(n_repeats= 8, n_splits= 5)) print(results.T[['sampler', 'auc', 'gacc']]) #%% running multiclass oversampling np.random.seed(random_seed) mc_oversampler= sv.MulticlassOversampling(sv.distance_SMOTE(), strategy= 'equalize_1_vs_many_successive') X_os, y_os= mc_oversampler.sample(wine['data'], wine['target']) plot_mc(wine['data'], wine['target'], 'wine', 0, 1, 2, 'wine.eps') plot_mc(X_os, y_os, 'wine oversampled by distance-SMOTE', 0, 1, 2, 'wine_distance_smote.eps') #%% oversampler evaluation import os.path ecoli['name']= 'ecoli' cache_path= os.path.join(os.path.expanduser('~'), 'smote_cache') np.random.seed(random_seed) results= sv.evaluate_oversamplers(datasets= [ecoli], samplers= [sv.SPY,