コード例 #1
0
def test_multiclass():
    dataset = datasets.load_wine()

    oversampler = sv.MulticlassOversampling(sv.distance_SMOTE())

    X_samp, y_samp = oversampler.sample(dataset['data'], dataset['target'])

    assert len(X_samp) > 0

    oversampler = sv.MulticlassOversampling(sv.distance_SMOTE(),
                                            strategy='equalize_1_vs_many')

    X_samp, y_samp = oversampler.sample(dataset['data'], dataset['target'])
コード例 #2
0
def over_sample(X, y, len_data, random_seed=None):  # -> 没有使用
    """
    对样本较少的类别进行过采样,增加样本数目,实现样本平衡
    """
    oversampler = sv.MulticlassOversampling(
        sv.distance_SMOTE(random_state=random_seed))
    X_samp, y_samp = oversampler.sample(X, y)
    return X_samp, y_samp
コード例 #3
0
# In[4]:

# printing the number of samples

for i in np.unique(y):
    print("class %d - samples: %d" % (i, np.sum(y == i)))

# ## Oversampling
#
# In this section multiclass oversampling is driven by the binary oversampler ```distance_SMOTE```.

# In[5]:

# chosing an oversampler supporting multiclass oversampling

oversampler = sv.MulticlassOversampling(sv.distance_SMOTE())

# In[6]:

X_samp, y_samp = oversampler.sample(X, y)

# ## Illustrating the outcome

# In[7]:

# printing the number of samples

for i in np.unique(y_samp):
    print("class %d - samples: %d" % (i, np.sum(y_samp == i)))

# In[8]:
コード例 #4
0
ファイル: script-9.py プロジェクト: danibolanos/Practicas_IN
    data_x_tst = pd.read_csv('nepal_earthquake_tst.csv')
    # Comprueba balanceo de clases
    # GraficoComprobarVar(data_y, "damage_grade")
    # Comprueba valores perdidos
    #ComprobarValPer(data_x)
    # Calcula la matriz de correlación
    #MatrizCorrelacion(data_x)
    # Elimina etiquetas
    eliminaLabels(data_x, data_y, data_x_tst, ['building_id'])
    # Preprocesado category to number

    X = catToNum(data_x).values
    y = np.ravel(data_y.values)
    X_tst = catToNum(data_x_tst).values

    oversampler = sv.MulticlassOversampling(sv.distance_SMOTE(proportion=0.75))

    X_sample, y_sample = oversampler.sample(X, y)
    '''
  print("------ RandomForest...")
  rfm = RandomForestClassifier(max_features = 'sqrt', criterion='gini', n_estimators=500, \
                               max_depth=25, random_state=76592621, \
                               n_jobs=-1)
  # Hago la validación cruzada para el algoritmo
  #rfm, y_train_clf, y_test_clf = validacion_cruzada(rfm, X_sample, y_sample)
  print("------ Generando submission...")
  submission(X_sample, y_sample, X_tst, rfm)
  '''

    print("------ Catboost...")
    cbc = CatBoostClassifier(n_estimators=450,
コード例 #5
0
np.random.seed(random_seed)

# In[3]:

libras = imb_datasets.fetch_datasets()['libras_move']
X, y = libras['data'], libras['target']

# In[4]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# ## Fitting a pipeline

# In[5]:

oversampler = sv.MulticlassOversampling(sv.distance_SMOTE())
classifier = KNeighborsClassifier(n_neighbors=5)

# In[6]:

model = Pipeline([('scale', StandardScaler()),
                  ('clf', sv.OversamplingClassifier(oversampler, classifier))])

# In[7]:

model.fit(X, y)

# ## Grid search

# In[8]:
コード例 #6
0
ファイル: ensemble_learning.py プロジェクト: dddtqshmpmz/PDX
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score
from sklearn import metrics
import smote_variants as sv
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import heapq

# prepare the smote data --> X_samp, y_samp
df = pd.read_csv("original_data.csv")
data = np.array(df)
X = data[:,:-1]
y = data[:, -1]
oversampler = sv.distance_SMOTE()
X_samp, y_samp = oversampler.sample(X, y)
X_samp = np.round(X_samp)
y_samp = np.round(y_samp)
X_samp, y_samp = X_samp[len(X):], y_samp[len(y):]
# set the count of smote data
SmoteNum = 11
X_samp = X_samp[:SmoteNum,:]
y_samp = y_samp[:SmoteNum]

# init arrays for saving test scores and train scores
meanAUC = np.array([])
meanPrecision = np.array([])
meanRecall = np.array([])
meanAccuracy = np.array([])
meanF1score = np.array([])
コード例 #7
0
                           validator= RepeatedStratifiedKFold(n_repeats= 8,
                                                              n_splits= 5))
print(results.T[['sampler', 'auc', 'gacc']])

np.random.seed(random_seed)
results= sv.cross_validate(dataset= libras, 
                           sampler= sv.NoSMOTE(), 
                           classifier= KNeighborsClassifier(),
                           validator= RepeatedStratifiedKFold(n_repeats= 8,
                                                              n_splits= 5))
print(results.T[['sampler', 'auc', 'gacc']])

#%% running multiclass oversampling

np.random.seed(random_seed)
mc_oversampler= sv.MulticlassOversampling(sv.distance_SMOTE(), strategy= 'equalize_1_vs_many_successive')
X_os, y_os= mc_oversampler.sample(wine['data'], wine['target'])

plot_mc(wine['data'], wine['target'], 'wine', 0, 1, 2, 'wine.eps')
plot_mc(X_os, y_os, 'wine oversampled by distance-SMOTE', 0, 1, 2, 'wine_distance_smote.eps')

#%% oversampler evaluation

import os.path

ecoli['name']= 'ecoli'
cache_path= os.path.join(os.path.expanduser('~'), 'smote_cache')

np.random.seed(random_seed)
results= sv.evaluate_oversamplers(datasets= [ecoli], 
                                  samplers= [sv.SPY,