def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_trans2, y_trans2 = pipeline.fit_sample(X, y)
    X_trans3, y_trans3 = rus.fit_sample(X, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)
    assert_array_almost_equal(y_trans, y_trans2)
    assert_array_almost_equal(y_trans, y_trans3)

    pca = PCA()
    pipeline = Pipeline([('pca', pca), ('rus', rus)])

    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_sample(X_pca, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(y_trans, y_trans2)
Beispiel #2
0
    def downsample(self):
        """Balance class data based on outcome"""
        print('Current outcome sampling {}'.format(Counter(self.y)))
        
        # to use a random sampling seed at random:
       # rus = RandomUnderSampler()
       # self.X, self.y = rus.fit_sample(self.X, self.y)
       
        # to fix the random sampling seed at a certain value & return indices: 
        rus = RandomUnderSampler(random_state=0,return_indices=True)
        self.X, self.y, ds_idx = rus.fit_sample(self.X, self.y)
        
        # print out the downsampled index to file: 
        file = open('downsampled_idx','a')
        file.write(str(ds_idx)+'\n')
        file.close()

        # print out the downsampled y to file: 
        file = open('downsampled_y','a')
        file.write(str(self.y)+'\n')
        file.close()
        
        
        self.Xview = self.X.view()[:, :self.n_features]
        print('Resampled dataset shape {}'.format(Counter(self.y)))
Beispiel #3
0
def undersample(X, y, bal_strategy):
	print 'Shape of X: ', X.shape
	print 'Shape of y_Train: ', y.shape

	if(bal_strategy == "RANDOM" or bal_strategy == "ALL"):
		# apply random under-sampling
		rus = RandomUnderSampler()
		X_sampled, y_sampled = rus.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "TOMEK" or bal_strategy == "ALL"):
		# Apply Tomek Links cleaning
		tl = TomekLinks()
		X_sampled, y_sampled = tl.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in ALL, RANDOM, TOMEK, NONE'
		sys.exit(1)

	return (X_sampled, y_sampled)
    def transform(self, X, y=None):
        # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
        # TODO      simple trainer in the correct order and leave this to advanced users?

        # Extract predicted column
        y = np.squeeze(X[[self.predicted_column]])

        # Copy the dataframe without the predicted column
        temp_dataframe = X.drop([self.predicted_column], axis=1)

        # Initialize and fit the under sampler
        under_sampler = RandomUnderSampler(random_state=self.random_seed)
        x_under_sampled, y_under_sampled = under_sampler.fit_sample(temp_dataframe, y)

        # Build the resulting under sampled dataframe
        result = pd.DataFrame(x_under_sampled)

        # Restore the column names
        result.columns = temp_dataframe.columns

        # Restore the y values
        y_under_sampled = pd.Series(y_under_sampled)
        result[self.predicted_column] = y_under_sampled

        return result
 def downsample(self):
     """Balance class data based on outcome"""
     print('Current outcome sampling {}'.format(Counter(self.y)))
     rus = RandomUnderSampler()
     self.X, self.y = rus.fit_sample(self.X, self.y)
     self.Xview = self.X.view()[:,:self.n_features]
     print('Resampled dataset shape {}'.format(Counter(self.y)))
def test_rus_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    rus.fit(X, Y)
    assert_raises(RuntimeError, rus.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_random_under_sampling_heterogeneous_data():
    X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
                        dtype=np.object)
    y = np.array([0, 0, 1])
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_res, y_res = rus.fit_resample(X_hetero, y)

    assert X_res.shape[0] == 2
    assert y_res.shape[0] == 2
    assert X_res.dtype == object
def test_multiclass_fit_sample():
    y = Y.copy()
    y[5] = 2
    y[6] = 2
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    count_y_res = Counter(y_resampled)
    assert count_y_res[0] == 2
    assert count_y_res[1] == 2
    assert count_y_res[2] == 2
def test_rus_fit_resample():
    rus = RandomUnderSampler(random_state=RND_SEED, replacement=True)
    X_resampled, y_resampled = rus.fit_resample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.12372842, 0.6536186], [0.04352327, -0.20515826]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])

    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_rus_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_rus_fit():
    """Test the fitting method"""

    # Create the object
    rus = RandomUnderSampler(random_state=RND_SEED)
    # Fit the data
    rus.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(rus.min_c_, 0)
    assert_equal(rus.maj_c_, 1)
    assert_equal(rus.stats_c_[0], 3)
    assert_equal(rus.stats_c_[1], 7)
def test_rus_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'rus_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_rus_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.12372842, 0.6536186], [0.04352327, -0.20515826]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    idx_gt = np.array([1, 3, 8, 6, 7, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Beispiel #14
0
def CrossVal(estimator, X, y,procsessor=None,cv=3,times=10,random_state=0,imb=False):
    """
    交叉验证
    
    estimator:
        模型
    
    X:
        数据集X部分
    
    y:
        数据集的label
    
    procsessor:
        预处理器,其实就是做特征选择
    
    cv:
        做cv折交叉验证
    
    times:
        重复times次交叉验证
        
    random_state:
        随机数种子
    
    imb:
        是否使用SMOTE使得正负样本数平衡
    
    """
    
    res=[]
    for t in range(times):
        skf=StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state+t)
        indices=list(skf.split(X=X,y=y))        
        for k in indices:
            x_train,y_train,x_test,y_test=X[k[0]],y[k[0]],X[k[1]],y[k[1]]              
            if(imb==True):
                n,p=__lableCount(y_train)
                rus=RandomUnderSampler(random_state=random_state+t)
                x_train,y_train=rus.fit_sample(x_train,y_train)         
            if(procsessor is not None):
                procsessor.fit(x_train,y_train)
                x_train,y_train=procsessor.transform(x_train,y_train)
                x_test,y_test=procsessor.transform(x_test,y_test)
            estimator.fit(x_train,y_train)
            res.append(Metrics.Score(estimator,x_test,y_test))                
    res=np.array(res)
    return res
def test_rus_fit_resample_half():
    sampling_strategy = {0: 3, 1: 6}
    rus = RandomUnderSampler(
        sampling_strategy=sampling_strategy,
        random_state=RND_SEED,
        replacement=True)
    X_resampled, y_resampled = rus.fit_resample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [
        0.92923648, 0.76103773
    ], [0.15490546, 0.3130677], [0.15490546, 0.3130677],
                     [0.15490546, 0.3130677], [0.20792588, 1.49407907],
                     [0.15490546, 0.3130677], [0.12372842, 0.6536186]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_rus_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.5
    rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.12372842, 0.6536186], [0.04352327, -0.20515826],
                     [0.15490546, 0.3130677], [0.15490546, 0.3130677],
                     [0.15490546, 0.3130677]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[5] = 2
    y[6] = 2

    # Resample the data
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 2)
    assert_equal(count_y_res[1], 2)
    assert_equal(count_y_res[2], 2)
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_trans2, y_trans2 = pipeline.fit_sample(X, y)
    X_trans3, y_trans3 = rus.fit_sample(X, y)
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(X_trans, X_trans3, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans3, rtol=R_TOL)

    pca = PCA()
    pipeline = Pipeline([('pca', PCA()),
                         ('rus', rus)])

    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_sample(X_pca, y)
    # We round the value near to zero. It seems that PCA has some issue
    # with that
    X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0
    X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
Beispiel #19
0
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.1,
    'min_child_weight': 1,
    'max_depth': 9,
    'gamma': 0.05,
    'lambda': 10,
    'silent': 1
}

# SMOTE over-sampling process, where '19000' is the size of positive (or negative) samples
print(" SMOTE begin...")
print(" ...")
SMOTE_params = SMOTE(ratio={1: 19000}, random_state=0)
train_X_SMOTED, train_y_SMOTED = SMOTE_params.fit_sample(train_X, train_y)
rus = RandomUnderSampler(ratio={0: 19000}, random_state=0)
train_X_SMOTE, train_y_SMOTE = rus.fit_sample(train_X_SMOTED, train_y_SMOTED)
print(sorted(Counter(train_y_SMOTE).items()))
print(" SMOTE end.")

# build the prediction model by XGBoost
print(" Training Begin...")
dtrain = xgb.DMatrix(train_X_SMOTE, label=train_y_SMOTE)
dtest = xgb.DMatrix(test_X)
watchlist = [(dtrain, 'train')]
bst = xgb.train(params, dtrain, num_boost_round=200, evals=watchlist)
print(" Training End.")

# output the probability value label of the prediction results (range between 0 and 1) and the AUC of the prediction results
print(" Testing Begin...")
ypred = bst.predict(dtest)
    test_size=0.2,
    random_state=23,
    shuffle=True,
    stratify=data_dict['y_train'])

# ### Obtain undersampled dataset
#
# Undersample the data that  will be used for  training. We do not undersample the mock testing set as we want to keep the distribution of the classes close to the distribution of the original dataset.

# In[97]:

from imblearn.under_sampling import RandomUnderSampler

# In[98]:

rus = RandomUnderSampler(random_state=0)

# In[99]:

X_train_under, y_train_under = rus.fit_resample(X_train, y_train)

# In[100]:

data_dict_under = {'y_train': y_train_under}
plot_target_frequency(data_dict_under)

# ### Prepare Inputs
#
# We convert the dataframes into numpy ndarrays.

# In[101]:
Beispiel #21
0
sortSimilarity = pd.Series(densitySimilarity, namesToPlot).sort_values(ascending=False)

plt.figure(3, figsize=(6,10))
sortSimilarity.plot(kind='bar')
plt.ylabel('density similarity')

X = df[namesToPlot]
y = df['Class']

sd = StandardScaler()
X = sd.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify =y)


rus = RandomUnderSampler(random_state=1)
X_train, y_train = rus.fit_resample(X_train, y_train)

clf_lr_base = LogisticRegression(class_weight='balanced', solver='saga', max_iter=5000)
clf_lr_base.fit(X_train, y_train)
y_pred_lr_base = clf_lr_base.predict(X_test)


print(classification_report(y_test, y_pred_lr_base))
print(confusion_matrix(y_test, y_pred_lr_base))
print(balanced_accuracy_score(y_test, y_pred_lr_base))
parameter = {'C': np.logspace(-6, 2, 10)}

gs = GridSearchCV(LogisticRegression(solver='saga', max_iter=5000, penalty='l1', class_weight='balanced'), parameter, scoring='balanced_accuracy')

gs.fit(X_train, y_train)
Beispiel #22
0
pointDF["invalid_state"] = pointDF["invalid_state"].astype('category')
pointDF["pdh0"] = pointDF["pdh0"].astype('category')
pointDF["vx_rms"] = pointDF["vx_rms"].astype('category')
pointDF["vy_rms"] = pointDF["vy_rms"].astype('category')

X = pointDF[[
    'x', 'y', 'dyn_prop', 'rcs', 'vx_comp', 'vy_comp', 'ambig_state', 'x_rms',
    'y_rms', 'invalid_state', 'pdh0', 'vx_rms', 'vy_rms'
]]
y = pointDF['BasicCategoryNum']

start = time.clock()

#Remove passenger car samples randomly
desiredSampleCounts = {4: 75000}
rus = RandomUnderSampler(sampling_strategy=desiredSampleCounts)
X_undersampled, y_undersampled = rus.fit_resample(X, y)

#Remove tractor samples randomly
desiredSampleCounts = {6: 75000}
rus = RandomUnderSampler(sampling_strategy=desiredSampleCounts)
X_undersampled, y_undersampled = rus.fit_resample(X_undersampled,
                                                  y_undersampled)
print(np.bincount(y_undersampled))

#Remove Tomek Pairs
underSampleObj = TomekLinks(sampling_strategy='all', n_jobs=5)
X_undersampledTomek, y_undersampledTomek = underSampleObj.fit_resample(
    X_undersampled, y_undersampled)
print(np.bincount(y_undersampledTomek))
Beispiel #23
0
model = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
model.fit(X_train_prepared, y_train)

sel = SelectFromModel(model)
sel.fit(X_test_prepared, y_test)

selected_feat = X_train.columns[(sel.get_support())]

# +
# Dealing with imbalanced data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

over = SMOTE(sampling_strategy=0.2)
under = RandomUnderSampler(sampling_strategy=0.6)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

#X_train_prepared, y_train = pipeline.fit_resample(X_train_prepared, y_train)
over_sample = SMOTE()
X_train_prepared, y_train = over_sample.fit_resample(X_train_prepared, y_train)

# +
#display(X_train_prepared.shape)
#display(y_train.shape)
# -

plt.figure(figsize=(5, 5))
splot = sns.countplot(data=y_train, x='Bankrupt?', palette='Blues')
from imblearn.under_sampling import RandomUnderSampler

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
# ....................................
#
# ``sampling_strategy`` can be given a ``float``. For **under-sampling
# methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by
# :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and
# :math:`N_{m}` are the number of samples in the majority class after
# resampling and the number of samples in the minority class, respectively.

# select only 2 classes since the ratio make sense in this case
binary_mask = np.bitwise_or(y == 0, y == 2)
binary_y = y[binary_mask]
binary_X = X[binary_mask]

sampling_strategy = 0.8

rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_res, y_res = rus.fit_resample(binary_X, binary_y)
print('Information of the iris data set after making it '
      'balanced using a float and an under-sampling method: \n '
      'sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# For **over-sampling methods**, it correspond to the ratio
# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
# minority class after resampling and the number of samples in the majority
# class, respectively.

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
Beispiel #26
0
def run_training(fold_):
    total_roc = []
    total_conf = []

    t0 = time.time()
    #df = pd.read_csv("../input/embedded_train_tiny_folds.csv")
    df = pd.read_hdf(path_or_buf="../input/tiny_data/full_data_folds.h5",
                     key='dataset')
    #print("tg\n",df.target.value_counts())
    #print(" ")
    t1 = time.time()
    total_time = t1 - t0
    print("time to read file", total_time)

    print(f"fold: {fold_}")

    t0 = time.time()

    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    #    print("train shape\n", train_df.shape)
    #   print("test shape\n", test_df.shape)

    #features
    xtrain = train_df.drop(["kfold", "target"], axis=1)
    xtest = test_df.drop(["kfold", "target"], axis=1)
    # Standard scaler

    #sc = StandardScaler()
    #sc.fit(xtrain)

    #xtrain = sc.transform(xtrain)
    #xtest = sc.transform(xtest)

    # target
    # First make the target binary
    train_df.target = train_df.target.apply(lambda x: 'open'
                                            if x == 'open' else 'closed')

    test_df.target = test_df.target.apply(lambda x: 'open'
                                          if x == 'open' else 'closed')
    # Encode labels
    le = preprocessing.LabelEncoder()
    le.fit(train_df.target)
    #print(le.classes_)
    ytrain = le.transform(train_df.target)

    ytest = le.transform(test_df.target)

    print("now do SMOTE")
    # defin pipeline
    #over = RandomOverSampler(
    #    sampling_strategy=0.032,
    #    random_state=0)
    over = SMOTE(sampling_strategy=0.8, n_jobs=-1)
    under = RandomUnderSampler(sampling_strategy=0.9)

    steps = [('o', over), ('u', under)]

    pipeline = Pipeline(steps=steps)
    #transform the datset
    X_res, y_res = pipeline.fit_resample(xtrain, ytrain)
    #X_res, y_res =xtrain, ytrain
    print("Before sampling %s" % Counter(ytrain))
    print('Resampled dataset shape %s' % Counter(y_res))

    #model

    model = xgb.XGBRFClassifier(use_label_encoder=False,
                                scale_pos_weight=0.9,
                                n_estimators=70,
                                max_depth=6,
                                n_jobs=-1,
                                subsample=0.4,
                                num_parallel_tree=20,
                                eval_metric='logloss',
                                tree_method='auto',
                                objective='reg:logistic',
                                gamma=.1,
                                min_child_weight=6,
                                booster='dart',
                                eta=0.8)
    #fit the model on training data
    model.fit(X_res, y_res)
    # make predictions
    preds = model.predict(xtest)
    preds_proba = model.predict_proba(xtest)[:, 1]
    # print('preds shape',preds_proba.shape)

    t1 = time.time()
    total_time = t1 - t0
    print('time to fit model:', total_time)

    accuracy_score = np.sum(preds == ytest) / len(ytest)
    #log_loss= metrics.log_loss(train_df.OpenStatus,preds)

    #print(f"Fold:{fold_}")
    #print(f"Accuracy={accuracy_score}")
    conf_m = confusion_matrix(ytest, preds)
    #print('Confusion matrix\n',conf_m)
    roc_score = roc_auc_score(ytest, preds_proba)
    print('ROC AUC score\n', roc_score)
    t = [fold_, roc_score]
    total_conf.append(conf_m)
    total_roc.append(t)
    test_df.loc[:, "xgb_pred_n"] = preds_proba
    print('Confusion matrix\n', confusion_matrix(ytest, preds))

    return test_df[["id", "target", "kfold", "xgb_pred_n"]], np.mean(total_roc,
                                                                     axis=0)[1]
lr_model = LogisticRegression(random_state=r_state)
xgb_model = XGBClassifier()
dt_model = DecisionTreeClassifier(random_state=r_state)
svc_model = SVC(kernel='linear', C=1.0, probability=True)
knn_model = KNeighborsClassifier(n_neighbors=5)

calculate_scores_in_cv(xgb_model, 'xgboost_model', X, y)
calculate_scores_in_cv(Adboost_model, 'AdaBoostClassifier', X, y)
calculate_scores_in_cv(dt_model, 'DecisionTreeClassifier', X, y)
calculate_scores_in_cv(svc_model, 'SVC', X, y)
calculate_scores_in_cv(lr_model, 'LogisticRegression', X, y)
calculate_scores_in_cv(knn_model, 'KNeighborsClassifier', X, y)
"""With SMOTE data balancing"""

over = SMOTE(sampling_strategy=1, random_state=r_state)
under = RandomUnderSampler(sampling_strategy=1, random_state=r_state)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

sampled_X, sampled_y = pipeline.fit_resample(X, y)

sampled_X, sampled_y = shuffle(sampled_X, sampled_y, random_state=r_state)

calculate_scores_in_cv(xgb_model, 'xgboost_model', sampled_X, sampled_y)
calculate_scores_in_cv(dt_model, 'DecisionTreeClassifier', sampled_X,
                       sampled_y)
calculate_scores_in_cv(svc_model, 'SVC', sampled_X, sampled_y)
calculate_scores_in_cv(lr_model, 'LogisticRegression', sampled_X, sampled_y)
calculate_scores_in_cv(knn_model, 'KNeighborsClassifier', sampled_X, sampled_y)
def comet_Fold(save_path, embedding_type, model_type, bin_labels):
    from comet_ml import Experiment
    exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                     project_name="80_10_baseline",
                     workspace="gdreiman1",
                     disabled=False)
    exp.log_code = True
    #turn off comet logging comments
    import os
    #os.environ['COMET_LOGGING_FILE_LEVEL'] = 'WARNING'
    import warnings
    warnings.filterwarnings('ignore')
    import pickle
    import pandas as pd
    import numpy as np
    import sklearn as sklearn
    from sklearn.metrics import precision_recall_fscore_support as prf
    from sklearn.linear_model import SGDClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    import matplotlib.pyplot as plt
    import seaborn as sns
    import ntpath
    from imblearn.over_sampling import RandomOverSampler
    #choosing a 4:1 Inactive to Active ratio
    ros = RandomOverSampler(sampling_strategy=0.33, random_state=42)
    from imblearn.under_sampling import RandomUnderSampler
    rus = RandomUnderSampler(sampling_strategy=0.33, random_state=42)
    '''Comet Saving Zone'''
    def comet_addtional_info(exp, save_path, metrics_dict, X_test, y_test,
                             embedding_type, model_type):
        #get base file name
        folder, base = ntpath.split(save_path)
        #split file name at second _ assumes file save in AID_xxx_endinfo.pkl
        AID, _, end_info = base.rpartition('_')
        exp.add_tag(AID)
        #save data location, AID info, and version info
        exp.log_dataset_info(name=AID, version=end_info, path=save_path)
        #save some informatvie tags:
        tags = [AID, end_info, model_type]
        exp.add_tags(tags)
        exp.add_tag(embedding_type)
        #save metrics_dict in data_folder with comet experiement number associated
        exp_num = exp.get_key()
        model_save = Path(folder + '/' + model_type + '_' + embedding_type +
                          '_' + exp_num + 'metrics_dict.pkl')
        pickle_on = open(model_save, 'wb')
        pickle.dump(metrics_dict, pickle_on)
        pickle_on.close()
        #log trained model location
        exp.log_other('Metrics Dict Path', model_save)
        #tell comet that the experiement is over
        exp.end()

    def get_Scaled_Data(train_ind, test_ind, X_mfp, activity_table, labels,
                        bin_labels):
        #get start and end index for molchars
        MC_start = activity_table.columns.get_loc('Chi0')
        #need to add 1 bc exclusive indexing
        MC_end = activity_table.columns.get_loc('VSA_EState9') + 1
        # standardize data
        scaler = StandardScaler(copy=False)
        #return requested datatype
        if embedding_type == 'MFPMolChars':
            X_train_molchars_std = scaler.fit_transform(
                np.array(activity_table.iloc[train_ind,
                                             MC_start:MC_end]).astype(float))
            X_test_molchars_std = scaler.transform(
                np.array(activity_table.iloc[test_ind,
                                             MC_start:MC_end]).astype(float))
            X_train = np.concatenate(
                (X_mfp[train_ind, :], X_train_molchars_std), axis=1)
            X_test = np.concatenate((X_mfp[test_ind, :], X_test_molchars_std),
                                    axis=1)
        elif embedding_type == 'MFP':
            X_train = X_mfp[train_ind, :]
            X_test = X_mfp[test_ind, :]
        elif embedding_type == 'MolChars':
            X_train_molchars_std = scaler.fit_transform(
                np.array(activity_table.iloc[train_ind,
                                             MC_start:MC_end]).astype(float))
            X_test_molchars_std = scaler.transform(
                np.array(activity_table.iloc[test_ind,
                                             MC_start:MC_end]).astype(float))
            X_train = X_train_molchars_std
            X_test = X_test_molchars_std
        y_train = labels[train_ind]
        y_test = labels[test_ind]
        #remapping active to 1 and everything else to zero
        bin_y_train, bin_y_test = np.array([
            1 if x == 0 else 0 for x in y_train
        ]), np.array([1 if x == 0 else 0 for x in y_test])
        if bin_labels == True:
            y_test = bin_y_test
            y_train = bin_y_train
        return X_train, X_test, y_train, y_test

    def train_SVM(X_train, X_test, y_train, y_test, split_ID):
        sgd_linear_SVM = SGDClassifier(loss='hinge',
                                       penalty='l2',
                                       alpha=0.0001,
                                       l1_ratio=0.15,
                                       fit_intercept=True,
                                       max_iter=500000,
                                       tol=0.001,
                                       shuffle=True,
                                       verbose=0,
                                       epsilon=0.1,
                                       n_jobs=-1,
                                       random_state=None,
                                       learning_rate='optimal',
                                       eta0=0.0,
                                       power_t=0.5,
                                       early_stopping=False,
                                       validation_fraction=0.1,
                                       n_iter_no_change=5,
                                       class_weight='balanced',
                                       warm_start=False,
                                       average=False)
        sgd_linear_SVM_model = sgd_linear_SVM.fit(X_train, y_train)

        sgd_lSVM_preds = sgd_linear_SVM_model.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, sgd_lSVM_preds, average=None)
        class_rep = sklearn.metrics.classification_report(
            y_test, sgd_lSVM_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, sgd_lSVM_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(sgd_linear_SVM_model.get_params())
        return prec, rec, f_1, supp, mcc

    def train_RF(X_train, X_test, y_train, y_test, split_ID):

        rf = RandomForestClassifier(n_estimators=100,
                                    random_state=2562,
                                    class_weight="balanced_subsample",
                                    n_jobs=-1)
        rand_for = rf.fit(X_train, y_train)
        rf_preds = rand_for.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, rf_preds, average=None)
        class_rep = sklearn.metrics.classification_report(y_test, rf_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, rf_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(rand_for.get_params())
        return prec, rec, f_1, supp, mcc

    def train_LGBM(X_train, X_test, y_train, y_test, split_ID):
        import lightgbm as lgb
        #make model class
        lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt',
                                        num_leaves=31,
                                        max_depth=-1,
                                        learning_rate=0.1,
                                        n_estimators=500,
                                        subsample_for_bin=200000,
                                        objective='binary',
                                        is_unbalance=True,
                                        min_split_gain=0.0,
                                        min_child_weight=0.001,
                                        min_child_samples=20,
                                        subsample=1.0,
                                        subsample_freq=0,
                                        colsample_bytree=1.0,
                                        reg_alpha=0.0,
                                        reg_lambda=0.0,
                                        random_state=None,
                                        n_jobs=-1,
                                        silent=True,
                                        importance_type='split')
        #train model
        lgbm = lgbm_model.fit(X_train, y_train)
        lgbm_preds = lgbm.predict(X_test)
        prec, rec, f_1, supp = prf(y_test, lgbm_preds, average=None)
        class_rep = sklearn.metrics.classification_report(y_test, lgbm_preds)
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(y_test, lgbm_preds)

        #if first iteration, report model parameters to comet
        if split_ID == '0':
            exp.log_parameters(lgbm.get_params())
        return prec, rec, f_1, supp, mcc

    def train_DNN(X_train, X_test, y_train, y_test, split_ID):
        import tensorflow as tf
        #tf.enable_eager_execution()
        #        from keras import backend as K
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import Dense, Dropout, GaussianNoise
        from tensorflow.keras.layers import Lambda
        from tensorflow.keras.utils import to_categorical
        #        def focal_loss(y_true, y_pred):
        #            gamma = 2.0
        #            alpha = 0.25
        #            pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        #            pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        #    #        pt_1 = K.clip(pt_1, 1e-3, .999)
        #    #        pt_0 = K.clip(pt_0, 1e-3, .999)
        #
        #            return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log( pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0 ))

        #bias for predictions
        fl_pi = 0.01
        final_bias = -np.log((1 - fl_pi) / fl_pi)
        num_labels = len(set(y_test))
        from sklearn.utils import class_weight
        class_weights = class_weight.compute_class_weight(
            'balanced', np.unique(y_train), y_train)
        tf.keras.backend.clear_session()
        fast_NN = Sequential(name='quick')
        #fast_NN.add(GaussianNoise(.5))
        fast_NN.add(Dense(512, activation='sigmoid', name='input'))
        #fast_NN.add(Dropout(0.5))
        fast_NN.add(
            Dense(128,
                  activation='relu',
                  name='first',
                  bias_initializer=tf.keras.initializers.Constant(value=0.1)))
        #fast_NN.add(Dropout(0.5))
        fast_NN.add(
            Dense(64,
                  activation='relu',
                  name='second',
                  bias_initializer=tf.keras.initializers.Constant(value=0.1)))
        #fast_NN.add(Dropout(0.5))
        fast_NN.add(
            Dense(16,
                  activation='relu',
                  name='third',
                  bias_initializer=tf.keras.initializers.Constant(value=0.1)))
        #fast_NN.add(Dropout(0.25))
        fast_NN.add(
            Dense(num_labels,
                  activation='softmax',
                  name='predict',
                  bias_initializer=tf.keras.initializers.Constant(
                      value=final_bias)))
        fast_NN.compile(loss='categorical_crossentropy',
                        optimizer='adam',
                        metrics=[
                            'categorical_accuracy',
                            tf.keras.metrics.Recall(),
                            tf.keras.metrics.Precision()
                        ])
        fast_NN_model = fast_NN.fit(X_train,
                                    to_categorical(y_train),
                                    validation_data=(X_test,
                                                     to_categorical(y_test)),
                                    epochs=10,
                                    batch_size=500,
                                    class_weight=class_weights,
                                    shuffle=True,
                                    verbose=0)
        NN_test_preds = fast_NN.predict(X_test)
        prec, rec, f_1, supp = prf(y_test,
                                   np.argmax(NN_test_preds, axis=1),
                                   average=None)
        class_rep = sklearn.metrics.classification_report(
            y_test, np.argmax(NN_test_preds, axis=1))
        exp.log_other('Classification Report' + split_ID, class_rep)
        mcc = sklearn.metrics.matthews_corrcoef(
            y_test, np.argmax(NN_test_preds, axis=1))

        #if first iteration, report model parameters to comet
        #        if split_ID == '0':
        #            exp.log_parameters(lgbm.get_params())
        return prec, rec, f_1, supp, mcc

    #from https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys

    def flatten(d, parent_key='', sep='_'):
        import collections
        items = []
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if isinstance(v, collections.MutableMapping):
                items.extend(flatten(v, new_key, sep=sep).items())
            else:
                items.append((new_key, v))
        return dict(items)

    def calc_and_save_metrics(X_train, X_test, y_train, y_test, split_index,
                              model_type, embedding_type, AID, metric_names,
                              metric_dict_list, split_info, split_num,
                              little_split_num):
        '''Takes in test and train data + labels, computes metrics and saves them
        as a dict inside of the provided list. Returns this list.'''
        prec, rec, f_1, supp, mcc = classifier_train(X_train, X_test, y_train,
                                                     y_test, split_info)
        results_array = np.concatenate((prec, rec, f_1, supp)).tolist() + [mcc]
        if little_split_num == 'NaN':
            split_size = '80%'
        else:
            split_size = '10%'
        metric_dict_list.append(
            dict(
                zip(metric_names, [
                    model_type, embedding_type, AID, split_num,
                    little_split_num, split_size, split_index, split_info
                ] + results_array)))
        return metric_dict_list

    '''Begin the actual experiment'''
    #get data cleaned
    pickle_off = open(save_path, 'rb')
    activity_table = pickle.load(pickle_off)
    pickle_off.close()
    #get AID
    folder, base = ntpath.split(save_path)
    #split file name at second _ assumes file save in AID_xxx_endinfo.pkl
    AID, _, end_info = base.rpartition('_')
    #get length of MFP
    fp_length = len(activity_table.iloc[5]['MFP'])
    #reshape mfp
    X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel()
    X_mfp = X_mfp.reshape((-1, fp_length))
    le = LabelEncoder()
    labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME'])
    #split data:
    from sklearn.model_selection import StratifiedShuffleSplit
    #this is outer 5fold cross validation i.e. 80/20 split
    big_splitter = StratifiedShuffleSplit(n_splits=5,
                                          test_size=0.2,
                                          random_state=2562)
    #inner replicateing the start with 10% of data (or 12.5% of 80% intial split)
    little_splitter = StratifiedShuffleSplit(n_splits=8,
                                             test_size=0.2,
                                             train_size=0.125,
                                             random_state=2562)
    #this holds all the metrics values that will be stored in comet
    metric_names = [
        'Classifier', 'Embedding', 'AID', '80% Split Number',
        '10% Split Number', 'Train Split Size', 'ID', 'Split Info',
        'prec_Inactive', 'prec_Active', 'rec_Inactive', 'rec_Active',
        'f_1_Inactive', 'f_1_Active', 'supp_Inactive', 'supp_Active', 'mcc'
    ]

    #determine model type
    classifier_dict = {
        'SVM': train_SVM,
        'RF': train_RF,
        'LGBM': train_LGBM,
        'DNN': train_DNN
    }
    #set dummy variable to func that trains specified model
    classifier_train = classifier_dict[model_type]
    metric_dict_list = []
    #using labels as a dummy for X
    for split_num, [train_ind,
                    test_ind] in enumerate(big_splitter.split(labels, labels)):
        #indexs which split the data comes from X.X ie big.little
        split_index = str(split_num)
        little_split_num = 'NaN'
        '''Regular Sample'''
        split_info = 'Split' + split_index + ' 80% train' + 'BaseRatio'
        #get test/train index
        X_train, X_test, y_train, y_test = get_Scaled_Data(
            train_ind, test_ind, X_mfp, activity_table, labels, bin_labels)
        #train model and get back classwise metrics
        over_X_train, over_y_train = ros.fit_resample(X_train, y_train)
        metric_dict_list = calc_and_save_metrics(
            over_X_train, X_test, over_y_train, y_test, split_index,
            model_type, embedding_type, AID, metric_names, metric_dict_list,
            split_info, split_num, little_split_num)
        '''Over Sample'''
        split_info = 'Split' + split_index + ' 80% train' + 'OverSample'
        #get test/train index
        X_train, X_test, y_train, y_test = get_Scaled_Data(
            train_ind, test_ind, X_mfp, activity_table, labels, bin_labels)
        #train model and get back classwise metrics
        over_X_train, over_y_train = ros.fit_resample(X_train, y_train)
        metric_dict_list = calc_and_save_metrics(
            over_X_train, X_test, over_y_train, y_test, split_index,
            model_type, embedding_type, AID, metric_names, metric_dict_list,
            split_info, split_num, little_split_num)
        '''Under Sample'''
        split_info = 'Split' + split_index + ' 80% train' + 'UnderSample'
        #get test/train index
        X_train, X_test, y_train, y_test = get_Scaled_Data(
            train_ind, test_ind, X_mfp, activity_table, labels, bin_labels)
        #train model and get back classwise metrics
        under_X_train, under_y_train = rus.fit_resample(X_train, y_train)
        #print('active ratio is:',sum(under_y_train)/len(under_y_train))
        metric_dict_list = calc_and_save_metrics(
            under_X_train, X_test, under_y_train, y_test, split_index,
            model_type, embedding_type, AID, metric_names, metric_dict_list,
            split_info, split_num, little_split_num)

        for little_split_num, [little_train_ind, little_test_ind] in enumerate(
                little_splitter.split(labels[train_ind], labels[train_ind])):
            split_index = str(split_num) + '.' + str(little_split_num)
            '''Regular Sample'''
            split_info = 'Split' + split_index + ' 10% train' + 'BaseRatio'
            #get test/train index
            X_train, X_test, y_train, y_test = get_Scaled_Data(
                train_ind, test_ind, X_mfp, activity_table, labels, bin_labels)
            #train model and get back classwise metrics
            over_X_train, over_y_train = ros.fit_resample(X_train, y_train)
            if len(set(y_train)) == 2:
                metric_dict_list = calc_and_save_metrics(
                    over_X_train, X_test, over_y_train, y_test, split_index,
                    model_type, embedding_type, AID, metric_names,
                    metric_dict_list, split_info, split_num, little_split_num)
            else:
                print('Skipped ' + split_info)
            '''Over Sample'''
            #get test/train index
            X_train, X_test, y_train, y_test = get_Scaled_Data(
                little_train_ind, test_ind, X_mfp, activity_table, labels,
                bin_labels)
            over_X_train, over_y_train = ros.fit_resample(X_train, y_train)
            split_info = 'Split' + str(split_num) + ' 10% train' + 'OverSample'
            #train model and get back classwise metrics
            #check if train_split contains both postive and negative labels
            if len(set(y_train)) == 2:
                metric_dict_list = calc_and_save_metrics(
                    over_X_train, X_test, over_y_train, y_test, split_index,
                    model_type, embedding_type, AID, metric_names,
                    metric_dict_list, split_info, split_num, little_split_num)
            else:
                print('Skipped ' + split_info)
            '''UnderSample'''
            #get test/train index
            X_train, X_test, y_train, y_test = get_Scaled_Data(
                little_train_ind, test_ind, X_mfp, activity_table, labels,
                bin_labels)
            under_X_train, under_y_train = rus.fit_resample(X_train, y_train)
            split_info = 'Split' + str(
                split_num) + ' 10% train' + 'UnderSample'
            #train model and get back classwise metrics
            #check if train_split contains both postive and negative labels
            if len(set(y_train)) == 2:
                metric_dict_list = calc_and_save_metrics(
                    under_X_train, X_test, under_y_train, y_test, split_index,
                    model_type, embedding_type, AID, metric_names,
                    metric_dict_list, split_info, split_num, little_split_num)
            else:
                print('Skipped ' + split_info)
    # now convert metric_dict_list to df:
    metrics_df = pd.DataFrame(metric_dict_list)
    #set Split_ID to inded
    #now plot all the columns
    #first make a new df column to ID things as either split
    cols_to_plot = [
        'prec_Inactive', 'prec_Active', 'rec_Inactive', 'rec_Active',
        'f_1_Inactive', 'f_1_Active', 'supp_Inactive', 'supp_Active', 'mcc'
    ]
    #turn off plotting
    plt.ioff()
    for metric in cols_to_plot:
        #make sns boxplot
        ax = sns.boxplot(x='Split Info', y=metric, data=metrics_df)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
        plt.tight_layout()
        #log the plot
        exp.log_figure()
        plt.clf()
    ''' now we're going to go through and calculate means and stds for 3 diff groups
        1) the 5 80% train runs
        2) the 5 sets of 8 10% runs
        3) the 40 total 10% runs
        we save each in a list as a pd Series with a name explaining the contents'''

    #now add list of dicts of averages to metrics df
    #convert metrics_df to metric dict and log it

    #save metric_df to current folder
    comet_addtional_info(exp, save_path, metrics_df, X_test, y_test,
                         embedding_type, model_type)
    return metrics_df
Beispiel #29
0
##################################
### [2] Over Sampling
##################################

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(X,y)
result_dic["Over Sampling"] = LogisticReg(X_resampled, y_resampled)


##################################
### [3] Under Sampling
##################################

from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler()
X_resampled, y_resampled = ros.fit_sample(X,y)
result_dic["Under Sampling"] = LogisticReg(X_resampled, y_resampled)


##################################
### [4] SMOTE
##################################

from imblearn.over_sampling import SMOTE
sm = SMOTE(kind='regular')
X_resampled, y_resampled = sm.fit_sample(X,y)
result_dic["SMOTE"] = LogisticReg(X_resampled, y_resampled)


##################################
def forest_tangent_space_hierarchical(data):
    """A cross validated tangent space classifier with svm.

    Parameters
    ----------
    data : dict
        A dictionary containing training and testing data

    Returns
    -------
    cross validated scores
        A list of cross validated scores.

    """

    # Combine two classes into one class
    x_level_1 = data['train_x']
    y_level_1 = np.argmax(data['train_y'], axis=1) + 1
    y_level_1 = np.expand_dims(y_level_1, axis=1)

    # Verify if they are balanced
    print(
        sum(y_level_1 == 1) / len(y_level_1),
        sum(y_level_1 == 2) / len(y_level_1),
        sum(y_level_1 == 3) / len(y_level_1))

    # Combine C1 and C2 classes and balance the dataset for traning
    y_level_1[y_level_1 == 2] = 1
    rus = RandomUnderSampler()
    rus.fit_resample(y_level_1, y_level_1)

    # Store them in dictionary
    x_level_1 = x_level_1[rus.sample_indices_, :]
    y_level_1 = y_level_1[rus.sample_indices_].ravel()

    # Train a classifier with only this data
    clf_level_1 = RandomForestClassifier(n_estimators=100, random_state=43)
    scores_1 = cross_val_score(clf_level_1,
                               x_level_1,
                               y_level_1,
                               cv=KFold(5, shuffle=True))
    print(scores_1)

    # Second level of traning
    y_level_2 = np.argmax(data['train_y'], axis=1) + 1
    idx = y_level_2 != 3
    x_level_2 = data['train_x'][idx, :]
    y_level_2 = y_level_2[idx].ravel()

    # Train a classifier with only this data
    clf_level_2 = RandomForestClassifier(n_estimators=100, random_state=43)
    scores_2 = cross_val_score(clf_level_2,
                               x_level_2,
                               y_level_2,
                               cv=KFold(5, shuffle=True))
    print(scores_2)

    # Fir the level 2 classifier for final testing
    clf_level_1 = clf_level_1.fit(x_level_1, y_level_1)
    clf_level_2 = clf_level_2.fit(x_level_2, y_level_2)

    # Predict using first level and use the output for second level
    y_true = np.argmax(data['test_y'], axis=1) + 1
    y_pred_1 = clf_level_1.predict(data['test_x'])
    idx = y_pred_1 == 1
    y_pred_2 = clf_level_2.predict(data['test_x'][idx, :])
    y_pred_1[idx] = y_pred_2

    # Concatenate both of them and compare with true labels
    y_pred = y_pred_1
    score = accuracy_score(y_true, y_pred)

    return score
Beispiel #31
0
            ys = np.concatenate((ys, np.array(y_res[i])))

    print(Xs.shape, ys.shape)
    shuffle(Xs, ys)

    # Generate more synthetic samples
    if smote is not None:
        Xs, ys = smote.fit_sample(Xs, ys)

    shuffle(Xs, ys)
    ys = to_categorical(ys, 2)

    return Xs, ys


rus = RandomUnderSampler(ratio={0: 1531 * 30, 1: 1531})
smote = SMOTE(n_jobs=-1, random_state=42, k_neighbors=3, m_neighbors=5)
rus2 = RandomUnderSampler(ratio={0: 1531 * 100, 1: 1531 * 50})

#ros = RandomOverSampler(ratio={0: 1531*10, 1: 1531*5})
# smoteenn = SMOTEENN(smote=SMOTE(n_jobs=-1))

print("Resampling")
'''
0.589
resampled_features, resampled_labels = rus.fit_sample(features, labels[:, 1])
resampled_features, resampled_labels = smote.fit_sample(
        resampled_features, resampled_labels)

#resampled_features, resampled_labels = rus2.fit_sample(
#        resampled_features, resampled_labels)
Beispiel #32
0
# train_samples = np.concatenate([ sentences_train,features_train,refers_train,abstract_train], axis=-1)
#train_samples = np.concatenate([[i * 10 for i in features_train]], axis=-1)
train_samples = np.concatenate(
    [sentences_train, features_train, refers_train, abstract_train], axis=-1)
# test_samples = np.concatenate([ sentences_test,features_test,refers_test,abstract_test,labels_test], axis=-1)
#test_samples = np.concatenate([[i * 10 for i in features_test]], axis=-1)
test_samples = np.concatenate(
    [sentences_test, features_test, refers_test, abstract_test, labels_test],
    axis=-1)
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
#smo = SMOTE(sampling_strategy=0.7)
#x_train,y_train = smo.fit_sample(train_samples,labels_train)

model_RandomUnderSample = RandomUnderSampler(sampling_strategy=0.6)
x_train, y_train = model_RandomUnderSample.fit_sample(train_samples,
                                                      labels_train)
y_train = np.expand_dims(y_train, axis=1)
train_samples = np.concatenate([x_train, y_train], axis=-1)

trainData = 'train7_14.csv'
testData = 'test7_14.csv'
data1 = pd.DataFrame(train_samples)
data1.columns = data1.columns.map(lambda x: 'test'
                                  if x == (data1.shape[1] - 1) else 'train')
data1.to_csv(trainData, index=False)

data1 = pd.DataFrame(test_samples)
data1.columns = data1.columns.map(lambda x: 'test'
                                  if x == (data1.shape[1] - 1) else 'train')
Beispiel #33
0
 def func(X, y, sampling_strategy, random_state):
     rus = RandomUnderSampler(
         sampling_strategy=sampling_strategy, random_state=random_state)
     return rus.fit_resample(X, y)
Beispiel #34
0
def eval_with_sampling_and_kfold_logical_regression(features, df, k=5):
    """
    evaluating logical regression with over+under sampling to find the balanced weights
    and applying Kfold CV over it
    :param features: selected features
    :param df: dataframe from the dataset
    :param k: kfold value for Kfold CV
    :return: precission,recall,accuracy,f1_score
    """
    roc_auc, precision, recall, acc, f1, auc_score = [[] for _ in range(6)]

    X, y = init_model(df, features)
    # trying to fix scewness
    X = np.log1p(X)

    print("X->", X)
    print("y->", y)

    # split into train/test sets %70 train and %30 test
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # fit a model
    model = LogisticRegression()
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under)]
    pipeline = Pipeline(steps=steps)
    # apply cross validation i.e K-Fold
    kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1)
    X, y = pipeline.fit_resample(X, y)
    # enumerate the splits and summarize the distributions
    for train_ix, test_ix in kfold.split(X, y):
        X_train = X.iloc[train_ix]
        X_test = X.iloc[test_ix]
        y_train = y[train_ix]
        y_test = y[test_ix]
        # print the split rates
        print_split_rates(y_train, y_test)

        print("running the pipeline fit..\n")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        # Scores
        precision += [precision_score(y_test, y_pred, average='binary')]
        recall += [recall_score(y_test, y_pred, average='binary')]
        acc += [accuracy_score(y_test, y_pred)]
        f1 += [f1_score(y_test, y_pred, average='binary')]
        # auc_score += [roc_auc_score(y_test,y_pred)]
        # cross_val_score()

        # print ROC curve
        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        roc_auc += [auc(fpr, tpr)]

    roc_auc = sum(roc_auc) / k
    print("\nprecision:{0}\nrecall:{1}\naccuracy:{2}\nf1_score:{3}".format(
        sum(precision) / k,
        sum(recall) / k,
        sum(acc) / k,
        sum(f1) / k))
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label='AUC = %0.3f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1.05])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    # plt.show()
    plt.savefig("LogR-ROC.png", dpi=300)
Beispiel #35
0
 def __init__(self, n_estimators, depth):
     self.M = n_estimators
     self.depth = depth
     self.undersampler = RandomUnderSampler(replacement=False)
Beispiel #36
0
 def func(X, y, ratio, random_state):
     rus = RandomUnderSampler(ratio=ratio, random_state=random_state)
     return rus.fit_sample(X, y)
        cols = self.X_unders[name].columns
        sel_cols = [
            cols[i] for i in range(len(cols)) if ranking[i] == 1
            and cols[i] not in ['TransactionID', 'TransactionDT']
        ]
        self.X_unders[name] = self.X_unders[name][sel_cols]


if __name__ == '__main__':
    # load data table
    red_data = reduced_transaction_table('../data/train_transaction.csv')

    und_samp_name = 'random'
    red_data.add_undersampling_transform(
        und_samp_name,
        RandomUnderSampler(sampling_strategy='majority', random_state=0))

    # load selected features
    rankings = load(
        open('../trained_models/select_features/rankings.pkl', 'rb'))
    # it is known that 80 features gives the best accuracy
    red_data.select_features(und_samp_name, rankings[80])

    # reference for fitting model and getting feature importance:
    # https://machinelearningmastery.com/calculate-feature-importance-with-python/

    # define dataset
    X, y = red_data.X_unders[und_samp_name], red_data.y_unders[und_samp_name]

    # split train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,
X2_train, X2_valid, y2_train, y2_valid = train_test_split(X2,
                                                          y2,
                                                          test_size=0.20,
                                                          stratify=y2,
                                                          random_state=42)

test_X = test_1.drop('went_on_backorder', axis=1).values
test_Y = test_1['went_on_backorder'].values
print('Imbalanced ratio in training set_2: 1:%i' %
      (Counter(y2)[0] / Counter(y2)[1]))

cart_0 = tree.DecisionTreeClassifier(criterion='entropy',
                                     max_depth=8,
                                     min_samples_leaf=5)
rus_0 = make_pipeline(
    RandomUnderSampler(),
    tree.DecisionTreeClassifier(criterion='entropy',
                                max_depth=8,
                                min_samples_leaf=5))
forest_0 = ensemble.RandomForestClassifier(criterion='entropy',
                                           max_depth=15,
                                           min_samples_leaf=5)
xgb_0 = XGBClassifier(max_depth=15, learning_rate=0.1)

cart_1 = tree.DecisionTreeClassifier(criterion='entropy',
                                     max_depth=8,
                                     min_samples_leaf=5)
rus_1 = make_pipeline(
    RandomUnderSampler(),
    tree.DecisionTreeClassifier(criterion='entropy',
                                max_depth=8,
Beispiel #39
0
def RandomUnderSample(X_train, y_train):
    rus = RandomUnderSampler(sampling_strategy='auto', return_indices=False,
                random_state=None, replacement=False, ratio=None)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    return X_train, y_train
Beispiel #40
0
neg, pos = np.bincount(y)
total = neg + pos
w0 = (1 / neg) * (total) / 2
w1 = (1 / pos) * (total) / 2
weights = {0: w0, 1: w1}

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

over = RandomOverSampler(sampling_strategy=0.4)
X, y = over.fit_resample(X, y)

#smote = SMOTE(sampling_strategy = 0.4, random_state = 1)
#X, y = smote.fit_resample(X, y)

under = RandomUnderSampler(sampling_strategy='majority')
X, y = under.fit_resample(X, y)

shuffler = np.random.permutation(len(X))
X = X[shuffler]
y = y[shuffler]

import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
#from sklearn.utils.class_weight import compute_class_weight

METRICS = [
    keras.metrics.TruePositives(name='tp'),
    keras.metrics.FalsePositives(name='fp'),
def run():

    dtype = [
        'characteristic_B', 'characteristic_C', 'characteristic_D',
        'characteristic_E', 'characteristic_G', 'characteristic_M',
        'characteristic_P', 'characteristic_Q', 'characteristic_R',
        'characteristic_S', 'characteristic_Y', 'characteristic_Z',
        'catering_C', 'catering_F', 'catering_H', 'catering_M', 'catering_R',
        'catering_T', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun',
        'freight', 'bank_holiday_running', 'length', 'speed', 'delayed'
    ]

    dtype = {key: "uint8" for key in dtype}

    dtype.update({
        "status": "category",
        "category": "category",
        "power_type": "category",
        "timing_load": "category",
        "seating": "category",
        "sleepers": "category",
        "reservations": "category",
        "ATOC_code": "category",
        "destination_stanox_area": "category",
        "origin_stanox_area": "category"
    })

    start = time.time()

    print("Loading data...", end="")

    df = pd.read_csv("data/dscm_w.csv",
                     index_col=["uid"],
                     parse_dates=["std", "sta", "atd", "ata"],
                     dtype=dtype)

    print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n")

    print(df.info())

    path = os.path.join("models", "select")

    if not os.path.exists(path):

        os.mkdir(path)

    Y = df["delayed"]
    X = df.drop(["delay", "delayed", "atd", "ata", "origin", "destination"],
                axis=1)

    categorical_features = X.select_dtypes(include="category").columns.values
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    datetime_features = X.select_dtypes(include="datetime").columns.values
    datetime_transformer = Pipeline([("cyclical",
                                      DatetimeEncoder(cyclical=True))])

    numeric_features = ["speed", "length", "duration"]
    numerical_transformer = Pipeline([("scaler", StandardScaler())])

    preprocessor = ColumnTransformer([
        ("categorical", categorical_transformer, categorical_features),
        ("datetime", datetime_transformer, datetime_features),
        ("numeric", numerical_transformer, numeric_features)
    ])

    resampler = IPipeline([
        # ('over', SMOTE(sampling_strategy=0.2, random_state=1)),                 # Increase minority to 20% of majority
        ('under', RandomUnderSampler(sampling_strategy=1.0, random_state=1)
         ),  # Reduce majority to 50% of minority
    ])

    start = time.time()

    print("\nPreprocessing data...", end="")

    X = preprocessor.fit_transform(X, Y)

    print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n")

    print(X.shape)

    print("\nResampling data...", end="")

    X, Y = resampler.fit_resample(X, Y)

    print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n")

    print("{}, delayed: {}, not delayed: {}\n".format(X.shape, Y.sum(),
                                                      len(Y) - Y.sum()))

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)

    classifiers = [
        # LogisticRegression(),
        # RidgeClassifier(),
        # SGDClassifier(),
        # LinearSVC(),
        # DecisionTreeClassifier(),
        # MLPClassifier(),
        # AdaBoostClassifier(),
        # GradientBoostingClassifier(),
        RandomForestClassifier(n_jobs=-1),
    ]

    for clf in classifiers:

        train(clf, path, X_train, Y_train)

        metrics = [
            recall_score,
            average_precision_score,
        ]

        Y_pred = clf.predict(X_test)

        results = {
            "name": clf.__class__.__name__,
            "score": clf.score(X_test, Y_test)
        }

        for m in metrics:

            results[m.__name__] = m(Y_test.values, Y_pred)

        print(clf.__class__.__name__ + "\n")
        print(results)
        print(
            classification_report(Y_test.values,
                                  Y_pred,
                                  target_names=["not delayed", "delayed"]))
Beispiel #42
0
def naieve_undersample(x, y, seed=None):
    if seed is None:
        seed = random.randint(0, 1000)
    rus = RandomUnderSampler(random_state=seed)
    x_resampled, y_resampled = rus.fit_resample(x, y)
    return x_resampled, y_resampled
Beispiel #43
0
    def trainy(self, testy=0.2, imbl=True):
        """
        I'll do the following here:
        1. Do train test split
        2. Convert X_train and X_test to DataFrame (to delete column later plus other purposes)
        3. Do tfidf using train section, use the model and fit the X_train and X_test (then can delete the wordchunk column)
        4. If StandardScaler, scale the training and test data. (Default = True)
        5. To prepare data for chi2 reduction we need to scale everything to above 0, so MinMaxScaler
        """

        #This is perhaps the main reason why this step is embedded in a class
        #Because the stratification would be different, everything would be different already, like the tfidf vocab for example
        X_train, X_test, y_train, y_test = train_test_split(
            self.X,
            self.y,
            random_state=self.random_state,
            test_size=testy,
            stratify=self.y)

        self.y_train = y_train
        self.y_test = y_test

        X_train = pd.DataFrame(X_train, columns=self.columns)
        X_test = pd.DataFrame(X_test, columns=self.columns)
        df_train = pd.DataFrame()
        df_test = pd.DataFrame()
        for i in np.arange(1, 4):
            tfidf = TfidfVectorizer(stop_words='english',
                                    ngram_range=(i, i),
                                    decode_error='replace',
                                    max_features=10000)
            Xword_train = tfidf.fit_transform(X_train['words_only'])
            Xword_test = tfidf.transform(X_test['words_only'])

            #We need to reduce the size of the tfidf trained matrix first
            #But after running TruncatedSVD we cannot see the words specifically alr so too bad...
            tsvd = TruncatedSVD(n_components=500,
                                algorithm='arpack',
                                random_state=self.random_state)
            Xwordie_train = tsvd.fit_transform(Xword_train)
            Xwordie_test = tsvd.transform(Xword_test)
            Xwordie_train_df = pd.DataFrame(
                Xwordie_train,
                columns=[
                    str(i) + '_' + str(b)
                    for b in np.arange(1, Xwordie_train.shape[1] + 1)
                ])
            Xwordie_test_df = pd.DataFrame(
                Xwordie_test,
                columns=[
                    str(i) + '_' + str(b)
                    for b in np.arange(1, Xwordie_test.shape[1] + 1)
                ])
            df_train = pd.concat([df_train, Xwordie_train_df], axis=1)
            df_test = pd.concat([df_test, Xwordie_test_df], axis=1)
            self.tfidf_list.append(tfidf)
            self.tsvd_list.append(tsvd)

        X_train.drop(['words_only'], axis=1, inplace=True)
        X_test.drop(['words_only'], axis=1, inplace=True)
        X = self.X.drop(['words_only'], axis=1)
        if self.web:
            X_train.drop([
                'n_video', 'n_links', 'n_image', 'n_otherlink',
                'mention_count', 'hashtag_count', 'mbti_ref_count',
                'ennea_count', 'bracket_count'
            ],
                         axis=1,
                         inplace=True)
            X_test.drop([
                'n_video', 'n_links', 'n_image', 'n_otherlink',
                'mention_count', 'hashtag_count', 'mbti_ref_count',
                'ennea_count', 'bracket_count'
            ],
                        axis=1,
                        inplace=True)
            X.drop([
                'n_video', 'n_links', 'n_image', 'n_otherlink',
                'mention_count', 'hashtag_count', 'mbti_ref_count',
                'ennea_count', 'bracket_count'
            ],
                   axis=1,
                   inplace=True)
        self.columns = X_train.columns

        #Standardization step
        if self.stan:
            ss = StandardScaler().fit(X)
            X_train = ss.transform(X_train)
            X_test = ss.transform(X_test)
            X_train = pd.DataFrame(X_train, columns=self.columns)
            X_test = pd.DataFrame(X_test, columns=self.columns)
            self.ss = ss

        #Join step
        if self.include_feature == 'words':
            X_train = df_train
            X_test = df_test
            columnie = X_train.columns
        else:
            X_train = X_train.join(df_train)
            X_test = X_test.join(df_test)
            columnie = X_train.columns

        #Scale again to between 0 and 1
        combined_X = pd.concat([X_train, X_test], axis=0)
        mms = MinMaxScaler().fit(combined_X)
        X_train = pd.DataFrame(mms.transform(X_train), columns=columnie)
        X_test = pd.DataFrame(mms.transform(X_test), columns=columnie)

        if imbl:
            imbler = RandomUnderSampler(random_state=42)
            X_train, y_train = imbler.fit_sample(X_train, y_train)

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.mms = mms

        return X_train, X_test, y_train, y_test
Beispiel #44
0
# under-sampling methods.
#
# With the controlled under-sampling methods, the number of samples to be
# selected can be specified.
# :class:`~imblearn.under_sampling.RandomUnderSampler` is the most naive way of
# performing such selection by randomly selecting a given number of samples by
# the targetted class.

# %%
from imblearn.under_sampling import RandomUnderSampler

X, y = create_dataset(n_samples=400, weights=(0.05, 0.15, 0.8), class_sep=0.8)

samplers = {
    FunctionSampler(),  # identity resampler
    RandomUnderSampler(random_state=0),
}

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
for ax, sampler in zip(axs, samplers):
    model = make_pipeline(sampler, clf).fit(X, y)
    plot_decision_function(
        X, y, model, ax[0], title=f"Decision function with {sampler.__class__.__name__}"
    )
    plot_resampling(X, y, sampler, ax[1])

fig.tight_layout()

# %% [markdown]
# :class:`~imblearn.under_sampling.NearMiss` algorithms implement some
# heuristic rules in order to select samples. NearMiss-1 selects samples from
Beispiel #45
0
def main(args):
    #Logging
    logger = get_logger("cfxgb")

    ################################################################################################################
    #ARGUMENT CHECK
    ################################################################################################################

    if args.Dataset is None:
        logger.error("Dataset required")
        exit(0)

    if args.ParentCols < 0:
        logger.error("Enter valid levels")
        exit(0)

    if args.parameters is None:
        logger.error("Model Parameters required")
        exit(0)
    else:
        config = load_json(args.parameters)
    logger.info("Loaded JSON")

    logger.info(
        "JSON ----------------------------------------------------------------------------------"
    )
    json1 = json.dumps(config, indent=4, separators=(". ", " = "))
    logger.info(json1)
    logger.info(
        "END OF JSON----------------------------------------------------------------------------"
    )

    ################################################################################################################
    #DATASET
    ################################################################################################################

    if not osp.exists(args.Dataset):
        full_path = osp.join('Datasets', args.Dataset + '.csv')
        if not osp.exists(full_path):
            logger.error("Enter valid Dataset")
            exit(0)
    else:
        full_path = args.Dataset

    logger.info(args.Dataset + " used")
    data = pd.read_csv(full_path)
    if (args.ignore):
        logger.info("First column ignored")
        data = data.iloc[:, 1:]

    logger.info("Data Read Complete")
    ################################################################################################################

    ################################################################################################################
    #Extra Columns
    ################################################################################################################

    if (args.ParentCols):
        logger.info("{} level(s) of parent nodes will be added. ".format(
            args.ParentCols))

    else:
        logger.info("Parent nodes not considered")
################################################################################################################

################################################################################################################
#Sample
################################################################################################################

    if (args.sample):
        weights = data.groupby(
            data.columns[-1])[data.columns[-1]].transform('count')
        if (len(np.unique(weights)) == 1):
            logging.info("Equal weights already.")
            data = data.sample(n=args.sample, random_state=0)
        else:
            sum = np.sum(np.unique(weights))
            weights = sum - weights
            data = data.sample(n=args.sample, weights=weights, random_state=0)
        logger.info("Distribution after sampling : \n{}".format(
            data.iloc[:, -1].value_counts()))

################################################################################################################

################################################################################################################
# X,y
################################################################################################################

    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    ################################################################################################################

    ################################################################################################################
    #Feature Selection (Initial)
    ################################################################################################################

    if (args.featureSelect):
        logger.info("Feature Selection - Initial")
        clf = XGBClassifier(n_estimators=100,
                            learning_rate=0.3,
                            max_depth=4,
                            verbosity=0,
                            random_state=0,
                            n_jobs=-1)
        rfe = RFECV(clf, step=1, cv=5, verbose=0)
        X = rfe.fit_transform(X, y)

################################################################################################################

################################################################################################################
#TRAIN TEST SPLIT
################################################################################################################

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)  #stratify = y
    logger.info("Train Test Split complete")

    ################################################################################################################

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #TRAINING
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #SAMPLING
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    if (args.RandomSamp):
        rus = RandomUnderSampler(random_state=0)
        X_train, y_train = rus.fit_resample(X_train, y_train)
        logger.info("Applied Random Under-Sampling")

    else:
        logger.info("No Random Under-Sampling")

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    X_test = np.array(X_test)

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #MODEL
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    #CFXGB
    cfxgb = CFXGB(config, args)

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #CASCADED FOREST AS TRANSFORMER
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    X_train_enc = cfxgb.get_encoded(X_train, y_train)
    X_test_enc = cfxgb.transform(X_test)

    #Final Transformation
    X_train_enc, X_test_enc = cfxgb.finalTransform(X_train, X_train_enc,
                                                   X_test, X_test_enc)
    #    X_train_enc = pd.DataFrame(X_train_enc)
    #    X_train_enc.to_csv("X_train_enc.csv")
    #    X_test_enc = pd.DataFrame(X_train_enc)
    #    X_test_enc.to_csv("X_test_enc.csv")
    logger.info("X_train_enc.shape={}, X_test_enc.shape={}".format(
        X_train_enc.shape, X_test_enc.shape))

    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#
    #XGBOOST
    #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$#

    y_pred = cfxgb.classify(X_train_enc, y_train, X_test_enc, y_test)

    logger.info("Confusion Matrix - \n{}".format(
        confusion_matrix(y_test, y_pred)))
    logger.info("\nClassification Report - \n{}".format(
        classification_report(y_test, y_pred)))
    logger.info("Accuracy - {}\n".format(accuracy_score(y_test, y_pred)))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    logger.info("AUC ")
    auc = metrics.auc(fpr, tpr)
    logger.info(auc)
    logger.info("Time - {}".format(time.time() - t))
    logger.info("Arguments used in this run : {}".format(str(sys.argv)))

    logging.shutdown()
Beispiel #46
0
def under_sampling(X_train, y_train):
    sampler = RandomUnderSampler(sampling_strategy='majority', random_state=0)
    X_train_under, y_train_under = sampler.fit_sample(X_train, y_train)
    return X_train_under, y_train_under
Beispiel #47
0
for i, times in enumerate(time_series_Train):
    time_series_Train[i] = np.array(time_series_Train[i])
max_len = max([len(x) for x in time_series_Train])
for i, times in enumerate(time_series_Train):
    time_series_Train[i] = np.pad(times, (0, max_len - len(times)), 'constant')
time_series_Mat = np.zeros((len(time_series_Train), max_len))
for i, times in enumerate(time_series_Train):
    for j, time in enumerate(time_series_Train[i]):
        time_series_Mat[i, j] = time
features_Train = np.concatenate([features_Train, time_series_Mat], axis=1)

features_Train = np.concatenate([features_Train, num_Norm_Train], axis=1)

from imblearn.over_sampling import SMOTE

undersample = RandomUnderSampler()
print(target_Train)
print(target_Train.shape)
print(type(target_Train))
features_Train_Resampled, target_Train_Resampled = undersample.fit_resample(
    features_Train, target_Train)
from keras.utils import to_categorical

target_Train_Resampled = to_categorical(target_Train_Resampled)
print(target_Train.shape)
print(type(target_Train))
print(target_Train)
print('FEATURES TRAIN')


def create_model():
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
rus = RandomUnderSampler(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                   idx_resampled)

idx_class_0 = y_resampled == 0
plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
Beispiel #49
0
        2: weights[2],
        3: weights[3],
        4: weights[4]
    }
    over = SMOTE(sampling_strategy=ratio_over, random_state=314)
    X_train, y_train = over.fit_resample(X_train, y_train)

    # undersample samples > average
    ratio_under = {
        0: average_samples,
        1: average_samples,
        2: average_samples,
        3: average_samples,
        4: average_samples
    }
    under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314)
    X_train, y_train = under.fit_resample(X_train, y_train)
    cv_inner = KFold(n_splits=5, shuffle=True)
    model = KerasClassifier(build_fn=create_model,
                            batch_size=32,
                            epochs=100,
                            verbose=0)
    learning_rate = [0.001, 0.01, 0.1]
    batch_size = [8, 16, 32]
    neurons = [50, 100, 150]
    hidden_layers = [1, 2, 3]
    epochs = [10, 30, 50]
    activation = ['relu', 'tanh', 'sigmoid']
    param_grid = dict(learning_rate=learning_rate,
                      epochs=epochs,
                      batch_size=batch_size,
    def use_parameters(self, X_train, selected_features):
        """
        Default Parameter

        """

        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            ClusterCentroids(),
            RandomUnderSampler(),
            # NearMiss(version=1),
            # EditedNearestNeighbours(),
            # AllKNN(),
            # CondensedNearestNeighbour(random_state=0),
            # InstanceHardnessThreshold(random_state=0,
            #                          estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
            RandomOverSampler(random_state=0),
            SMOTE(),
            BorderlineSMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]
        #test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
        #test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]

        # gamma default parameters
        #param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var()))

        #parameters = [
        #    {
        #        'scaler': test_scaler,
        #        'sampling': test_sampling,
        #        'feat__cols': selected_features,
        #        'model__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
        #        'model__weights': ['uniform', 'distance']
        #    }]

        parameters = [{
            'scaler': test_scaler,
            'sampling': test_sampling,
            'feat__cols': selected_features,
            'model__n_neighbors': [13, 15, 21, 25],
            'model__weights': ['uniform', 'distance']
        }]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

            print("Selected Parameters: ", parameters)
        # else:
        print("Parameters defined in the input: ", parameters)

        return parameters
plt1.set_title('Original data')
plt1.scatter(X[:, 0], X[:, 1], marker='o', s=25, edgecolor='k')

X = np.vstack((X[y == 0][:n1], X[y == 1][:n2], X[y == 2][:n3]))


newy = np.concatenate((np.full((n1,1),0), np.full((n2,1),1), np.full((n3,1),2)))


colors = ['#ef8a62' if v == 0 else '#f7f7f7' if v == 1 else '#67a9cf' for v in newy]		
plt2.set_title('Different density data')
plt2.scatter(X[:, 0], X[:, 1], marker='o',c=colors,  s=25, edgecolor='k')


sampler = RandomUnderSampler(random_state=0)
X_res, y_res = sampler.fit_resample(X, newy)
print(X.shape)
print(X_res.shape)
colors = ['#ef8a62' if v == 0 else '#f7f7f7' if v == 1 else '#67a9cf' for v in y_res]
plt3.set_title('Undersampled data')
plt3.scatter(X_res[:, 0], X_res[:, 1], c=colors, linewidth=0.5, edgecolor='black')


NN = NearestNeighbors(n_neighbors=len(X)).fit(X)
distances, indices = NN.kneighbors(X)

print(distances)


plt4.set_title('minPts elbow')
Beispiel #52
0
    def sample_data(self,
                    sampling_method: str,
                    X_train,
                    Y_train,
                    base_file_name,
                    target_column="star_rating"):
        """
        Creates sampler based in sampling method and return the resulting X and y

        This method will also save the final distribution to a CSV file based on base_file_name

        :param X_train: Original features
        :param Y_train: Original labels
        :param base_file_name: base file name to save the final distribution csv
        :return:
        """
        ## if we want to over sample or under sample
        log.debug(f'Y_train {Y_train.shape}')
        log.debug(f'Y_train {Y_train.head()}')

        grouped_df = Y_train.reset_index().groupby(target_column).count()

        log.info(
            f'Distribution before sampling with {sampling_method}\n{grouped_df}'
        )
        log.debug(f'grouped type: {type(grouped_df)}')
        log.debug(f'grouped: {grouped_df.head()}')
        log.debug(f'grouped: {grouped_df.shape}')

        if sampling_method == "smote":
            sampler = SMOTE(random_state=RSTATE,
                            sampling_strategy='not majority',
                            n_jobs=self.n_jobs)
        elif sampling_method == "adasyn":
            sampler = ADASYN(random_state=RSTATE,
                             sampling_strategy='not majority',
                             n_jobs=self.n_jobs)
        elif sampling_method == "random_over_sampling":
            sampler = RandomOverSampler(random_state=RSTATE,
                                        sampling_strategy='not majority')
        elif sampling_method == "random_under_sampling":
            sampler = RandomUnderSampler(random_state=RSTATE, replacement=True)
        elif sampling_method == "nearmiss2":
            sampler = NearMiss(random_state=RSTATE,
                               sampling_strategy='not minority',
                               version=2,
                               n_jobs=self.n_jobs)
        else:
            raise Exception(
                f"Sampling method not supported: {sampling_method}")

        X_train_res, Y_train_res = sampler.fit_resample(
            X_train, Y_train.ravel())

        X_train = pd.DataFrame(X_train_res, columns=X_train.columns)
        Y_train = pd.DataFrame(Y_train_res, columns=[target_column])

        # get distribution of samples after samping
        dist = Y_train.reset_index().groupby(target_column).count()

        log.info(f'Distribution after sampling with {sampling_method}\n{dist}')

        log.debug(dist.head())
        dist.to_csv(
            f'{REPORT_DIR}/{base_file_name}-histogram-{sampling_method}.csv')
        return X_train, Y_train
Beispiel #53
0
 def fit(self, X, y):
     pos = len(y[y == 1])
     neg = int(pos * ((1 - self.pos_ratio) / self.pos_ratio))
     self.ratio_sampler = RandomUnderSampler(random_state=self.random_state, ratio={0: neg, 1: pos})
     self.ratio_sampler.fit(X, y)
     return self