def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_trans2, y_trans2 = pipeline.fit_sample(X, y) X_trans3, y_trans3 = rus.fit_sample(X, y) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) assert_array_almost_equal(y_trans, y_trans2) assert_array_almost_equal(y_trans, y_trans3) pca = PCA() pipeline = Pipeline([('pca', pca), ('rus', rus)]) X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_sample(X_pca, y) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(y_trans, y_trans2)
def downsample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) # to use a random sampling seed at random: # rus = RandomUnderSampler() # self.X, self.y = rus.fit_sample(self.X, self.y) # to fix the random sampling seed at a certain value & return indices: rus = RandomUnderSampler(random_state=0,return_indices=True) self.X, self.y, ds_idx = rus.fit_sample(self.X, self.y) # print out the downsampled index to file: file = open('downsampled_idx','a') file.write(str(ds_idx)+'\n') file.close() # print out the downsampled y to file: file = open('downsampled_y','a') file.write(str(self.y)+'\n') file.close() self.Xview = self.X.view()[:, :self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def undersample(X, y, bal_strategy): print 'Shape of X: ', X.shape print 'Shape of y_Train: ', y.shape if(bal_strategy == "RANDOM" or bal_strategy == "ALL"): # apply random under-sampling rus = RandomUnderSampler() X_sampled, y_sampled = rus.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "TOMEK" or bal_strategy == "ALL"): # Apply Tomek Links cleaning tl = TomekLinks() X_sampled, y_sampled = tl.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == 'NONE'): X_sampled = X y_sampled = y print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape else: print 'bal_stragegy not in ALL, RANDOM, TOMEK, NONE' sys.exit(1) return (X_sampled, y_sampled)
def transform(self, X, y=None): # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the # TODO simple trainer in the correct order and leave this to advanced users? # Extract predicted column y = np.squeeze(X[[self.predicted_column]]) # Copy the dataframe without the predicted column temp_dataframe = X.drop([self.predicted_column], axis=1) # Initialize and fit the under sampler under_sampler = RandomUnderSampler(random_state=self.random_seed) x_under_sampled, y_under_sampled = under_sampler.fit_sample(temp_dataframe, y) # Build the resulting under sampled dataframe result = pd.DataFrame(x_under_sampled) # Restore the column names result.columns = temp_dataframe.columns # Restore the y values y_under_sampled = pd.Series(y_under_sampled) result[self.predicted_column] = y_under_sampled return result
def downsample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) rus = RandomUnderSampler() self.X, self.y = rus.fit_sample(self.X, self.y) self.Xview = self.X.view()[:,:self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def test_rus_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object rus = RandomUnderSampler(random_state=RND_SEED) rus.fit(X, Y) assert_raises(RuntimeError, rus.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_random_under_sampling_heterogeneous_data(): X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], dtype=np.object) y = np.array([0, 0, 1]) rus = RandomUnderSampler(random_state=RND_SEED) X_res, y_res = rus.fit_resample(X_hetero, y) assert X_res.shape[0] == 2 assert y_res.shape[0] == 2 assert X_res.dtype == object
def test_multiclass_fit_sample(): y = Y.copy() y[5] = 2 y[6] = 2 rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_sample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2
def test_rus_fit_resample(): rus = RandomUnderSampler(random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_rus_fit_sample(): """Test the fit sample routine""" # Resample the data rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_rus_fit(): """Test the fitting method""" # Create the object rus = RandomUnderSampler(random_state=RND_SEED) # Fit the data rus.fit(X, Y) # Check if the data information have been computed assert_equal(rus.min_c_, 0) assert_equal(rus.maj_c_, 1) assert_equal(rus.stats_c_[0], 3) assert_equal(rus.stats_c_[1], 7)
def test_rus_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'rus_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_rus_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) idx_gt = np.array([1, 3, 8, 6, 7, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def CrossVal(estimator, X, y,procsessor=None,cv=3,times=10,random_state=0,imb=False): """ 交叉验证 estimator: 模型 X: 数据集X部分 y: 数据集的label procsessor: 预处理器,其实就是做特征选择 cv: 做cv折交叉验证 times: 重复times次交叉验证 random_state: 随机数种子 imb: 是否使用SMOTE使得正负样本数平衡 """ res=[] for t in range(times): skf=StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state+t) indices=list(skf.split(X=X,y=y)) for k in indices: x_train,y_train,x_test,y_test=X[k[0]],y[k[0]],X[k[1]],y[k[1]] if(imb==True): n,p=__lableCount(y_train) rus=RandomUnderSampler(random_state=random_state+t) x_train,y_train=rus.fit_sample(x_train,y_train) if(procsessor is not None): procsessor.fit(x_train,y_train) x_train,y_train=procsessor.transform(x_train,y_train) x_test,y_test=procsessor.transform(x_test,y_test) estimator.fit(x_train,y_train) res.append(Metrics.Score(estimator,x_test,y_test)) res=np.array(res) return res
def test_rus_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [ 0.92923648, 0.76103773 ], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.20792588, 1.49407907], [0.15490546, 0.3130677], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_rus_fit_sample_half(): """Test the fit sample routine with a 0.5 ratio""" # Resample the data ratio = 0.5 rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.15490546, 0.3130677]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[5] = 2 y[6] = 2 # Resample the data rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 2) assert_equal(count_y_res[1], 2) assert_equal(count_y_res[2], 2)
def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_trans2, y_trans2 = pipeline.fit_sample(X, y) X_trans3, y_trans3 = rus.fit_sample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(X_trans, X_trans3, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans3, rtol=R_TOL) pca = PCA() pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_sample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0 assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL)
'objective': 'binary:logistic', 'eval_metric': 'auc', 'learning_rate': 0.1, 'min_child_weight': 1, 'max_depth': 9, 'gamma': 0.05, 'lambda': 10, 'silent': 1 } # SMOTE over-sampling process, where '19000' is the size of positive (or negative) samples print(" SMOTE begin...") print(" ...") SMOTE_params = SMOTE(ratio={1: 19000}, random_state=0) train_X_SMOTED, train_y_SMOTED = SMOTE_params.fit_sample(train_X, train_y) rus = RandomUnderSampler(ratio={0: 19000}, random_state=0) train_X_SMOTE, train_y_SMOTE = rus.fit_sample(train_X_SMOTED, train_y_SMOTED) print(sorted(Counter(train_y_SMOTE).items())) print(" SMOTE end.") # build the prediction model by XGBoost print(" Training Begin...") dtrain = xgb.DMatrix(train_X_SMOTE, label=train_y_SMOTE) dtest = xgb.DMatrix(test_X) watchlist = [(dtrain, 'train')] bst = xgb.train(params, dtrain, num_boost_round=200, evals=watchlist) print(" Training End.") # output the probability value label of the prediction results (range between 0 and 1) and the AUC of the prediction results print(" Testing Begin...") ypred = bst.predict(dtest)
test_size=0.2, random_state=23, shuffle=True, stratify=data_dict['y_train']) # ### Obtain undersampled dataset # # Undersample the data that will be used for training. We do not undersample the mock testing set as we want to keep the distribution of the classes close to the distribution of the original dataset. # In[97]: from imblearn.under_sampling import RandomUnderSampler # In[98]: rus = RandomUnderSampler(random_state=0) # In[99]: X_train_under, y_train_under = rus.fit_resample(X_train, y_train) # In[100]: data_dict_under = {'y_train': y_train_under} plot_target_frequency(data_dict_under) # ### Prepare Inputs # # We convert the dataframes into numpy ndarrays. # In[101]:
sortSimilarity = pd.Series(densitySimilarity, namesToPlot).sort_values(ascending=False) plt.figure(3, figsize=(6,10)) sortSimilarity.plot(kind='bar') plt.ylabel('density similarity') X = df[namesToPlot] y = df['Class'] sd = StandardScaler() X = sd.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify =y) rus = RandomUnderSampler(random_state=1) X_train, y_train = rus.fit_resample(X_train, y_train) clf_lr_base = LogisticRegression(class_weight='balanced', solver='saga', max_iter=5000) clf_lr_base.fit(X_train, y_train) y_pred_lr_base = clf_lr_base.predict(X_test) print(classification_report(y_test, y_pred_lr_base)) print(confusion_matrix(y_test, y_pred_lr_base)) print(balanced_accuracy_score(y_test, y_pred_lr_base)) parameter = {'C': np.logspace(-6, 2, 10)} gs = GridSearchCV(LogisticRegression(solver='saga', max_iter=5000, penalty='l1', class_weight='balanced'), parameter, scoring='balanced_accuracy') gs.fit(X_train, y_train)
pointDF["invalid_state"] = pointDF["invalid_state"].astype('category') pointDF["pdh0"] = pointDF["pdh0"].astype('category') pointDF["vx_rms"] = pointDF["vx_rms"].astype('category') pointDF["vy_rms"] = pointDF["vy_rms"].astype('category') X = pointDF[[ 'x', 'y', 'dyn_prop', 'rcs', 'vx_comp', 'vy_comp', 'ambig_state', 'x_rms', 'y_rms', 'invalid_state', 'pdh0', 'vx_rms', 'vy_rms' ]] y = pointDF['BasicCategoryNum'] start = time.clock() #Remove passenger car samples randomly desiredSampleCounts = {4: 75000} rus = RandomUnderSampler(sampling_strategy=desiredSampleCounts) X_undersampled, y_undersampled = rus.fit_resample(X, y) #Remove tractor samples randomly desiredSampleCounts = {6: 75000} rus = RandomUnderSampler(sampling_strategy=desiredSampleCounts) X_undersampled, y_undersampled = rus.fit_resample(X_undersampled, y_undersampled) print(np.bincount(y_undersampled)) #Remove Tomek Pairs underSampleObj = TomekLinks(sampling_strategy='all', n_jobs=5) X_undersampledTomek, y_undersampledTomek = underSampleObj.fit_resample( X_undersampled, y_undersampled) print(np.bincount(y_undersampledTomek))
model = RandomForestClassifier(n_estimators=1000, n_jobs=-1) model.fit(X_train_prepared, y_train) sel = SelectFromModel(model) sel.fit(X_test_prepared, y_test) selected_feat = X_train.columns[(sel.get_support())] # + # Dealing with imbalanced data from imblearn.over_sampling import SMOTE from imblearn.under_sampling import RandomUnderSampler from imblearn.pipeline import Pipeline over = SMOTE(sampling_strategy=0.2) under = RandomUnderSampler(sampling_strategy=0.6) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) #X_train_prepared, y_train = pipeline.fit_resample(X_train_prepared, y_train) over_sample = SMOTE() X_train_prepared, y_train = over_sample.fit_resample(X_train_prepared, y_train) # + #display(X_train_prepared.shape) #display(y_train.shape) # - plt.figure(figsize=(5, 5)) splot = sns.countplot(data=y_train, x='Bankrupt?', palette='Blues')
from imblearn.under_sampling import RandomUnderSampler # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling rus = RandomUnderSampler() X_resampled, y_resampled = rus.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
# .................................... # # ``sampling_strategy`` can be given a ``float``. For **under-sampling # methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by # :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and # :math:`N_{m}` are the number of samples in the majority class after # resampling and the number of samples in the minority class, respectively. # select only 2 classes since the ratio make sense in this case binary_mask = np.bitwise_or(y == 0, y == 2) binary_y = y[binary_mask] binary_X = X[binary_mask] sampling_strategy = 0.8 rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an under-sampling method: \n ' 'sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # For **over-sampling methods**, it correspond to the ratio # :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}` # where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the # minority class after resampling and the number of samples in the majority # class, respectively. ros = RandomOverSampler(sampling_strategy=sampling_strategy)
def run_training(fold_): total_roc = [] total_conf = [] t0 = time.time() #df = pd.read_csv("../input/embedded_train_tiny_folds.csv") df = pd.read_hdf(path_or_buf="../input/tiny_data/full_data_folds.h5", key='dataset') #print("tg\n",df.target.value_counts()) #print(" ") t1 = time.time() total_time = t1 - t0 print("time to read file", total_time) print(f"fold: {fold_}") t0 = time.time() train_df = df[df.kfold != fold_].reset_index(drop=True) test_df = df[df.kfold == fold_].reset_index(drop=True) # print("train shape\n", train_df.shape) # print("test shape\n", test_df.shape) #features xtrain = train_df.drop(["kfold", "target"], axis=1) xtest = test_df.drop(["kfold", "target"], axis=1) # Standard scaler #sc = StandardScaler() #sc.fit(xtrain) #xtrain = sc.transform(xtrain) #xtest = sc.transform(xtest) # target # First make the target binary train_df.target = train_df.target.apply(lambda x: 'open' if x == 'open' else 'closed') test_df.target = test_df.target.apply(lambda x: 'open' if x == 'open' else 'closed') # Encode labels le = preprocessing.LabelEncoder() le.fit(train_df.target) #print(le.classes_) ytrain = le.transform(train_df.target) ytest = le.transform(test_df.target) print("now do SMOTE") # defin pipeline #over = RandomOverSampler( # sampling_strategy=0.032, # random_state=0) over = SMOTE(sampling_strategy=0.8, n_jobs=-1) under = RandomUnderSampler(sampling_strategy=0.9) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) #transform the datset X_res, y_res = pipeline.fit_resample(xtrain, ytrain) #X_res, y_res =xtrain, ytrain print("Before sampling %s" % Counter(ytrain)) print('Resampled dataset shape %s' % Counter(y_res)) #model model = xgb.XGBRFClassifier(use_label_encoder=False, scale_pos_weight=0.9, n_estimators=70, max_depth=6, n_jobs=-1, subsample=0.4, num_parallel_tree=20, eval_metric='logloss', tree_method='auto', objective='reg:logistic', gamma=.1, min_child_weight=6, booster='dart', eta=0.8) #fit the model on training data model.fit(X_res, y_res) # make predictions preds = model.predict(xtest) preds_proba = model.predict_proba(xtest)[:, 1] # print('preds shape',preds_proba.shape) t1 = time.time() total_time = t1 - t0 print('time to fit model:', total_time) accuracy_score = np.sum(preds == ytest) / len(ytest) #log_loss= metrics.log_loss(train_df.OpenStatus,preds) #print(f"Fold:{fold_}") #print(f"Accuracy={accuracy_score}") conf_m = confusion_matrix(ytest, preds) #print('Confusion matrix\n',conf_m) roc_score = roc_auc_score(ytest, preds_proba) print('ROC AUC score\n', roc_score) t = [fold_, roc_score] total_conf.append(conf_m) total_roc.append(t) test_df.loc[:, "xgb_pred_n"] = preds_proba print('Confusion matrix\n', confusion_matrix(ytest, preds)) return test_df[["id", "target", "kfold", "xgb_pred_n"]], np.mean(total_roc, axis=0)[1]
lr_model = LogisticRegression(random_state=r_state) xgb_model = XGBClassifier() dt_model = DecisionTreeClassifier(random_state=r_state) svc_model = SVC(kernel='linear', C=1.0, probability=True) knn_model = KNeighborsClassifier(n_neighbors=5) calculate_scores_in_cv(xgb_model, 'xgboost_model', X, y) calculate_scores_in_cv(Adboost_model, 'AdaBoostClassifier', X, y) calculate_scores_in_cv(dt_model, 'DecisionTreeClassifier', X, y) calculate_scores_in_cv(svc_model, 'SVC', X, y) calculate_scores_in_cv(lr_model, 'LogisticRegression', X, y) calculate_scores_in_cv(knn_model, 'KNeighborsClassifier', X, y) """With SMOTE data balancing""" over = SMOTE(sampling_strategy=1, random_state=r_state) under = RandomUnderSampler(sampling_strategy=1, random_state=r_state) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) sampled_X, sampled_y = pipeline.fit_resample(X, y) sampled_X, sampled_y = shuffle(sampled_X, sampled_y, random_state=r_state) calculate_scores_in_cv(xgb_model, 'xgboost_model', sampled_X, sampled_y) calculate_scores_in_cv(dt_model, 'DecisionTreeClassifier', sampled_X, sampled_y) calculate_scores_in_cv(svc_model, 'SVC', sampled_X, sampled_y) calculate_scores_in_cv(lr_model, 'LogisticRegression', sampled_X, sampled_y) calculate_scores_in_cv(knn_model, 'KNeighborsClassifier', sampled_X, sampled_y)
def comet_Fold(save_path, embedding_type, model_type, bin_labels): from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="80_10_baseline", workspace="gdreiman1", disabled=False) exp.log_code = True #turn off comet logging comments import os #os.environ['COMET_LOGGING_FILE_LEVEL'] = 'WARNING' import warnings warnings.filterwarnings('ignore') import pickle import pandas as pd import numpy as np import sklearn as sklearn from sklearn.metrics import precision_recall_fscore_support as prf from sklearn.linear_model import SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler, LabelEncoder import matplotlib.pyplot as plt import seaborn as sns import ntpath from imblearn.over_sampling import RandomOverSampler #choosing a 4:1 Inactive to Active ratio ros = RandomOverSampler(sampling_strategy=0.33, random_state=42) from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(sampling_strategy=0.33, random_state=42) '''Comet Saving Zone''' def comet_addtional_info(exp, save_path, metrics_dict, X_test, y_test, embedding_type, model_type): #get base file name folder, base = ntpath.split(save_path) #split file name at second _ assumes file save in AID_xxx_endinfo.pkl AID, _, end_info = base.rpartition('_') exp.add_tag(AID) #save data location, AID info, and version info exp.log_dataset_info(name=AID, version=end_info, path=save_path) #save some informatvie tags: tags = [AID, end_info, model_type] exp.add_tags(tags) exp.add_tag(embedding_type) #save metrics_dict in data_folder with comet experiement number associated exp_num = exp.get_key() model_save = Path(folder + '/' + model_type + '_' + embedding_type + '_' + exp_num + 'metrics_dict.pkl') pickle_on = open(model_save, 'wb') pickle.dump(metrics_dict, pickle_on) pickle_on.close() #log trained model location exp.log_other('Metrics Dict Path', model_save) #tell comet that the experiement is over exp.end() def get_Scaled_Data(train_ind, test_ind, X_mfp, activity_table, labels, bin_labels): #get start and end index for molchars MC_start = activity_table.columns.get_loc('Chi0') #need to add 1 bc exclusive indexing MC_end = activity_table.columns.get_loc('VSA_EState9') + 1 # standardize data scaler = StandardScaler(copy=False) #return requested datatype if embedding_type == 'MFPMolChars': X_train_molchars_std = scaler.fit_transform( np.array(activity_table.iloc[train_ind, MC_start:MC_end]).astype(float)) X_test_molchars_std = scaler.transform( np.array(activity_table.iloc[test_ind, MC_start:MC_end]).astype(float)) X_train = np.concatenate( (X_mfp[train_ind, :], X_train_molchars_std), axis=1) X_test = np.concatenate((X_mfp[test_ind, :], X_test_molchars_std), axis=1) elif embedding_type == 'MFP': X_train = X_mfp[train_ind, :] X_test = X_mfp[test_ind, :] elif embedding_type == 'MolChars': X_train_molchars_std = scaler.fit_transform( np.array(activity_table.iloc[train_ind, MC_start:MC_end]).astype(float)) X_test_molchars_std = scaler.transform( np.array(activity_table.iloc[test_ind, MC_start:MC_end]).astype(float)) X_train = X_train_molchars_std X_test = X_test_molchars_std y_train = labels[train_ind] y_test = labels[test_ind] #remapping active to 1 and everything else to zero bin_y_train, bin_y_test = np.array([ 1 if x == 0 else 0 for x in y_train ]), np.array([1 if x == 0 else 0 for x in y_test]) if bin_labels == True: y_test = bin_y_test y_train = bin_y_train return X_train, X_test, y_train, y_test def train_SVM(X_train, X_test, y_train, y_test, split_ID): sgd_linear_SVM = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=500000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=-1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False) sgd_linear_SVM_model = sgd_linear_SVM.fit(X_train, y_train) sgd_lSVM_preds = sgd_linear_SVM_model.predict(X_test) prec, rec, f_1, supp = prf(y_test, sgd_lSVM_preds, average=None) class_rep = sklearn.metrics.classification_report( y_test, sgd_lSVM_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, sgd_lSVM_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(sgd_linear_SVM_model.get_params()) return prec, rec, f_1, supp, mcc def train_RF(X_train, X_test, y_train, y_test, split_ID): rf = RandomForestClassifier(n_estimators=100, random_state=2562, class_weight="balanced_subsample", n_jobs=-1) rand_for = rf.fit(X_train, y_train) rf_preds = rand_for.predict(X_test) prec, rec, f_1, supp = prf(y_test, rf_preds, average=None) class_rep = sklearn.metrics.classification_report(y_test, rf_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, rf_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(rand_for.get_params()) return prec, rec, f_1, supp, mcc def train_LGBM(X_train, X_test, y_train, y_test, split_ID): import lightgbm as lgb #make model class lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=500, subsample_for_bin=200000, objective='binary', is_unbalance=True, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True, importance_type='split') #train model lgbm = lgbm_model.fit(X_train, y_train) lgbm_preds = lgbm.predict(X_test) prec, rec, f_1, supp = prf(y_test, lgbm_preds, average=None) class_rep = sklearn.metrics.classification_report(y_test, lgbm_preds) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef(y_test, lgbm_preds) #if first iteration, report model parameters to comet if split_ID == '0': exp.log_parameters(lgbm.get_params()) return prec, rec, f_1, supp, mcc def train_DNN(X_train, X_test, y_train, y_test, split_ID): import tensorflow as tf #tf.enable_eager_execution() # from keras import backend as K from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, GaussianNoise from tensorflow.keras.layers import Lambda from tensorflow.keras.utils import to_categorical # def focal_loss(y_true, y_pred): # gamma = 2.0 # alpha = 0.25 # pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred)) # pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred)) # # pt_1 = K.clip(pt_1, 1e-3, .999) # # pt_0 = K.clip(pt_0, 1e-3, .999) # # return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log( pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0 )) #bias for predictions fl_pi = 0.01 final_bias = -np.log((1 - fl_pi) / fl_pi) num_labels = len(set(y_test)) from sklearn.utils import class_weight class_weights = class_weight.compute_class_weight( 'balanced', np.unique(y_train), y_train) tf.keras.backend.clear_session() fast_NN = Sequential(name='quick') #fast_NN.add(GaussianNoise(.5)) fast_NN.add(Dense(512, activation='sigmoid', name='input')) #fast_NN.add(Dropout(0.5)) fast_NN.add( Dense(128, activation='relu', name='first', bias_initializer=tf.keras.initializers.Constant(value=0.1))) #fast_NN.add(Dropout(0.5)) fast_NN.add( Dense(64, activation='relu', name='second', bias_initializer=tf.keras.initializers.Constant(value=0.1))) #fast_NN.add(Dropout(0.5)) fast_NN.add( Dense(16, activation='relu', name='third', bias_initializer=tf.keras.initializers.Constant(value=0.1))) #fast_NN.add(Dropout(0.25)) fast_NN.add( Dense(num_labels, activation='softmax', name='predict', bias_initializer=tf.keras.initializers.Constant( value=final_bias))) fast_NN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[ 'categorical_accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision() ]) fast_NN_model = fast_NN.fit(X_train, to_categorical(y_train), validation_data=(X_test, to_categorical(y_test)), epochs=10, batch_size=500, class_weight=class_weights, shuffle=True, verbose=0) NN_test_preds = fast_NN.predict(X_test) prec, rec, f_1, supp = prf(y_test, np.argmax(NN_test_preds, axis=1), average=None) class_rep = sklearn.metrics.classification_report( y_test, np.argmax(NN_test_preds, axis=1)) exp.log_other('Classification Report' + split_ID, class_rep) mcc = sklearn.metrics.matthews_corrcoef( y_test, np.argmax(NN_test_preds, axis=1)) #if first iteration, report model parameters to comet # if split_ID == '0': # exp.log_parameters(lgbm.get_params()) return prec, rec, f_1, supp, mcc #from https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys def flatten(d, parent_key='', sep='_'): import collections items = [] for k, v in d.items(): new_key = parent_key + sep + k if parent_key else k if isinstance(v, collections.MutableMapping): items.extend(flatten(v, new_key, sep=sep).items()) else: items.append((new_key, v)) return dict(items) def calc_and_save_metrics(X_train, X_test, y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num): '''Takes in test and train data + labels, computes metrics and saves them as a dict inside of the provided list. Returns this list.''' prec, rec, f_1, supp, mcc = classifier_train(X_train, X_test, y_train, y_test, split_info) results_array = np.concatenate((prec, rec, f_1, supp)).tolist() + [mcc] if little_split_num == 'NaN': split_size = '80%' else: split_size = '10%' metric_dict_list.append( dict( zip(metric_names, [ model_type, embedding_type, AID, split_num, little_split_num, split_size, split_index, split_info ] + results_array))) return metric_dict_list '''Begin the actual experiment''' #get data cleaned pickle_off = open(save_path, 'rb') activity_table = pickle.load(pickle_off) pickle_off.close() #get AID folder, base = ntpath.split(save_path) #split file name at second _ assumes file save in AID_xxx_endinfo.pkl AID, _, end_info = base.rpartition('_') #get length of MFP fp_length = len(activity_table.iloc[5]['MFP']) #reshape mfp X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel() X_mfp = X_mfp.reshape((-1, fp_length)) le = LabelEncoder() labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME']) #split data: from sklearn.model_selection import StratifiedShuffleSplit #this is outer 5fold cross validation i.e. 80/20 split big_splitter = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=2562) #inner replicateing the start with 10% of data (or 12.5% of 80% intial split) little_splitter = StratifiedShuffleSplit(n_splits=8, test_size=0.2, train_size=0.125, random_state=2562) #this holds all the metrics values that will be stored in comet metric_names = [ 'Classifier', 'Embedding', 'AID', '80% Split Number', '10% Split Number', 'Train Split Size', 'ID', 'Split Info', 'prec_Inactive', 'prec_Active', 'rec_Inactive', 'rec_Active', 'f_1_Inactive', 'f_1_Active', 'supp_Inactive', 'supp_Active', 'mcc' ] #determine model type classifier_dict = { 'SVM': train_SVM, 'RF': train_RF, 'LGBM': train_LGBM, 'DNN': train_DNN } #set dummy variable to func that trains specified model classifier_train = classifier_dict[model_type] metric_dict_list = [] #using labels as a dummy for X for split_num, [train_ind, test_ind] in enumerate(big_splitter.split(labels, labels)): #indexs which split the data comes from X.X ie big.little split_index = str(split_num) little_split_num = 'NaN' '''Regular Sample''' split_info = 'Split' + split_index + ' 80% train' + 'BaseRatio' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics over_X_train, over_y_train = ros.fit_resample(X_train, y_train) metric_dict_list = calc_and_save_metrics( over_X_train, X_test, over_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) '''Over Sample''' split_info = 'Split' + split_index + ' 80% train' + 'OverSample' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics over_X_train, over_y_train = ros.fit_resample(X_train, y_train) metric_dict_list = calc_and_save_metrics( over_X_train, X_test, over_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) '''Under Sample''' split_info = 'Split' + split_index + ' 80% train' + 'UnderSample' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics under_X_train, under_y_train = rus.fit_resample(X_train, y_train) #print('active ratio is:',sum(under_y_train)/len(under_y_train)) metric_dict_list = calc_and_save_metrics( under_X_train, X_test, under_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) for little_split_num, [little_train_ind, little_test_ind] in enumerate( little_splitter.split(labels[train_ind], labels[train_ind])): split_index = str(split_num) + '.' + str(little_split_num) '''Regular Sample''' split_info = 'Split' + split_index + ' 10% train' + 'BaseRatio' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) #train model and get back classwise metrics over_X_train, over_y_train = ros.fit_resample(X_train, y_train) if len(set(y_train)) == 2: metric_dict_list = calc_and_save_metrics( over_X_train, X_test, over_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) else: print('Skipped ' + split_info) '''Over Sample''' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( little_train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) over_X_train, over_y_train = ros.fit_resample(X_train, y_train) split_info = 'Split' + str(split_num) + ' 10% train' + 'OverSample' #train model and get back classwise metrics #check if train_split contains both postive and negative labels if len(set(y_train)) == 2: metric_dict_list = calc_and_save_metrics( over_X_train, X_test, over_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) else: print('Skipped ' + split_info) '''UnderSample''' #get test/train index X_train, X_test, y_train, y_test = get_Scaled_Data( little_train_ind, test_ind, X_mfp, activity_table, labels, bin_labels) under_X_train, under_y_train = rus.fit_resample(X_train, y_train) split_info = 'Split' + str( split_num) + ' 10% train' + 'UnderSample' #train model and get back classwise metrics #check if train_split contains both postive and negative labels if len(set(y_train)) == 2: metric_dict_list = calc_and_save_metrics( under_X_train, X_test, under_y_train, y_test, split_index, model_type, embedding_type, AID, metric_names, metric_dict_list, split_info, split_num, little_split_num) else: print('Skipped ' + split_info) # now convert metric_dict_list to df: metrics_df = pd.DataFrame(metric_dict_list) #set Split_ID to inded #now plot all the columns #first make a new df column to ID things as either split cols_to_plot = [ 'prec_Inactive', 'prec_Active', 'rec_Inactive', 'rec_Active', 'f_1_Inactive', 'f_1_Active', 'supp_Inactive', 'supp_Active', 'mcc' ] #turn off plotting plt.ioff() for metric in cols_to_plot: #make sns boxplot ax = sns.boxplot(x='Split Info', y=metric, data=metrics_df) ax.set_xticklabels(ax.get_xticklabels(), rotation=30) plt.tight_layout() #log the plot exp.log_figure() plt.clf() ''' now we're going to go through and calculate means and stds for 3 diff groups 1) the 5 80% train runs 2) the 5 sets of 8 10% runs 3) the 40 total 10% runs we save each in a list as a pd Series with a name explaining the contents''' #now add list of dicts of averages to metrics df #convert metrics_df to metric dict and log it #save metric_df to current folder comet_addtional_info(exp, save_path, metrics_df, X_test, y_test, embedding_type, model_type) return metrics_df
################################## ### [2] Over Sampling ################################## from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler() X_resampled, y_resampled = ros.fit_sample(X,y) result_dic["Over Sampling"] = LogisticReg(X_resampled, y_resampled) ################################## ### [3] Under Sampling ################################## from imblearn.under_sampling import RandomUnderSampler ros = RandomUnderSampler() X_resampled, y_resampled = ros.fit_sample(X,y) result_dic["Under Sampling"] = LogisticReg(X_resampled, y_resampled) ################################## ### [4] SMOTE ################################## from imblearn.over_sampling import SMOTE sm = SMOTE(kind='regular') X_resampled, y_resampled = sm.fit_sample(X,y) result_dic["SMOTE"] = LogisticReg(X_resampled, y_resampled) ##################################
def forest_tangent_space_hierarchical(data): """A cross validated tangent space classifier with svm. Parameters ---------- data : dict A dictionary containing training and testing data Returns ------- cross validated scores A list of cross validated scores. """ # Combine two classes into one class x_level_1 = data['train_x'] y_level_1 = np.argmax(data['train_y'], axis=1) + 1 y_level_1 = np.expand_dims(y_level_1, axis=1) # Verify if they are balanced print( sum(y_level_1 == 1) / len(y_level_1), sum(y_level_1 == 2) / len(y_level_1), sum(y_level_1 == 3) / len(y_level_1)) # Combine C1 and C2 classes and balance the dataset for traning y_level_1[y_level_1 == 2] = 1 rus = RandomUnderSampler() rus.fit_resample(y_level_1, y_level_1) # Store them in dictionary x_level_1 = x_level_1[rus.sample_indices_, :] y_level_1 = y_level_1[rus.sample_indices_].ravel() # Train a classifier with only this data clf_level_1 = RandomForestClassifier(n_estimators=100, random_state=43) scores_1 = cross_val_score(clf_level_1, x_level_1, y_level_1, cv=KFold(5, shuffle=True)) print(scores_1) # Second level of traning y_level_2 = np.argmax(data['train_y'], axis=1) + 1 idx = y_level_2 != 3 x_level_2 = data['train_x'][idx, :] y_level_2 = y_level_2[idx].ravel() # Train a classifier with only this data clf_level_2 = RandomForestClassifier(n_estimators=100, random_state=43) scores_2 = cross_val_score(clf_level_2, x_level_2, y_level_2, cv=KFold(5, shuffle=True)) print(scores_2) # Fir the level 2 classifier for final testing clf_level_1 = clf_level_1.fit(x_level_1, y_level_1) clf_level_2 = clf_level_2.fit(x_level_2, y_level_2) # Predict using first level and use the output for second level y_true = np.argmax(data['test_y'], axis=1) + 1 y_pred_1 = clf_level_1.predict(data['test_x']) idx = y_pred_1 == 1 y_pred_2 = clf_level_2.predict(data['test_x'][idx, :]) y_pred_1[idx] = y_pred_2 # Concatenate both of them and compare with true labels y_pred = y_pred_1 score = accuracy_score(y_true, y_pred) return score
ys = np.concatenate((ys, np.array(y_res[i]))) print(Xs.shape, ys.shape) shuffle(Xs, ys) # Generate more synthetic samples if smote is not None: Xs, ys = smote.fit_sample(Xs, ys) shuffle(Xs, ys) ys = to_categorical(ys, 2) return Xs, ys rus = RandomUnderSampler(ratio={0: 1531 * 30, 1: 1531}) smote = SMOTE(n_jobs=-1, random_state=42, k_neighbors=3, m_neighbors=5) rus2 = RandomUnderSampler(ratio={0: 1531 * 100, 1: 1531 * 50}) #ros = RandomOverSampler(ratio={0: 1531*10, 1: 1531*5}) # smoteenn = SMOTEENN(smote=SMOTE(n_jobs=-1)) print("Resampling") ''' 0.589 resampled_features, resampled_labels = rus.fit_sample(features, labels[:, 1]) resampled_features, resampled_labels = smote.fit_sample( resampled_features, resampled_labels) #resampled_features, resampled_labels = rus2.fit_sample( # resampled_features, resampled_labels)
# train_samples = np.concatenate([ sentences_train,features_train,refers_train,abstract_train], axis=-1) #train_samples = np.concatenate([[i * 10 for i in features_train]], axis=-1) train_samples = np.concatenate( [sentences_train, features_train, refers_train, abstract_train], axis=-1) # test_samples = np.concatenate([ sentences_test,features_test,refers_test,abstract_test,labels_test], axis=-1) #test_samples = np.concatenate([[i * 10 for i in features_test]], axis=-1) test_samples = np.concatenate( [sentences_test, features_test, refers_test, abstract_test, labels_test], axis=-1) from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import SMOTE #smo = SMOTE(sampling_strategy=0.7) #x_train,y_train = smo.fit_sample(train_samples,labels_train) model_RandomUnderSample = RandomUnderSampler(sampling_strategy=0.6) x_train, y_train = model_RandomUnderSample.fit_sample(train_samples, labels_train) y_train = np.expand_dims(y_train, axis=1) train_samples = np.concatenate([x_train, y_train], axis=-1) trainData = 'train7_14.csv' testData = 'test7_14.csv' data1 = pd.DataFrame(train_samples) data1.columns = data1.columns.map(lambda x: 'test' if x == (data1.shape[1] - 1) else 'train') data1.to_csv(trainData, index=False) data1 = pd.DataFrame(test_samples) data1.columns = data1.columns.map(lambda x: 'test' if x == (data1.shape[1] - 1) else 'train')
def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=random_state) return rus.fit_resample(X, y)
def eval_with_sampling_and_kfold_logical_regression(features, df, k=5): """ evaluating logical regression with over+under sampling to find the balanced weights and applying Kfold CV over it :param features: selected features :param df: dataframe from the dataset :param k: kfold value for Kfold CV :return: precission,recall,accuracy,f1_score """ roc_auc, precision, recall, acc, f1, auc_score = [[] for _ in range(6)] X, y = init_model(df, features) # trying to fix scewness X = np.log1p(X) print("X->", X) print("y->", y) # split into train/test sets %70 train and %30 test # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # fit a model model = LogisticRegression() over = SMOTE(sampling_strategy=0.1) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('over', over), ('under', under)] pipeline = Pipeline(steps=steps) # apply cross validation i.e K-Fold kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1) X, y = pipeline.fit_resample(X, y) # enumerate the splits and summarize the distributions for train_ix, test_ix in kfold.split(X, y): X_train = X.iloc[train_ix] X_test = X.iloc[test_ix] y_train = y[train_ix] y_test = y[test_ix] # print the split rates print_split_rates(y_train, y_test) print("running the pipeline fit..\n") model.fit(X_train, y_train) y_pred = model.predict(X_test) # Scores precision += [precision_score(y_test, y_pred, average='binary')] recall += [recall_score(y_test, y_pred, average='binary')] acc += [accuracy_score(y_test, y_pred)] f1 += [f1_score(y_test, y_pred, average='binary')] # auc_score += [roc_auc_score(y_test,y_pred)] # cross_val_score() # print ROC curve fpr, tpr, thresholds = roc_curve(y_test, y_pred) roc_auc += [auc(fpr, tpr)] roc_auc = sum(roc_auc) / k print("\nprecision:{0}\nrecall:{1}\naccuracy:{2}\nf1_score:{3}".format( sum(precision) / k, sum(recall) / k, sum(acc) / k, sum(f1) / k)) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label='AUC = %0.3f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1.05]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') # plt.show() plt.savefig("LogR-ROC.png", dpi=300)
def __init__(self, n_estimators, depth): self.M = n_estimators self.depth = depth self.undersampler = RandomUnderSampler(replacement=False)
def func(X, y, ratio, random_state): rus = RandomUnderSampler(ratio=ratio, random_state=random_state) return rus.fit_sample(X, y)
cols = self.X_unders[name].columns sel_cols = [ cols[i] for i in range(len(cols)) if ranking[i] == 1 and cols[i] not in ['TransactionID', 'TransactionDT'] ] self.X_unders[name] = self.X_unders[name][sel_cols] if __name__ == '__main__': # load data table red_data = reduced_transaction_table('../data/train_transaction.csv') und_samp_name = 'random' red_data.add_undersampling_transform( und_samp_name, RandomUnderSampler(sampling_strategy='majority', random_state=0)) # load selected features rankings = load( open('../trained_models/select_features/rankings.pkl', 'rb')) # it is known that 80 features gives the best accuracy red_data.select_features(und_samp_name, rankings[80]) # reference for fitting model and getting feature importance: # https://machinelearningmastery.com/calculate-feature-importance-with-python/ # define dataset X, y = red_data.X_unders[und_samp_name], red_data.y_unders[und_samp_name] # split train and test sets X_train, X_test, y_train, y_test = train_test_split(X,
X2_train, X2_valid, y2_train, y2_valid = train_test_split(X2, y2, test_size=0.20, stratify=y2, random_state=42) test_X = test_1.drop('went_on_backorder', axis=1).values test_Y = test_1['went_on_backorder'].values print('Imbalanced ratio in training set_2: 1:%i' % (Counter(y2)[0] / Counter(y2)[1])) cart_0 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_leaf=5) rus_0 = make_pipeline( RandomUnderSampler(), tree.DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_leaf=5)) forest_0 = ensemble.RandomForestClassifier(criterion='entropy', max_depth=15, min_samples_leaf=5) xgb_0 = XGBClassifier(max_depth=15, learning_rate=0.1) cart_1 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_leaf=5) rus_1 = make_pipeline( RandomUnderSampler(), tree.DecisionTreeClassifier(criterion='entropy', max_depth=8,
def RandomUnderSample(X_train, y_train): rus = RandomUnderSampler(sampling_strategy='auto', return_indices=False, random_state=None, replacement=False, ratio=None) X_train, y_train = rus.fit_resample(X_train, y_train) return X_train, y_train
neg, pos = np.bincount(y) total = neg + pos w0 = (1 / neg) * (total) / 2 w1 = (1 / pos) * (total) / 2 weights = {0: w0, 1: w1} from imblearn.over_sampling import RandomOverSampler, SMOTE from imblearn.under_sampling import RandomUnderSampler over = RandomOverSampler(sampling_strategy=0.4) X, y = over.fit_resample(X, y) #smote = SMOTE(sampling_strategy = 0.4, random_state = 1) #X, y = smote.fit_resample(X, y) under = RandomUnderSampler(sampling_strategy='majority') X, y = under.fit_resample(X, y) shuffler = np.random.permutation(len(X)) X = X[shuffler] y = y[shuffler] import keras import tensorflow as tf from keras.models import Sequential from keras.layers import Dense, Dropout #from sklearn.utils.class_weight import compute_class_weight METRICS = [ keras.metrics.TruePositives(name='tp'), keras.metrics.FalsePositives(name='fp'),
def run(): dtype = [ 'characteristic_B', 'characteristic_C', 'characteristic_D', 'characteristic_E', 'characteristic_G', 'characteristic_M', 'characteristic_P', 'characteristic_Q', 'characteristic_R', 'characteristic_S', 'characteristic_Y', 'characteristic_Z', 'catering_C', 'catering_F', 'catering_H', 'catering_M', 'catering_R', 'catering_T', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun', 'freight', 'bank_holiday_running', 'length', 'speed', 'delayed' ] dtype = {key: "uint8" for key in dtype} dtype.update({ "status": "category", "category": "category", "power_type": "category", "timing_load": "category", "seating": "category", "sleepers": "category", "reservations": "category", "ATOC_code": "category", "destination_stanox_area": "category", "origin_stanox_area": "category" }) start = time.time() print("Loading data...", end="") df = pd.read_csv("data/dscm_w.csv", index_col=["uid"], parse_dates=["std", "sta", "atd", "ata"], dtype=dtype) print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n") print(df.info()) path = os.path.join("models", "select") if not os.path.exists(path): os.mkdir(path) Y = df["delayed"] X = df.drop(["delay", "delayed", "atd", "ata", "origin", "destination"], axis=1) categorical_features = X.select_dtypes(include="category").columns.values categorical_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) datetime_features = X.select_dtypes(include="datetime").columns.values datetime_transformer = Pipeline([("cyclical", DatetimeEncoder(cyclical=True))]) numeric_features = ["speed", "length", "duration"] numerical_transformer = Pipeline([("scaler", StandardScaler())]) preprocessor = ColumnTransformer([ ("categorical", categorical_transformer, categorical_features), ("datetime", datetime_transformer, datetime_features), ("numeric", numerical_transformer, numeric_features) ]) resampler = IPipeline([ # ('over', SMOTE(sampling_strategy=0.2, random_state=1)), # Increase minority to 20% of majority ('under', RandomUnderSampler(sampling_strategy=1.0, random_state=1) ), # Reduce majority to 50% of minority ]) start = time.time() print("\nPreprocessing data...", end="") X = preprocessor.fit_transform(X, Y) print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n") print(X.shape) print("\nResampling data...", end="") X, Y = resampler.fit_resample(X, Y) print(" DONE ({:.2f}s)".format(time.time() - start), end="\n\n") print("{}, delayed: {}, not delayed: {}\n".format(X.shape, Y.sum(), len(Y) - Y.sum())) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1) classifiers = [ # LogisticRegression(), # RidgeClassifier(), # SGDClassifier(), # LinearSVC(), # DecisionTreeClassifier(), # MLPClassifier(), # AdaBoostClassifier(), # GradientBoostingClassifier(), RandomForestClassifier(n_jobs=-1), ] for clf in classifiers: train(clf, path, X_train, Y_train) metrics = [ recall_score, average_precision_score, ] Y_pred = clf.predict(X_test) results = { "name": clf.__class__.__name__, "score": clf.score(X_test, Y_test) } for m in metrics: results[m.__name__] = m(Y_test.values, Y_pred) print(clf.__class__.__name__ + "\n") print(results) print( classification_report(Y_test.values, Y_pred, target_names=["not delayed", "delayed"]))
def naieve_undersample(x, y, seed=None): if seed is None: seed = random.randint(0, 1000) rus = RandomUnderSampler(random_state=seed) x_resampled, y_resampled = rus.fit_resample(x, y) return x_resampled, y_resampled
def trainy(self, testy=0.2, imbl=True): """ I'll do the following here: 1. Do train test split 2. Convert X_train and X_test to DataFrame (to delete column later plus other purposes) 3. Do tfidf using train section, use the model and fit the X_train and X_test (then can delete the wordchunk column) 4. If StandardScaler, scale the training and test data. (Default = True) 5. To prepare data for chi2 reduction we need to scale everything to above 0, so MinMaxScaler """ #This is perhaps the main reason why this step is embedded in a class #Because the stratification would be different, everything would be different already, like the tfidf vocab for example X_train, X_test, y_train, y_test = train_test_split( self.X, self.y, random_state=self.random_state, test_size=testy, stratify=self.y) self.y_train = y_train self.y_test = y_test X_train = pd.DataFrame(X_train, columns=self.columns) X_test = pd.DataFrame(X_test, columns=self.columns) df_train = pd.DataFrame() df_test = pd.DataFrame() for i in np.arange(1, 4): tfidf = TfidfVectorizer(stop_words='english', ngram_range=(i, i), decode_error='replace', max_features=10000) Xword_train = tfidf.fit_transform(X_train['words_only']) Xword_test = tfidf.transform(X_test['words_only']) #We need to reduce the size of the tfidf trained matrix first #But after running TruncatedSVD we cannot see the words specifically alr so too bad... tsvd = TruncatedSVD(n_components=500, algorithm='arpack', random_state=self.random_state) Xwordie_train = tsvd.fit_transform(Xword_train) Xwordie_test = tsvd.transform(Xword_test) Xwordie_train_df = pd.DataFrame( Xwordie_train, columns=[ str(i) + '_' + str(b) for b in np.arange(1, Xwordie_train.shape[1] + 1) ]) Xwordie_test_df = pd.DataFrame( Xwordie_test, columns=[ str(i) + '_' + str(b) for b in np.arange(1, Xwordie_test.shape[1] + 1) ]) df_train = pd.concat([df_train, Xwordie_train_df], axis=1) df_test = pd.concat([df_test, Xwordie_test_df], axis=1) self.tfidf_list.append(tfidf) self.tsvd_list.append(tsvd) X_train.drop(['words_only'], axis=1, inplace=True) X_test.drop(['words_only'], axis=1, inplace=True) X = self.X.drop(['words_only'], axis=1) if self.web: X_train.drop([ 'n_video', 'n_links', 'n_image', 'n_otherlink', 'mention_count', 'hashtag_count', 'mbti_ref_count', 'ennea_count', 'bracket_count' ], axis=1, inplace=True) X_test.drop([ 'n_video', 'n_links', 'n_image', 'n_otherlink', 'mention_count', 'hashtag_count', 'mbti_ref_count', 'ennea_count', 'bracket_count' ], axis=1, inplace=True) X.drop([ 'n_video', 'n_links', 'n_image', 'n_otherlink', 'mention_count', 'hashtag_count', 'mbti_ref_count', 'ennea_count', 'bracket_count' ], axis=1, inplace=True) self.columns = X_train.columns #Standardization step if self.stan: ss = StandardScaler().fit(X) X_train = ss.transform(X_train) X_test = ss.transform(X_test) X_train = pd.DataFrame(X_train, columns=self.columns) X_test = pd.DataFrame(X_test, columns=self.columns) self.ss = ss #Join step if self.include_feature == 'words': X_train = df_train X_test = df_test columnie = X_train.columns else: X_train = X_train.join(df_train) X_test = X_test.join(df_test) columnie = X_train.columns #Scale again to between 0 and 1 combined_X = pd.concat([X_train, X_test], axis=0) mms = MinMaxScaler().fit(combined_X) X_train = pd.DataFrame(mms.transform(X_train), columns=columnie) X_test = pd.DataFrame(mms.transform(X_test), columns=columnie) if imbl: imbler = RandomUnderSampler(random_state=42) X_train, y_train = imbler.fit_sample(X_train, y_train) self.X_train = X_train self.X_test = X_test self.y_train = y_train self.mms = mms return X_train, X_test, y_train, y_test
# under-sampling methods. # # With the controlled under-sampling methods, the number of samples to be # selected can be specified. # :class:`~imblearn.under_sampling.RandomUnderSampler` is the most naive way of # performing such selection by randomly selecting a given number of samples by # the targetted class. # %% from imblearn.under_sampling import RandomUnderSampler X, y = create_dataset(n_samples=400, weights=(0.05, 0.15, 0.8), class_sep=0.8) samplers = { FunctionSampler(), # identity resampler RandomUnderSampler(random_state=0), } fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15)) for ax, sampler in zip(axs, samplers): model = make_pipeline(sampler, clf).fit(X, y) plot_decision_function( X, y, model, ax[0], title=f"Decision function with {sampler.__class__.__name__}" ) plot_resampling(X, y, sampler, ax[1]) fig.tight_layout() # %% [markdown] # :class:`~imblearn.under_sampling.NearMiss` algorithms implement some # heuristic rules in order to select samples. NearMiss-1 selects samples from
def main(args): #Logging logger = get_logger("cfxgb") ################################################################################################################ #ARGUMENT CHECK ################################################################################################################ if args.Dataset is None: logger.error("Dataset required") exit(0) if args.ParentCols < 0: logger.error("Enter valid levels") exit(0) if args.parameters is None: logger.error("Model Parameters required") exit(0) else: config = load_json(args.parameters) logger.info("Loaded JSON") logger.info( "JSON ----------------------------------------------------------------------------------" ) json1 = json.dumps(config, indent=4, separators=(". ", " = ")) logger.info(json1) logger.info( "END OF JSON----------------------------------------------------------------------------" ) ################################################################################################################ #DATASET ################################################################################################################ if not osp.exists(args.Dataset): full_path = osp.join('Datasets', args.Dataset + '.csv') if not osp.exists(full_path): logger.error("Enter valid Dataset") exit(0) else: full_path = args.Dataset logger.info(args.Dataset + " used") data = pd.read_csv(full_path) if (args.ignore): logger.info("First column ignored") data = data.iloc[:, 1:] logger.info("Data Read Complete") ################################################################################################################ ################################################################################################################ #Extra Columns ################################################################################################################ if (args.ParentCols): logger.info("{} level(s) of parent nodes will be added. ".format( args.ParentCols)) else: logger.info("Parent nodes not considered") ################################################################################################################ ################################################################################################################ #Sample ################################################################################################################ if (args.sample): weights = data.groupby( data.columns[-1])[data.columns[-1]].transform('count') if (len(np.unique(weights)) == 1): logging.info("Equal weights already.") data = data.sample(n=args.sample, random_state=0) else: sum = np.sum(np.unique(weights)) weights = sum - weights data = data.sample(n=args.sample, weights=weights, random_state=0) logger.info("Distribution after sampling : \n{}".format( data.iloc[:, -1].value_counts())) ################################################################################################################ ################################################################################################################ # X,y ################################################################################################################ X = data.iloc[:, :-1] y = data.iloc[:, -1] ################################################################################################################ ################################################################################################################ #Feature Selection (Initial) ################################################################################################################ if (args.featureSelect): logger.info("Feature Selection - Initial") clf = XGBClassifier(n_estimators=100, learning_rate=0.3, max_depth=4, verbosity=0, random_state=0, n_jobs=-1) rfe = RFECV(clf, step=1, cv=5, verbose=0) X = rfe.fit_transform(X, y) ################################################################################################################ ################################################################################################################ #TRAIN TEST SPLIT ################################################################################################################ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) #stratify = y logger.info("Train Test Split complete") ################################################################################################################ #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #TRAINING #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #SAMPLING #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# if (args.RandomSamp): rus = RandomUnderSampler(random_state=0) X_train, y_train = rus.fit_resample(X_train, y_train) logger.info("Applied Random Under-Sampling") else: logger.info("No Random Under-Sampling") X_train = np.array(X_train) y_train = np.array(y_train) y_test = np.array(y_test) X_test = np.array(X_test) #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #MODEL #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #CFXGB cfxgb = CFXGB(config, args) #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #CASCADED FOREST AS TRANSFORMER #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# X_train_enc = cfxgb.get_encoded(X_train, y_train) X_test_enc = cfxgb.transform(X_test) #Final Transformation X_train_enc, X_test_enc = cfxgb.finalTransform(X_train, X_train_enc, X_test, X_test_enc) # X_train_enc = pd.DataFrame(X_train_enc) # X_train_enc.to_csv("X_train_enc.csv") # X_test_enc = pd.DataFrame(X_train_enc) # X_test_enc.to_csv("X_test_enc.csv") logger.info("X_train_enc.shape={}, X_test_enc.shape={}".format( X_train_enc.shape, X_test_enc.shape)) #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# #XGBOOST #$#$#$#$#$#$#$$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$$#$$$#$#$#$#$$#$#$#$$#$#$#$#$#$#$#$#$#$#$# y_pred = cfxgb.classify(X_train_enc, y_train, X_test_enc, y_test) logger.info("Confusion Matrix - \n{}".format( confusion_matrix(y_test, y_pred))) logger.info("\nClassification Report - \n{}".format( classification_report(y_test, y_pred))) logger.info("Accuracy - {}\n".format(accuracy_score(y_test, y_pred))) fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred) logger.info("AUC ") auc = metrics.auc(fpr, tpr) logger.info(auc) logger.info("Time - {}".format(time.time() - t)) logger.info("Arguments used in this run : {}".format(str(sys.argv))) logging.shutdown()
def under_sampling(X_train, y_train): sampler = RandomUnderSampler(sampling_strategy='majority', random_state=0) X_train_under, y_train_under = sampler.fit_sample(X_train, y_train) return X_train_under, y_train_under
for i, times in enumerate(time_series_Train): time_series_Train[i] = np.array(time_series_Train[i]) max_len = max([len(x) for x in time_series_Train]) for i, times in enumerate(time_series_Train): time_series_Train[i] = np.pad(times, (0, max_len - len(times)), 'constant') time_series_Mat = np.zeros((len(time_series_Train), max_len)) for i, times in enumerate(time_series_Train): for j, time in enumerate(time_series_Train[i]): time_series_Mat[i, j] = time features_Train = np.concatenate([features_Train, time_series_Mat], axis=1) features_Train = np.concatenate([features_Train, num_Norm_Train], axis=1) from imblearn.over_sampling import SMOTE undersample = RandomUnderSampler() print(target_Train) print(target_Train.shape) print(type(target_Train)) features_Train_Resampled, target_Train_Resampled = undersample.fit_resample( features_Train, target_Train) from keras.utils import to_categorical target_Train_Resampled = to_categorical(target_Train_Resampled) print(target_Train.shape) print(type(target_Train)) print(target_Train) print('FEATURES TRAIN') def create_model():
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling rus = RandomUnderSampler(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
2: weights[2], 3: weights[3], 4: weights[4] } over = SMOTE(sampling_strategy=ratio_over, random_state=314) X_train, y_train = over.fit_resample(X_train, y_train) # undersample samples > average ratio_under = { 0: average_samples, 1: average_samples, 2: average_samples, 3: average_samples, 4: average_samples } under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314) X_train, y_train = under.fit_resample(X_train, y_train) cv_inner = KFold(n_splits=5, shuffle=True) model = KerasClassifier(build_fn=create_model, batch_size=32, epochs=100, verbose=0) learning_rate = [0.001, 0.01, 0.1] batch_size = [8, 16, 32] neurons = [50, 100, 150] hidden_layers = [1, 2, 3] epochs = [10, 30, 50] activation = ['relu', 'tanh', 'sigmoid'] param_grid = dict(learning_rate=learning_rate, epochs=epochs, batch_size=batch_size,
def use_parameters(self, X_train, selected_features): """ Default Parameter """ test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), ClusterCentroids(), RandomUnderSampler(), # NearMiss(version=1), # EditedNearestNeighbours(), # AllKNN(), # CondensedNearestNeighbour(random_state=0), # InstanceHardnessThreshold(random_state=0, # estimator=LogisticRegression(solver='lbfgs', multi_class='auto')), RandomOverSampler(random_state=0), SMOTE(), BorderlineSMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] #test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] #test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2] # gamma default parameters #param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var())) #parameters = [ # { # 'scaler': test_scaler, # 'sampling': test_sampling, # 'feat__cols': selected_features, # 'model__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21], # 'model__weights': ['uniform', 'distance'] # }] parameters = [{ 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'model__n_neighbors': [13, 15, 21, 25], 'model__weights': ['uniform', 'distance'] }] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) # else: print("Parameters defined in the input: ", parameters) return parameters
plt1.set_title('Original data') plt1.scatter(X[:, 0], X[:, 1], marker='o', s=25, edgecolor='k') X = np.vstack((X[y == 0][:n1], X[y == 1][:n2], X[y == 2][:n3])) newy = np.concatenate((np.full((n1,1),0), np.full((n2,1),1), np.full((n3,1),2))) colors = ['#ef8a62' if v == 0 else '#f7f7f7' if v == 1 else '#67a9cf' for v in newy] plt2.set_title('Different density data') plt2.scatter(X[:, 0], X[:, 1], marker='o',c=colors, s=25, edgecolor='k') sampler = RandomUnderSampler(random_state=0) X_res, y_res = sampler.fit_resample(X, newy) print(X.shape) print(X_res.shape) colors = ['#ef8a62' if v == 0 else '#f7f7f7' if v == 1 else '#67a9cf' for v in y_res] plt3.set_title('Undersampled data') plt3.scatter(X_res[:, 0], X_res[:, 1], c=colors, linewidth=0.5, edgecolor='black') NN = NearestNeighbors(n_neighbors=len(X)).fit(X) distances, indices = NN.kneighbors(X) print(distances) plt4.set_title('minPts elbow')
def sample_data(self, sampling_method: str, X_train, Y_train, base_file_name, target_column="star_rating"): """ Creates sampler based in sampling method and return the resulting X and y This method will also save the final distribution to a CSV file based on base_file_name :param X_train: Original features :param Y_train: Original labels :param base_file_name: base file name to save the final distribution csv :return: """ ## if we want to over sample or under sample log.debug(f'Y_train {Y_train.shape}') log.debug(f'Y_train {Y_train.head()}') grouped_df = Y_train.reset_index().groupby(target_column).count() log.info( f'Distribution before sampling with {sampling_method}\n{grouped_df}' ) log.debug(f'grouped type: {type(grouped_df)}') log.debug(f'grouped: {grouped_df.head()}') log.debug(f'grouped: {grouped_df.shape}') if sampling_method == "smote": sampler = SMOTE(random_state=RSTATE, sampling_strategy='not majority', n_jobs=self.n_jobs) elif sampling_method == "adasyn": sampler = ADASYN(random_state=RSTATE, sampling_strategy='not majority', n_jobs=self.n_jobs) elif sampling_method == "random_over_sampling": sampler = RandomOverSampler(random_state=RSTATE, sampling_strategy='not majority') elif sampling_method == "random_under_sampling": sampler = RandomUnderSampler(random_state=RSTATE, replacement=True) elif sampling_method == "nearmiss2": sampler = NearMiss(random_state=RSTATE, sampling_strategy='not minority', version=2, n_jobs=self.n_jobs) else: raise Exception( f"Sampling method not supported: {sampling_method}") X_train_res, Y_train_res = sampler.fit_resample( X_train, Y_train.ravel()) X_train = pd.DataFrame(X_train_res, columns=X_train.columns) Y_train = pd.DataFrame(Y_train_res, columns=[target_column]) # get distribution of samples after samping dist = Y_train.reset_index().groupby(target_column).count() log.info(f'Distribution after sampling with {sampling_method}\n{dist}') log.debug(dist.head()) dist.to_csv( f'{REPORT_DIR}/{base_file_name}-histogram-{sampling_method}.csv') return X_train, Y_train
def fit(self, X, y): pos = len(y[y == 1]) neg = int(pos * ((1 - self.pos_ratio) / self.pos_ratio)) self.ratio_sampler = RandomUnderSampler(random_state=self.random_state, ratio={0: neg, 1: pos}) self.ratio_sampler.fit(X, y) return self