def test_multiclass_fit_resample(): y = Y.copy() y[5] = 2 y[6] = 2 rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_resample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2
def test_random_under_sampling_heterogeneous_data(): X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], dtype=np.object) y = np.array([0, 0, 1]) rus = RandomUnderSampler(random_state=RND_SEED) X_res, y_res = rus.fit_resample(X_hetero, y) assert X_res.shape[0] == 2 assert y_res.shape[0] == 2 assert X_res.dtype == object
def test_rus_fit_resample(): rus = RandomUnderSampler(random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit_resample(X, y) X_trans2, y_trans2 = rus.fit_resample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) pca = PCA() pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) X_trans, y_trans = pipeline.fit_resample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_resample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0 assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL)
def test_rus_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_resample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [ 0.92923648, 0.76103773 ], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.20792588, 1.49407907], [0.15490546, 0.3130677], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def prepare_nn(self, n_splits=10, normalize=True, shuffle_data=True, oversample=True, undersample=False): self.read_data() if oversample: ros = RandomOverSampler(random_state=55) self.data, self.labels = ros.fit_resample(self.data, self.labels) elif undersample: rus = RandomUnderSampler(random_state=55) self.data, self.labels = rus.fit_resample(self.data, self.labels) if shuffle_data: self.shuffle_data() if normalize: self.normalize_data(0, 1) skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle_data, random_state=43) return skf
def sample(X, y, ratio): """Undersamples majority and synthetic minority samples using SMOTE Params -------- X (df): dataframe representing independent (non-target) variables y (df): dataframe representing target ratio (int): ratio to be used for under sampling Returns -------- X_over (df): dataframe representing independent (non-target) variables, with undersampled majority/SMOTE minority y_over (df): dataframe representing target, with undersampled majority/SMOTE minority """ # for sample runs, need to ensure k_neighbors is less than minority samples n_minority_samples = y.groupby('target').target.count()[1] if n_minority_samples < 5: k_neighbors = n_minority_samples - 2 else: k_neighbors = 5 # under sample majority based on ratio undersample = RandomUnderSampler(sampling_strategy=ratio, random_state=123) X_under, y_under = undersample.fit_resample(X, y) # synthetic oversample via SMOTE # oversample = BorderlineSMOTE(random_state=123)#, sampling_strategy=.25)#, random_state=123) # oversample = SVMSMOTE(random_state=123)#, sampling_strategy=.25)#, random_state=123) oversample = SMOTENC( categorical_features=[0, 1, 2, 4], random_state=123, k_neighbors=k_neighbors) #, sampling_strategy=.25)#, random_state=123) X_over, y_over = oversample.fit_resample(X_under, y_under) return X_over, y_over
def test_rus_fit_resample_half(): sampling_strategy = {0: 3, 1: 6} rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=RND_SEED, replacement=True, ) X_resampled, y_resampled = rus.fit_resample(X, Y) X_gt = np.array([ [0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.20792588, 1.49407907], [0.15490546, 0.3130677], [0.12372842, 0.6536186], ]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def undersample_major_class(X: np.ndarray, Y: np.ndarray, k=3): """ Undersamples the majority class k times. :param X: :param Y: :param k: :return: """ logger.info(f'Undersampling the majority class [{k}] times.') under_sampler = RandomUnderSampler() k_undersampled_list = [] for i in range(k): X_resampled, Y_resampled = under_sampler.fit_resample(X, Y) X_resampled, Y_resampled = unison_shuffled_copies( X_resampled, Y_resampled) undersampled_dict = {} for x, y in zip(X_resampled, Y_resampled): x = str(x[0]) undersampled_dict[x] = y k_undersampled_list.append(undersampled_dict) return k_undersampled_list
def tuneReducedDecisionTree(): X, y = common.loadTrainingDataSet() kf = KFold(n_splits=5, random_state=42, shuffle=True) splitIndex = 0 f1ScoreList = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] totalF1 = 0.0 numModels = 9 for modelNum in range(numModels): rs = 42 + modelNum rus = RandomUnderSampler(random_state=rs) X_model_full, y_model = rus.fit_resample(X_train, y_train) truncatedSvd = TruncatedSVD(n_components=331, n_iter=7, random_state=42) X_model = truncatedSvd.fit_transform(X_model_full, y_model) dtClassifier = DecisionTreeClassifier(ccp_alpha=0.015) dtClassifier.fit(X_model, y_model) X_model_test = truncatedSvd.transform(X_test) y_pred = dtClassifier.predict(X_model_test) #report = classification_report(y_test, y_pred) currentF1 = f1_score(y_test, y_pred) print("Printing F1 for model #" + str(modelNum) + " = " + str(currentF1)) #print(str(report)) totalF1 += currentF1 avgF1 = totalF1 / numModels print("f1 = " + str(avgF1))
def randomUnderSample(x, y, label='class', numSamplesPerClassType=None): numSamplesPerClassType = {1: 100000, 2: 100000, 3: 100000, 4: 100000, 5: 100000, 6: 100000, 7: 100000} # fixme - hard coded print('- Balancing with random under sampling') print('Current x state: ', x.shape) x_columns = x.columns.values counts = y.value_counts().to_dict() printStr = '> Initial class freq:\n' for k, v in counts.items(): printStr += '"{}" instances: [{}]\n'.format(k, v) print(printStr) if numSamplesPerClassType is not None: classNumSamp = {k:v if v<counts[k] else counts[k] for k,v in numSamplesPerClassType.items()} rus = RandomUnderSampler(random_state=int(time.time()), sampling_strategy=classNumSamp) else: rus = RandomUnderSampler(random_state=int(time.time())) x, y = rus.fit_resample(x, y) x_bal = pd.DataFrame(x, columns=x_columns) y_bal = pd.DataFrame(y, columns=[label]) # fixme - working but done in a stupid way df = x_bal.join(y_bal) y_bal = df.loc[:, label] x_bal = df.drop(columns=[label]) counts = y_bal.value_counts().to_dict() printStr = 'Balanced class freq:\n' for k, v in counts.items(): printStr += '"{}" instances: [{}]\n'.format(k, v) print(printStr) print('Balanced x state: ', x_bal.shape) return x_bal, y_bal
def evaluate(self, train_docs, y_train, test_docs, y_test, clf_metadata, features_metadata, task='classification', return_predictions=False): clf = self.get_classifier(clf_metadata) X_train, X_test = self.prepare_features(features_metadata, train_docs, test_docs) if (features_metadata['sampling'] == 'over'): ros = RandomOverSampler(random_state=0) X_train, y_train = ros.fit_resample(X_train, y_train) # X_train, y_train = self.oversample(X_train, y_train) elif (features_metadata['sampling'] == 'under'): rus = RandomUnderSampler(random_state=0) X_train, y_train = rus.fit_resample(X_train, y_train) # X_train, y_train = self.undersample(X_train, y_train) if (features_metadata['LDA']): lda = LinearDiscriminantAnalysis( n_components=features_metadata['n_components']) dense_train = X_train.todense() dense_test = X_test.todense() lda.fit(dense_train, y_train) X_train = lda.transform(dense_train) X_test = lda.transform(dense_test) clf.fit(X_train, y_train) test_predicted = clf.predict(X_test) train_predicted = clf.predict(X_train) metrics = self.get_metrics(clf, y_train, y_test, train_predicted, test_predicted, task) return metrics
def sampling(X_train, y_train): ran_over = RandomOverSampler(random_state=42) X_train_oversample,y_train_oversample = ran_over.fit_resample(X_train,y_train) ran_under = RandomUnderSampler(random_state=42) X_train_undersample, y_train_undersample = ran_under.fit_resample(X_train,y_train) tl = TomekLinks(n_jobs=6) X_train_tl, y_train_tl = tl.fit_sample(X_train, y_train) sm = SMOTE(random_state=42, n_jobs=5) X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train) enn = EditedNearestNeighbours() X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train) print(np.unique(y_train, return_counts=True)) print("after sampling") print("randomg over sampling") print(np.unique(y_train_oversample, return_counts=True)) print("SMOTE sampling") print(np.unique(y_train_sm, return_counts=True)) print("random under sampling") print(np.unique(y_train_undersample, return_counts=True)) print("TomekLinks under sampling") print(np.unique(y_train_tl, return_counts=True)) return (X_train_oversample, y_train_oversample, X_train_undersample, y_train_undersample, X_train_tl, y_train_tl, X_train_sm, y_train_sm, X_train_enn, y_train_enn)
def tackle_data_imbalance(self, X, Y): increase = 3 counter = Counter(Y) total_classes = len(counter) total_data_points = sum(counter.values()) expected_points = total_data_points * increase avg_points_per_class = int(expected_points / total_classes) # generating highest amount of data for each class # higest_key, highest_val = max(counter.items(), key=operator.itemgetter(1)) # famous_dict = dict((key, highest_val) for key in counter) famous_dict = dict( (key, avg_points_per_class) for key in counter) # generating double of previous for each class over = ADASYN(n_neighbors=1, sampling_strategy=famous_dict) under = RandomUnderSampler(sampling_strategy="auto") X, Y = over.fit_resample(X, Y) X, Y = under.fit_resample(X, Y) return X, Y
def main_logic(data): # drop quantization column df = data.drop('Quantization_time', axis=1) X = df.iloc[:, :-1] y = df.iloc[:, -1] # split training, testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) if debug == 1: # summarize class distribution print("Before undersampling: ", Counter(y_train)) # define undersampling strategy undersample = RandomUnderSampler(sampling_strategy='majority') # fit and apply the transform X_train_under, y_train_under = undersample.fit_resample(X_train, y_train) if debug == 1: # summarize class distribution print("After undersampling: ", Counter(y_train_under))
def _fit_clf(self, clf_type, dataloader): if clf_type == 'other_classifiers': clf = KNeighborsClassifier(weights="distance") param_grid = {"n_neighbors": [9, 11, 13, 15]} elif clf_type == 'svm': clf = Pipeline(steps=[("scaler", StandardScaler()), ("clf", SVC())]) param_grid = {"clf__C": [0.01, 0.1, 1, 10]} else: return print(f"Training {clf_type}") X, y = [], [] self.net.eval() with torch.no_grad(): for images, labels in dataloader: images = images.to(self.device) y.append(labels) features = self._extract_features(images, normalize=False) X.append(features) X = torch.cat(X).cpu().numpy() y = torch.cat(y).cpu().numpy() rus = RandomUnderSampler() X, y = rus.fit_resample(X, y) grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, scoring='accuracy', cv=4) grid_search.fit(X, y) self.clf = grid_search.best_estimator_ self.params_clf.append(grid_search.best_params_)
def random_under(X,y,strategy=0.5): """Random Undersampling: To tackle imbalanced data, this function helps removing samples from majority. Parameters ---------- X : dataframe or array Features of dataset y : dataframe or array Target value of dataset strategy : float The desired ratio of majority sample Returns ------- X : dataframe or array Features of new dataset y : dataframe or array Target value of new dataset """ under = RandomUnderSampler(sampling_strategy=strategy) X,y = under.fit_resample(X,y) return X,y
def run(k, j, filename, seednum=20, threshold = 0.5, resultdir=None, graphdir = f'{treedir}/'): # classes = ["P1a1" , "P1a2" , "P2b" , "P2c" ] classes = ["P1a1" , "P1a2", "P2b", "P2c", "H1" ] # H1 H2 O (1) P1a1 (4) P1a2 (6) P2b P2c S1a (0) S1c S2 S3 joind = gp.read_file(filename, layer = layers[j]) print(f'\n------\n------{layers[j]}----\n-----\n') joind['area']= joind['geometry'].area #calculate the area of each object df1 = pd.DataFrame(joind.drop(columns='geometry')) df1 = df1.replace([np.inf, -np.inf], np.nan).dropna() Pcl = df1.loc[df1['geocode_2'].isin(classes)] # filter only classes of interest print(Pcl['geocode_2'].value_counts()) # regroup, geocode_2 from here on becomes binary! Pcl['geocode_2'] = np.where(Pcl['geocode_2'].str.contains(classes[k]),classes[k],'Others') print(Pcl['geocode_2'].value_counts()) minc = min(Pcl['geocode_2'].value_counts() ) # skip if less than 20 objects if minc< 20: print("minimum class less than 20") return (-1, -1) # -1 -1 if not calculated else: print(f'total {len(df1)}, P_H1_classes: {len(Pcl)}, minimun class: {minc}') # bootstrap and get averaged accuracy avepre = np.zeros(1) # store all the xgb+tree precisions in each CV averec = np.zeros(1) for seeds in range(seednum): np.random.seed(seeds) #1. categorise the variable "area", the variable "area" is kept in the data frame, strictly it can be removed. #2. use groupby to sample the same amount for each area category # use 70% of area for training, get the index print (Pcl['area'].quantile([0, .25, .5, .75, 1])) Pcl['area_c'] = pd.cut(Pcl['area'], bins= Pcl['area'].quantile([0, .25, .5, .75, 1]).tolist() labels=[ "q25", "q5", "q75", "Max"]) print(Pcl["area_c"].value_counts()) train_ind = Pcl.groupby('area_c').sample(n = int(min(Pcl["area_c"].value_counts())*0.7)).index test_ind = Pcl[~Pcl.index.isin(train_ind)].index Pcl.loc [train_ind,"geocode_2" ].value_counts() X_train0 = Pcl.loc [train_ind ].drop(columns=["geocode_2","layer","OBJECTID","path", "area_c"]) X_test0 = Pcl.loc [test_ind ].drop(columns=["geocode_2","layer","OBJECTID","path", "area_c"]) Y_train0 = Pcl.filter(regex='geocode_2').loc[train_ind] Y_test0 = Pcl.filter(regex='geocode_2').loc[test_ind] print("after sampling by area: for 2 classes,", X_train0.shape[0], X_test.shape[0]) print(Pcl.loc [train_ind ]["geocode_2"].value_counts()) # if my pandas is lower and i can't use the above function, # grouped = Pcl.drop(columns=["geocode_2","layer","OBJECTID","path",'area']).groupby('area_c') #def fun1(x): # y = x.drop(columns=["area_c"]) # return( y.sample(n = int(minc/5*0.7)).index ) #train_ind = grouped.apply(fun1) #test_ind = Pcl[~Pcl.index.isin(train_ind)].index #neew to ungroup train_ind # test data #grouped2 = Pcl[['geocode_2',"area_c"]].groupby('area_c') #y = grouped2.apply(fun1) ##### # after getting x, y train, we will use undersample to sample from each classes, p1a1 and others rus = RandomUnderSampler(random_state = 1) X_train, Y_train = rus.fit_resample(X_train0, Y_train0) print("number of samples used for training:", X_train.shape[0]/2) #y2 = y2.reshape(-1, 1) #y2_rus, y_rus = rus.fit_resample(y2, y) #y2_rus= y2_rus.flatten() #len(train)+len(test) # relable label_all = [classes[k], "Others"] #classtype = [(j, "float32") for j in classes] #Pcl.geocode_2.unique() i = 0 idx2class = {} class2idx = {} for tp in label_all: idx2class[i] = tp class2idx[tp] = i i+= 1 Y_trainnum = cl2idx(Y_train.values, class2idx).astype(int) Y_testnum = cl2idx(Y_test.values, class2idx).astype(int) np.unique(Y_trainnum) params = {'max_depth': 6, 'eta': 0.002, 'objective':'binary:logistic', 'num_class': 1} clf = xgb.XGBModel(**params) clf.fit(X_train.values, Y_trainnum, eval_set=[(X_train.values, Y_trainnum), (X_test.values, Y_testnum)], eval_metric='logloss', verbose=True) #for testing #clf = DecisionTreeClassifier(min_samples_split= 30, max_depth= 4, min_samples_leaf=20, random_state=1) yhat = clf.predict(X_test) # threshold 0.5, probability higher than 0.5 -> positive. yhat_labels = yhat>threshold yhat_labels = yhat_labels.astype(int) #TP TP = ((Y_testnum == 1) & (yhat_labels == 1)).astype(float) * X_test["area"] #FP FP = ((Y_testnum == 0) & (yhat_labels == 1)).astype(float) * X_test["area"] #TN TN = ((Y_testnum == 0) & (yhat_labels == 0)).astype(float) * X_test["area"] #FN FN =((Y_testnum == 1) & (yhat_labels == 0)).astype(float) * X_test["area"] precision = np.sum(TP)/np.sum(TP+FP) recall = np.sum(TP)/np.sum(TP+TN) averec = np.append(averec, recall) #store all of them avepre = np.append(avepre, precision) recall = averec.sum()/seednum #get the mean but exclude the first one (0) precision = avepre.sum()/seednum print(averec, recall) if resultdir is not None: Y_testnum = Y_testnum.astype(int) plt.rcParams.update({'font.size': 8}) ax = xgb.plot_importance(model, grid=False, importance_type='gain', title='Feature importance') ax.set_title(f'xgboost importance {layers[j]} {classes[k]}') fname = f"{resultdir}/P_{layers[j]}_{classes[k]}_imp" plt.savefig(fname, dpi=1200) return (recall, precision)
0: size0, 1: size1, 2: size2, 3: size3, 4: size4, 5: size5, 6: size6, 7: size7, 8: size8, 9: size9, 10: size10, 11: size11, 12: size12, 13: size13, 14: size14 } ros = RandomUnderSampler(sampling_strategy=strategy, random_state=7) X_under, y_under = ros.fit_resample(df, df['dfiscx.label']) # transformando os vetores em dataframes X_under = pd.DataFrame(X_under) y_under = pd.DataFrame(y_under) # Concatenando features e classes dataset = pd.concat([X_under, y_under]) # salvando em CSV export_csv = dataset.to_csv( r'/home/latin/export_dataframe5PercentCleanedPCA.csv', index=None, header=True) #Don't forget to add '.csv' at the end of the path
X = df.drop(['id','Response'], axis = 1) cat_var = np.where(X.dtypes != np.float)[0] neg, pos = np.bincount(y) from imblearn.over_sampling import RandomOverSampler, SMOTE from imblearn.under_sampling import RandomUnderSampler over = RandomOverSampler(sampling_strategy = 0.4) under = RandomUnderSampler(sampling_strategy = 0.8) #smote = SMOTE(sampling_strategy = 0.4, random_state = 1) #X, y = smote.fit_resample(X, y) X, y = over.fit_resample(X,y) X, y = under.fit_resample(X, y) from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.15, shuffle = True, stratify = y) from catboost import CatBoostClassifier from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix classifier = CatBoostClassifier()#, scale_pos_weight = neg/pos) classifier.fit(X_train, y_train, eval_set = (X_val, y_val), cat_features = cat_var) yhat = classifier.predict(df_sub) s = np.column_stack((case_id,yhat)) s = pd.DataFrame(s) s.columns = ['id', 'Response'] s.to_csv("Submission.csv", index = False, index_label = None)
def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=random_state) return rus.fit_resample(X, y)
def func(X, y, sampling_strategy, random_state): rus = RandomUnderSampler( sampling_strategy=sampling_strategy, random_state=random_state ) return rus.fit_resample(X, y)
3: weights[3], 4: weights[4] } over = RandomOverSampler(sampling_strategy=ratio_over, random_state=314) X_train, y_train = over.fit_resample(X_train, y_train) # undersample samples > average ratio_under = { 0: average_samples, 1: average_samples, 2: average_samples, 3: average_samples, 4: average_samples } under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314) X_train, y_train = under.fit_resample(X_train, y_train) # OUD: Maak class weights voor class imbalance #label_integers =np.argmax(labels_as_array, axis=1) #class_weights = compute_class_weight('balanced', np.unique(label_integers), label_integers) #d_class_weights = dict(enumerate(class_weights)) #print(d_class_weights) # Hieronder is voor parameters testen # Maak model print(type(X_train)) print(type(y_train)) estimator = KerasClassifier(build_fn=baseline_model, epochs=40, batch_size=20, verbose=1)
dataset2.rename(columns={"500_Buggy?": "Buggy"}, inplace=True) # separate the data from the target attributes test_data = dataset2.drop('change_id', axis=1) test_data = test_data.drop('411_commit_time', axis=1) test_data = test_data.drop('412_full_path', axis=1) # remove unnecessary features #test_data = test_data.drop('File', axis=1) # the lables of test data test_target = dataset2.Buggy #print(test_target) from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_resample(train_data, train_target) test_data_resampled, test_target_resampled = rus.fit_resample( test_data, test_target) clf = LogisticRegression(warm_start=True, max_iter=1000000000) test_pred = clf.fit(X_resampled, y_resampled).predict(test_data_resampled) file.write( classification_report(test_target_resampled, test_pred, labels=[0, 1])) file.write("\n") file.close()
train_X = pad_sequences(train_X, maxlen=maxlen) test_X = pad_sequences(test_X, maxlen=maxlen) val_X = pad_sequences(val_X, maxlen=maxlen) len(train_X) train_X[110] """## **Undersampling**""" from collections import Counter from sklearn.datasets import make_classification from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_resample(train_X, train_y) print(sorted(Counter(y_resampled).items())) train_X.shape train_X[110] embed_size = 100 # how big is each word vector S_DROPOUT = 0.4 DROPOUT = 0.1 def plotting(history): plt.plot(history.history['acc']) plt.plot(history.history['val_acc'])
class ModelDataset(Dataset): """ The dataset class responsible for loading the data and providing the samples for \ training. :param Dataset: Base Dataset class to use with PyTorch models :type Dataset: torch.utils.data.Dataset """ def __init__( self, out_var=None, out_mean=None, forecast_dir=None, forcings_dir=None, reanalysis_dir=None, transform=None, hparams=None, **kwargs, ): """ Constructor for the ModelDataset class :param out_var: Variance of the output variable, defaults to None :type out_var: float, optional :param out_mean: Mean of the output variable, defaults to None :type out_mean: float, optional :param forecast_dir: The directory containing the FWI-Forecast data, defaults \ to None :type forecast_dir: str, optional :param forcings_dir: The directory containing the FWI-Forcings data, defaults \ to None :type forcings_dir: str, optional :param reanalysis_dir: The directory containing the FWI-Reanalysis data, \ defaults to None :type reanalysis_dir: str, optional :param transform: Custom transform for the input variable, defaults to None :type transform: torch.transforms, optional :param hparams: Holds configuration values, defaults to None :type hparams: Namespace, optional """ self.hparams = hparams self.out_mean = out_mean self.out_var = out_var self.hparams.thresh = self.hparams.out_mad / 2 if self.hparams.binned: self.bin_intervals = self.hparams.binned # Mean of output variable used for bias-initialization. self.out_mean = out_mean if out_mean else self.hparams.out_mean # Variance of output variable used to scale the training loss. self.out_var = out_var if out_var else self.hparams.out_var # Convert string dates to numpy format if self.hparams.date_range: self.hparams.date_range = [ np.datetime64(d) for d in self.hparams.date_range ] # Convert case-study dates to numpy format if ( hasattr(self.hparams, "case_study_dates") and self.hparams.case_study_dates and not self.hparams.date_range ): self.hparams.case_study_dates = [ [np.datetime64(d) for d in r] for r in self.hparams.case_study_dates ] # If custom date range specified, override else: self.hparams.case_study_dates = None # Create imbalanced-learn random subsampler if self.hparams.undersample: self.undersampler = RandomUnderSampler() if not self.hparams.benchmark: # Input transforms including mean and std normalization self.transform = transforms.Compose( [ transforms.ToTensor(), # Mean and standard deviation stats used to normalize the input data # to the mean of zero and standard deviation of one. transforms.Normalize( [ x for i in range(self.hparams.in_days) for x in ( self.hparams.inp_mean["rh"], self.hparams.inp_mean["t2"], self.hparams.inp_mean["tp"], self.hparams.inp_mean["wspeed"], ) ] + ( [ self.hparams.smos_mean for i in range(self.hparams.in_days) ] if self.hparams.smos_input else [] ), [ x for i in range(self.hparams.in_days) for x in ( self.hparams.inp_std["rh"], self.hparams.inp_std["t2"], self.hparams.inp_std["tp"], self.hparams.inp_std["wspeed"], ) ] + ( [self.hparams.smos_std for i in range(self.hparams.in_days)] if self.hparams.smos_input else [] ), ), ] ) def __len__(self): """ The internal method used to obtain the number of iteration samples. :return: The maximum possible iterations with the provided data. :rtype: int """ return len(self.dates) def __getitem__(self, idx): """ Internal method used by pytorch to fetch input and corresponding output tensors. :param idx: The index number of data sample. :type idx: int :return: Batch of data containing input and output tensors :rtype: tuple """ if torch.is_tensor(idx): idx = idx.tolist() if self.hparams.benchmark: X = torch.from_numpy( np.stack( [ resize( self.input[list(self.input.data_vars)[0]] .sel(time=[self.dates[idx]], lead=[i]) .values.squeeze(), self.output[list(self.output.data_vars)[0]][0].shape, ) for i in range(self.hparams.out_days) ], axis=0, ) ) else: X = self.transform( np.stack( [ self.input[v] .sel(time=[self.dates[idx] - np.timedelta64(i, "D")]) .values.squeeze() for i in range(self.hparams.in_days) for v in ["rh", "t2", "tp", "wspeed"] ] + ( [ resize( np.nan_to_num( self.smos_input[list(self.smos_input.data_vars)[0]] .sel( time=[self.dates[idx] - np.timedelta64(i, "D")], method="nearest", ) .values.squeeze()[::-1], copy=False, # Use 50 as the placeholder for water bodies nan=50, ), self.input.rh[0].shape, ) for i in range(self.hparams.in_days) ] if self.hparams.smos_input else [] ), axis=-1, ) ) y = torch.from_numpy( np.stack( [ self.output[list(self.output.data_vars)[0]] .sel(time=[self.dates[idx] + np.timedelta64(i, "D")]) .values.squeeze() for i in range(self.hparams.out_days) ], axis=0, ) ) return X, y def get_cb_loss_factor(self, y): """ Compute the Class-Balanced loss factor mask using output value frequency \ distribution and the supplied beta factor. :param y: The 1D ground truth value tensor :type y: torch.tensor """ idx = ( ( y.unsqueeze(0).expand(self.bin_centers.shape[0], -1) - self.bin_centers.unsqueeze(-1).expand(-1, y.shape[0]) ) .abs() .argmin(dim=0) ) loss_factor = torch.empty_like(y) for i in range(self.bin_centers.shape[0]): loss_factor[idx == i] = self.loss_factors[i] return loss_factor def apply_mask(self, *y_list): """ Returns batch_size x channels x N sized matrices after applying the mask. :param *y_list: The interable of tensors to be masked :type y_list: torch.Tensor :return: The list of masked tensors :rtype: list(torch.Tensor) """ return [ y.permute(-2, -1, 0, 1)[self.mask.expand_as(y[0][0])].permute(-2, -1, 0) for y in y_list ] def get_loss(self, y, y_hat): """ Do the applicable processing and return the loss for the supplied prediction \ and the label tensors. :param y: Label tensor :type y: torch.Tensor :param y_hat: Predicted tensor :type y_hat: torch.Tensor :return: Prediction loss :rtype: torch.Tensor """ if self.hparams.undersample: sub_mask = y < self.hparams.undersample subval = y[sub_mask] low = max(subval.min(), 0.5) high = subval.max() boundaries = torch.arange(low, high, (high - low) / 10).to( self.model.device ) freq_idx = torch.bucketize(subval, boundaries[:-1], right=False) self.undersampler.fit_resample( subval.cpu().unsqueeze(-1), (boundaries.take(index=freq_idx).cpu() * 100).int(), ) idx = self.undersampler.sample_indices_ y = torch.cat((y[~sub_mask], subval[idx])) y_hat = torch.cat((y_hat[~sub_mask], y_hat[sub_mask][idx])) if self.hparams.round_to_zero: y_hat = y_hat[y > self.hparams.round_to_zero] y = y[y > self.hparams.round_to_zero] if self.hparams.clip_output: y_hat = y_hat[ (y < self.hparams.clip_output[-1]) & (self.hparams.clip_output[0] < y) ] y = y[ (y < self.hparams.clip_output[-1]) & (self.hparams.clip_output[0] < y) ] if self.hparams.cb_loss: loss_factor = self.get_cb_loss_factor(y) if self.hparams.boxcox: y = torch.from_numpy(boxcox(y.cpu(), lmbda=self.hparams.boxcox,)).to( y.device ) pre_loss = (y_hat - y) ** 2 # if "loss_factor" in locals(): # pre_loss *= loss_factor loss = pre_loss.mean() assert loss == loss return loss def training_step(self, model, batch): """ Called inside the training loop with the data from the training dataloader \ passed in as `batch`. :param model: The chosen model :type model: Model :param batch: Batch of input and ground truth variables :type batch: int :return: Loss and logs :rtype: dict """ # forward pass x, y_pre = batch y_hat_pre = model(x) y_pre, y_hat_pre = self.apply_mask(y_pre, y_hat_pre) assert y_pre.shape == y_hat_pre.shape tensorboard_logs = defaultdict(dict) for b in range(y_pre.shape[0]): for c in range(y_pre.shape[1]): loss = self.get_loss(y_pre[b][c], y_hat_pre[b][c]) tensorboard_logs["train_loss_unscaled"][str(c)] = loss loss = torch.stack( list(tensorboard_logs["train_loss_unscaled"].values()) ).mean() tensorboard_logs["_train_loss_unscaled"] = loss # model.logger.log_metrics(tensorboard_logs) return { "loss": loss.true_divide(model.data.out_var * self.hparams.out_days), "_log": tensorboard_logs, } def validation_step(self, model, batch): """ Called inside the validation loop with the data from the validation dataloader \ passed in as `batch`. :param model: The chosen model :type model: Model :param batch: Batch of input and ground truth variables :type batch: int :return: Loss and logs :rtype: dict """ # forward pass x, y_pre = batch y_hat_pre = model(x) y_pre, y_hat_pre = self.apply_mask(y_pre, y_hat_pre) assert y_pre.shape == y_hat_pre.shape tensorboard_logs = defaultdict(dict) for b in range(y_pre.shape[0]): for c in range(y_pre.shape[1]): y, y_hat = y_pre[b][c], y_hat_pre[b][c] loss = self.get_loss(y, y_hat) # Accuracy for a threshold abs_diff = (y - y_hat).abs() acc = (abs_diff < self.hparams.thresh).float().mean() mae = abs_diff.mean() tensorboard_logs["val_loss"][str(c)] = loss tensorboard_logs["acc"][str(c)] = acc tensorboard_logs["mae"][str(c)] = mae val_loss = torch.stack(list(tensorboard_logs["val_loss"].values())).mean() tensorboard_logs["_val_loss"] = val_loss # model.logger.log_metrics(tensorboard_logs) return { "val_loss": val_loss, "log": tensorboard_logs, } def inference_step(self, y_pre, y_hat_pre): """ Run inference for the target and predicted values and return the loss and the \ metrics values as logs. :param y_pre: Label values :type y_pre: torch.Tensor :param y_hat_pre: Predicted value :type y_hat_pre: torch.Tensor :return: Loss and the log dictionary :rtype: tuple """ y_pre, y_hat_pre = self.apply_mask(y_pre, y_hat_pre) tensorboard_logs = defaultdict(dict) for b in range(y_pre.shape[0]): for c in range(y_pre.shape[1]): y = y_pre[b][c] y_hat = y_hat_pre[b][c] if self.hparams.boxcox and not self.hparams.benchmark: # Negative predictions give NaN after inverse-boxcox y_hat[y_hat < 0] = 0 y_hat = torch.from_numpy( inv_boxcox(y_hat.cpu().numpy(), self.hparams.boxcox) ).to(y_hat.device) if not y.numel(): return None pre_loss = (y_hat - y) ** 2 loss = lambda low, high: pre_loss[(y > low) & (y <= high)].mean() assert loss(y.min(), y.max()) == loss(y.min(), y.max()) # Accuracy for a threshold acc = ( lambda low, high: ( (y - y_hat)[(y > low) & (y <= high)].abs() < self.hparams.thresh ) .float() .mean() ) # Mean absolute error mae = ( lambda low, high: (y - y_hat)[(y > low) & (y <= high)] .abs() .float() .mean() ) tensorboard_logs["mse"][str(c)] = loss(y.min(), y.max()) tensorboard_logs["acc"][str(c)] = acc(y.min(), y.max()) tensorboard_logs["mae"][str(c)] = mae(y.min(), y.max()) # Inference on binned values if self.hparams.binned: for i in range(len(self.bin_intervals) - 1): low, high = ( self.bin_intervals[i], self.bin_intervals[i + 1], ) tensorboard_logs[f"mse_{low}_{high}"][str(c)] = loss(low, high) tensorboard_logs[f"acc_{low}_{high}"][str(c)] = acc(low, high) tensorboard_logs[f"mae_{low}_{high}"][str(c)] = mae(low, high) tensorboard_logs[f"mse_{self.bin_intervals[-1]}inf"][str(c)] = loss( self.bin_intervals[-1], y.max() ) tensorboard_logs[f"acc_{self.bin_intervals[-1]}inf"][str(c)] = acc( self.bin_intervals[-1], y.max() ) tensorboard_logs[f"mae_{self.bin_intervals[-1]}inf"][str(c)] = mae( self.bin_intervals[-1], y.max() ) inference_loss = torch.stack(list(tensorboard_logs["mse"].values())).mean() tensorboard_logs["_inference_loss"] = inference_loss return inference_loss, tensorboard_logs def test_step(self, model, batch): """ Called inside the testing loop with the data from the testing dataloader \ passed in as `batch`. :param model: The chosen model :type model: Model :param batch: Batch of input and ground truth variables :type batch: int :return: Loss and logs :rtype: dict """ x, y_pre = batch y_hat_pre = model(x) test_loss, tensorboard_logs = self.inference_step(y_pre, y_hat_pre) return { "mse": test_loss, "log": tensorboard_logs, } def benchmark_step(self, batch): """ Called inside the testing loop with the data from the testing dataloader \ passed in as `batch`. :param model: The chosen model :type model: Model :param batch: Batch of input and ground truth variables :type batch: int :return: Loss and logs :rtype: dict """ y_hat_pre, y_pre = batch benchmark_loss, tensorboard_logs = self.inference_step(y_pre, y_hat_pre) return { "mse": benchmark_loss, "log": tensorboard_logs, }
if __name__=='__main__': ty,auc_thres,k = str(args.type),float(args.threshold),int(args.num) # load the data with open(f'data_{ty}.pkl', 'rb') as f: data = pickle.load(f) X_train,y_train,X_test,y_test = data['X_train'],data['y_train'],data['X_test'],data['y_test'] print(X_train.shape,y_train.shape,X_test.shape,y_test.shape) print(np.sum(y_train)) pool = ['SVM'] # train base models models,i = [],1 while True: # use RandomUnderSampler to sample rus = RandomUnderSampler(random_state=random.randint(1000,9999)) X_resampled, y_resampled = rus.fit_resample(X_train, y_train) # train locModel = random.choice(pool) clf = get_base_model(locModel) clf.fit(X_resampled, y_resampled) # predict ypro_pre = clf.predict_proba(X_test) y_pre = ypro_pre.argmax(axis=1).reshape(-1,1) # evaluate acc = (sum(y_pre==y_test)/len(y_pre))[0] fpr, tpr, thresholds = metrics.roc_curve(y_test, ypro_pre[:, 1]) auc = metrics.auc(fpr, tpr) pre = metrics.precision_score(y_test, ypro_pre[:, 1]>0.5) rec = metrics.recall_score(y_test, ypro_pre[:, 1]>0.5) f1 = metrics.f1_score(y_test, ypro_pre[:, 1]>0.5) print(f'Base Model {i}: {locModel}')
def preprocessing(betas, labels, cpg_sites, index, threshold_to_drop=0.1, test_size=0.3, sampling_strategy=0.5, fill_na_strategy='knn', smote=False, undersample=False, train_test=True): print(f"=== Drop Columns and Rows ===") # Dropping rows for which label is NA idx_to_delete = np.where(np.isnan(labels))[0] print(f"Dropping {idx_to_delete.shape[0]} because of missing labels") labels = np.delete(labels, idx_to_delete) betas = np.delete(betas, idx_to_delete, axis=0) index = np.delete(index, idx_to_delete) print(f"New Shape = {betas.shape}") # Dropping columns percent_threshold = threshold_to_drop * 100 print( f"Dropping columns which have more than {percent_threshold:.0f}% of values missing" ) betas, cpg_sites = drop_columns(betas, cpg_sites, threshold=threshold_to_drop) # Dropping rows print( f"\nDropping rows which have more than {percent_threshold:.0f}% of values missing" ) betas, labels, index = drop_rows(betas, labels, index) # Filling remaining NA Values print(f"\n=== Fill remaining NAs ===") nb_nan = np.sum(np.sum(np.isnan(betas), axis=1), axis=0) if fill_na_strategy == 'knn': print(f"Filling remaining NA values using a KNNImputer") betas = fill_remaining_na(betas) elif fill_na_strategy == 'simple': print(f"Filling remaining NA values using a Simple Median Imputer") imputer = SimpleImputer(missing_values=np.nan, strategy='mean') betas = imputer.fit_transform(betas) else: print(f"Filling remaining NAs with zeros") nan_idx = np.where(np.isnan(betas)) betas[nan_idx] = 0 print( f"{nb_nan} NA were filled, i.e. approximately {nb_nan / betas.shape[0]:.2f} per rows" ) if train_test: print(f"\n=== Train / Test Split ===") print(f"Splitting dataset into train and test") print(f"Train = {100 - test_size * 100:.0f} %") print(f"Test = {test_size * 100:.0f} %") X_train, X_test, y_train, y_test = train_test_split( betas, labels, test_size=test_size, random_state=123) print(f"\n=== Standardize dataset ===") scaler = StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) print( f"The average of column mean on train is {np.mean(np.mean(X_train_scaled, axis=1), axis=0):.2f}" ) print( f"The average of column mean on test is {np.mean(np.mean(X_test_scaled, axis=1), axis=0):.2f}" ) if smote: print("\n=== Balance dataset with oversample ===") # Computing multi-class ratio unique, count = np.unique(y_train, return_counts=True) print(list(zip(unique, count))) m = max(count) majority_class = unique[np.argmax(count)] # Every class will be oversampled to (ratio) * #observations in majority class # Except the majority class which is left as is resampling_strategy = { k: max(c, int(sampling_strategy * m)) for (k, c) in zip(unique, count) } resampling_strategy[majority_class] = m print( f"The resampling_strategy gives the following repartition {resampling_strategy}" ) sm = SMOTE(random_state=123, sampling_strategy=resampling_strategy) X_train_res, y_train_res = sm.fit_sample(X_train_scaled, y_train) print( f"{X_train_res.shape[0] - X_train_scaled.shape[0]} rows were added in the training data" ) elif undersample: print("=== Balance dataset with undersample ===") print( f"The resampling_strategy gives the following repartition {sampling_strategy}" ) under_sampling = RandomUnderSampler( sampling_strategy=sampling_strategy) X_train_res, y_train_res = under_sampling.fit_resample( X_train_scaled, y_train) else: X_train_res = X_train_scaled y_train_res = y_train return X_train_res, X_test_scaled, y_train_res, y_test, labels, cpg_sites else: df = DataFrame(betas, columns=cpg_sites, index=index) df['label'] = labels df['label'] = df['label'].astype(int) return df
print("recall_score : ", recall_score(y_label.iloc[:df_test_len], y_pred, average="macro")) print("\nAccuracy Score :",accuracy_score(y_pred,y_label.iloc[:df_test_len])) roc_curve_plots(y_label.iloc[:df_test_len],y_pred,X_test,model) """## Use techniques like undersampling or oversampling before running Naïve Bayes, Logistic Regression or SVM. * ### Oversampling or undersampling can be used to tackle the class imbalance problem * ### Oversampling increases the prior probability of imbalanced class and in case of other classifiers, error gets multiplied as the low-proportionate class is mimicked multiple times. """ ros = RandomOverSampler(random_state=42) X_ros, y_ros = ros.fit_resample(x, y) print(X_ros.shape, y_ros.shape) rus = RandomUnderSampler(random_state=42) X_rus, y_rus = rus.fit_resample(x, y) print(X_rus.shape, y_rus.shape) gnb = GaussianNB() df_test_len = df_test.shape[0] # print(df_test.sample(n=df_test_len)) model_accuracies(model = gnb, x_feature=X_ros,y_label= y_ros, X_test=df_test, df_test_len=df_test_len) lr = LogisticRegression(class_weight ='balanced') model_accuracies(model = lr, x_feature=X_ros,y_label= y_ros,X_test=df_test,df_test_len=df_test.shape[0]) gnb = GaussianNB() model_accuracies(model = gnb, x_feature=X_rus,y_label= y_rus, X_test=df_test.iloc[:788],df_test_len = 788) lr = LogisticRegression(class_weight ='balanced')
# # ``sampling_strategy`` can be given a ``float``. For **under-sampling # methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by # :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and # :math:`N_{m}` are the number of samples in the majority class after # resampling and the number of samples in the minority class, respectively. # select only 2 classes since the ratio make sense in this case binary_mask = np.bitwise_or(y == 0, y == 2) binary_y = y[binary_mask] binary_X = X[binary_mask] sampling_strategy = 0.8 rus = RandomUnderSampler(sampling_strategy=sampling_strategy) X_res, y_res = rus.fit_resample(binary_X, binary_y) print('Information of the iris data set after making it ' 'balanced using a float and an under-sampling method: \n ' 'sampling_strategy={} \n y: {}' .format(sampling_strategy, Counter(y_res))) plot_pie(y_res) ############################################################################### # For **over-sampling methods**, it correspond to the ratio # :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}` # where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the # minority class after resampling and the number of samples in the majority # class, respectively. ros = RandomOverSampler(sampling_strategy=sampling_strategy) X_res, y_res = ros.fit_resample(binary_X, binary_y)
from imblearn.combine import SMOTEENN, SMOTETomek from imblearn.pipeline import make_pipeline from collections import Counter from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler # define oversampling strategy sm = SMOTE(sampling_strategy={3: 20000, 2: 20000}, random_state=1) X_ov, Y_ov = sm.fit_resample(X_train, Y_train) print(Counter(Y_ov)) under = RandomUnderSampler(sampling_strategy={ 0: 20000, 1: 20000 }, random_state=1) X_new, Y_new = under.fit_resample(X_ov, Y_ov) print(Counter(Y_new)) # oversample = RandomOverSampler(sampling_strategy=0.1, random_state=1) # X_new, Y_new = oversample.fit_resample(X_ov, Y_ov) # # ------------------------------------------------------------- # # # ---------------------- Encoding Data ------------------------- # # # ------------------------------------------------------------- # # Prepare Y values for one-hot encoding from sklearn.preprocessing import LabelEncoder from keras.utils import np_utils # encode class values as integers encoder = LabelEncoder()
# Get data df_train, df_test = GetData().get() # Feature Engineering of df_train myFE = FeatEng() df_train = myFE.fit_transform(df_train) X_train = np.array(df_train.drop(['Y'], axis=1)) y_train = np.array(df_train[['Y']]).reshape(-1, ) # Oversampling and Undersampling oversample = RandomOverSampler(sampling_strategy=1) undersample = RandomUnderSampler(sampling_strategy=1) X_over, y_over = oversample.fit_resample(X_train, y_train) X_under, y_under = undersample.fit_resample(X_train, y_train) # Hyperparameter tuning params_dir = os.path.join(os.path.dirname(os.getcwd()), "params") # Direct myModel = OurModel() models_tune = TuningHyperparameters(myModel.models, myModel.params_to_tune) models_tune.fit(X_train, y_train) print(models_tune.get_best_params()) choosen_params = models_tune.get_best_params() with open(os.path.join(params_dir, 'hyperparameters_direct.json'), 'w') as f: json.dump(choosen_params, f) # Oversampling myModel = OurModel()
# Stratify: ensures same proportion of samples to be present in y_train and y_test y_train.value_counts(normalize=True) y_test.value_counts(normalize=True) # ============================================================================= # Sampling # ============================================================================= # sampling only Training data # sampling strategies # Random Under Sampling : reduces majority class to match minority class # Random Over Sampling : increases minority class to match majority class # SMOTE : increases minority class to match majority class by creating synthetic samples # and many more # Random Under Sampling rus = RandomUnderSampler(sampling_strategy=0.3) X_rus, y_rus = rus.fit_resample(X_train, y_train) print( f'sampled trained data percentage:\n{y_rus.value_counts(normalize =True)}') print(f'sampled trained data count:\n{y_rus.value_counts()}') # ============================================================================= # Feature Selection # ============================================================================= # Quick Shortlisting variables Strategy # 1. Removing constant variables : standar deviation = 0 # 2. Removing Quasi constant variables # 3. Removing columns with High precentage of missing value # 4. Removing Highly Correlated Variables # 5. Removing Low Univariate ROC-AUC curve ( cut-off - 50% or 55%) # 1. Constant Features
def prep(df): #sample data data_rate=2400/df.shape[0] if data_rate >1: data_rate=1 report_file.write("data portion used: "+str(data_rate)+"\n") df = df.sample(frac=data_rate, replace=False,random_state=0) #remove repeated rows df = df.drop_duplicates() #print(df.head()) #dropnas #df.isna().sum() df=df.dropna(axis=0) #print(df["Attr1"].value_counts()) #binarize categorical values features = list(df.head(0)) colection = [] names =[] for f in features: if df[f].dtype =='O' and f!=class_name : colection.append(pd.get_dummies(df[f],prefix=f).iloc[:,1:]) names.append(f) if(len(colection)>0): df =df.drop(names,axis=1) concatdf =pd.concat(colection,axis =1) df = pd.concat([df,concatdf],axis=1) df.shape print(df.shape) report_file.write("data size: "+str(df.shape)+"\n") #get class distribuition target_counts = df[class_name].value_counts() rate_of_maiority = max(target_counts)/sum(target_counts) print(rate_of_maiority ) report_file.write("portion of class: "+str(max(target_counts)/sum(target_counts))+"\n") #reduce to featureset and class X_all = df.drop([class_name],axis=1) y_all = df[class_name] #rebalanced data if rate_of_maiority >= 0.6: print("Rebalancing data") sm = RandomUnderSampler(random_state=42) X_all, y_all = sm.fit_resample(X_all, y_all) #features.remove("class") # X_all = pd.DataFrame.from_records(X_all) print("Y:",y_all) # print(y_all) # y_all = np.reshape(y_all, (-1, 1)) # print(y_all) # y_all = pd.DataFrame.from_records(y_all) # print(y_all) #print ("X: ", X_all) else: y_all=y_all.values # print("Y:",y_all) #print("X: ",X_all) #normalize X_all = preprocessing.MinMaxScaler((0,1)).fit(X_all).transform(X_all) #print(X_all[0:5,:]) #print head #print(df.head(0)) #generate train and test_set x_train, x_test, y_train, y_test = train_test_split(X_all,y_all,test_size =test_rate,stratify=y_all,random_state=0) return x_train, x_test,y_train,y_test
directory = get_working_directories('pipeline/vegetation-binary', ['data', 'params', 'results', 'model']) """ Read Data """ cols, rows, bands, data = read_binary('data/update-2020-09/stack_v2.bin', to_string=False) X = data[:, :11] test_size = .8 y = np.zeros((cols * rows), dtype=int) for x in range(11, 24): if is_vegetation[x]: y = data[:, x].astype(int) | y X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) rus = RandomUnderSampler(sampling_strategy=1) X_train_sub, y_train_sub = rus.fit_resample(X_train, y_train) scaler = StandardScaler().fit(X_train_sub) X_train_sub = scaler.transform(X_train_sub) pipeline = Pipeline([('kmeans', KMeans()), ('rf', RandomForestClassifier(n_jobs=-1))]) param_grid = dict(kmeans__n_clusters=range(1, 101, 25), rf__max_depth=[3, 8], rf__max_features=[0.1, 0.5], rf__n_estimators=[50, 500]) grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=0, n_jobs=-1) """ Fit, Predict, and Display Results """ grid_clf.fit(X_train_sub, y_train_sub) X_scaled = scaler.transform(X)
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling rus = RandomUnderSampler(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples')
DATASET_PATH = Path('data/preprocessed_data/dataset_10k') print('Loading data') X = pd.read_json(DATASET_PATH.joinpath(SENTENCE_LIST))[0].tolist() y = pd.read_json(DATASET_PATH.joinpath(LABEL_LIST))[0].tolist() print('Splitting data') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22) rus = RandomUnderSampler() X_train, y_train = rus.fit_resample( np.asarray(X_train).reshape(-1, 1), y_train) X_train = X_train.squeeze() y_train = y_train.squeeze() print('Transforming input data') # Transform train data from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer(min_df=10, ngram_range=(1, 2)) X_train_counts = count_vect.fit_transform(X_train) X_test_counts = count_vect.transform(X_test) # from sklearn.feature_extraction.text import TfidfTransformer # tfidf_transformer = TfidfTransformer(use_idf=True) #
# result = hdd x = result.iloc[:, :-1].values y = result.iloc[:, -1].values from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler() del hdd del hdd_extra del hdd_merged del result gc.collect() X_resampled, y_resampled = rus.fit_resample(x, y) print(Counter(y_resampled)) # X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42) clf = RandomForestClassifier() clf.fit(X_resampled, y_resampled) features = [1, 4, 5, 7, 9, 12, 188, 193, 194, 197, 198, 199] columns_specified = [] for feature in features: columns_specified += ["smart_{0}_raw".format(feature)] stripe_size = 50 thresholds = np.arange(start=0, stop=.505, step=.005) output_file = open(