def OverSample(X, y): print('Original dataset shape %s' % Counter(y)) gsmote = GeometricSMOTE(random_state=1) X_res, y_res = gsmote.fit_resample(X, y) print('Resampled dataset shape %s' % Counter(y_res)) return X_res, y_res
class MeanClassifier(BaseEstimator, ClassifierMixin): """An example of classifier""" def __init__(self, smooth_iteration=25, training_iteration=50, spreading_factor=0.83, FD=0.1, learning_rate=0.3, smooth_learning_factor=0.8): """ Called when initializing the classifier """ self.smooth_iteration = smooth_iteration self.spreading_factor = spreading_factor self.training_iteration = training_iteration self.FD = FD self.learning_rate = learning_rate self.smooth_learning_factor = smooth_learning_factor self.gsom = GSOM(self.spreading_factor, 55, max_radius=4, FD=self.FD, learning_rate=self.learning_rate, smooth_learning_factor=self.smooth_learning_factor) self.gsmote = GeometricSMOTE(random_state=1, truncation_factor=1.0, deformation_factor=0, k_neighbors=5, sampling_rate=0.3) def fit(self, X, y): X_train, y_train = self.gsmote.fit_resample(X, y) y1 = np.copy(y_train) y = np.column_stack([y1, y_train]) labels = ["Name", "label"] y = np.vstack((labels, y)) frame = pd.DataFrame(y[1:, :], columns=y[0, :]) self.gsom.fit(X_train, self.training_iteration, self.smooth_iteration) self.gsom.labelling_gsom(X_train, frame, "Name", "label") self.gsom.finalize_gsom_label() return self # def _meaning(self, x): # return True def predict(self, X): return self.gsom.predict_values(X)
class MeanClassifier(BaseEstimator, ClassifierMixin): """An example of classifier""" def __init__(self, truncation_factor=1.0, deformation_factor=0.0, k_neighbors=1, sampling_rate=0.3, n_estimators=100, learning_rate=0.01, max_depth=3): """ Called when initializing the classifier """ self.truncation_factor = truncation_factor self.deformation_factor = deformation_factor self.k_neighbors = k_neighbors self.sampling_rate = sampling_rate self.n_estimators = n_estimators self.learning_rate = learning_rate self.max_depth = max_depth self.regressor = GradientBoostingClassifier( n_estimators=self.n_estimators, learning_rate=self.learning_rate, max_depth=self.max_depth) self.gsmote = GeometricSMOTE( random_state=1, truncation_factor=self.truncation_factor, deformation_factor=self.deformation_factor, k_neighbors=self.k_neighbors, sampling_rate=self.sampling_rate) def fit(self, X, y): print(self.max_depth, self.learning_rate, self.n_estimators, self.sampling_rate, self.k_neighbors, self.deformation_factor, self.truncation_factor) X_train, y_train = self.gsmote.fit_resample(X, y) self.regressor.fit(X_train, y_train) return self # def _meaning(self, x): # return True def predict(self, y): return self.regressor.predict(y)
'learning_rate': [0.01], 'max_depth': [3] }] gs = GridSearchCV(MeanClassifier(), parameters) gs.fit(X, y) params = gs.best_params_ print(params) #find performance X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.2, random_state=0) gsmote = GeometricSMOTE(random_state=1, truncation_factor=params["truncation_factor"], deformation_factor=params["deformation_factor"], k_neighbors=params["k_neighbors"], sampling_rate=params["sampling_rate"]) X_train, y_train = gsmote.fit_resample(X_t, y_t) # Fitting Gradient boosting gbc = GradientBoostingClassifier(n_estimators=params["n_estimators"], learning_rate=params["learning_rate"], max_depth=params["max_depth"]) gbc.fit(X_train, y_train) # Predicting the Test set results y_predict = gbc.predict(X_test) y_pred = np.where(y_predict.astype(int) > 0.5, 1, 0) evaluate("Gradient Boosting", y_test, y_pred)
def runSMOTEvariationsGen(self, folder): """ Create files with SMOTE preprocessing and without preprocessing. :param datasets: datasets. :param folder: cross-validation folders. :return: """ smote = SMOTE() borderline1 = BorderlineSMOTE(kind='borderline-1') borderline2 = BorderlineSMOTE(kind='borderline-2') smoteSVM = SVMSMOTE() geometric_smote = GeometricSMOTE(n_jobs=-1) for dataset in datasets: # biclass e multiclass for fold in range(5): path = os.path.join(folder, dataset, str(fold), ''.join([dataset, "_train.csv"])) train = np.genfromtxt(path, delimiter=',') X = train[:, 0:train.shape[1] - 1] Y = train[:, train.shape[1] - 1] # SMOTE print("SMOTE..." + dataset) X_res, y_res = smote.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_SMOTE.csv"])), header=False, index=False) # SMOTE BORDERLINE1 print("Borderline1..." + dataset) X_res, y_res = borderline1.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Borderline1.csv"])), header=False, index=False) # SMOTE BORDERLINE2 print("Borderline2..." + dataset) X_res, y_res = borderline2.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Borderline2.csv"])), header=False, index=False) # SMOTE SVM print("SMOTE SVM..." + dataset) X_res, y_res = smoteSVM.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_smoteSVM.csv"])), header=False, index=False) # GEOMETRIC SMOTE print("GEOMETRIC SMOTE..." + dataset) X_res, y_res = geometric_smote.fit_resample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Geometric_SMOTE.csv"])), header=False, index=False)
def parse_input_zoo_data(filename, header='infer'): gsmote = GeometricSMOTE(random_state=1) # # (X_train, y_train), (X_test, y_test) = mnist.load_data() # # d1, d2, d3 = X_train.shape # X_train_reshaped = X_train.reshape(d1, d2 * d3) # print(X_train_reshaped[:2000, :].shape) # y_train_half = y_train[:2000] # classes = y_train_half.tolist() # labels = y_train_half.tolist() # # print(labels) # # input_database = { # 0: X_train_reshaped[:2000, :] # } #GSMOTE # X_f,y_f = GSMOTE.OverSample() # # # X_t, X_test, y_t, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=0) # # # classes = y_t.tolist() # labels = y_t.tolist() # input_database = { # 0: X_t # } X, y = pp.preProcess(filename) X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.2, random_state=0) X_train, y_train = gsmote.fit_resample(X_t, y_t) classes = y_train.tolist() labels = y_train.tolist() input_database = {0: X_train} # (X_train, y_train), (X_test, y_test) = mnist.load_data() # # d1, d2, d3 = X_train.shape # X_train_reshaped = X_train.reshape(d1, d2 * d3) # print(X_train_reshaped[:2000, :].shape) # y_train_half = y_train[:2000] # classes = y_train_half.tolist() # labels = y_train_half.tolist() # # print(labels) # # input_database = { # 0: X_train_reshaped[:2000, :] # } #Smote # X_f,y_f = smote.Data_Extract(filename) # classes = y_f.tolist() # labels = y_f.tolist() # input_database = { # 0: X_f[:,:] # } # input_data = pd.read_csv(filename, header=header) # # input_database = { # 0: input_data.as_matrix([0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29]) # } # # (X_train, y_train), (X_test, y_test) = mnist.load_data() # # d1, d2, d3 = X_train.shape # X_train_reshaped = X_train.reshape(d1, d2 * d3) # print(X_train_reshaped[:2000, :].shape) # y_train_half = y_train[:2000] # classes = y_train_half.tolist() # labels = y_train_half.tolist() # # print(labels) # # input_database = { # 0: X_train_reshaped[:2000, :] # } # input_data = pd.read_csv(filename, header=header) # # classes = input_data[17].tolist() # labels = input_data[0].tolist() # input_database = { # 0: input_data.as_matrix([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]) # } return input_database, labels, classes, X_test, y_test