Esempio n. 1
0
def OverSample(X, y):
    print('Original dataset shape %s' % Counter(y))
    gsmote = GeometricSMOTE(random_state=1)
    X_res, y_res = gsmote.fit_resample(X, y)
    print('Resampled dataset shape %s' % Counter(y_res))

    return X_res, y_res
Esempio n. 2
0
class MeanClassifier(BaseEstimator, ClassifierMixin):
    """An example of classifier"""
    def __init__(self,
                 smooth_iteration=25,
                 training_iteration=50,
                 spreading_factor=0.83,
                 FD=0.1,
                 learning_rate=0.3,
                 smooth_learning_factor=0.8):
        """
        Called when initializing the classifier
        """
        self.smooth_iteration = smooth_iteration
        self.spreading_factor = spreading_factor
        self.training_iteration = training_iteration
        self.FD = FD
        self.learning_rate = learning_rate
        self.smooth_learning_factor = smooth_learning_factor
        self.gsom = GSOM(self.spreading_factor,
                         55,
                         max_radius=4,
                         FD=self.FD,
                         learning_rate=self.learning_rate,
                         smooth_learning_factor=self.smooth_learning_factor)
        self.gsmote = GeometricSMOTE(random_state=1,
                                     truncation_factor=1.0,
                                     deformation_factor=0,
                                     k_neighbors=5,
                                     sampling_rate=0.3)

    def fit(self, X, y):
        X_train, y_train = self.gsmote.fit_resample(X, y)
        y1 = np.copy(y_train)
        y = np.column_stack([y1, y_train])
        labels = ["Name", "label"]
        y = np.vstack((labels, y))
        frame = pd.DataFrame(y[1:, :], columns=y[0, :])
        self.gsom.fit(X_train, self.training_iteration, self.smooth_iteration)
        self.gsom.labelling_gsom(X_train, frame, "Name", "label")
        self.gsom.finalize_gsom_label()
        return self

    # def _meaning(self, x):
    #     return True

    def predict(self, X):
        return self.gsom.predict_values(X)
Esempio n. 3
0
class MeanClassifier(BaseEstimator, ClassifierMixin):
    """An example of classifier"""
    def __init__(self,
                 truncation_factor=1.0,
                 deformation_factor=0.0,
                 k_neighbors=1,
                 sampling_rate=0.3,
                 n_estimators=100,
                 learning_rate=0.01,
                 max_depth=3):
        """
        Called when initializing the classifier
        """
        self.truncation_factor = truncation_factor
        self.deformation_factor = deformation_factor
        self.k_neighbors = k_neighbors
        self.sampling_rate = sampling_rate
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.regressor = GradientBoostingClassifier(
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            max_depth=self.max_depth)
        self.gsmote = GeometricSMOTE(
            random_state=1,
            truncation_factor=self.truncation_factor,
            deformation_factor=self.deformation_factor,
            k_neighbors=self.k_neighbors,
            sampling_rate=self.sampling_rate)

    def fit(self, X, y):
        print(self.max_depth, self.learning_rate, self.n_estimators,
              self.sampling_rate, self.k_neighbors, self.deformation_factor,
              self.truncation_factor)
        X_train, y_train = self.gsmote.fit_resample(X, y)
        self.regressor.fit(X_train, y_train)
        return self

    # def _meaning(self, x):
    #     return True

    def predict(self, y):
        return self.regressor.predict(y)
Esempio n. 4
0
    'learning_rate': [0.01],
    'max_depth': [3]
}]
gs = GridSearchCV(MeanClassifier(), parameters)
gs.fit(X, y)

params = gs.best_params_
print(params)

#find performance
X_t, X_test, y_t, y_test = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            random_state=0)
gsmote = GeometricSMOTE(random_state=1,
                        truncation_factor=params["truncation_factor"],
                        deformation_factor=params["deformation_factor"],
                        k_neighbors=params["k_neighbors"],
                        sampling_rate=params["sampling_rate"])
X_train, y_train = gsmote.fit_resample(X_t, y_t)
# Fitting Gradient boosting
gbc = GradientBoostingClassifier(n_estimators=params["n_estimators"],
                                 learning_rate=params["learning_rate"],
                                 max_depth=params["max_depth"])
gbc.fit(X_train, y_train)

# Predicting the Test set results
y_predict = gbc.predict(X_test)
y_pred = np.where(y_predict.astype(int) > 0.5, 1, 0)

evaluate("Gradient Boosting", y_test, y_pred)
Esempio n. 5
0
    def runSMOTEvariationsGen(self, folder):
        """
        Create files with SMOTE preprocessing and without preprocessing.
        :param datasets: datasets.
        :param folder:   cross-validation folders.
        :return:
        """
        smote = SMOTE()
        borderline1 = BorderlineSMOTE(kind='borderline-1')
        borderline2 = BorderlineSMOTE(kind='borderline-2')
        smoteSVM = SVMSMOTE()
        geometric_smote = GeometricSMOTE(n_jobs=-1)

        for dataset in datasets:  # biclass e multiclass
            for fold in range(5):
                path = os.path.join(folder, dataset, str(fold),
                                    ''.join([dataset, "_train.csv"]))
                train = np.genfromtxt(path, delimiter=',')
                X = train[:, 0:train.shape[1] - 1]
                Y = train[:, train.shape[1] - 1]

                # SMOTE
                print("SMOTE..." + dataset)
                X_res, y_res = smote.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(folder, dataset, str(fold),
                                             ''.join([dataset, "_SMOTE.csv"])),
                                header=False,
                                index=False)
                # SMOTE BORDERLINE1
                print("Borderline1..." + dataset)
                X_res, y_res = borderline1.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Borderline1.csv"])),
                                header=False,
                                index=False)
                # SMOTE BORDERLINE2
                print("Borderline2..." + dataset)
                X_res, y_res = borderline2.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Borderline2.csv"])),
                                header=False,
                                index=False)
                # SMOTE SVM
                print("SMOTE SVM..." + dataset)
                X_res, y_res = smoteSVM.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_smoteSVM.csv"])),
                                header=False,
                                index=False)

                # GEOMETRIC SMOTE
                print("GEOMETRIC SMOTE..." + dataset)
                X_res, y_res = geometric_smote.fit_resample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Geometric_SMOTE.csv"])),
                                header=False,
                                index=False)
Esempio n. 6
0
    def parse_input_zoo_data(filename, header='infer'):
        gsmote = GeometricSMOTE(random_state=1)

        #
        #     (X_train, y_train), (X_test, y_test) = mnist.load_data()
        #
        #     d1, d2, d3 = X_train.shape
        #     X_train_reshaped = X_train.reshape(d1, d2 * d3)
        #     print(X_train_reshaped[:2000, :].shape)
        #     y_train_half = y_train[:2000]
        #     classes = y_train_half.tolist()
        #     labels = y_train_half.tolist()
        #     # print(labels)
        #
        #     input_database = {
        #         0: X_train_reshaped[:2000, :]
        #     }
        #GSMOTE
        # X_f,y_f = GSMOTE.OverSample()
        #
        #
        # X_t, X_test, y_t, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=0)
        #
        #
        # classes = y_t.tolist()
        # labels = y_t.tolist()
        # input_database = {
        #     0: X_t
        # }

        X, y = pp.preProcess(filename)
        X_t, X_test, y_t, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
        X_train, y_train = gsmote.fit_resample(X_t, y_t)
        classes = y_train.tolist()
        labels = y_train.tolist()
        input_database = {0: X_train}

        # (X_train, y_train), (X_test, y_test) = mnist.load_data()
        #
        # d1, d2, d3 = X_train.shape
        # X_train_reshaped = X_train.reshape(d1, d2 * d3)
        # print(X_train_reshaped[:2000, :].shape)
        # y_train_half = y_train[:2000]
        # classes = y_train_half.tolist()
        # labels = y_train_half.tolist()
        # # print(labels)
        #
        # input_database = {
        #     0: X_train_reshaped[:2000, :]
        # }

        #Smote
        # X_f,y_f = smote.Data_Extract(filename)
        # classes = y_f.tolist()
        # labels = y_f.tolist()
        # input_database = {
        #     0: X_f[:,:]
        # }

        # input_data = pd.read_csv(filename, header=header)
        #
        # input_database = {
        #     0: input_data.as_matrix([0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29])
        # }
        #
        #     (X_train, y_train), (X_test, y_test) = mnist.load_data()
        #
        #     d1, d2, d3 = X_train.shape
        #     X_train_reshaped = X_train.reshape(d1, d2 * d3)
        #     print(X_train_reshaped[:2000, :].shape)
        #     y_train_half = y_train[:2000]
        #     classes = y_train_half.tolist()
        #     labels = y_train_half.tolist()
        #     # print(labels)
        #
        #     input_database = {
        #         0: X_train_reshaped[:2000, :]
        #     }

        # input_data = pd.read_csv(filename, header=header)
        #
        # classes = input_data[17].tolist()
        # labels = input_data[0].tolist()
        # input_database = {
        #     0: input_data.as_matrix([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
        # }

        return input_database, labels, classes, X_test, y_test