Ejemplo n.º 1
0
def test_sample_borderline1():
    """Test sample function with borderline 1 SMOTE."""

    # Create the object
    kind = 'borderline1'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.3765279, -0.2009615], [0.55276636, -0.10550373],
                     [0.45413452, -0.08883319], [1.21118683, -0.22817957]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 2
0
def test_sample_svm():
    """Test sample function with SVM SMOTE."""

    # Create the object
    kind = 'svm'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.47436888, -0.2645749], [1.07844561, -0.19435291],
                     [1.44015515, -1.30621303]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 3
0
def test_sample_svm():
    """Test sample function with SVM SMOTE."""

    # Create the object
    kind = 'svm'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.47436888, -0.2645749], [1.07844561, -0.19435291],
                     [1.44015515, -1.30621303]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 4
0
def test_sample_regular_half():
    """Test sample function with regular SMOTE and a ratio of 0.5."""

    # Create the object
    ratio = 0.8
    kind = 'regular'
    smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.36784496, -0.1953161]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 5
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    sm = SMOTE(random_state=RND_SEED)
    sm.fit(X, Y)
    assert_raises(RuntimeError, sm.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
Ejemplo n.º 6
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    sm = SMOTE(random_state=RND_SEED)
    sm.fit(X, Y)
    assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Ejemplo n.º 7
0
def test_smote_fit():
    """Test the fitting method"""

    # Create the object
    smote = SMOTE(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(smote.min_c_, 0)
    assert_equal(smote.maj_c_, 1)
    assert_equal(smote.stats_c_[0], 8)
    assert_equal(smote.stats_c_[1], 12)
Ejemplo n.º 8
0
def test_smote_fit():
    """Test the fitting method"""

    # Create the object
    smote = SMOTE(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(smote.min_c_, 0)
    assert_equal(smote.maj_c_, 1)
    assert_equal(smote.stats_c_[0], 8)
    assert_equal(smote.stats_c_[1], 12)
Ejemplo n.º 9
0
def age_prediction(df, classifier='lr'):
    df.dropna(subset=['age'], inplace=True)

    # Data to use
    X = df['text_p'] #age features extraction
    y = df['age']

    # Results without oversampling and only cv - F1 macro
    # NB: 0.64, LR: 0.66, RF: 0.54

    # using synthetic oversampling technique
    smote = SMOTE('minority')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=33)

    X_sm, y_sm = smote.fit(X_train, y_train)

    if classifier == 'nb':
        nb(X_sm, X_test, y_sm, y_test)
    elif classifier == 'lr':
        lr(X_sm, X_test, y_sm, y_test)
    elif classifier == 'rf':
        rf(X_sm, X_test, y_sm, y_test)
    else:
        sgd(X_sm, X_test, y_sm, y_test)
 def split_data(self, data, seed, re=False):
     X, y = data.iloc[:, 1:-1], data.iloc[:, -1]
     # Train-Test split
     test_size = 0.2
     X_train_o, X_test, y_train_o, y_test = model_selection.train_test_split(
         X, y, test_size=test_size, random_state=seed)
     # Resampling
     if re:
         resam = SMOTE(random_state=seed)
         resam.fit(X_train_o, y_train_o)
         X_train, y_train = resam.fit_resample(X_train_o, y_train_o)
         X_train = pd.DataFrame(X_train, columns=X_train_o.columns)
         y_train = pd.Series(y_train)
     else:
         X_train, y_train = X_train_o, y_train_o
     return X, y, X_train, y_train, X_test, y_test
Ejemplo n.º 11
0
def test_sample_regular():
    """Test sample function with regular SMOTE."""

    # Create the object
    kind = 'regular'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 12
0
def test_sample_svm():
    """Test sample function with SVM SMOTE."""

    # Create the object
    kind = 'svm'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_svm_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_svm_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 13
0
class SMOTER:
    def __init__(self, *args, **kwargs):
        self.smote = SMOTE(*args, **kwargs)
        self.params = dict()
        for key, value in kwargs.items():
            self.params[key] = value

    def fit(self, X, y):
        self.smote.fit(X, y)
        return None

    def transform(self, X, y=None):
        return self.smote.sample(X, y)

    def get_params(self, deep):
        return self.params
Ejemplo n.º 14
0
def test_sample_regular_half():
    """Test sample function with regular SMOTE and a ratio of 0.5."""

    # Create the object
    ratio = 0.5
    kind = 'regular'
    smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y_05.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 15
0
 def split_data(self, data, seed, re=False):
     lbl = preprocessing.LabelEncoder()
     lbl.fit(list(data["class"].values))
     data["class"] = lbl.fit_transform(list(data["class"].values))
     X, y = data.iloc[:, 0:-1], data.iloc[:, -1]
     X = self.OnehotEncode(X, X.select_dtypes("category").columns)
     X.columns = [col.replace("<", "_") for col in X.columns]
     # Train-Test split
     test_size = 0.3
     X_train_o, X_test, y_train_o, y_test = model_selection.train_test_split(
         X, y, test_size=test_size, random_state=seed)
     # Resampling
     if re:
         resam = SMOTE(random_state=seed)
         resam.fit(X_train_o, y_train_o)
         X_train, y_train = resam.fit_resample(X_train_o, y_train_o)
         X_train = pd.DataFrame(X_train, columns=X_train_o.columns)
         y_train = pd.Series(y_train)
     else:
         X_train, y_train = X_train_o, y_train_o
     return X_train, y_train, X_test, y_test
    plt.legend(loc="lower right")
    plt.show()

## 2(3)
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support) 
sample_leaf_options = [1,5,10,50,100,200,500]
   
RF = RandomForestClassifier(min_samples_split=20, random_state=99,max_depth=(len(X_train)-1))
RF.fit(X_train, y_train) 
preditct_RF = RF.predict(X_test)
print('accuracy using RF:',accuracy_score(preditct_RF, y_test))

sm = svm.SVC(C=5,kernel='rbf',gamma=0.02)
sm.fit(X_train, y_train) 
preditct_sm = sm.predict(X_test)
print('accuracy using sm:',accuracy_score(preditct_sm, y_test))
### MLP
mlp_clf = MLPClassifier(solver='sgd', alpha=1e-4,hidden_layer_sizes=(10,3),learning_rate='adaptive', random_state=1,activation='tanh')
mlp_clf.fit(X_train, y_train)
preditct_mlp = mlp_clf.predict(X_test)
print('accuracy using NN:',accuracy_score(preditct_mlp, y_test))  

report = classification_report(y_test,preditct_RF)
fpr, tpr, thresholds = roc_curve(y_test, preditct_RF)
roc_auc = auc(fpr, tpr)
## MLP model 2
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(30,)))
model.add(Dropout(0.2))
Ejemplo n.º 17
0
    def _sample(self, X, y):

        # Create the clusters and set the labels
        self._set_cluster()
        self._fit_cluster(X, y)

        self.labels = self._cluster_class.labels_

        X_resampled = X.copy()
        y_resampled = y.copy()

        with catch_warnings():
            filterwarnings("ignore", category=UserWarning, module="imblearn")

            for target_class in self.ratio_:

                n_to_generate = self.ratio_[target_class]

                clusters_to_use = self._filter_clusters(
                    y, self._cluster_class.labels_, target_class)

                # In case we do not have cluster where the target class it dominant, we apply regular SMOTE
                if not clusters_to_use and n_to_generate > 0:
                    warn("Class does not have a cluster where is dominant.")

                else:
                    sampling_weights = self._calculate_sampling_weights(
                        X, y, clusters_to_use, self.labels, target_class)

                    for cluster in sampling_weights:
                        mask = self.labels == cluster
                        X_cluster = X[mask]
                        y_cluster = y[mask]

                        n_obs = mask.sum()

                        artificial_index = -1

                        # There needs to be at least two unique values of the target variable
                        if np.unique(y_cluster).size < 2:
                            art_x = np.zeros((1, X.shape[1]))
                            artificial_index = n_obs

                            artificial_y = np.unique(y)[
                                np.unique(y) != target_class][0]

                            X_cluster = np.concatenate((X_cluster, art_x),
                                                       axis=0)
                            y_cluster = np.concatenate(
                                (y_cluster, np.asarray(artificial_y).reshape(
                                    (1, ))),
                                axis=0)

                        minority_obs = y_cluster[y_cluster == target_class]

                        n_new = n_to_generate * sampling_weights[cluster]

                        temp_dic = {
                            target_class:
                            int(round(n_new) + minority_obs.size)
                        }

                        # We need to make sure that k_neighors is less than the number of observations in the cluster
                        if self.k_neighbors > minority_obs.size - 1:
                            k_neighbors = minority_obs.size - 1
                        else:
                            k_neighbors = self.k_neighbors

                        over_sampler = SMOTE(ratio=temp_dic,
                                             k_neighbors=k_neighbors)
                        over_sampler.fit(X_cluster, y_cluster)

                        X_cluster_resampled, y_cluster_resampled = over_sampler.sample(
                            X_cluster, y_cluster)

                        # If there was a observation added, then it is necessary to remove it now
                        if artificial_index > 0:
                            X_cluster_resampled = np.delete(
                                X_cluster_resampled, artificial_index, axis=0)
                            y_cluster_resampled = np.delete(
                                y_cluster_resampled, artificial_index)

                        # Save the newly generated samples only
                        X_cluster_resampled = X_cluster_resampled[n_obs:, :]
                        y_cluster_resampled = y_cluster_resampled[n_obs:, ]

                        # Add the newly generated samples to the data to be returned
                        X_resampled = np.concatenate(
                            (X_resampled, X_cluster_resampled))
                        y_resampled = np.concatenate(
                            (y_resampled, y_cluster_resampled))

        return X_resampled, y_resampled
Ejemplo n.º 18
0
class SMOTEBoost(AdaBoostClassifier):
    """Implementation of SMOTEBoost.

    SMOTEBoost introduces data sampling into the AdaBoost algorithm by
    oversampling the minority class using SMOTE on each boosting iteration [1].

    This implementation inherits methods from the scikit-learn 
    AdaBoostClassifier class, only modifying the `fit` method.

    Parameters
    ----------
    n_samples : int, optional (default=100)
        Number of new synthetic samples per boosting step.
    k_neighbors : int, optional (default=5)
        Number of nearest neighbors.
    base_estimator : object, optional (default=DecisionTreeClassifier)
        The base estimator from which the boosted ensemble is built.
        Support for sample weighting is required, as well as proper `classes_`
        and `n_classes_` attributes.
    n_estimators : int, optional (default=50)
        The maximum number of estimators at which boosting is terminated.
        In case of perfect fit, the learning procedure is stopped early.
    learning_rate : float, optional (default=1.)
        Learning rate shrinks the contribution of each classifier by
        ``learning_rate``. There is a trade-off between ``learning_rate`` and
        ``n_estimators``.
    algorithm : {'SAMME', 'SAMME.R'}, optional (default='SAMME.R')
        If 'SAMME.R' then use the SAMME.R real boosting algorithm.
        ``base_estimator`` must support calculation of class probabilities.
        If 'SAMME' then use the SAMME discrete boosting algorithm.
        The SAMME.R algorithm typically converges faster than SAMME,
        achieving a lower test error with fewer boosting iterations.
    random_state : int or None, optional (default=None)
        If int, random_state is the seed used by the random number generator.
        If None, the random number generator is the RandomState instance used
        by np.random.

    References
    ----------
    .. [1] N. V. Chawla, A. Lazarevic, L. O. Hall, and K. W. Bowyer.
           "SMOTEBoost: Improving Prediction of the Minority Class in
           Boosting." European Conference on Principles of Data Mining and
           Knowledge Discovery (PKDD), 2003.
    """

    def __init__(self,
                 n_samples=100,
                 k_neighbors=5,
                 #base_estimator=None,
                 base_estimator = SVC(probability=True, kernel='linear'),
                 n_estimators=50,
                 learning_rate=1.,
                 #algorithm='SAMME.R',
                 algorithm='SAMME',
                 random_state=None):

        self.n_samples = n_samples
        self.algorithm = algorithm
        self.smote = SMOTE(k_neighbors=k_neighbors,
                           random_state=random_state)

        super(SMOTEBoost, self).__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=random_state)

    def fit(self, X, y, sample_weight=None, minority_target=None):
        """Build a boosted classifier/regressor from the training set (X, y),
        performing SMOTE during each boosting step.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrix can be CSC, CSR, COO,
            DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is
            forced to DTYPE from tree._tree if the base classifier of this
            ensemble weighted boosting classifier is a tree or forest.
        y : array-like of shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like of shape = [n_samples], optional
            Sample weights. If None, the sample weights are initialized to
            1 / n_samples.
        minority_target : int
            Minority class label.

        Returns
        -------
        self : object
            Returns self.

        Notes
        -----
        Based on the scikit-learn v0.18 AdaBoostClassifier and
        BaseWeightBoosting `fit` methods.
        """
        # Check that algorithm is supported.
        if self.algorithm not in ('SAMME', 'SAMME.R'):
            raise ValueError("algorithm %s is not supported" % self.algorithm)

        # Check parameters.
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")

        if (self.base_estimator is None or
                isinstance(self.base_estimator, (BaseDecisionTree,
                                                 BaseForest))):
            DTYPE = np.float64  # from fast_dict.pxd
            dtype = DTYPE
            accept_sparse = 'csc'
        else:
            dtype = None
            accept_sparse = ['csr', 'csc']

        X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype,
                         y_numeric=is_regressor(self))

        if sample_weight is None:
            # Initialize weights to 1 / n_samples.
            sample_weight = np.empty(X.shape[0], dtype=np.float64)
            sample_weight[:] = 1. / X.shape[0]
        else:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            # Normalize existing weights.
            sample_weight = sample_weight / sample_weight.sum(dtype=np.float64)

            # Check that the sample weights sum is positive.
            if sample_weight.sum() <= 0:
                raise ValueError(
                    "Attempting to fit with a non-positive "
                    "weighted number of samples.")

        if minority_target is None:
            # Determine the minority class label.
            stats_c_ = Counter(y)
            maj_c_ = max(stats_c_, key=stats_c_.get)
            min_c_ = min(stats_c_, key=stats_c_.get)
            self.minority_target = min_c_
        else:
            self.minority_target = minority_target

        # Check parameters.
        self._validate_estimator()

        # Clear any previous fit results.
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)
        self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)

        random_state = check_random_state(self.random_state)

        for iboost in range(self.n_estimators):
            # SMOTE step.
 #           X_min = X[np.where(y == self.minority_target)]
 #           self.smote.fit(X_min)
 #           X_syn = self.smote.sample(self.n_samples)
 #           y_syn = np.full(X_syn.shape[0], fill_value=self.minority_target, dtype=np.int64)

            X_temp, y_temp = self.smote.fit(X, y)
            X_syn = X_temp[len(X):]
            y_syn = y_syn[len(y):]

            # Normalize synthetic sample weights based on current training set.
            sample_weight_syn = np.empty(X_syn.shape[0], dtype=np.float64)
            sample_weight_syn[:] = 1. / X.shape[0]

            # Combine the original and synthetic samples.
            X = np.vstack((X, X_syn))
            y = np.append(y, y_syn)

            # Combine the weights.
            sample_weight = \
                np.append(sample_weight, sample_weight_syn).reshape(-1, 1)
            sample_weight = \
                np.squeeze(normalize(sample_weight, axis=0, norm='l1'))

            # X, y, sample_weight = shuffle(X, y, sample_weight,
            #                              random_state=random_state)

            # Boosting step.
            sample_weight, estimator_weight, estimator_error = self._boost(
                iboost,
                X, y,
                sample_weight,
                random_state)

            # Early termination.
            if sample_weight is None:
                break

            self.estimator_weights_[iboost] = estimator_weight
            self.estimator_errors_[iboost] = estimator_error

            # Stop if error is zero.
            if estimator_error == 0:
                break

            sample_weight_sum = np.sum(sample_weight)

            # Stop if the sum of sample weights has become non-positive.
            if sample_weight_sum <= 0:
                break

            if iboost < self.n_estimators - 1:
                # Normalize.
                sample_weight /= sample_weight_sum

        return self
Ejemplo n.º 19
0
              metrics=["accuracy"])
# if __name__ == '__main__':

# img_path = '/data/edong/PycharmProjects/projpy/CancerOriginal/00-3734A_Thionin_Cancer_FEU_00000_1_40x.tif'
# img = image.load_img(img_path, target_size=(96, 96))
# img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# x = image.img_to_array(img)
# x = np.expand_dims(x, axis=0)
# x = preprocess_input(x)
# print('Input image shape:', x.shape)
#
# Test a image:
# preds = model.predict(x)
# print('Predicted:', decode_predictions(preds))

# Fit the model with data
from imblearn.over_sampling import SMOTE
sm = SMOTE()
histsmote = sm.fit(X_train,
                   y_train,
                   batch_size=64,
                   epochs=80,
                   verbose=1,
                   validation_data=(X_val, y_val))
hist = model.fit(X_train,
                 y_train,
                 batch_size=64,
                 epochs=80,
                 verbose=1,
                 validation_data=(X_val, y_val))
Ejemplo n.º 20
0
y_rfm = df_modeling_rfm['response']

#CLV
X_clv = df_modeling_clv.drop(columns=['response','customer_id'])
y_clv = df_modeling_clv['response']

## creating train and test dataset
#RFM
X_train_rfm, X_test_rfm, y_train_rfm, y_test_rfm = train_test_split(X_rfm, y_rfm, test_size=0.3, random_state=0)
#CLV
X_train_clv, X_test_clv, y_train_clv, y_test_clv = train_test_split(X_clv, y_clv, test_size=0.3, random_state=0)

# create dummy variable because data is imbalanced
sm = SMOTE(random_state=0)
#RFM
sm.fit(X_train_rfm, y_train_rfm)
X_SMOTE_rfm, y_SMOTE_rfm = sm.fit_sample(X_train_rfm, y_train_rfm)

#CLV
sm.fit(X_train_clv, y_train_clv)
X_SMOTE_clv, y_SMOTE_clv = sm.fit_sample(X_train_clv, y_train_clv)

print('logistic regression model - SMOTE RFM')
logreg = LogisticRegression(solver='liblinear', class_weight='balanced')
predicted_y = []
expected_y = []

logreg_model_SMOTE_rfm = logreg.fit(X_SMOTE_rfm, y_SMOTE_rfm)
predictions = logreg_model_SMOTE_rfm.predict(X_SMOTE_rfm)
predicted_y.extend(predictions)
expected_y.extend(y_SMOTE_rfm)
Ejemplo n.º 21
0
def multiclass_classification(X, Y, sub_to_main_type, feature_names, isSubType,
                              samplingMethod):
    """
    This function is for multi-class classification with some sampling methods.

    :param X: numpy array for features.
    :param Y: numpy array for class labels.
    :param sub_to_main_type: dict mapping network sub-type to network type.
    :param feature_names: a list of feature names.
    :param isSubType: flag for if labels in Y are network subtypes or not.
    :param samplingMethod: name of the sampling method. Valid names are: RandomOver, RandomUnder, SMOTE and None
    :return:
     cm: confusion matrix
     NetworkTypeLabels: a list of string, either network type or network subtype.
     accuracy: accuracy value taking a value in the range [0-1].
     feature_importances: a list of tuple of a feature's name and its importance in the classification.
    """

    if isSubType:
        NetworkTypeLabels = sorted(
            list(set(Y)), key=lambda sub_type: sub_to_main_type[sub_type])
    else:
        NetworkTypeLabels = sorted(list(set(Y)))

    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.4, random_state=0)

    for train_index, test_index in sss.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]

    if samplingMethod == "RandomOver":
        random_over = RandomOverSampler()
        sampled_x, sampled_y = random_over.fit_sample(X_train, y_train)

    elif samplingMethod == "RandomUnder":
        random_under = RandomUnderSampler()
        sampled_x, sampled_y = random_under.fit_sample(X_train, y_train)

    # SMOTE does not support multi-class classification in imblearn library, so we populate minority classes
    # in binary classification setting. The resulting set should all have the same # of instances as the largest class.
    elif samplingMethod == "SMOTE":
        sm = SMOTE(kind='regular', k=3)
        sm.fit(X_train, y_train)

        # get the label of the largest class in terms of the number of instances.
        majority = sm.maj_c_

        all_X = []
        all_Y = []

        for network_type in NetworkTypeLabels:
            if network_type != majority:
                # extract elements of a pair of network types, i.e. the majority and one to be inflated
                X_extracted = np.concatenate(
                    (X_train[y_train == majority],
                     X_train[y_train == network_type]),
                    axis=0)
                Y_extracted = np.concatenate(
                    (y_train[y_train == majority],
                     y_train[y_train == network_type]),
                    axis=0)
                x_tmp, y_tmp = sm.fit_sample(X_extracted, Y_extracted)
                x = x_tmp[y_tmp == network_type]
                y = y_tmp[y_tmp == network_type]
                all_X.append(x)
                all_Y.append(y)

        all_X.append(X_train[y_train == majority])
        all_Y.append(y_train[y_train == majority])

        Xs = np.concatenate(tuple(all_X))
        Ys = np.concatenate(tuple(all_Y))

        sampled_x, sampled_y = sm.fit_sample(Xs, Ys)

    elif samplingMethod == "None":
        sampled_x, sampled_y = X_train, y_train

    random_forest = RandomForestClassifier()
    random_forest.fit(sampled_x, sampled_y)
    accuracy = random_forest.score(X_test, y_test)

    feature_importances = sorted(zip(
        map(lambda x: round(x, 4), random_forest.feature_importances_),
        feature_names),
                                 reverse=True)

    y_pred = random_forest.predict(X_test)
    cm = confusion_matrix(y_test, y_pred, labels=NetworkTypeLabels)
    return cm, NetworkTypeLabels, accuracy, feature_importances
Ejemplo n.º 22
0
acc = do_cross_val_LR(np.array(X_resampled), y_resampled, 10)
print("Accuracy", acc)
##Random Forest
acc = do_cross_val_RForest(np.array(X_resampled), y_resampled, 10)
print("Accuracy", acc)

##Balancing by SMOTE
from imblearn.over_sampling import SMOTE
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=0)
X_res, y_res = sm.fit_sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_res)))
from imblearn.over_sampling import SMOTE
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=42)
sm.fit(X, y)
X_res, y_res = sm.sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_res)))
X_res1, y_res1 = sm.fit_sample(X_res, y_res)
print('Resampled dataset shape {}'.format(Counter(y_res1)))
X_res2, y_res2 = sm.fit_sample(X_res1, y_res1)
print('Resampled dataset shape {}'.format(Counter(y_res2)))

## Decision Tree
acc = do_cross_val_Decision(X_res2, y_res2, 10)
print("Accuracy", acc)
## Logistic Regression
acc = do_cross_val_LR(X_res2, y_res2, 10)
print("Accuracy", acc)
## Random Forest
acc = do_cross_val_RForest(X_res2, y_res2, 10)
Ejemplo n.º 23
0
lcol_num = [x for x in dtype.index.values if not(x in list_col_cat)]
for i in lcol_num:
  colname.append("{}".format(i))

X_train = pipeline_preprocess.transform(X_train)
X_test = pipeline_preprocess.transform(X_test)

joblib.dump([dtype,categorical_feat_classes,list_col_cat,list_idx_cat,categorical_onehot_idx,categorical_onehot_nval,colname], './model/las_kupedes_ultramikro_v3_var.sav')
#joblib.dump([le,pipeline_preprocess], './model/las_kupedes_ultramikro_v3_preprocess.sav')
joblib.dump([le,pipeline_preprocess], './model/las_kupedes_ultramikro_v3_preprocess_wo_scaler.sav')

### Resampling unbalanced dataset
# (1) Over-sampling with SMOTE
def_ratio = 0.15
sm = SMOTE(random_state=42, ratio={0:Y_train.value_counts()[0],1:int(Y_train.value_counts()[0]*(def_ratio/(1-def_ratio)))})
sm.fit(X_train,Y_train)
X_train_upsampled, Y_train_upsampled = sm.sample(X_train, Y_train)
# (2) Class weight
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
sample_weight = compute_sample_weight(
  class_weight = {0:1,1:10},
  y = Y_train_upsampled
)

### CV - XGBoost

from sklearn.model_selection import KFold
K = 5
kf = KFold(n_splits = K, random_state = 3228, shuffle = True)

xgb_preds = []
Ejemplo n.º 24
0
from imblearn.over_sampling import SMOTE

# Generate a global dataset to use
RND_SEED = 0
book = xlrd.open_workbook("F:/Dot/Downloads/truncated.xls")
sheet = book.sheet_by_index(0)

X = []
for row in range(sheet.nrows):
    _row = []
    for col in range(sheet.ncols):
        _row.append(sheet.cell_value(row, col))
    X.append(_row)
X = np.asmatrix(X)
Y = []
X = X.transpose()
for i in range(96):
    if i < 87:
        Y.append(2)
    else:
        Y.append(1)
R_TOL = 1e-4

kind = 'regular'
smote = SMOTE(random_state=RND_SEED, kind=kind)
# Fit the data
smote.fit(X, Y)

X_resampled, y_resampled = smote.fit_sample(X, Y)
print(X_resampled)
Ejemplo n.º 25
0
model = LogisticRegression(random_state=6)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)
print("Accuracy score:", score)

# Code ends here

# --------------
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# code starts here
smote = SMOTE(random_state=9)
smote.fit(X_train, y_train)
X_train, y_train = smote.fit_sample(X_train, y_train)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Code ends here

# --------------
# Code Starts here

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)
Ejemplo n.º 26
0
# Find intersecting features
avail_columns = compound_x.columns.intersection(full_columns)
# Select features on subset
x_data = compound_x.loc[:, avail_columns]
y_data = compound_y.copy()
# Create binary variable
y_class = np.squeeze([int(y_val <= 10) for y_val in y_data])

# Smote
from custom_pipe_helper import SMOTER

import auto

smote = SMOTE()

check = smote.fit(x_data, y_class)
smote.fit_sample()
check = smote.sample(x_data, y_class)

check[0].shape
check[1]

# Create folds
# For each fold
# SMOTE the train data
# Train model
# Evaluate model

from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold