def resample(X, y, sample_fraction=0.1, test_size=0.3):
    X_columns = X.columns
    y_columns = y.columns
    n = len(X_columns)

    print('~' * 80)
    print('@@-\n', y.converted.value_counts())
    print('@@0 - Original')
    show_balance(y.values)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print('@@2 - y_train')
    show_balance(y_train)
    print('@@2 -  y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    ros = RandomOverSampler(random_state=42)
    X_train, y_train = ros.fit_sample(X_train, y_train)
    X_test, y_test = ros.fit_sample(X_test, y_test)
    print('@@3 - Oversampled y_train')
    show_balance(y_train)
    print('@@3 - Oversampled y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    if sample_fraction < 1.0:
        _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=sample_fraction, random_state=43)
        _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=sample_fraction, random_state=44)
        print('@@2 - Downsampled y_train')
        show_balance(y_train)
        print('@@2 - Downsampled y_test')
        show_balance(y_test)
        assert len(X_train.shape) == 2 and len(X_test.shape) == 2, (X_train.shape, X_test.shape)
        assert X_train.shape[1] == n and X_test.shape[1] == n, (X_train.shape, X_test.shape)

    print('X_columns=%d %s' % (len(X_columns), X_columns))
    print('y_columns=%d %s' % (len(y_columns), y_columns))
    print('X_train=%-10s y_train=%s' % (list(X_train.shape), list(y_train.shape)))
    print('X_test =%-10s y_test =%s' % (list(X_test.shape), list(y_test.shape)))
    assert X_train.shape[1] == n and X_test.shape[1] == n

    X_train = pd.DataFrame(X_train, columns=X_columns)
    y_train = pd.DataFrame(y_train, columns=y_columns, index=X_train.index)
    X_test = pd.DataFrame(X_test, columns=X_columns)
    y_test = pd.DataFrame(y_test, columns=y_columns, index=X_test.index)
    print('@@+ y_train\n', y_train.converted.value_counts(), flush=True)
    print('@@+ y_test\n', y_test.converted.value_counts(), flush=True)

    return (X_train, y_train), (X_test, y_test)
Example #2
0
    def transform(self, X, y=None):
        # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
        # TODO      simple trainer in the correct order and leave this to advanced users?

        # Extract predicted column
        y = np.squeeze(X[[self.predicted_column]])

        # Copy the dataframe without the predicted column
        temp_dataframe = X.drop([self.predicted_column], axis=1)

        # Initialize and fit the under sampler
        over_sampler = RandomOverSampler(random_state=self.random_seed)
        x_over_sampled, y_over_sampled = over_sampler.fit_sample(temp_dataframe, y)

        # Build the resulting under sampled dataframe
        result = pd.DataFrame(x_over_sampled)

        # Restore the column names
        result.columns = temp_dataframe.columns

        # Restore the y values
        y_over_sampled = pd.Series(y_over_sampled)
        result[self.predicted_column] = y_over_sampled

        return result
Example #3
0
 def oversample(self):
     self._X_original = self._X
     self._y_original = self._y
     ros = RandomOverSampler(random_state=0)
     X, y = ros.fit_sample(self._X, self._y)
     self._X = X
     self._y = y
def test_ros_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ros = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ros_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ros_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #5
0
 def Random_OverSample(self, imData):
     imDataPlace = self.sPath + imData
     tradition_list, _ = LoadTraditionCSV(imDataPlace)
     trainingDF_Bug = LoadCSV(imDataPlace)['bug'].tolist()
     training_listLabel = MakeLabels(trainingDF_Bug)
     from imblearn.over_sampling import RandomOverSampler
     ros = RandomOverSampler(random_state=0)
     X_res, y_res = ros.fit_sample(tradition_list, training_listLabel)
     result = [X_res, y_res]
     savepickle(result,
                self.oPath + "AfterOverSample" + imData[:-4] + ".pickle")
     return "AfterOverSample" + imData[:-4] + ".pickle"
def oversample(alg, X_train, y_train):
    # print('in oversample got ', X_train, y_train)
    if alg == 'smote':
        smt = SMOTE()
        X_train, y_train = smt.fit_sample(X_train, y_train)
        # print('Resampled dataset shape %s' % Counter(y_train))
        return X_train, y_train
    if alg == 'random':
        ros = RandomOverSampler(random_state=42)
        X_train, y_train = ros.fit_sample(X_train, y_train)
        # print('Resampled dataset shape %s' % Counter(y_train))
        return X_train, y_train
def test_ros_fit_sample_half():
    ratio = 0.5
    ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, Y)
    X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773],
                     [0.20792588, 1.49407907], [0.47104475, 0.44386323],
                     [0.22950086, 0.33367433], [0.15490546, 0.3130677],
                     [0.09125309, -0.85409574], [0.12372842, 0.6536186],
                     [0.13347175, 0.12167502], [0.094035, -2.55298982]])
    y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #8
0
def KMeans_unbalanced(X_datavec,
                      Y_datavec,
                      X_columns,
                      Y_names,
                      num_used=20000):

    XY_datavec = pd.merge(pd.DataFrame(X_datavec, columns=X_columns),
                          pd.DataFrame(Y_datavec, columns=[Y_names]),
                          how="left",
                          right_index=True,
                          left_index=True)
    XY_datavec_normal = XY_datavec[XY_datavec[Y_names] == 0]
    X_datavec_normal = XY_datavec_normal.drop(Y_names, axis=1).values.tolist()
    XY_datavec_outlier = XY_datavec[XY_datavec[Y_names] == 1]

    #处理数据不平衡问题
    best_num_cluster = GS_KMeans_parameter(X_datavec_normal)
    y_clst_labels = Model_KMeans(X_datavec_normal, best_num_cluster)
    #避免和已经标记label重合
    y_clst_labels = [i + 100 for i in y_clst_labels]
    print 'y_clst_labels Information:', set(y_clst_labels)
    print '----------------------------------------------'
    XY_clst_normal = pd.merge(pd.DataFrame(X_datavec_normal,
                                           columns=X_columns),
                              pd.DataFrame(y_clst_labels, columns=[Y_names]),
                              how="left",
                              right_index=True,
                              left_index=True)
    XY_datavec = pd.concat([XY_clst_normal, XY_datavec_outlier])
    X_data = XY_datavec.drop(Y_names, axis=1).values.tolist()
    Y_data = XY_datavec[Y_names].values.tolist()
    #输出每个标签的数量
    print 'Counter:y', Counter(Y_data)
    print '----------------------------------------------'
    #随机采样的少数类,解决类别不平衡问题
    ros = RandomOverSampler(random_state=0)
    #ros = SMOTE(random_state=0)
    X_resampled, Y_resampled = ros.fit_sample(X_data, Y_data)
    print 'Counter:y after using RandomOverSampler', Counter(Y_resampled)
    print '----------------------------------------------'
    if len(X_resampled) > num_used * len(Counter(Y_data)):
        #每个类别分层采样num_used个事例
        x_NoUse_train, X_resampled, y_NoUse_train, Y_resampled = train_test_split(
            X_resampled,
            Y_resampled,
            train_size=None,
            test_size=num_used * len(Counter(Y_data)),
            stratify=Y_resampled,
            random_state=0)
    print 'Counter:used y', Counter(Y_resampled)
    print '----------------------------------------------'

    return X_resampled, Y_resampled
Example #9
0
def AdversarialTrainVal(train, test):
    from imblearn.over_sampling import RandomOverSampler
    ros = RandomOverSampler(ratio='minority', random_state=44)
    X_res, y_res = ros.fit_sample(train, train.target)
    train = pd.DataFrame(X_res, columns=train.columns)
    train = train.sort_values(by=['prob_test'], ascending=False)
    Xtrain = pd.DataFrame(train.nlargest(int(train.shape[0] * 0.82),
                                         'prob_test'),
                          columns=train.columns)
    ros = RandomOverSampler(ratio='all', random_state=44)
    X_xres, y_xres = ros.fit_sample(Xtrain, Xtrain.target)
    X_xres = pd.DataFrame(X_xres, columns=train.columns)
    X_data, X_test, Y_data, y_test = train_test_split(X_xres,
                                                      X_xres.target,
                                                      stratify=X_xres.target,
                                                      test_size=0.6,
                                                      random_state=44)

    Xtrain = Xtrain.append(X_data)
    print("Xtrain = {}".format(Xtrain.shape))
    Xtrain = Xtrain.append(
        train.nlargest(int(train.shape[0] * 0.31), 'prob_test'))
    for i in range(100):
        Xtrain = Xtrain.append(train[train["prob_test"] > 0.80])
        Xtrain = Xtrain.append(train[train["prob_test"] > 0.70])
    for i in range(110):
        Xtrain = Xtrain.append(train[train["prob_test"] > 0.60])

    #Xtrain = Xtrain.drop(["is_test"], 1)
    #Xtrain = Xtrain.append(train)

    val = train.nsmallest(int(train.shape[0] * 0.7), 'prob_test')
    val = val.append(X_test)

    #val = val.drop(["is_test"], 1)

    x_train, y_train = Xtrain.drop(['prob_test', "target"], 1), Xtrain.target
    x_val, y_val = val.drop(['prob_test', "target"], 1), val.target
    x_test = test.drop(['prob_test', "target"], 1)
    return x_train, y_train, x_val, y_val, x_test
Example #10
0
def fit_RandomForestClassifier(X_train, X_test, y_train, y_test, target_label):
    # Pipeline
    a = Imputer(missing_values='NaN', strategy='median', axis=0)
    b = StandardScaler()
    c = SelectKBest()
    d = RandomOverSampler()
    X_res, y_res = d.fit_sample(X_train, y_train)
    clf = RandomForestClassifier()
    model = Pipeline([('impute', a), ('scaling', b), ('anova', c),
                      ('rf', clf)])

    # Grid Search CV
    parameters = {
        'anova__k': [5, 10, 20, 40],
        'rf__n_estimators': [10, 50],
        'rf__criterion': ['gini', 'entropy'],
        'rf__max_features': ['auto', 'sqrt']
    }
    grid = GridSearchCV(model, parameters, cv=10, scoring='f1_weighted')
    grid.fit(X_res, y_res)

    # Features Used
    final_pipeline = grid.best_estimator_
    select_indices = final_pipeline.named_steps['anova'].transform(
        np.arange(X_train.shape[1]).reshape(1, -1))
    feature_names = X_train.columns[select_indices]
    # Predicting and scoring on test set
    class_index = list(grid.classes_).index(target_label)
    y_pred = grid.predict(X_test)
    y_score = grid.predict_proba(X_test)[:, class_index]
    model_name = get_modelName(clf)

    # Plot
    fig = plt.figure(figsize=(15, 5))
    plt.subplot(121)
    # ROC curve
    metrics.plot_roc(y_test, y_score, target_label, model_name)
    # Precision Recall Curve
    plt.subplot(122)
    metrics.plot_prc(y_test, y_score, target_label, model_name)
    plt.show()

    # Get Metrics
    metric = metrics.get_ClassificationMetrics(model_name, y_test, y_pred,
                                               y_score, target_label)
    print('Training data best accuracy: %.5f' % grid.best_score_)
    print('Testing data accuracy: %.5f' % grid.score(X_test, y_test))
    print()
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

    return metric, feature_names, y_score
    def resampling_data(self, X, y):

        # Import a dataset with X and multi-label y
        lp = LabelPowerset()
        ros = RandomOverSampler(random_state=42)

        # Applies the above stated multi-label (ML) to multi-class (MC) transformation.
        yt = lp.transform(y)
        X_resampled, y_resampled = ros.fit_sample(X, yt)
        # Inverts the ML-MC transformation to recreate the ML set
        y_resampled = lp.inverse_transform(y_resampled)

        return X_resampled, y_resampled
def oversample(X_train, y_train):
    ## DOES NOT WORK CORRECTLY
    ros = RandomOverSampler(random_state=42)

    n_samples, n_levels, n_variables = X_train.shape[0], \
                                       X_train.shape[1], \
                                       X_train.shape[2]

    X_train = X_train.reshape((n_samples, -1), order='F')
    X_train, y_train = ros.fit_sample(X_train, y_train)
    X_train = np.reshape(X_train, (-1, n_levels, n_variables))

    return X_train, y_train
    def oversample(self):
        """Balance class data based on outcome"""
        print('Current outcome sampling {}'.format(Counter(self.y)))
        
        # to use a random sampling seed at random:
        ros = RandomOverSampler()
        #ros = SMOTE()
        #ros = ADASYN()

        self.X, self.y = ros.fit_sample(self.X, self.y)

        self.Xview = self.X.view()[:, :self.n_features]
        print('Resampled dataset shape {}'.format(Counter(self.y)))
Example #14
0
def sample_data(X, y, sampleMethod):
    if sampleMethod == "Undersample":
        rus = RandomUnderSampler(return_indices=True)
        X_rus, y_rus, id_rus = rus.fit_sample(X, y)
        return X_rus, y_rus, id_rus
    elif sampleMethod == "Oversample":
        ros = RandomOverSampler()
        X_ros, y_ros = ros.fit_sample(X, y)
        return X_ros, y_ros
    elif sampleMethod == "SMOTE":
        smote = SMOTE(ratio='minority')
        X_sm, y_sm = smote.fit_sample(X, y)
        return X_sm, y_sm
Example #15
0
def preprocess_data(data, upsample=False):
    data_x = data.iloc[:, 0:-1]
    data_y = data.iloc[:, -1:np.newaxis]
    if upsample:
        sampler = RandomOverSampler()
        data_x_sampled, data_y_sampled = sampler.fit_sample(data_x, data_y)
        print(data_x_sampled.shape[0] - data_x.shape[0],
              'new random picked points')
        data_x = data_x_sampled
        data_y = data_y_sampled.reshape(-1, 1)
    # standard_x=StandardScaler().fit_transform(data_x)
    standard_x = (data_x - data_x.mean()) / data_x.std()
    return np.hstack((standard_x, np.array(data_y)))
Example #16
0
def OverSampling_RandomOver(X, y):
    from collections import Counter
    print(sorted(Counter(y).items()))
    from imblearn.over_sampling import RandomOverSampler
    ros = RandomOverSampler(random_state=0)
    newX, newY = ros.fit_sample(X, y)
    print(newX.shape, newY.shape, type(newX))
    print(sorted(Counter(newY).items()))
    print('-' * 20)
    #from imblearn.datasets import make_imbalance
    #newX,newY=make_imbalance(newX,newY,ratio=ratio_multiplier)
    #print(sorted(Counter(newY).items()))
    return newX, newY
Example #17
0
def test_ros_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.5
    ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ros_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ros_y_05.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #18
0
 def data_prepare(self):
     """
     数据准备,由于数据集存在不均衡的现象[点击数:未点击数=1:8],所以对训练集数据进行处理,处理方式为对训练集中对点击数据进行随机过采样
     """
     print('正在进行数据集准备...')
     if self.oversampling == 's':
         ros = RandomOverSampler(random_state=0)
         new_train_vec, new_train_label = ros.fit_sample(
             self.train_vec, self.train_label)
     else:
         new_train_vec, new_train_label = self.train_vec, self.train_label
     print('数据集准备完成...')
     return new_train_vec, new_train_label
Example #19
0
 def sample(self, X, Y):
     ##################over_sample########################
     if self.with_sample:
         shape = X.shape
         X = X.reshape((shape[0], -1))
         Y = Y
         # 定义模型,random_state相当于随机数种子的作用
         if self.sample_method == "ROS":
             sampler = RandomOverSampler(random_state=0)
         elif self.sample_method == "SMOTE":
             sampler = SMOTE(random_state=0)
         X, Y = sampler.fit_sample(X, Y)
         X = X.reshape((-1, shape[1], shape[2], shape[3]))
     return X, Y
 def oversample(self):
     """Balance class data based on outcome"""
     print('Current outcome sampling {}'.format(Counter(self.y)))
     
     # to use a random sampling seed at random:
     ros = RandomOverSampler()
     
     # to fix the random sampling seed at a certain value & return indices: 
     #ros = RandomOverSampler(random_state=2)
     
     self.X, self.y = ros.fit_sample(self.X, self.y)
     
     self.Xview = self.X.view()[:, :self.n_features]
     print('Resampled dataset shape {}'.format(Counter(self.y)))
Example #21
0
 def Rand_Over_Samp(self, X_t, Y_t):
     X_train = pd.DataFrame(self.X_t)
     Y_train = pd.DataFrame(self.Y_t)
     comb = pd.concat([X_train, Y_train], axis=1)
     l = list(comb)
     sampler = ROS(random_state=42)
     sampled_X, sampled_Y = sampler.fit_sample(X_train,
                                               Y_train.values.ravel())
     sampled_X = pd.DataFrame(sampled_X)
     sampled_Y = pd.DataFrame(sampled_Y)
     data_for_modelling = np.concatenate([sampled_X, sampled_Y], axis=1)
     data_for_modelling = pd.DataFrame(data_for_modelling)
     data_for_modelling.columns = l
     return data_for_modelling
Example #22
0
def load_data(binary=False):
    src_path = os.path.dirname(os.path.realpath(__file__))

    s_var = BinaryVariable(name=u'sex', pos=u'Male', neg=u'Female')
    y_var = BinaryVariable(name=u'income', pos=u'>50K', neg=u'<=50K')

    if binary:
        df = pd.read_csv(os.path.join(src_path, '../data/adult/adult-b.csv'))
        x_vars = [
            CategoricalVariable('age'),
            CategoricalVariable('workclass'),
            CategoricalVariable('education-num'),
            CategoricalVariable('marital-status'),
            CategoricalVariable('occupation'),
            CategoricalVariable('relationship'),
            CategoricalVariable('race'),
            CategoricalVariable('hours-per-week'),
            CategoricalVariable('native-country')
        ]
    else:
        df = pd.read_csv(os.path.join(src_path, '../data/adult/adult.csv'))
        x_vars = [
            QuantitativeVariable('age'),
            CategoricalVariable('workclass'),
            QuantitativeVariable('education-num'),
            CategoricalVariable('marital-status'),
            CategoricalVariable('occupation'),
            CategoricalVariable('relationship'),
            CategoricalVariable('race'),
            QuantitativeVariable('hours-per-week'),
            CategoricalVariable('native-country')
        ]
    s = s_var.normalize(df[s_var.name])
    y = y_var.normalize(df[y_var.name])

    x = pd.DataFrame(data=None)
    for x_var in x_vars:
        x = pd.concat([x, x_var.normalize(x=df[x_var.name])], axis=1)

    xs = pd.concat([x, s], axis=1)
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_sample(xs, y)
    x = pd.DataFrame(X_resampled[:, :-1])
    s = pd.Series(X_resampled[:, -1], name=s_var.name)
    y = pd.Series(y_resampled, name=y_var.name)

    offset = {
        'hinge-hinge-tau': [0.13, 3.27],
    }
    return s_var, y_var, x, s, y, offset
Example #23
0
    def minority_oversample(x, labels, ratio='auto', post_resamp_n=None):
        """
        Over-sample the minority class(es) in "labels" by picking samples at random.
        with replacement.
        :param x: data to resample
        :param labels: values of dependent variable (m - samples,)
        :param binwidth: width of bin by which to group values. Bins will be the discrete "label" by which to randomly over-sample
        :param post_resamp_n: after resampling, unformly subsample - without replacement - this many samples.
        :return:
        resampled x
        """

        binwidth = 2 * (iqr(labels)) / (labels.shape[0]**(1. / 3))

        miny, maxy = np.min(labels), np.max(labels)
        # Center the min/maxes inside their own bins
        yrange = maxy - miny
        nbins = np.ceil(yrange / (binwidth + 1.0e-10))

        if nbins == 0:
            return x

        # Obtain bin centers, where min/max are within the beginning/ending bins
        binsctrs = np.linspace((miny - binwidth / 2000.),
                               (maxy + binwidth / 2000.),
                               num=nbins)

        # Assign y-values to each bin
        bins = np.digitize(x=labels, bins=binsctrs)

        # Use the index of each sample to over-sample.
        # 1) Pair each sample with an index.
        # 2) Resample the p
        # Represent the resample array as: [y-value, sample-index]
        samp_idc_feat = np.array(list(range(labels.shape[0])))[..., None]

        # Let n_maj be the number of the majority class. ROS generates data
        # such that the count for each class is equal to n_maj.
        ros = RandomOverSampler(ratio)
        resamp_idc, res_bins = ros.fit_sample(X=samp_idc_feat, y=bins)

        if post_resamp_n is not None:
            assert post_resamp_n <= resamp_idc.shape[0]
            resamp_idc, _ = resample(resamp_idc,
                                     res_bins,
                                     n_samples=post_resamp_n,
                                     replace=False)

        # Recover the y-values from each sample
        return x[resamp_idc, ...]
def test_random_over_sampling_return_indices():
    ros = RandomOverSampler(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, sample_indices = ros.fit_sample(X, Y)
    X_gt = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773],
                     [0.20792588, 1.49407907], [0.47104475, 0.44386323],
                     [0.22950086, 0.33367433], [0.15490546, 0.3130677],
                     [0.09125309, -0.85409574], [0.12372842, 0.6536186],
                     [0.13347175, 0.12167502], [0.094035, -2.55298982],
                     [0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.92923648, 0.76103773], [0.47104475, 0.44386323]])
    y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(np.sort(np.unique(sample_indices)), np.arange(len(X)))
Example #25
0
    def get_data(self, data_files, setting, names):
        """
        Get the Data object
        :param data_files: the pathname of the data files
        :param setting: the Setting object
        :param names: the Names object
        :return: the Data object
        """

        # If one data file
        if len(data_files) == 1:
            data_file = data_files[0]

            # Get X and y
            X, y = self.get_X_y(data_file, names)
        elif len(data_files) == 2:
            training_data_file = data_files[0] if 'train' in data_files[
                0] else data_files[1]
            testing_data_file = data_files[0] if 'test' in data_files[
                0] else data_files[1]

            # Get X_train and y_train
            X_train, y_train = self.get_X_y(training_data_file, names)

            # Get X_test and y_test
            X_test, y_test = self.get_X_y(testing_data_file, names)

            # Combine training and testing data
            X = pd.concat([X_train, X_test])
            y = pd.concat([y_train, y_test])
        else:
            print("Wrong number of data files!")
            exit(1)

        # Encode X and y
        X, y = self.encode_X_y(X, y, setting, names)

        # Update the name of features
        names.features = np.array(X.columns)
        # Transform X from dataframe into numpy array
        X = X.values

        # Oversampling when y is imbalanced
        if len(np.unique(np.unique(y, return_counts=True)[1])) != 1:
            ros = RandomOverSampler(random_state=setting.random_state)
            X, y = ros.fit_sample(X, y)

        data = Data.Data(X, y)

        return data
    def handleImbalanceDataset(self, X, Y):
        """
                                                      Method Name: handleImbalanceDataset
                                                      Description: This method handles the imbalance in the dataset by oversampling.
                                                      Output: A Dataframe which is balanced now.
                                                      On Failure: Raise Exception

                                                    
                                   """

        rdsmple = RandomOverSampler()
        x_sampled, y_sampled = rdsmple.fit_sample(X, Y)

        return x_sampled, y_sampled
Example #27
0
def random_oversample(X, y, ratio='auto', random_state=None):
    """
    Function to oversample minority class by sampling at random with replacement (by default)
    :param X: Feature data
    :param y: Class labels
    :param ratio: (string/float) The number of samples in the minority class over the number of samples
        in the majority class.
    :param random_state: (int) Seed used by the random number generator.
    :return: Re-sampled features and corresponding class labels, with higher sampling of minority class
    """
    ros = RandomOverSampler(ratio=ratio, random_state=random_state)
    X_res, y_res = ros.fit_sample(X, y)

    return X_res, y_res
Example #28
0
    def test_train_split(self,
                         X,
                         y,
                         share_train=0.8,
                         stratify=None,
                         balance=None,
                         X_label=None):
        '''
        Create testing and training splits from the provided data. If balance is not None,
        balances data by upsampling or downsampling (upsample, downsample) using RandomSampling.
        Requires the imbalanced-learn library.
        :param X:
        :param Y:
        :param share_train:
        :param stratify:
        :param balance:
        :param mod:
        :param min_val:
        :return:
        '''

        if X.shape[0] != y.shape[0]:
            logging.warning('X and Y are not the same length.', UserWarning)

        # set aside X_test and y_test so that test data is not upsample or downsample data
        X_train, self.X_test, y_train, self.y_test = train_test_split(
            X, y, test_size=(1. - share_train), stratify=stratify)

        self.dependent = y.name
        if X_label:
            self.independent = X_label
        else:
            self.independent = list(X.columns.values)
        self.balance = balance

        if balance == 'upsample':
            ros = RandomOverSampler()
            X_resample, y_resample = ros.fit_sample(X_train, y_train)
        elif balance == 'downsample':
            rus = RandomUnderSampler()
            X_resample, y_resample = rus.fit_sample(X_train, y_train)
        else:
            X_resample = X
            y_resample = y

        self.X_train, X_test, self.y_train, y_test = train_test_split(
            X_resample,
            y_resample,
            test_size=(1. - share_train),
            stratify=stratify)
    def oversample(self):
        """Balance class data based on outcome"""
        print('Current outcome sampling {}'.format(Counter(self.y)))

        # to use a random sampling seed at random:
        ros = RandomOverSampler()

        # to fix the random sampling seed at a certain value & return indices:
        #ros = RandomOverSampler(random_state=2)

        self.X, self.y = ros.fit_sample(self.X, self.y)

        self.Xview = self.X.view()[:, :self.n_features]
        print('Resampled dataset shape {}'.format(Counter(self.y)))
Example #30
0
def Decision():
    print('------------决策树------------')
    # 读取文件
    data1 = pd.read_csv('data.csv', encoding='GBK')
    data = data1.loc[data1["年份(年末)"] != 7]
    # 处理数据不均衡问题
    ros = RandomOverSampler(random_state=0, sampling_strategy=1)
    X_resampled, y_resampled = ros.fit_sample(data.iloc[:, 3:15],
                                              data['是否高转送'])
    # 拆分专家样本集
    data_tr, data_te, label_tr, label_te = train_test_split(
        X_resampled, y_resampled)
    #模型构建
    Model = DecisionTreeClassifier(
        max_depth=25,
        random_state=8,
        splitter='random',
        min_samples_split=3,
        min_samples_leaf=1,
    )
    #模型训练
    Model.fit(data_tr, label_tr)

    #模型预测
    dt_pre = Model.predict(data_te)
    print('预测结果为:', dt_pre)
    print('---------模型预测值与真实值比较------------')
    print(dt_pre == label_te)  #比较模型预测值与真实值
    # 分类报告
    dt_reports = classification_report(label_te, dt_pre)
    print('---------分类报告------------')
    #打印分类报告
    print(dt_reports)

    # 决策树可视化
    dot_data = export_graphviz(Model,
                               feature_names=[
                                   '年份(年末)', '交易日平均价', '预增或预减', '超涨或超跌', '次新股',
                                   '每股资本公积(元/股)+每股未分配利润(元/股)', '每股现金流量净额(元/股)',
                                   '实收资本(或股本)', '每股收益(期末摊薄,元/股)', '每股净资产(元/股)',
                                   '营业总收入同必增长(%)', '近两年送转比例', '上市时间'
                               ],
                               class_names='是否高转送')
    #可视化结果保存到“dt.dot”
    #打开dot文件,需要在node属性中添加“fontname = FangSong”,否则会出现乱码
    f = open('dt.dot', 'w')
    f.write(dot_data)
    f.close()
    graph = graphviz.Source(dot_data)
Example #31
0
def test_multiclass_fit_sample():
    # Make y to be multiclass
    y = Y.copy()
    y[5] = 2
    y[6] = 2

    # Resample the data
    ros = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 5)
    assert_equal(count_y_res[1], 5)
    assert_equal(count_y_res[2], 5)
Example #32
0
def ros_rs(df):
    train_df, test_df, X_train, y_train, X_test, y_test = preprocess(df)
    ros = RandomOverSampler(random_state=SEED)
    X_train_ros, y_train_ros = ros.fit_sample(X_train, y_train)

    # print(X_train_ros.shape[0] - train_features.shape[0], 'new random picked points')

    X_train_ros = pd.DataFrame(X_train_ros)
    X_train_ros.columns = train_df.keys().tolist()
    y_train_ros = pd.DataFrame(y_train_ros, columns=['depressed'])


    print(y_train_ros.depressed.value_counts())

    return X_train_ros, y_train_ros, X_test, y_test
Example #33
0
def update_initial_train(iter_sampling, under_sampling, smote, unmodified_train_X, unmodified_train_y, num_subsets):
    if iter_sampling == True:
        print "Oversampling in the active iteration list"
        ros = RandomOverSampler()
        initial_X_train = None
        initial_y_train = None
        initial_X_train, initial_y_train = ros.fit_sample(unmodified_train_X, unmodified_train_y)
    elif under_sampling == True:
        ee = EasyEnsemble(return_indices=True, replacement=True, n_subsets=num_subsets)
        initial_X_train = None
        initial_y_train = None
        initial_X_train, initial_y_train, indices = ee.fit_sample(unmodified_train_X, unmodified_train_y)
    elif smote == True:
        ros = SMOTE(k_neighbors=3)
        initial_X_train = None
        initial_y_train = None
        initial_X_train, initial_y_train = ros.fit_sample(unmodified_train_X, unmodified_train_y)
    else:
        # initial_X_train[:] = []
        # initial_y_train[:] = []
        initial_X_train = copy.deepcopy(unmodified_train_X)
        initial_y_train = copy.deepcopy(unmodified_train_y)

    return initial_X_train, initial_y_train
Example #34
0
def predict(ver, predict_ver, alike_metrics):
    predictor_rep = PredictorRepository(predict_ver, ver)

    training_m = Metrics_Origin(ver, METRICS_DIR)
    evaluate_m = Metrics_Origin(predict_ver, METRICS_DIR)

    ens_analyzer = AUCAnalyzer(predict_ver, 'ENS', TARGET)

    for i in tqdm(range(ITER)):
        # NML MODEL
        predictor = predictor_rep.get_predictor('ENS', PRED_TYPE)
        if predictor is None:
            print(' predictor has not found, type: ' + PRED_TYPE)
            return
        # sm = RandomOverSampler(ratio='auto', random_state=random.randint(1,100))
        # X_resampled, y_resampled = sm.fit_sample(training_m.product_df, training_m.fault)
        X_resampled, y_resampled = training_m.product_df.as_matrix(
        ), training_m.fault.as_matrix()
        nml_model = predictor.train_model(X_resampled, y_resampled)
        ev_data, dv_data = evaluate_m.get_not_modified_df()
        nml_value, _ = predictor.predict_ensemble_test_data(
            nml_model, ev_data, dv_data, None)

        # RFN MODEL
        sm = RandomOverSampler(ratio='auto',
                               random_state=random.randint(1, 100))
        X_resampled, y_resampled = sm.fit_sample(training_m.mrg_df,
                                                 training_m.fault)
        rfn_model = predictor.train_model(X_resampled, y_resampled)
        ev_data, dv_data = evaluate_m.get_modified_df()
        mrg_value, _ = predictor.predict_ensemble_test_data(
            rfn_model, ev_data, dv_data, None)
        predictor.set_is_new_df(evaluate_m.isNew)
        predictor.set_is_modified_df(evaluate_m.isModified)
        report_df = predictor.export_report(predict_ver)
        report_df[REPORT_COLUMNS].to_csv('df.csv')
        if report_df is not None:
            ens_analyzer.set_report_df(report_df[REPORT_COLUMNS])
            ens_analyzer.calculate()
            ens_analyzer.analyze_predict_result()

    # export report
    ens_df = ens_analyzer.calculate_average(ITER)
    ens_analyzer.export(target_sw=TARGET, df=ens_df, predictor_type=PRED_TYPE)
    ens_df = ens_analyzer.calculate_num_report_averge(ITER)
    ens_analyzer.export_count_report(target_sw=TARGET,
                                     df=ens_df,
                                     predictor_type=PRED_TYPE)
Example #35
0
 def balance_dataset(self, X, Y):
     X_new = X
     Y_new = Y
     overSampler = RandomOverSampler()
     underSampler = RandomUnderSampler()
     #sm = EasyEnsemble()
     #X_refit, Y_refit = sm.fit_sample(X, Y)
     #print('Resampled dataset shape {}'.format(Counter(Y_refit[0])))
     #X, Y = X_refit[0], Y_refit[0]
     classCounts = Counter(Y)
     print('Original training dataset shape {}'.format(classCounts))
     avg = 0
     minCount = classCounts[self.classes[0]]
     maxCount = classCounts[self.classes[0]]
     for i in self.classes:
         avg = avg + classCounts[i]
         if classCounts[i] < minCount:
             minCount = classCounts[i]
         if classCounts[i] > maxCount:
             maxCount = classCounts[i]
     avg = avg // len(classCounts)
     print("Rounded-down average class count in training dataset: " +
           str(avg))
     print("minCount: " + str(minCount))
     print("maxCount: " + str(maxCount))
     rate = avg / float(maxCount)
     print("rate: " + str(rate))
     underSampler = RandomUnderSampler(ratio=rate)
     X_new, Y_new = underSampler.fit_sample(X_new, Y_new)
     classCounts = Counter(Y_new)
     print('Class counts after undersampling {}'.format(classCounts))
     #avg = 0
     #minCount = classCounts[0]
     #maxCount = classCounts[0]
     #for i in range(len(classCounts)):
     #    avg = avg + classCounts[i]
     #    if classCounts[i] < minCount:
     #        minCount = classCounts[i]
     #    if classCounts[i] > maxCount:
     #        maxCount = classCounts[i]
     #avg = avg // len(classCounts)
     #rate = minCount / float(avg)
     #print("rate: " + str(rate))
     #overSampler = RandomOverSampler(ratio = rate)
     #print("I am here1")
     X_new, Y_new = overSampler.fit_sample(X_new, Y_new)
     #print("I am here2")
     return X_new, Y_new
def test_ros_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.5
    ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, Y)

    X_gt = np.array([[0.04352327, -0.20515826], [0.20792588, 1.49407907],
                     [0.22950086, 0.33367433], [0.15490546, 0.3130677],
                     [0.09125309, -0.85409574], [0.12372842, 0.6536186],
                     [0.094035, -2.55298982], [0.92923648, 0.76103773],
                     [0.47104475, 0.44386323], [0.13347175, 0.12167502]])
    y_gt = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ros = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = ros.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 3600)
    assert_equal(count_y_res[1], 3600)
    assert_equal(count_y_res[2], 3600)
Example #38
0
def balance_data(X, y):
    # Apply the random over-sampling
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_sample(X, y)
    return X_resampled, y_resampled
Example #39
0
# summarize the number of rows and columns in the dataset after listwise drop
(sample, vnum) = dataset.shape
print(sample, vnum)

# Get the number of variables
vnum = vnum - 1

# splice into IVs and DV
values = dataset.values
X = values[:, 0:vnum]
y = values[:, vnum]

# Oversampling
ros = RandomOverSampler(random_state=0)
X_R, y_R = ros.fit_sample(X, y)

# create model
model = Sequential()
model.add(Dense(12, input_dim=vnum, kernel_initializer='uniform', activation='relu'))
model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X_R, y_R, epochs=150, batch_size=10, verbose=2)

# calculate predictions
predictions = model.predict(X)
# round predictions
rounded = [round(x[0]) for x in predictions]
Example #40
0
# Split train_val data into training set and validation set
X_train, X_val, y_train, y_val \
    = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# ==========================================================================================

# Over-sampled data

# Generate the new dataset using under-sampling method
verbose = False
ratio = 'auto'

# 'Random over-sampling'
OS = RandomOverSampler(ratio=ratio, verbose=verbose)
X_train_os, y_train_os = OS.fit_sample(X_train, y_train)

# 'SMOTE'
smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
X_train_smo, y_train_smo = smote.fit_sample(X_train, y_train)

# 'SMOTE bordeline 1'
bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1')
X_train_bs1, y_train_bs1 = bsmote1.fit_sample(X_train, y_train)

# 'SMOTE bordeline 2'
bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
X_train_bs2, y_train_bs2 = bsmote2.fit_sample(X_train, y_train)

# 'SMOTE SVM'
svm_args={'class_weight': 'auto'}
for feature in list(data.columns):

	# onehot encode the feature
	feature_data = data[[feature]]
	encoded_feature_data = pd.get_dummies(feature_data)

	print '\n'
	print feature
	print feature_data.shape
	print encoded_feature_data.shape
	print y.shape

	# upsample minority class
	from imblearn.over_sampling import RandomOverSampler
	ros = RandomOverSampler(ratio=0.5)
	X_resampled, y_resampled = ros.fit_sample(encoded_feature_data, y)

	print '\n'
	print X_resampled.shape
	print y_resampled.shape

	# create train and test split
	X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0, test_size=0.2)

	print '\n'
	print 'Training data'
	print X_train.shape
	print y_train.shape

	print 'Testing data'
	print X_test.shape
import sys, os, csv
from imblearn.over_sampling import RandomOverSampler
input_csv_file = sys.argv[1]
input_csv = input_csv_file.split(".csv")[0]
with open(input_csv_file, newline="") as input_file:
    reader = csv.reader(input_file, delimiter=',')
    with open(input_csv + "-ro-.csv", 'w', newline='') as output_file:
        writer = csv.writer(output_file, delimiter=',')
        skip_header = True
        X = []
        y = []
        ros = RandomOverSampler()
        for x in reader:
            if skip_header:
                skip_header = False
                continue
            y.append(x[-1])
            X.append(list(map(int, x[:len(x) - 1])))
            #print (X)
        X_res, y_res = ros.fit_sample(X, y)        
        print (len(X_res))
        print (len(y_res))
        for idx, s in enumerate(X_res):
            #print (list(s) + list(y_res[idx]))
            writer.writerow(list(s) + list(y_res[idx]))
            #break;
            
Example #43
0

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
Example #44
0
def runns(resp_var, size_of_test_data,dataset,positive_class,n_estimators,important_features,dealing_with_nulls):
	dataset = pd.read_csv('raw_data.csv', low_memory=False) # For testing purposes
	#----DATA PREPROCESSING
	#-------dealing with NULL values in the data
	#----------remove the rows in which the response is null
	dataset=dataset.dropna(subset=[resp_var])
	#----------dealing with nulls
	dataset=deal_with_nulls(dealing_with_nulls,dataset)
	#----FEATURE SELECTION
	#-------get predictors important in predicting the response
	#-----------transform categorical predictors to dummy variables
	predictors=dataset.drop(resp_var,axis=1,inplace=False)
	predictors=pd.get_dummies(predictors)
	#-----------balance the classes in the response var
	ros = RandomOverSampler(random_state=0)
	resp=dataset[resp_var]
	prds, resp = ros.fit_sample(predictors, resp)
	#-----------fit the random forest classifier to give us the important predictors
	rf_clf = RandomForestClassifier(n_estimators=n_estimators)
	rf_clf.fit(prds,resp)
	#-------get the important predictors
	feature_imp = pd.Series(rf_clf.feature_importances_,
                    index=list(predictors.iloc[:,0:])).sort_values(ascending=False)
	#-------names of the important predictors
	important_predictor_names = feature_imp.index[0:important_features]
	#-------subset the data to get only the important predictors and the response
	resp=pd.DataFrame(data=resp,columns=[resp_var])
	predictors=pd.DataFrame(prds,columns=list(predictors))
	dataset=pd.concat([resp,predictors],axis=1)
	#---------------------------------------------------------
	#----MODEL TRAINING
	#--------Remove the response variables from the features variables - axis 1 refers to the columns
	m_data= dataset.drop(resp_var, axis = 1,inplace=False) 
	# Response variables are the values we want to predict
	resp_var = np.array(dataset[resp_var])

	dataset = pd.get_dummies(m_data)
    
	# Saving feature names for later use
	feature_list = list(m_data.columns)
	# Convert to numpy array
	dataset = np.array(dataset)

	# Split the data into training and testing sets
	train_features, test_features, train_labels, test_labels = train_test_split(dataset, resp_var, test_size = size_of_test_data, random_state = 402)

	# Instantiate model with n_estimators decision trees
	clf = SVC(kernel='rbf',probability=True)

	# Train the model on training data
	clf.fit(train_features, train_labels)
    # evaluation
	predicted = clf.predict(test_features)
	pred_prob = clf.predict_proba(test_features)
    
	accuracy = accuracy_score(test_labels, predicted)
	#confusion matrix
	cnf = (confusion_matrix(test_labels,predicted))
	#precision score
	precision = precision_score(test_labels,predicted,pos_label=positive_class)
	#avg pres
	avg_precision = average_precision_score(test_labels,pred_prob[:,[1]])
	#recall score
	rec = recall_score(test_labels,predicted,pos_label=positive_class)
	#f1 scorea
	fscore = f1_score(test_labels,predicted,pos_label=positive_class)
	#fbeta score
	fbeta = fbeta_score(test_labels,predicted,beta=0.5)
	#hamming_loss
	hamming = hamming_loss(test_labels,predicted)
	#jaccard similarity score
	jaccard = jaccard_similarity_score(test_labels,predicted)
	#logloss
	logloss = log_loss(test_labels,predicted)
	#zero-oneloss
	zero_one = zero_one_loss(test_labels,predicted)
	#auc roc 
	area_under_roc = roc_auc_score(test_labels,pred_prob[:,[1]])
	#cohen_score
	cohen = cohen_kappa_score(test_labels,predicted)
	#mathews corr
	mathews = matthews_corrcoef(test_labels,predicted)
	# Variable importances from the important features selection stage
	variable_importance_list = list(zip(prds, feature_imp))
	output={"accuracy":accuracy,"precision":precision,"average precision":avg_precision,"recall":rec,"fscore":fscore,"fbeta":fbeta,"hamming":hamming,"jaccard":jaccard,"logloss":logloss,"zero_one":zero_one,"area_under_roc":area_under_roc,"cohen":cohen,"mathews":mathews}
	output=json.dumps(output)
	return jsonify({"Predictions": output})