def ReSampling(self, data, labels, over_s=True):

        label_status = Counter(labels)
        print(self.tasktype, "data " + self.tasktype, label_status)

        featurelen = len(data[0])
        if 1 not in label_status.keys():
            x, y = np.zeros(shape=featurelen, dtype=np.int), 1
        elif 0 not in label_status.keys():
            x, y = np.zeros(shape=featurelen, dtype=np.int), 0
        else:
            x, y = None, None
        if x is not None:
            data = np.insert(data, 0, x, 0)
            labels = np.insert(labels, 0, y, 0)

        if len(label_status) < 2:
            print(self.tasktype, "no need to resample")
            return data, labels
        if label_status[1] / label_status[0] < 5. and label_status[
                1] / label_status[0] > 0.2:
            print("data are not biased too much")
            return data, labels

        maxSamples = label_status[0]
        if label_status[1] > label_status[0]:
            maxSamples = label_status[1]
            resampling = over_sampling.ADASYN(ratio={
                1: maxSamples,
                0: int(0.4 * maxSamples)
            })
        else:
            resampling = over_sampling.ADASYN(ratio={
                0: maxSamples,
                1: int(0.4 * maxSamples)
            })

        try:
            data, labels = resampling.fit_sample(data, labels)
        except:
            print(self.tasktype, "resampling using random method")
            if over_s:
                resampling = over_sampling.RandomOverSampler()
            else:
                resampling = under_sampling.RandomUnderSampler()

            data, labels = resampling.fit_sample(data, labels)

        label_status = Counter(labels)
        print(self.tasktype, "sampling status=", label_status)

        return data, labels
Ejemplo n.º 2
0
Archivo: core.py Proyecto: thodk/BATMAN
def execute_adasyn(df, label, minority_class):
    def worker(array, x):
        return list(0 if i < 10 else round(i, 2) for i in list(array[x]))

    X_df = df.ix[:, df.columns != label]
    features = X_df.columns
    y_df = df[[label]]

    X_mat = X_df.as_matrix()
    y_mat = y_df.as_matrix().ravel()

    adasyn_obj = over_sampling.ADASYN(k=30)
    X_mat_new, y_mat_new = adasyn_obj.fit_sample(X_mat, y_mat)

    new_examples_count = X_mat_new.shape[0] - X_mat.shape[0]
    if minority_class == "positive_class":
        new_rownames = list("pos_" + str(i) for i in range(new_examples_count))
    else:
        new_rownames = list("neg_" + str(i) for i in range(new_examples_count))
    X_mat_new_examples = X_mat_new[X_mat.shape[0]:]
    X_mat_new_examples = numpy.array(
        map(lambda x: worker(X_mat_new_examples, x),
            range(new_examples_count)))

    X_df_new = pandas.DataFrame(X_mat_new_examples,
                                index=new_rownames,
                                columns=features)
    y_df_new = pandas.DataFrame(y_mat_new[X_mat.shape[0]:],
                                index=new_rownames,
                                columns=['class'])
    new_examples_df = pandas.concat([y_df_new, X_df_new], axis=1)

    df = pandas.concat([df, new_examples_df], axis=0)
    return df
Ejemplo n.º 3
0
def _oversample(X, y, method='SMOTE', strat='not majority'):
    # compute minimum number of samples per class
    min_samples = len(y)
    for l in set(y):
        if y.tolist().count(l) < min_samples:
            min_samples = y.tolist().count(l)
    if min_samples <= 5:
        method = 'RNDM'

    if method == 'ADASYN':
        ios = imbover.ADASYN(sampling_strategy=strat, random_state=42)
    elif method == 'SMOTE':
        ios = imbover.SMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'SMOTENC':
        ios = imbover.SMOTENC(sampling_strategy=strat, random_state=42)
    elif method == 'BORDERSMOTE':
        ios = imbover.BorderlineSMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'SVMSMOTE':
        ios = imbover.SVMSMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'KMEANSSMOTE':
        ios = imbover.KMeansSMOTE(sampling_strategy=strat, random_state=42)
    elif method == 'RNDM':
        ios = imbover.RandomOverSampler(sampling_strategy=strat,
                                        random_state=42)

    X_resampled, y_resampled = ios.fit_resample(X, y)
    return X_resampled, y_resampled
Ejemplo n.º 4
0
def imbalance_set(X, y, operation):
    
    methods = {'smoteen' : imb.SMOTEENN(), 'smotetom' : imb.SMOTETomek(), 'adasyn' : imbov.ADASYN(), 'randomunder' : imbun.RandomUnderSampler(), 'condensed' : imbun.CondensedNearestNeighbour(n_jobs=-1)}
    
    sm = methods[str(operation)]
    
    X_resampl, y_resampl = sm.fit_sample(X, y)
    
    return X_resampl, y_resampl
Ejemplo n.º 5
0
    def __init__(self,
                 inputs,
                 targets,
                 batch_size=100,
                 max_num_batches=-1,
                 shuffle_order=True,
                 rng=None,
                 oversample=None):
        """Create a new recognition data provider object.

        Args:
            inputs (ndarray): Array of data input features of shape
                (num_data, input_dim).
            targets (ndarray): Array of data output targets of shape
                (num_data, output_dim) or (num_data,) if output_dim == 1.
            batch_size (int): Number of data points to include in each batch.
            max_num_batches (int): Maximum number of batches to iterate over
                in an epoch. If `max_num_batches * batch_size > num_data` then
                only as many batches as the data can be split into will be
                used. If set to -1 all of the data will be used.
            shuffle_order (bool): Whether to randomly permute the order of
                the data before each epoch.
            rng (RandomState): A seeded random number generator.
        """
        if not oversample is None:
            oversample = oversample.lower()
            self.initialize_seed(rng)

            if oversample == "smote":
                oversampler = imbl.SMOTE(random_state=self.rng)
            elif oversample == "smote-cat":
                # Need method for specifying categorical attributes, e.g., imbl.SMOTENC(random_state=self.rng, categorical_features=range(4200, 4348))
                raise (NotImplementedError)
            elif oversample == "smote-svm":
                oversampler = imbl.SVMSMOTE(random_state=self.rng)
            elif oversample == "smote-borderline-1":
                oversampler = imbl.BorderlineSMOTE(random_state=self.rng,
                                                   kind='borderline-1')
            elif oversample == "smote-borderline-2":
                oversampler = imbl.BorderlineSMOTE(random_state=self.rng,
                                                   kind='borderline-2')
            elif oversample == "adasyn":
                oversampler = imbl.ADASYN(random_state=self.rng)
            else:
                raise (Exception(
                    "Unrecognized oversampling method: {0}".format(oversample))
                       )

            inputs, targets = oversampler.fit_resample(inputs, targets)

        self.num_classes = 3
        inputs = inputs.astype(np.float32)

        # pass the loaded data to the parent class __init__
        super(RecognitionDataProvider,
              self).__init__(inputs, targets, batch_size, max_num_batches,
                             shuffle_order, rng)
Ejemplo n.º 6
0
def split_data(Xdata, Ydata, oversample, K_neighbors=4):
    if oversample == False:
        X_train, X_test, y_train, y_test = train_test_split(
            Xdata, Ydata, train_size=0.70, random_state=RANDOM_STATE)
    elif oversample == True:
        print('Data was oversampled using the ADASYN method')
        smote = over_sampling.ADASYN(random_state=RANDOM_STATE,
                                     n_neighbors=K_neighbors)
        # split
        X_train, X_test, y_train, y_test = train_test_split(
            Xdata, Ydata, train_size=0.70, random_state=RANDOM_STATE)
        X_train, y_train = smote.fit_sample(X_train, y_train)

        # oversample the train sets
        #X_over, y_over = smote.fit_sample(Xdata,Ydata)
        #X_train, X_test, y_train, y_test = train_test_split(X_over, y_over,train_size = 0.70,random_state=RANDOM_STATE, stratify = y_over)

    return X_train, X_test, y_train, y_test
Ejemplo n.º 7
0
def init(bsize):
    data, label = load("GermanCredit.npz")
    #转换到球极坐标
    # norm=np.sqrt(np.sum(data**2,axis=1,keepdims=True))
    # ag=data/norm
    # data=np.concatenate([data,norm,ag],axis=1)
    #使用sin和cos信息
    # data=np.concatenate([np.sin(data),np.cos(data)],axis=1)

    # 下采样 制造平衡样本
    # cr = under_sampling.NearMiss(version=3)
    # data,label=cr.fit_sample(data,label)
    #上采样 制造平衡样本
    ocr = over_sampling.ADASYN()
    data, label = ocr.fit_sample(data, label)
    #混肴
    idx = list(range(len(data)))
    random.shuffle(idx)
    data, label = data[idx], label[idx]
    #onehot
    olabel = np.zeros(shape=(len(label), 2))
    for i, l in enumerate(label):
        olabel[i][int(l - 1)] = 1

    #类型转换
    data = data.astype("float32")
    olabel = olabel.astype("float32")
    #
    train_sum = int(len(data) / 1.3)
    tdata, tlabel = data[:train_sum], olabel[:train_sum]
    test_data, test_label = data[train_sum:], olabel[train_sum:]
    train_set = mxdata.ArrayDataset(nd.array(tdata), nd.array(tlabel))
    test_set = mxdata.ArrayDataset(nd.array(test_data), nd.array(test_label))

    #
    #loader
    train_loader = mxdata.DataLoader(train_set, batch_size=bsize)
    test_loader = mxdata.DataLoader(test_set, batch_size=bsize)

    return train_loader, test_loader
gender = df0.gender
nd1 = preprocessing.scale(df1.values)

logger.info(f"Data loaded")

jn = pushbulletNotifier.JobNotification(devices="phone")

processes = 25
try:
    X_train, X_test, y_train, y_test = model_selection.train_test_split(nd1, gender.values,
                                                                        test_size=0.2, stratify=gender.values)

    logger.info(f"Split data in to training set and validation set.")
    classifier = ['logisticregression', linear_model.LogisticRegression(max_iter=250)]
    sampler_lst = [['smote', over_sampling.SMOTE()],
                   ['adasyn', over_sampling.ADASYN()],
                   ['random¬oversampler', over_sampling.RandomOverSampler()]]
    pipeline_lst = [ [f'{sampler[0]}-{classifier[0]}', make_pipeline(sampler[1], classifier[1])]
                      for sampler in sampler_lst ]  # noqa
    param_grid = {
        'logisticregression__C': 2.0**np.linspace(-8, 5, 15)
        }  # noqa
    for name, pipe in pipeline_lst:
        jn.send(message=f"Starding cross validation with resampling method {name}")
        logger.info(f"Starting cross validation")
        est = model_selection.GridSearchCV(pipe, param_grid, scoring='roc_auc', cv=5, verbose=49, refit=True,
                                           n_jobs=processes, pre_dispatch=processes, return_train_score=True)
        est.fit(X_train, y_train)
        _, yhat = est.predict_proba(X_test).T
        try:
            logger.info(f"Cross validation done, best score was {est.best_score_}")
Ejemplo n.º 9
0
    for i in tqdm(range(100), desc="Preprocessing", leave=False):
        # Apply over-sampling
        sm_reg = over_sampling.SMOTE(kind='regular',
                                     random_state=RANDOM_STATE,
                                     k_neighbors=5)
        sm_b1 = over_sampling.SMOTE(kind='borderline1',
                                    random_state=RANDOM_STATE,
                                    k_neighbors=5)
        sm_b2 = over_sampling.SMOTE(kind='borderline2',
                                    random_state=RANDOM_STATE,
                                    k_neighbors=5)
        sm_enn = combine.SMOTEENN(random_state=RANDOM_STATE,
                                  smote=over_sampling.SMOTE(k_neighbors=5))
        sm_tomek = combine.SMOTETomek(random_state=RANDOM_STATE,
                                      smote=over_sampling.SMOTE(k_neighbors=5))
        ada = over_sampling.ADASYN(random_state=RANDOM_STATE, n_neighbors=5)

        X_reg, y_reg = sm_reg.fit_sample(X_train, y_train)
        X_b1, y_b1 = sm_b1.fit_sample(X_train, y_train)
        X_b2, y_b2 = sm_b2.fit_sample(X_train, y_train)
        X_enn, y_enn = sm_enn.fit_sample(X_train, y_train)
        X_tomek, y_tomek = sm_tomek.fit_sample(X_train, y_train)
        X_ada, y_ada = ada.fit_sample(X_train, y_train)
        os_list = [[X_train, y_train], [X_reg, y_reg], [X_b1, y_b1],
                   [X_b2, y_b2], [X_enn, y_enn], [X_tomek, y_tomek],
                   [X_ada, y_ada], [X_mndo, y_mndo]]

        # scaling
        os_list, X_test_scaled = preprocessing.normalization(os_list, X_test)
        #os_list, X_test_scaled = preprocessing.standardization(os_list, X_test)
Ejemplo n.º 10
0
from student import egitimGirdi, egitimCikti, valGirdi, valCikti

print(egitimGirdi.shape)

#### SENTETİK VERİ ÜRETİMİ
ros = over_sampling.RandomOverSampler()
rosEgitimGirdi, rosEgitimCikti = ros.fit_sample(egitimGirdi, egitimCikti)

print(rosEgitimGirdi.shape)

smote = over_sampling.SMOTE()
smoteEgitimGirdi, smoteEgitimCikti = smote.fit_sample(egitimGirdi, egitimCikti)

print(smoteEgitimGirdi.shape)

ada = over_sampling.ADASYN(ratio='minority')
adasynEgitimGirdi, adasynEgitimCikti = ada.fit_sample(egitimGirdi, egitimCikti)

print(adasynEgitimGirdi.shape)
#print(adasynEgitimGirdi.shape)

#alınan verileri modellerle analiz etme
models = []
models.append(("LR", LogisticRegression()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("DCT", DecisionTreeClassifier()))
models.append(("GNB", GaussianNB()))
models.append(("SVC", SVC()))
models.append(("GPC", GaussianProcessClassifier(1.0 * RBF(1.0))))
models.append(("MLP", MLPClassifier()))
def oversampling_adasyn(features, labels):
    adasyn = over_sampling.ADASYN(random_state=0)
    return adasyn.fit_resample(X=features, y=labels)
Ejemplo n.º 12
0
    # Preprocessing
    #-----------------
    # Multivariate over-sampling
    mndo_df = mndo(pos, num_minority, file_name)

    X_mndo, y_mndo = append_mndo(X_train, y_train, mndo_df)
    #print('y_mndo: {}'.format(Counter(y_mndo)))

    for i in tqdm(range(100), desc="Preprocessing", leave=False):
        # Apply over-sampling
        sm_reg = over_sampling.SMOTE(kind='regular', random_state=RANDOM_STATE)
        sm_b1 = over_sampling.SMOTE(kind='borderline1', random_state=RANDOM_STATE)
        sm_b2 = over_sampling.SMOTE(kind='borderline2', random_state=RANDOM_STATE)
        sm_enn = combine.SMOTEENN(random_state=RANDOM_STATE)
        sm_tomek = combine.SMOTETomek(random_state=RANDOM_STATE)
        ada = over_sampling.ADASYN(random_state=RANDOM_STATE)
        
        X_reg, y_reg = sm_reg.fit_sample(X_train, y_train)
        X_b1, y_b1 = sm_b1.fit_sample(X_train, y_train)
        X_b2, y_b2 = sm_b2.fit_sample(X_train, y_train)
        X_enn, y_enn = sm_enn.fit_sample(X_train, y_train)
        X_tomek, y_tomek = sm_tomek.fit_sample(X_train, y_train)
        X_ada, y_ada = ada.fit_sample(X_train, y_train)
        os_list = [[X_reg, y_reg], [X_b1, y_b1], [X_b2, y_b2], [X_enn, y_enn],
                [X_tomek, y_tomek], [X_ada, y_ada], [X_mndo, y_mndo]]
       
        # scaling 
        os_list, X_test_scaled = preprocessing.normalization(os_list, X_test)
        #os_list, X_test_scaled = preprocessing.standardization(os_list, X_test)

    #-------------
Ejemplo n.º 13
0
    train_nan.loc[i, fill_col] = train.loc[id_, fill_col]
#test
for i in test_nan.index:
    fill_col, id_ = impute(i, train, test_nan)
    test_nan.loc[i, fill_col] = train.loc[id_, fill_col]

train = pd.concat([train, train_nan], axis=0)
del train_nan
#test
test = pd.concat([test, test_nan], axis=0)
del test_nan

y = train['renewal']
x = train.drop('renewal', axis=1)

ros = over_sampling.ADASYN()
rus = under_sampling.NearMiss()
rcs = combine.SMOTEENN()
rcs2 = combine.SMOTETomek()

log = BaggingClassifier(LogisticRegressionCV(Cs=6))
rf = BaggingClassifier(RandomForestClassifier())
gbc = BaggingClassifier(
    GradientBoostingClassifier(n_estimators=250, learning_rate=0.01))
sv = SVC(C=0.8, probability=True)
for sample, sample_name in zip([rcs2, ros, rus, rcs, rcs2],
                               ['rcs2', 'ros', 'rus', 'rcs']):
    print(sample_name)
    x_rs, y_rs = sample.fit_sample(x, y)
    for model, model_name in zip([log, rf, gbc], ['log', 'rf', 'gbc']):
        model.fit(x_rs, y_rs)
Ejemplo n.º 14
0
        metod, name, train_result, test_result)


print train_x.shape

ROS = over_sampling.RandomOverSampler()
ROS_x, ROS_y = ROS.fit_sample(train_x, train_y)

print ROS_x.shape

smote = over_sampling.SMOTE()
smote_x, smote_y = smote.fit_sample(train_x, train_y)

print smote_x.shape

adasyn = over_sampling.ADASYN()
adasyn_x, adasyn_y = adasyn.fit_sample(train_x, train_y)

print adasyn_x.shape

models = []
models.append(("LR", LogisticRegression()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("DCT", DecisionTreeClassifier()))
models.append(("GNB", GaussianNB()))
models.append(("SVC", SVC()))
models.append(("GPC", GaussianProcessClassifier(1.0 * RBF(1.0))))
models.append(("MLP", MLPClassifier()))
models.append(("ADB", AdaBoostClassifier()))
Ejemplo n.º 15
0
def resample_classes(X,
                     Y,
                     how='und1',
                     random_state=None,
                     test_size=0.3,
                     n_jobs=2,
                     split=True,
                     verbose=True):
    """

    """
    if how == 'und1':
        if verbose:
            msg = 'Under-sampling the majority class(es) by randomly picking '
            msg += 'samples without replacement'
            print msg
        samp = imbus.RandomUnderSampler(random_state=random_state,
                                        replacement=False)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'und2':
        if verbose:
            msg = 'Under-sampling by generating centroids based on clustering '
            msg += 'methods'
            print msg
        samp = imbus.ClusterCentroids(ratio='auto',
                                      random_state=random_state,
                                      estimator=None,
                                      n_jobs=n_jobs)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'und3':
        if verbose:
            print 'Under-sampling based on NearMiss methods'
        samp = imbus.NearMiss(ratio='auto',
                              return_indices=False,
                              random_state=random_state,
                              version=1,
                              size_ngh=None,
                              n_neighbors=3,
                              ver3_samp_ngh=None,
                              n_neighbors_ver3=3,
                              n_jobs=n_jobs)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'over1':
        if verbose:
            msg = 'Over-sampling the minority class(es) by picking samples at '
            msg += 'random with replacement'
            print
        samp = imbov.RandomOverSampler(random_state=random_state)
        X_res, y_res = samp.fit_sample(X, Y)
    elif how == 'over2':
        if verbose:
            msg = 'Over-sapmling using SMOTE - Synthetic Minority Over-sampling '
            msg += 'Technique'
            print msg
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbov.SMOTE(random_state=random_state,
                               ratio=.99,
                               k=None,
                               k_neighbors=5,
                               m=None,
                               m_neighbors=10,
                               out_step=0.5,
                               kind='regular',
                               svm_estimator=None,
                               n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    elif how == 'over3':
        if verbose:
            msg = 'Over-sampling using ADASYN - Adaptive Synthetic Sampling '
            msg += 'Approach for Imbalanced Learning'
            print msg
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbov.ADASYN(ratio=.93,
                                random_state=random_state,
                                k=None,
                                n_neighbors=5,
                                n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    elif how == 'comb1':
        if verbose:
            print 'Combine over- and under-sampling using SMOTE and Tomek links.'
        X_res, y_res = X, Y
        for i in range(3):
            samp = imbcom.SMOTETomek(ratio=.99,
                                     random_state=random_state,
                                     smote=None,
                                     tomek=None,
                                     k=None,
                                     m=None,
                                     out_step=None,
                                     kind_smote=None,
                                     n_jobs=n_jobs)
            X_res, y_res = samp.fit_sample(X_res, y_res)
    else:
        print 'Sampling approach not recognized'
        return

    if verbose:
        print '\t\t\t1\t2\t3\t4'
        val_y = pd.Series(Y).value_counts(sort=False).values
        msg = 'Counts in y_init:\t{}\t{}\t{}\t{} '
        print msg.format(val_y[0], val_y[1], val_y[2], val_y[3])
        val_yres = pd.Series(y_res).value_counts(sort=False).values
        msg = 'Counts in y_resamp:\t{}\t{}\t{}\t{} '
        print msg.format(val_yres[0], val_yres[1], val_yres[2], val_yres[3])

    if split:
        X_train, X_test, y_train, y_test = train_test_split(
            X_res, y_res, test_size=test_size, random_state=random_state)
        if verbose:
            val_ytr = pd.Series(y_train).value_counts(sort=False).values
            msg = 'Counts in y_train:\t{}\t{}\t{}\t{} '
            print msg.format(val_ytr[0], val_ytr[1], val_ytr[2], val_ytr[3])

            val_yte = pd.Series(y_test).value_counts(sort=False).values
            msg = 'Counts in y_test:\t{}\t{}\t{}\t{} '
            print msg.format(val_yte[0], val_yte[1], val_yte[2], val_yte[3])

            print 'X_train:', X_train.shape, ', X_test:', X_test.shape

        return X_train, X_test, y_train, y_test
    else:
        return X_res, y_res