def split(self, x_data, y_data):
     Xt, Yt, Xv, Yv = super(SMOTESplitter, self).split(x_data, y_data)
     Xt_smote, Yt_smote = SMOTE(**self._smote_params).fit_transform(
         Xt.as_matrix(), Yt.as_matrix())
     Xt_smote, Yt_smote = UnderSampler(
         ratio=self._under_sample).fit_transform(Xt_smote, Yt_smote)
     return Xt_smote, Yt_smote, Xv, Yv
Beispiel #2
0
def sampling():
    verbose = False
    y = np.bincount(target_train1)
    print y
    ratio = float(y[2]) / float(y[1])
    # 'Random over-sampling'
    OS = OverSampler(ratio=ratio, verbose=verbose)
    osx, osy = OS.fit_transform(data_train1, target_train1)
    random_methods(osx,osy)
    # 'SMOTE'
    smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
    smox, smoy = smote.fit_transform(data_train1, target_train1)
    random_methods(smox,smoy)
    # 'SMOTE bordeline 1'
    bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1')
    bs1x, bs1y = bsmote1.fit_transform(data_train, target_train)
    random_methods(bs1x,bs1y)
    # 'SMOTE bordeline 2'
    bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
    bs2x, bs2y = bsmote2.fit_transform(data_train1, target_train1)
    random_methods(bs2x,bs2y)
    # 'SMOTE SVM'
    svm_args={'class_weight' : 'auto'}
    svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args)
    svsx, svsy = svmsmote.fit_transform(data_train1, target_train1)
    random_methods(svsx,svsy)
    # 'SMOTE Tomek links'
    STK = SMOTETomek(ratio=ratio, verbose=verbose)
    stkx, stky = STK.fit_transform(data_train1, target_train1)
    random_methods(stkx,stky)
    # 'SMOTE ENN'
    SENN = SMOTEENN(ratio=ratio, verbose=verbose)
    ennx, enny = SENN.fit_transform(data_train1, target_train1)
    random_methods(ennx,enny)
    # 'EasyEnsemble'
    EE = EasyEnsemble(verbose=verbose)
    eex, eey = EE.fit_transform(data_train1, target_train1)
    random_methods(eex,eey)
    # 'BalanceCascade'
    BS = BalanceCascade(verbose=verbose)
    bsx, bsy = BS.fit_transform(data_train1, target_train1)
    random_methods(bsx,bsy)
Beispiel #3
0
    def resample(self, X, y, t, fold):
        if not self.resample_method:
            return X, y
        else:
            start = time.time()
            if self.verbose:
                ptf('> Resampling for timestep %d, fold %d' % (t, fold),
                    self.logfile)

            # create resampler
            if self.resample_method == 'under':
                print 'UNDER SAMPLING is not implemented yet'
                return X, y
            elif self.resample_method == 'over':
                if self.oversample_method.lower() == 'smote':
                    resampler = SMOTE(**self.oversample_arguments)
                else:
                    print 'Your resampling method is not implemented yet'
                    return X, y

            print type(X), type(y)
            print X.shape, y[0].shape
            Xsmote, ysmote = resampler.fit_transform(X, y[0])
            # resample
            ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold)
            # ysmote_df = self.build_smoted_label_df(ysmote, y, fold)
            # # find new folds
            # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y)

            if self.debug:
                print np.sum(y[0] == 0), np.sum(ysmote == 0)
                print np.sum(y[0] == 1), np.sum(ysmote == 1)

            if self.on_disk:
                self.pickle_time_step(ysmote_tuple,
                                      'trigger_resample_labels',
                                      fold=fold,
                                      t=t)
                self.pickle_time_step(Xsmote,
                                      'trigger_resample_features',
                                      fold=fold,
                                      t=t)
            else:
                self.trigger_resample_labels[fold][t] = ysmote_tuple
                self.trigger_resample_features[fold][t] = Xsmote

            end = time.time()
            if self.verbose:
                ptf('... %d s' % (end - start), self.logfile)
            return Xsmote, ysmote_tuple
Beispiel #4
0
def apply_sampling(X_data, Y_data, sampling, n_states, maxlen):
    ratio = float(np.count_nonzero(Y_data == 1)) / \
        float(np.count_nonzero(Y_data == 0))
    X_data = np.reshape(X_data, (len(X_data), n_states * maxlen))
    # 'Random over-sampling'
    if sampling == 'OverSampler':
        OS = OverSampler(ratio=ratio, verbose=True)
    # 'Random under-sampling'
    elif sampling == 'UnderSampler':
        OS = UnderSampler(verbose=True)
    # 'Tomek under-sampling'
    elif sampling == 'TomekLinks':
        OS = TomekLinks(verbose=True)
    # Oversampling
    elif sampling == 'SMOTE':
        OS = SMOTE(ratio=1, verbose=True, kind='regular')
    # Oversampling - Undersampling
    elif sampling == 'SMOTETomek':
        OS = SMOTETomek(ratio=ratio, verbose=True)
    # Undersampling
    elif sampling == 'OneSidedSelection':
        OS = OneSidedSelection(verbose=True)
    # Undersampling
    elif sampling == 'CondensedNearestNeighbour':
        OS = CondensedNearestNeighbour(verbose=True)
    # Undersampling
    elif sampling == 'NearMiss':
        OS = NearMiss(version=1, verbose=True)
    # Undersampling
    elif sampling == 'NeighbourhoodCleaningRule':
        OS = NeighbourhoodCleaningRule(verbose=True)
    # ERROR: WRONG SAMPLER, TERMINATE
    else:
        print('Wrong sampling variable you have set... Exiting...')
        sys.exit()
    # print('shape ' + str(X.shape))
    X_data, Y_data = OS.fit_transform(X_data, Y_data)
    return X_data, Y_data
Beispiel #5
0
def _sample_values(X, y, method=None, ratio=1, verbose=False):
    """Perform any kind of sampling(over and under).

    Parameters
    ----------
    X : array, shape = [n_samples, n_features]
        Data.
    y : array, shape = [n_samples]
        Target.
    method : str, optional default: None
        Over or under smapling method.
    ratio: float
        Unbalanced class ratio.

    Returns
    -------
    X, y : tuple
        Sampled X and y.
    """
    if method == 'SMOTE':
        sampler = SMOTE(ratio=ratio, verbose=verbose)

    elif method == 'SMOTEENN':
        ratio = ratio * 0.3
        sampler = SMOTEENN(ratio=ratio, verbose=verbose)

    elif method == 'random_over_sample':
        sampler = OverSampler(ratio=ratio, verbose=verbose)

    elif method == 'random_under_sample':
        sampler = UnderSampler(verbose=verbose)

    elif method == 'TomekLinks':
        sampler = TomekLinks(verbose=verbose)

    return sampler.fit_transform(X, y)
y = pd.read_csv(tain_path,
                header=None,
                index_col=False,
                names=colnames,
                skiprows=[0],
                usecols=[8])
y = y['violation'].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0)
main_x = X.values
main_y = y

verbose = False
ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))

# 'SMOTE'
smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
x, y = smote.fit_transform(main_x, main_y)

ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.333,
                                                    random_state=0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, X_test, y_test)

y_pred = clf.fit(X_train, y_train).predict(X_test)
    n_source, n_target, n_samples = None, None, None

    for f in args.read_train:
        npzfile = np.load(f)
        print "Read %d instances from %s" \
            % (npzfile['feature_matrix'].shape[0], f.name)
        assert npzfile['targets'].size == npzfile['feature_matrix'].shape[0]

        tgt, fm = npzfile['targets'], npzfile['feature_matrix']
        print "target size: ", tgt.shape
        print "positive examples: ", sum(sum(tgt))
        tgt = tgt.reshape(tgt.size)
        if args.smote:
            ratio = float(np.count_nonzero(tgt == 0)) / \
                float(np.count_nonzero(tgt == 1))
            OS = SMOTE(ratio=ratio, kind='regular')
            fm, tgt = OS.fit_transform(fm, tgt)

        if targets is None:
            targets = tgt
            m = fm
        else:
            print "Before concat: ", targets.shape, tgt.shape
            targets = np.concatenate((targets, tgt), axis=0)
            m = np.concatenate((m, fm), axis=0)
            print "After concat: ", targets.shape, tgt.shape

    assert targets.size == m.shape[0]
    assert m.shape[0] == targets.shape[0]

    print "Sum of targets: ", sum(targets)
label_index = 11
folds = os.listdir(location)
delimiter = '\t'
folds[:] = [location + i for i in folds]
verbose = True

## LOAD RAW DATA ##
raw_data = []
feat_content = []
labels = []
for fold_nr in range(0, len(folds)):
    file = open(folds[fold_nr], 'r')
    raw_data.append([])
    feat_content.append([])
    labels.append([])
    for line in file.readlines():
        raw_data[fold_nr].append(line.split(delimiter))
        feat_content[fold_nr] = [
            row[content_index] for row in raw_data[fold_nr]
        ]
        labels[fold_nr] = [row[label_index] for row in raw_data[fold_nr]]
        labels[fold_nr] = [
            0 if label == 'f' else 1 for label in labels[fold_nr]
        ]
    file.close()

## EVALUATE ##
resampler = SMOTE(verbose=verbose)
pip = Pipeline(feat_content, labels, resampler, verbose)
f1_complete = pip.validation()
Beispiel #9
0
y = pd.read_csv(tain_path,
                header=None,
                index_col=False,
                names=colnames,
                skiprows=[0],
                usecols=[8])
y = y['violation'].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0)
main_x = X.values
main_y = y

verbose = False
ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))

# 'SMOTE bordeline 2'
bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
x, y = bsmote2.fit_transform(main_x, main_y)

ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.333,
                                                    random_state=0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, X_test, y_test)

y_pred = clf.fit(X_train, y_train).predict(X_test)

def f6(x):
    if x == 7:
        return 2
    elif x > 7:
        return x - 1
    else:
        return x


#y = map(f6, y)

y = numpy.array(y)

sm = SMOTE(kind='regular')
for i in xrange(20):
    x_metrics, y = sm.fit_transform(x_metrics, y)

clf = RandomForestClassifier(n_estimators=100, class_weight='auto')
pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10)
#print metrics.accuracy_score(y, pr)
#print metrics.confusion_matrix(y, pr)

delete_rows_indexes = [i for i, y_i in enumerate(pr) if y_i == 2 and y[i] == 7]
x_metrics = numpy.delete(x_metrics, delete_rows_indexes, axis=0)
y = numpy.delete(y, delete_rows_indexes, axis=0)
#clf.fit(x_metrics,y)
#joblib.dump(clf, 'rand_forest_model_3.pkl')

pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10)
Beispiel #11
0
                usecols=[3, 4, 5, 6, 7])
y = pd.read_csv(tain_path,
                header=None,
                index_col=False,
                names=colnames,
                skiprows=[0],
                usecols=[8])
y = y['violation'].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0)
main_x = X.values
main_y = y

verbose = False
ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))
# 'SMOTE bordeline 1'
bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1')
x, y = bsmote1.fit_transform(main_x, main_y)

ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.333,
                                                    random_state=0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, X_test, y_test)

y_pred = clf.fit(X_train, y_train).predict(X_test)
Beispiel #12
0
orig_X, orig_y = read_data()
skf = StratifiedKFold(orig_y, n_folds=4, shuffle=True)

while True:
  scores = []
  for train_index, test_index in skf:
    X, X_cv = orig_X[train_index], orig_X[test_index]
    y, y_cv = orig_y[train_index], orig_y[test_index]
    
    # Fraction of majority samples to draw with respect to samples of
    # minority class.
    sampled_X,sampled_y = X,y
    # Oversample data from the minority class.

    if P['is_smote']:
      sampled_X, sampled_y = SMOTE(k=P['k'], m=P['m'], ratio=P['ratio'], verbose=False, kind='regular').fit_transform(sampled_X, sampled_y)
      # Undersample samples from the majority class.
      sampled_X, sampled_y = UnderSampler(1.0).fit_transform(sampled_X, sampled_y)
    
    # Fit a scaler only for the sampled data.
    scaler = Scaler(sampled_X, sampled_y)
    sampled_X = scaler.getOriginalTransformedData()
    #model = RandomForestClassifier(n_estimators=100).fit(sampled_X, sampled_y)
    #model = RandomForestClassifier(n_estimators=P['n_estimators'], criterion=P['criterion'], max_depth=P['max_depth'], min_samples_split=P['min_samples_split'], min_samples_leaf=P['min_samples_leaf'], min_weight_fraction_leaf=P['min_weight_fraction_leaf'], max_features=P['max_features'], max_leaf_nodes=P['max_leaf_nodes'], bootstrap=P['bootstrap'], oob_score=P['oob_score'], n_jobs=8, random_state=None, verbose=0, warm_start=False, class_weight=None).fit(sampled_X, sampled_y)
    model = MLPClassifier(activation=P['activation'], algorithm=P['algorithm'], alpha=P['alpha'], hidden_layer_sizes=P['layer'], learning_rate=P['learning_rate'], tol=P['tol'], random_state=1).fit(sampled_X, sampled_y)
    #model = xgb.XGBClassifier(max_depth=P['max_depth'], n_estimators=P['n_estimators'], learning_rate=P['learning_rate'], nthread=8, subsample=P['subsample'], colsample_bylevel=P['colsample_bylevel']).fit(sampled_X, sampled_y, eval_metric=P['eval_metric'])
    prediction_cv = model.predict_proba(scaler.transform(X_cv))
    auc_score = roc_auc_score(y_cv, prediction_cv[:,1])
    scores.append(auc_score)
    log("***roc_auc_score:%f" % auc_score)