Python SMOTE.fit_transform Examples, unbalanced_dataset.SMOTE.fit_transform Python Examples

Example #1

0

Show file

File: trees.py Project: mentekid/PatternRecognition

def averageTrainTest():
    datasetFile = 'data/source-code-metrics_train.csv'
    labelsFile = 'data/bugs_train.csv'
    data = pd.read_csv(datasetFile, ';') #separate at semicolon instead of comma
    labels = pd.read_csv(labelsFile, ';')
    data.set_index('classid',inplace=True)
    labels.set_index('classid',inplace=True)
    
    """
    Section: SMOTE for class balance
    """ 
    from unbalanced_dataset import SMOTE #, TomekLinks

    columns = list(data)
    smote = SMOTE(ratio=3, verbose=False, kind='regular')
    smox,smoy = smote.fit_transform(data.as_matrix(),labels.as_matrix().ravel())
    data = pd.DataFrame(smox, columns=columns)
    labels = pd.DataFrame(smoy, columns=['bugs'])

    
    """
    Section: outlier detection
    """
    from myOutlierDetection import interquantileRange
    interquantileRange(data, perFeature = False)
    
    data = [trainandtest(data, labels) for _ in range(500)]
    return (sum([data[i][0] for i in range(len(data))])/len(data),sum([data[i][1] for i in range(len(data))])/len(data))

Example #2

0

Show file

File: trees.py Project: mentekid/PatternRecognition

def runCrossValidation(runSMOTE = True, runIQR = True):
    datasetFile = 'data/source-code-metrics_train.csv'
    labelsFile = 'data/bugs_train.csv'
    data = pd.read_csv(datasetFile, ';') #separate at semicolon instead of comma
    labels = pd.read_csv(labelsFile, ';')
    data.set_index('classid',inplace=True)
    labels.set_index('classid',inplace=True)
    
    if runSMOTE:
        """
        Section: SMOTE for class balance
        """ 
        from unbalanced_dataset import SMOTE #, TomekLinks
    
        columns = list(data)
        smote = SMOTE(ratio=3, verbose=False, kind='regular')
        smox,smoy = smote.fit_transform(data.as_matrix(),labels.as_matrix().ravel())
        data = pd.DataFrame(smox, columns=columns)
        labels = pd.DataFrame(smoy, columns=['bugs'])

    if runIQR:
        """
        Section: outlier detection
        """
        from myOutlierDetection import interquantileRange
        interquantileRange(data, perFeature = False)
    
    return crossvalidate(data.as_matrix(), labels.as_matrix().ravel())

Example #3

0

Show file

File: test.py Project: emcmahon013/UnbalancedDataset

def test_smote(x, y):
    print('SMOTE')
    sm = SMOTE(kind='regular', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE bordeline 1')
    sm = SMOTE(kind='borderline1', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE bordeline 2')
    sm = SMOTE(kind='borderline2', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE SVM')
    svm_args={'class_weight': 'auto'}
    sm = SMOTE(kind='svm', verbose=verbose, **svm_args)
    svmx, svmy = sm.fit_transform(x, y)

Example #4

0

Show file

File: smote.py Project: campbelljc/598p4

def sampling():
    verbose = False
    y = np.bincount(target_train1)
    print y
    ratio = float(y[2]) / float(y[1])
    # 'Random over-sampling'
    OS = OverSampler(ratio=ratio, verbose=verbose)
    osx, osy = OS.fit_transform(data_train1, target_train1)
    random_methods(osx,osy)
    # 'SMOTE'
    smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
    smox, smoy = smote.fit_transform(data_train1, target_train1)
    random_methods(smox,smoy)
    # 'SMOTE bordeline 1'
    bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1')
    bs1x, bs1y = bsmote1.fit_transform(data_train, target_train)
    random_methods(bs1x,bs1y)
    # 'SMOTE bordeline 2'
    bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
    bs2x, bs2y = bsmote2.fit_transform(data_train1, target_train1)
    random_methods(bs2x,bs2y)
    # 'SMOTE SVM'
    svm_args={'class_weight' : 'auto'}
    svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args)
    svsx, svsy = svmsmote.fit_transform(data_train1, target_train1)
    random_methods(svsx,svsy)
    # 'SMOTE Tomek links'
    STK = SMOTETomek(ratio=ratio, verbose=verbose)
    stkx, stky = STK.fit_transform(data_train1, target_train1)
    random_methods(stkx,stky)
    # 'SMOTE ENN'
    SENN = SMOTEENN(ratio=ratio, verbose=verbose)
    ennx, enny = SENN.fit_transform(data_train1, target_train1)
    random_methods(ennx,enny)
    # 'EasyEnsemble'
    EE = EasyEnsemble(verbose=verbose)
    eex, eey = EE.fit_transform(data_train1, target_train1)
    random_methods(eex,eey)
    # 'BalanceCascade'
    BS = BalanceCascade(verbose=verbose)
    bsx, bsy = BS.fit_transform(data_train1, target_train1)
    random_methods(bsx,bsy)

Example #5

0

Show file

    def resample(self, X, y, t, fold):
        if not self.resample_method:
            return X, y
        else:
            start = time.time()
            if self.verbose:
                ptf('> Resampling for timestep %d, fold %d' % (t, fold),
                    self.logfile)

            # create resampler
            if self.resample_method == 'under':
                print 'UNDER SAMPLING is not implemented yet'
                return X, y
            elif self.resample_method == 'over':
                if self.oversample_method.lower() == 'smote':
                    resampler = SMOTE(**self.oversample_arguments)
                else:
                    print 'Your resampling method is not implemented yet'
                    return X, y

            print type(X), type(y)
            print X.shape, y[0].shape
            Xsmote, ysmote = resampler.fit_transform(X, y[0])
            # resample
            ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold)
            # ysmote_df = self.build_smoted_label_df(ysmote, y, fold)
            # # find new folds
            # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y)

            if self.debug:
                print np.sum(y[0] == 0), np.sum(ysmote == 0)
                print np.sum(y[0] == 1), np.sum(ysmote == 1)

            if self.on_disk:
                self.pickle_time_step(ysmote_tuple,
                                      'trigger_resample_labels',
                                      fold=fold,
                                      t=t)
                self.pickle_time_step(Xsmote,
                                      'trigger_resample_features',
                                      fold=fold,
                                      t=t)
            else:
                self.trigger_resample_labels[fold][t] = ysmote_tuple
                self.trigger_resample_features[fold][t] = Xsmote

            end = time.time()
            if self.verbose:
                ptf('... %d s' % (end - start), self.logfile)
            return Xsmote, ysmote_tuple

Example #6

0

Show file

File: GeneralCrossValidation.py Project: mentekid/Metaclassifier

def CrossValidateSMOTE(data, labels, clf, folds=10, runSMOTE=True):
    from unbalanced_dataset import SMOTE
    from sklearn.metrics import confusion_matrix as confmat
    from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
    columns = []
    
    if type(data) is not np.ndarray:
        data = data.as_matrix()
        
    if type(labels) is not np.ndarray:
        labels = labels.as_matrix().ravel()
    
    skf = StratifiedKFold(labels,n_folds=folds, shuffle=False)
    sets = [{'train':train, 'test':test} for train, test in skf]
    acc = []
    fmeasure = []
    recall = []
    precision = []
    cm = np.array([0, 0, 0, 0]).reshape(2,2)
    
    for fold in sets:
        data_train = data[fold['train']]
        labels_train = labels[fold['train']]
        
    
        bugs = sum(labels_train)
        ratio = float(len(labels_train)-bugs)/bugs
        
        data_test = data[fold['test']]
        labels_test = labels[fold['test']]
        if runSMOTE:
            smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1')
            data_train, labels_train = smote.fit_transform(data_train,labels_train)
        
        clf.fit(data_train, labels_train)
        hypot = clf.predict(data_test)
        
        acc.append(accuracy_score(hypot, labels_test))
        fmeasure.append(f1_score(hypot, labels_test))
        recall.append(recall_score(hypot, labels_test))
        precision.append(precision_score(hypot, labels_test))
        
        cm += confmat(labels_test, hypot)
        
    return acc, fmeasure, recall, precision, cm

Example #7

0

Show file

File: triggeredseriesmodel.py Project: fcesco/smelling_sepsis

    def resample(self, X, y, t, fold):
        if not self.resample_method:
            return X, y
        else:
            start = time.time()
            if self.verbose:
                ptf('> Resampling for timestep %d, fold %d' % (t, fold), self.logfile)

            # create resampler
            if self.resample_method == 'under':
                print 'UNDER SAMPLING is not implemented yet'
                return X, y
            elif self.resample_method == 'over':
                if self.oversample_method.lower() == 'smote':
                    resampler = SMOTE(**self.oversample_arguments)
                else:
                    print 'Your resampling method is not implemented yet'
                    return X, y

            print type(X), type(y)
            print X.shape, y[0].shape
            Xsmote, ysmote = resampler.fit_transform(X, y[0])
            # resample
            ysmote_tuple = self.build_smoted_label_tuple(ysmote, y, fold)
            # ysmote_df = self.build_smoted_label_df(ysmote, y, fold)
            # # find new folds
            # folds, ynewdf = self.find_new_folds(Xsmote, ysmote, y)

            if self.debug:
                print np.sum(y[0]==0), np.sum(ysmote == 0)
                print np.sum(y[0]==1), np.sum(ysmote == 1)

            if self.on_disk:
                self.pickle_time_step(ysmote_tuple, 'trigger_resample_labels', fold=fold, t=t)
                self.pickle_time_step(Xsmote, 'trigger_resample_features', fold=fold, t=t)
            else:
                self.trigger_resample_labels[fold][t] = ysmote_tuple
                self.trigger_resample_features[fold][t] = Xsmote

            end = time.time()
            if self.verbose:
                ptf('... %d s' % (end-start), self.logfile)
            return Xsmote, ysmote_tuple

Example #8

0

Show file

File: ml.py Project: chqiwang/box-office

def train_with_kmeans(X,Y,W):
    k = 2
    c = KMeans(k)
    y = c.fit_predict(np.reshape(Y,[len(Y),1]))
    
    label,slabel = 0,1
    m0,m1 = np.mean(Y[y == 0]),np.mean(Y[y == 1])
    if m0 > m1:
        label = 1
    idx = np.where(y == label)[0]
    Ys = Y[y == label]
    ys = c.fit_predict(np.reshape(Ys,[len(Ys),1]))
    m0,m1 = np.mean(Ys[ys == 0]),np.mean(Ys[ys == 1])
    if m0 > m1:
        slabel = 0
    y[idx[ys==slabel]] = abs(1-label)
    
    z,o = float(sum(y == 0)),float(sum(y == 1))
    if z > o:
        r = z/o
    else:
        r = o/z
    smote = SMOTE(ratio=r/2, kind='regular')
    XS_train, yy_train = smote.fit_transform(X,y)
    
    s = AdaBoostClassifier(n_estimators=300)
    s.fit(XS_train,yy_train)
    #y_test_pred = s.predict(X_test)
    
    X_trains = [X[y == i] for i in range(k)]
    Y_trains = [Y[y == i] for i in range(k)]
    #X_tests = [X_test[y_test_pred == i] for i in range(k)]
    #Y_tests = [Y_test[y_test_pred == i] for i in range(k)]
    W_trains = [W[y == i] for i in range(k)]
    
    regressors = []
    for i in range(k):
        regressor = LinearRegression()
        regressor.fit(X_trains[i],Y_trains[i],W_trains[i])
        regressors.append(regressor)
    return s,regressors

Example #9

0

Show file

def _sample_values(X, y, method=None, ratio=1, verbose=False):
    """Perform any kind of sampling(over and under).

    Parameters
    ----------
    X : array, shape = [n_samples, n_features]
        Data.
    y : array, shape = [n_samples]
        Target.
    method : str, optional default: None
        Over or under smapling method.
    ratio: float
        Unbalanced class ratio.

    Returns
    -------
    X, y : tuple
        Sampled X and y.
    """
    if method == 'SMOTE':
        sampler = SMOTE(ratio=ratio, verbose=verbose)

    elif method == 'SMOTEENN':
        ratio = ratio * 0.3
        sampler = SMOTEENN(ratio=ratio, verbose=verbose)

    elif method == 'random_over_sample':
        sampler = OverSampler(ratio=ratio, verbose=verbose)

    elif method == 'random_under_sample':
        sampler = UnderSampler(verbose=verbose)

    elif method == 'TomekLinks':
        sampler = TomekLinks(verbose=verbose)

    return sampler.fit_transform(X, y)

Example #10

0

Show file

File: multiclassification.py Project: HamedMP/kaggle-airbnb

def _sample_values(X, y, method=None, ratio=1, verbose=False):
    """Perform any kind of sampling(over and under).

    Parameters
    ----------
    X : array, shape = [n_samples, n_features]
        Data.
    y : array, shape = [n_samples]
        Target.
    method : str, optional default: None
        Over or under smapling method.
    ratio: float
        Unbalanced class ratio.

    Returns
    -------
    X, y : tuple
        Sampled X and y.
    """
    if method == 'SMOTE':
        sampler = SMOTE(ratio=ratio, verbose=verbose)

    elif method == 'SMOTEENN':
        ratio = ratio * 0.3
        sampler = SMOTEENN(ratio=ratio, verbose=verbose)

    elif method == 'random_over_sample':
        sampler = OverSampler(ratio=ratio, verbose=verbose)

    elif method == 'random_under_sample':
        sampler = UnderSampler(verbose=verbose)

    elif method == 'TomekLinks':
        sampler = TomekLinks(verbose=verbose)

    return sampler.fit_transform(X, y)

Example #11

0

Show file

File: train_classifier.py Project: paracrawl/DataCollection

    for f in args.read_train:
        npzfile = np.load(f)
        print "Read %d instances from %s" \
            % (npzfile['feature_matrix'].shape[0], f.name)
        assert npzfile['targets'].size == npzfile['feature_matrix'].shape[0]

        tgt, fm = npzfile['targets'], npzfile['feature_matrix']
        print "target size: ", tgt.shape
        print "positive examples: ", sum(sum(tgt))
        tgt = tgt.reshape(tgt.size)
        if args.smote:
            ratio = float(np.count_nonzero(tgt == 0)) / \
                float(np.count_nonzero(tgt == 1))
            OS = SMOTE(ratio=ratio, kind='regular')
            fm, tgt = OS.fit_transform(fm, tgt)

        if targets is None:
            targets = tgt
            m = fm
        else:
            print "Before concat: ", targets.shape, tgt.shape
            targets = np.concatenate((targets, tgt), axis=0)
            m = np.concatenate((m, fm), axis=0)
            print "After concat: ", targets.shape, tgt.shape

    assert targets.size == m.shape[0]
    assert m.shape[0] == targets.shape[0]

    print "Sum of targets: ", sum(targets)
    print "Instances x features: ", m.shape

Example #12

0

Show file

File: tree.py Project: flyxu/scikit-learn

from sklearn import tree
from sklearn.datasets import load_svmlight_file
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import numpy as np
from unbalanced_dataset import SMOTE
#加载原始libsvm格式数据
data,label=load_svmlight_file("/home/hadoop/input/libsvm.data")
#原始数据集合划分30%作为测试集
x_train,x_test,y_train,y_test=cross_validation.train_test_split(data,label,test_size=0.3,random_state=0)
#训练数据调用smote算法
verbose = False
ratio = float(np.count_nonzero(y_train==0)) / float(np.count_nonzero(y_train==1))
smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
smox, smoy = smote.fit_transform(x_train.toarray(), y_train)
print np.count_nonzero(smoy==1)
print np.count_nonzero(smoy==0)
#使用决策树模型训练数据
clf=tree.DecisionTreeClassifier()
clf=clf.fit(smox,smoy)
#score=clf.score(x_test,y_test)
#对测试数据预测
y_pred=clf.predict(x_test)
print y_pred
#模型评估
confusion=confusion_matrix(y_test,y_pred)
print confusion
accruacy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)

Example #13

0

Show file

File: train_model.py Project: niggor/technoKot

x = numpy.delete(x, [3 + i * 11 for i in xrange(6)], axis=1)
#print x[0]
#x = normalize(x, axis=0)
#x = scale(x, axis=0)
y = df[:, -1]
y = map(lambda x: int(x), y)
def f6(x):
    if x == 6:
        return 2
    elif x == 7:
        return 6
    else:
        return x


y = map(f6, y)
y = numpy.array(y)

sm = SMOTE(kind='regular')
for i in xrange(10):
    x, y = sm.fit_transform(x, y)


clf = RandomForestClassifier(n_estimators=100, class_weight='auto')
pr = cross_validation.cross_val_predict(clf, x, y, cv=10)
#clf.fit(x,y)
print metrics.accuracy_score(y, pr)
print metrics.confusion_matrix(y, pr) 
#joblib.dump(clf, 'rand_forest_model_1.pkl')

Example #14

0

Show file

File: unbalanced.py Project: MGolubeva/Ubalanced_classes

            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
axes[2, 2].scatter(ncrx_vis[ncry==1, 0], ncrx_vis[ncry==1, 1], label="Class #1", alpha=0.5, 
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
axes[2, 2].set_title('Neighboorhood cleaning rule', fontsize=fs)

plt.show()

# Generate the new dataset using under-sampling method
verbose = False
ratio = float(np.count_nonzero(y==1)) / float(np.count_nonzero(y==0))
# 'Random over-sampling'
OS = OverSampler(ratio=ratio, verbose=verbose)
osx, osy = OS.fit_transform(x, y)
# 'SMOTE'
smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
smox, smoy = smote.fit_transform(x, y)
# 'SMOTE bordeline 1'
bsmote1 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline1')
bs1x, bs1y = bsmote1.fit_transform(x, y)
# 'SMOTE bordeline 2'
bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
bs2x, bs2y = bsmote2.fit_transform(x, y)
# 'SMOTE SVM'
svm_args={'class_weight' : 'auto'}
svmsmote = SMOTE(ratio=ratio, verbose=verbose, kind='svm', **svm_args)
svsx, svsy = svmsmote.fit_transform(x, y)
# 'SMOTE Tomek links'
STK = SMOTETomek(ratio=ratio, verbose=verbose)
stkx, stky = STK.fit_transform(x, y)
# 'SMOTE ENN'
SENN = SMOTEENN(ratio=ratio, verbose=verbose)

Example #15

0

Show file

File: overSampling.py Project: debasishdebs/parameterTesting

 def smote_boderline2(self):
     bsmote2 = SMOTE(ratio=self._ratio, verbose=self.verbose, kind='borderline2')
     bs2x, bs2y = bsmote2.fit_transform(self.x, self.y)
     return bs2x, bs2y

Example #16

0

Show file

File: overSampling.py Project: debasishdebs/parameterTesting

 def smote_svm(self):
     svm_args={'class_weight' : 'auto'}
     svmsmote = SMOTE(ratio=self._ratio, verbose=self.verbose, kind='svm', **svm_args)
     svsx, svsy = svmsmote.fit_transform(self.x, self.y)
     return svsx, svsy

Example #17

0

Show file

File: overSampling.py Project: debasishdebs/parameterTesting

 def smote(self):
     smote = SMOTE(ratio=self._ratio, verbose=self.verbose, kind='regular')
     smox, smoy = smote.fit_transform(self.x, self.y)
     return  smox, smoy

Example #18

0

Show file

File: lle.py Project: flyxu/scikit-learn

import numpy as np

train=pd.read_csv('./cmv.csv')
train['Defective']=train['Defective'].map({'Y':1,'N':0})
print type(train.values)
train=train.values
print train[0:1]
X_r,err=manifold.locally_linear_embedding(train[:,0:-1],n_neighbors=12,n_components=4)
print("Done. Reconstruction error: %g" % err)
data=X_r
label=train[:,-1]
#print label
x_train,x_test,y_train,y_test=cross_validation.train_test_split(data,label,test_size=0.3,random_state=0)
verbose = False
ratio = float(np.count_nonzero(y_train==0)) / float(np.count_nonzero(y_train==1))
smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
smox, smoy = smote.fit_transform(x_train, y_train)
print np.count_nonzero(smoy==1)
print np.count_nonzero(smoy==0)
clf=svm.SVC(C=10000,gamma=0.0078125)
#print y_train.astype(int)
clf.fit(smox,smoy)
y_pred=clf.predict(x_test)
print y_test
print y_pred
confusion=confusion_matrix(y_test,y_pred)
print confusion
score = cross_val_score(clf, x_train, y_train)
print score.mean()
print score.std()

Example #19

0

Show file

File: train_python_model.py Project: andyudina/technocat-movements-checker

def f6(x):
    if x == 7:
        return 2
    elif x > 7:
        return x - 1
    else:
        return x


#y = map(f6, y)

y = numpy.array(y)

sm = SMOTE(kind='regular')
for i in xrange(20):
    x_metrics, y = sm.fit_transform(x_metrics, y)

clf = RandomForestClassifier(n_estimators=100, class_weight='auto')
pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10)
#print metrics.accuracy_score(y, pr)
#print metrics.confusion_matrix(y, pr)

delete_rows_indexes = [i for i, y_i in enumerate(pr) if y_i == 2 and y[i] == 7]
x_metrics = numpy.delete(x_metrics, delete_rows_indexes, axis=0)
y = numpy.delete(y, delete_rows_indexes, axis=0)
#clf.fit(x_metrics,y)
#joblib.dump(clf, 'rand_forest_model_3.pkl')

pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10)
print metrics.accuracy_score(y, pr)
#print metrics.confusion_matrix(y, pr)

Example #20

0

Show file

File: BC.py Project: kabnigam/Data-Projects

from sklearn.preprocessing import StandardScaler

X = df.ix[:, 1:10].values
y = df["Class"]


# In[467]:


xn = StandardScaler().fit_transform(X)
Xn = pd.DataFrame(xn, columns=df.ix[:, 1:10].columns)


OS = SMOTE(ratio=0.85, verbose=True)

osx, osy = OS.fit_transform(Xn.values, y)


# In[468]:

X_train, X_test, y_train, y_test = train_test_split(osx, osy, test_size=0.2, random_state=1)


# In[469]:

from sklearn.svm import SVC


# In[470]:

modelOS = SVC(kernel="linear", C=1).fit(osx, osy)

Example #21

0

Show file

File: MetaClassifier_Class.py Project: mentekid/Metaclassifier

    def fit(self, data, labels):
        """
        Training (fitting) the meta-classifier requires training each individual
        classifier in the ensemble and using testing data that has not been used
        in the classifiers' training set to train the meta-classifier.
        To do this, we use 10-fold Stratified Cross-Validation to produce a
        training set for the meta-classifier equal to the one provided.
        
        Arguments
        ---------
        data: pandas (N,d) dataframe with data to be trained on
        labels: pandas (N,1) dataframe with labels for training data
        SMOTE: perform SMOTE as part of cross-validation to balance classes
        """
        from unbalanced_dataset import SMOTE
        from sklearn.cross_validation import StratifiedKFold
        
        #if not isinstance(data, pd.DataFrame) or not isinstance(labels, pd.DataFrame):
        #    print "data and labels must be in pandas DataFrame form"
        #    raise TypeError
        N, d = data.shape
        
        self.data_train = np.copy(data)#.copy(deep=True)
        self.labels_train = np.copy(labels)#.copy(deep=True)

        #training data for metaclassifier (results of each classifier in ensemble)
        self.fusion_data = pd.DataFrame() #(fusion_labels = labels_train!)
        
        skf = StratifiedKFold(self.labels_train, n_folds=10)
        sets = [{'train':train, 'test':test} for train, test in skf]
        
        count = 0
        for clf in self.ensemble:
            hypothesis = list()
            for fold in sets:
                
                #separate training/testing set for fold, use SMOTE if asked to
                data_train_fold = self.data_train[fold['train']]
                labels_train_fold = self.labels_train[fold['train']]
            
                if self.useSMOTE:
                    bugs = sum(labels_train_fold)
                    ratio = float(len(labels_train_fold)-bugs)/bugs
                    smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1')
                    data_train_fold, labels_train_fold = smote.fit_transform(data_train_fold,labels_train_fold)
                    #data_train_fold = pd.DataFrame(data_train_fold, columns=categories)
                    #labels_train_fold = pd.DataFrame(labels_train_fold, columns=['bugs'])
                
                #fit the classifier with the training data of current fold
                clf.fit(data_train_fold, labels_train_fold)
                
                #make a prediction with the testing data of current fold
                data_test_fold = self.data_train[fold['test']]
                y = clf.predict(data_test_fold)
                
                #store data for the meta-classifier
                hypothesis.extend(list(y))
                
            #re-train the model using the entire available data (better performance)
            if self.useSMOTE:
                    bugs = sum(self.labels_train)
                    ratio = float(len(self.labels_train)-bugs)/bugs
                    smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1')
                    data_train_clf, labels_train_clf = smote.fit_transform(self.data_train, self.labels_train)
                    #data_train_fold = pd.DataFrame(data_train_fold, columns=categories)
                    #labels_train_fold = pd.DataFrame(labels_train_fold, columns=['bugs'])
                
            
            clf.fit(data_train_clf, labels_train_clf)
            
            #new column of metaclassifier training data (this classifier's hypothesis)
            self.fusion_data['classifier_'+str(count)] = np.array(hypothesis)
            count+=1
        
        #perform smote on the fusion data to even out the classes
        if self.useSMOTE:
            columns = list(self.fusion_data)
            bugs = sum(self.labels_train)
            ratio = float(len(self.labels_train)-bugs)/bugs
            smote = SMOTE(ratio=ratio, verbose=False, kind='borderline1')
            self.fusion_data, self.labels_train = smote.fit_transform(self.fusion_data.as_matrix(),self.labels_train)
            self.fusion_data = pd.DataFrame(self.fusion_data, columns=columns)
        
        #train the aggregator using the fusion set created earlier
        self.aggregator.fit(self.fusion_data.as_matrix(), self.labels_train)
        return

Example #22

0

Show file

x = numpy.delete(x, [3 + i * 11 for i in xrange(6)], axis=1)
#print x[0]
#x = normalize(x, axis=0)
#x = scale(x, axis=0)
y = df[:, -1]
y = map(lambda x: int(x), y)


def f6(x):
    if x == 6:
        return 2
    elif x == 7:
        return 6
    else:
        return x


y = map(f6, y)
y = numpy.array(y)

sm = SMOTE(kind='regular')
for i in xrange(10):
    x, y = sm.fit_transform(x, y)

clf = RandomForestClassifier(n_estimators=100, class_weight='auto')
pr = cross_validation.cross_val_predict(clf, x, y, cv=10)
#clf.fit(x,y)
print metrics.accuracy_score(y, pr)
print metrics.confusion_matrix(y, pr)
#joblib.dump(clf, 'rand_forest_model_1.pkl')

Example #23

0

Show file

                header=None,
                index_col=False,
                names=colnames,
                skiprows=[0],
                usecols=[8])
y = y['violation'].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0)
main_x = X.values
main_y = y

verbose = False
ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))

# 'SMOTE bordeline 2'
bsmote2 = SMOTE(ratio=ratio, verbose=verbose, kind='borderline2')
x, y = bsmote2.fit_transform(main_x, main_y)

ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.333,
                                                    random_state=0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, X_test, y_test)

y_pred = clf.fit(X_train, y_train).predict(X_test)
y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]

Example #24

0

Show file

File: capstone.py Project: ScottD61/Thinkful

#Ratio output
#print(ratio)
#2.33
#Set verbose as false to show less information
verbose = False
#Create SMOTE object
#smote = SMOTE(ratio = ratio, verbose = False, kind = 'regular') #Don't use

#Another way - leave this way
smote = SMOTE(ratio = 1.335, verbose = False, kind = 'regular')

#Fit data and transform
X_mod = X.as_matrix()
Y_mod = np.array(Y)
#Create new dataset
smox, smoy = smote.fit_transform(X_mod, Y_mod) 

#Check ratio of good and bad creditors
#Convert matrix to dataframe
y_data = pd.DataFrame(smoy, columns = ['classification'])
#check work
y_data['classification'].value_counts()


#New visualizations
#Convert matrix to dataframe to plot numeric columns
#Create list of column names
col_names = ['Status checking_A11', 'Status checking_A12', 'Status checking_A13',
       'Status checking_A14', 'Credit history_A30', 'Credit history_A31',
       'Credit history_A32', 'Credit history_A33', 'Credit history_A34',
       'Purpose_A40', 'Purpose_A41', 'Purpose_A410', 'Purpose_A42',

Example #25

0

Show file

File: overSampling.py Project: debasishdebs/parameterTesting

 def smote_boderline1(self):
     bsmote1 = SMOTE(ratio=self._ratio, verbose=self.verbose, kind='borderline1')
     bs1x, bs1y = bsmote1.fit_transform(self.x, self.y)
     return bs1x, bs1y

Example #26

0

Show file

File: random_forest_SMOTE.py Project: xuezhizeng/SLA_violation_classification

                header=None,
                index_col=False,
                names=colnames,
                skiprows=[0],
                usecols=[8])
y = y['violation'].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.333, random_state=0)
main_x = X.values
main_y = y

verbose = False
ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))

# 'SMOTE'
smote = SMOTE(ratio=ratio, verbose=verbose, kind='regular')
x, y = smote.fit_transform(main_x, main_y)

ratio = float(np.count_nonzero(y == 1)) / float(np.count_nonzero(y == 0))
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.333,
                                                    random_state=0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

clf = RandomForestClassifier(n_estimators=10)
scores = cross_val_score(clf, X_test, y_test)

y_pred = clf.fit(X_train, y_train).predict(X_test)
y_score = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]

Example #27

0

Show file

File: train_python_model.py Project: niggor/technoKot

def f6(x):
    if x == 7:
        return 2
    elif x > 7:
        return x - 1
    else:
        return x


#y = map(f6, y)

y = numpy.array(y)

sm = SMOTE(kind='regular')
for i in xrange(20):
    x_metrics, y = sm.fit_transform(x_metrics, y)


clf = RandomForestClassifier(n_estimators=100, class_weight='auto')
pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10)
#print metrics.accuracy_score(y, pr)
#print metrics.confusion_matrix(y, pr)

delete_rows_indexes = [i for i, y_i in enumerate(pr) if y_i == 2 and y[i] == 7]
x_metrics = numpy.delete(x_metrics, delete_rows_indexes, axis=0)
y = numpy.delete(y, delete_rows_indexes, axis=0)
#clf.fit(x_metrics,y)
#joblib.dump(clf, 'rand_forest_model_3.pkl')

 
pr = cross_validation.cross_val_predict(clf, x_metrics, y, cv=10)