コード例 #1
0
def test_multiclass_fit_resample():
    y = Y.copy()
    y[5] = 2
    y[6] = 2
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_resample(X, y)
    count_y_res = Counter(y_resampled)
    assert count_y_res[0] == 2
    assert count_y_res[1] == 2
    assert count_y_res[2] == 2
コード例 #2
0
def test_random_under_sampling_heterogeneous_data():
    X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
                        dtype=np.object)
    y = np.array([0, 0, 1])
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_res, y_res = rus.fit_resample(X_hetero, y)

    assert X_res.shape[0] == 2
    assert y_res.shape[0] == 2
    assert X_res.dtype == object
コード例 #3
0
def test_rus_fit_resample():
    rus = RandomUnderSampler(random_state=RND_SEED, replacement=True)
    X_resampled, y_resampled = rus.fit_resample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.12372842, 0.6536186], [0.04352327, -0.20515826]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])

    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #4
0
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit_resample(X, y)
    X_trans2, y_trans2 = rus.fit_resample(X, y)
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)

    pca = PCA()
    pipeline = Pipeline([('pca', PCA()), ('rus', rus)])

    X_trans, y_trans = pipeline.fit_resample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_resample(X_pca, y)
    # We round the value near to zero. It seems that PCA has some issue
    # with that
    X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0
    X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
コード例 #5
0
def test_rus_fit_resample_half():
    sampling_strategy = {0: 3, 1: 6}
    rus = RandomUnderSampler(
        sampling_strategy=sampling_strategy,
        random_state=RND_SEED,
        replacement=True)
    X_resampled, y_resampled = rus.fit_resample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [
        0.92923648, 0.76103773
    ], [0.15490546, 0.3130677], [0.15490546, 0.3130677],
                     [0.15490546, 0.3130677], [0.20792588, 1.49407907],
                     [0.15490546, 0.3130677], [0.12372842, 0.6536186]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #6
0
    def prepare_nn(self,
                   n_splits=10,
                   normalize=True,
                   shuffle_data=True,
                   oversample=True,
                   undersample=False):
        self.read_data()
        if oversample:
            ros = RandomOverSampler(random_state=55)
            self.data, self.labels = ros.fit_resample(self.data, self.labels)
        elif undersample:
            rus = RandomUnderSampler(random_state=55)
            self.data, self.labels = rus.fit_resample(self.data, self.labels)
        if shuffle_data:
            self.shuffle_data()
        if normalize:
            self.normalize_data(0, 1)

        skf = StratifiedKFold(n_splits=n_splits,
                              shuffle=shuffle_data,
                              random_state=43)
        return skf
コード例 #7
0
def sample(X, y, ratio):
    """Undersamples majority and synthetic minority samples using SMOTE

    Params
    --------
        X (df): dataframe representing independent (non-target) variables
        y (df): dataframe representing target
        ratio (int): ratio to be used for under sampling


    Returns
    --------
        X_over (df): dataframe representing independent (non-target) variables, with undersampled majority/SMOTE minority
        y_over (df): dataframe representing target, with undersampled majority/SMOTE minority
    """

    # for sample runs, need to ensure k_neighbors is less than minority samples
    n_minority_samples = y.groupby('target').target.count()[1]

    if n_minority_samples < 5:
        k_neighbors = n_minority_samples - 2
    else:
        k_neighbors = 5

    # under sample majority based on ratio
    undersample = RandomUnderSampler(sampling_strategy=ratio, random_state=123)
    X_under, y_under = undersample.fit_resample(X, y)

    # synthetic oversample via SMOTE
    # oversample = BorderlineSMOTE(random_state=123)#, sampling_strategy=.25)#, random_state=123)
    # oversample = SVMSMOTE(random_state=123)#, sampling_strategy=.25)#, random_state=123)
    oversample = SMOTENC(
        categorical_features=[0, 1, 2, 4],
        random_state=123,
        k_neighbors=k_neighbors)  #, sampling_strategy=.25)#, random_state=123)
    X_over, y_over = oversample.fit_resample(X_under, y_under)

    return X_over, y_over
コード例 #8
0
def test_rus_fit_resample_half():
    sampling_strategy = {0: 3, 1: 6}
    rus = RandomUnderSampler(
        sampling_strategy=sampling_strategy,
        random_state=RND_SEED,
        replacement=True,
    )
    X_resampled, y_resampled = rus.fit_resample(X, Y)

    X_gt = np.array([
        [0.92923648, 0.76103773],
        [0.47104475, 0.44386323],
        [0.92923648, 0.76103773],
        [0.15490546, 0.3130677],
        [0.15490546, 0.3130677],
        [0.15490546, 0.3130677],
        [0.20792588, 1.49407907],
        [0.15490546, 0.3130677],
        [0.12372842, 0.6536186],
    ])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #9
0
def undersample_major_class(X: np.ndarray, Y: np.ndarray, k=3):
    """ Undersamples the majority class k times.

    :param X:
    :param Y:
    :param k:
    :return:
    """
    logger.info(f'Undersampling the majority class [{k}] times.')
    under_sampler = RandomUnderSampler()
    k_undersampled_list = []
    for i in range(k):
        X_resampled, Y_resampled = under_sampler.fit_resample(X, Y)
        X_resampled, Y_resampled = unison_shuffled_copies(
            X_resampled, Y_resampled)
        undersampled_dict = {}
        for x, y in zip(X_resampled, Y_resampled):
            x = str(x[0])
            undersampled_dict[x] = y

        k_undersampled_list.append(undersampled_dict)

    return k_undersampled_list
コード例 #10
0
def tuneReducedDecisionTree():
    X, y = common.loadTrainingDataSet()

    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    splitIndex = 0
    f1ScoreList = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        totalF1 = 0.0
        numModels = 9
        for modelNum in range(numModels):
            rs = 42 + modelNum
            rus = RandomUnderSampler(random_state=rs)
            X_model_full, y_model = rus.fit_resample(X_train, y_train)

            truncatedSvd = TruncatedSVD(n_components=331,
                                        n_iter=7,
                                        random_state=42)
            X_model = truncatedSvd.fit_transform(X_model_full, y_model)

            dtClassifier = DecisionTreeClassifier(ccp_alpha=0.015)
            dtClassifier.fit(X_model, y_model)

            X_model_test = truncatedSvd.transform(X_test)
            y_pred = dtClassifier.predict(X_model_test)
            #report = classification_report(y_test, y_pred)
            currentF1 = f1_score(y_test, y_pred)
            print("Printing F1 for model #" + str(modelNum) + " = " +
                  str(currentF1))
            #print(str(report))
            totalF1 += currentF1

        avgF1 = totalF1 / numModels
        print("f1 = " + str(avgF1))
コード例 #11
0
def randomUnderSample(x, y, label='class', numSamplesPerClassType=None):
    numSamplesPerClassType = {1: 100000, 2: 100000, 3: 100000, 4: 100000, 5: 100000, 6: 100000, 7: 100000}   # fixme - hard coded
    print('- Balancing with random under sampling')
    print('Current x state: ', x.shape)

    x_columns = x.columns.values
    counts = y.value_counts().to_dict()
    printStr = '> Initial class freq:\n'
    for k, v in counts.items():
        printStr += '"{}" instances: [{}]\n'.format(k, v)
    print(printStr)

    if numSamplesPerClassType is not None:
        classNumSamp = {k:v if v<counts[k] else counts[k] for k,v in numSamplesPerClassType.items()}
        rus = RandomUnderSampler(random_state=int(time.time()), sampling_strategy=classNumSamp)
    else:
        rus = RandomUnderSampler(random_state=int(time.time()))

    x, y = rus.fit_resample(x, y)

    x_bal = pd.DataFrame(x, columns=x_columns)
    y_bal = pd.DataFrame(y, columns=[label])

    # fixme - working but done in a stupid way
    df = x_bal.join(y_bal)
    y_bal = df.loc[:, label]
    x_bal = df.drop(columns=[label])

    counts = y_bal.value_counts().to_dict()
    printStr = 'Balanced class freq:\n'
    for k, v in counts.items():
        printStr += '"{}" instances: [{}]\n'.format(k, v)
    print(printStr)

    print('Balanced x state: ', x_bal.shape)

    return x_bal, y_bal
コード例 #12
0
    def evaluate(self,
                 train_docs,
                 y_train,
                 test_docs,
                 y_test,
                 clf_metadata,
                 features_metadata,
                 task='classification',
                 return_predictions=False):

        clf = self.get_classifier(clf_metadata)
        X_train, X_test = self.prepare_features(features_metadata, train_docs,
                                                test_docs)
        if (features_metadata['sampling'] == 'over'):
            ros = RandomOverSampler(random_state=0)
            X_train, y_train = ros.fit_resample(X_train, y_train)
            # X_train, y_train = self.oversample(X_train, y_train)
        elif (features_metadata['sampling'] == 'under'):
            rus = RandomUnderSampler(random_state=0)
            X_train, y_train = rus.fit_resample(X_train, y_train)
            # X_train, y_train = self.undersample(X_train, y_train)
        if (features_metadata['LDA']):
            lda = LinearDiscriminantAnalysis(
                n_components=features_metadata['n_components'])
            dense_train = X_train.todense()
            dense_test = X_test.todense()
            lda.fit(dense_train, y_train)
            X_train = lda.transform(dense_train)
            X_test = lda.transform(dense_test)
        clf.fit(X_train, y_train)
        test_predicted = clf.predict(X_test)
        train_predicted = clf.predict(X_train)

        metrics = self.get_metrics(clf, y_train, y_test, train_predicted,
                                   test_predicted, task)

        return metrics
コード例 #13
0
ファイル: run.py プロジェクト: thtang/ieee-fraud-detection
def sampling(X_train, y_train):
    ran_over = RandomOverSampler(random_state=42)
    X_train_oversample,y_train_oversample = ran_over.fit_resample(X_train,y_train)
    ran_under = RandomUnderSampler(random_state=42)
    X_train_undersample, y_train_undersample = ran_under.fit_resample(X_train,y_train)
    tl = TomekLinks(n_jobs=6)
    X_train_tl, y_train_tl = tl.fit_sample(X_train, y_train)
    sm = SMOTE(random_state=42, n_jobs=5)
    X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)
    enn = EditedNearestNeighbours()
    X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

    print(np.unique(y_train, return_counts=True))
    print("after sampling")
    print("randomg over sampling")
    print(np.unique(y_train_oversample, return_counts=True))
    print("SMOTE sampling")
    print(np.unique(y_train_sm, return_counts=True))
    print("random under sampling")
    print(np.unique(y_train_undersample, return_counts=True))
    print("TomekLinks under sampling")
    print(np.unique(y_train_tl, return_counts=True))
    return (X_train_oversample, y_train_oversample, X_train_undersample, y_train_undersample,
     X_train_tl, y_train_tl, X_train_sm, y_train_sm, X_train_enn, y_train_enn)
コード例 #14
0
    def tackle_data_imbalance(self, X, Y):

        increase = 3
        counter = Counter(Y)

        total_classes = len(counter)
        total_data_points = sum(counter.values())
        expected_points = total_data_points * increase
        avg_points_per_class = int(expected_points / total_classes)

        # generating highest amount of data for each class
        # higest_key, highest_val = max(counter.items(), key=operator.itemgetter(1))
        # famous_dict = dict((key, highest_val) for key in counter)

        famous_dict = dict(
            (key, avg_points_per_class)
            for key in counter)  # generating double of previous for each class

        over = ADASYN(n_neighbors=1, sampling_strategy=famous_dict)
        under = RandomUnderSampler(sampling_strategy="auto")

        X, Y = over.fit_resample(X, Y)
        X, Y = under.fit_resample(X, Y)
        return X, Y
コード例 #15
0
def main_logic(data):

    # drop quantization column
    df = data.drop('Quantization_time', axis=1)

    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # split training, testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

    if debug == 1:
        # summarize class distribution
        print("Before undersampling: ", Counter(y_train))

    # define undersampling strategy
    undersample = RandomUnderSampler(sampling_strategy='majority')

    # fit and apply the transform
    X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

    if debug == 1:
        # summarize class distribution
        print("After undersampling: ", Counter(y_train_under))
    def _fit_clf(self, clf_type, dataloader):
        if clf_type == 'other_classifiers':
            clf = KNeighborsClassifier(weights="distance")
            param_grid = {"n_neighbors": [9, 11, 13, 15]}
        elif clf_type == 'svm':
            clf = Pipeline(steps=[("scaler", StandardScaler()), ("clf",
                                                                 SVC())])
            param_grid = {"clf__C": [0.01, 0.1, 1, 10]}
        else:
            return

        print(f"Training {clf_type}")

        X, y = [], []

        self.net.eval()
        with torch.no_grad():
            for images, labels in dataloader:
                images = images.to(self.device)
                y.append(labels)
                features = self._extract_features(images, normalize=False)
                X.append(features)

            X = torch.cat(X).cpu().numpy()
            y = torch.cat(y).cpu().numpy()

        rus = RandomUnderSampler()
        X, y = rus.fit_resample(X, y)
        grid_search = GridSearchCV(clf,
                                   param_grid=param_grid,
                                   n_jobs=-1,
                                   scoring='accuracy',
                                   cv=4)
        grid_search.fit(X, y)
        self.clf = grid_search.best_estimator_
        self.params_clf.append(grid_search.best_params_)
コード例 #17
0
def random_under(X,y,strategy=0.5):
    """Random Undersampling:
    To tackle imbalanced data, this function helps removing samples from majority.
    
    Parameters
    ----------
    X : dataframe or array
        Features of dataset
    y : dataframe or array
        Target value of dataset
    strategy : float
        The desired ratio of majority sample

    Returns
    -------
    X : dataframe or array
        Features of new dataset
    y : dataframe or array
        Target value of new dataset
    """

    under = RandomUnderSampler(sampling_strategy=strategy)
    X,y = under.fit_resample(X,y)
    return X,y
コード例 #18
0
def run(k, j, filename, seednum=20, threshold = 0.5, resultdir=None, graphdir = f'{treedir}/'):
#    classes = ["P1a1" , "P1a2"  , "P2b"  , "P2c" ] 
    classes = ["P1a1" , "P1a2", "P2b", "P2c", "H1" ]
    # H1 H2  O (1) P1a1 (4)  P1a2 (6)   P2b   P2c   S1a (0)   S1c    S2    S3 
    joind = gp.read_file(filename, layer = layers[j])
    print(f'\n------\n------{layers[j]}----\n-----\n')
    joind['area']= joind['geometry'].area #calculate the area of each object
    df1 = pd.DataFrame(joind.drop(columns='geometry'))
    df1 = df1.replace([np.inf, -np.inf], np.nan).dropna()
    
    Pcl = df1.loc[df1['geocode_2'].isin(classes)] # filter only classes of interest
    print(Pcl['geocode_2'].value_counts())
    # regroup, geocode_2 from here on becomes binary!
    Pcl['geocode_2'] = np.where(Pcl['geocode_2'].str.contains(classes[k]),classes[k],'Others')
    print(Pcl['geocode_2'].value_counts())
    minc = min(Pcl['geocode_2'].value_counts() ) # skip if less than 20 objects 
    if minc< 20:
        print("minimum class less than 20")
        return (-1, -1) # -1 -1 if not calculated
    else:    
        print(f'total {len(df1)}, P_H1_classes: {len(Pcl)}, minimun class: {minc}')       
        # bootstrap and get averaged accuracy
        avepre = np.zeros(1) # store all the xgb+tree precisions in each CV
        averec = np.zeros(1)
        for seeds in range(seednum):
            np.random.seed(seeds)
            #1. categorise the variable "area", the variable "area" is kept in the data frame, strictly it can be removed.  
            #2. use groupby to sample the same amount for each area category 
            # use 70% of area for training, get the index
            print (Pcl['area'].quantile([0, .25, .5, .75, 1]))
            Pcl['area_c'] = pd.cut(Pcl['area'],
                     bins=  Pcl['area'].quantile([0, .25, .5, .75, 1]).tolist()
                     labels=[ "q25", "q5", "q75", "Max"])
            
            print(Pcl["area_c"].value_counts())

            train_ind = Pcl.groupby('area_c').sample(n = int(min(Pcl["area_c"].value_counts())*0.7)).index 
            test_ind = Pcl[~Pcl.index.isin(train_ind)].index
            
            Pcl.loc [train_ind,"geocode_2" ].value_counts()
            X_train0 = Pcl.loc [train_ind ].drop(columns=["geocode_2","layer","OBJECTID","path", "area_c"])
            X_test0  = Pcl.loc [test_ind ].drop(columns=["geocode_2","layer","OBJECTID","path", "area_c"])
            
            Y_train0 = Pcl.filter(regex='geocode_2').loc[train_ind] 
            Y_test0  = Pcl.filter(regex='geocode_2').loc[test_ind] 
            print("after sampling by area: for 2 classes,", X_train0.shape[0], X_test.shape[0])
            print(Pcl.loc [train_ind ]["geocode_2"].value_counts())
            # if my pandas is lower and i can't use the above function,
             
            # grouped = Pcl.drop(columns=["geocode_2","layer","OBJECTID","path",'area']).groupby('area_c')
            
            #def fun1(x):
            #    y = x.drop(columns=["area_c"]) 
            #    return( y.sample(n = int(minc/5*0.7)).index )
            #train_ind = grouped.apply(fun1) 
            #test_ind = Pcl[~Pcl.index.isin(train_ind)].index
            #neew to ungroup train_ind
            
            # test data
            #grouped2 = Pcl[['geocode_2',"area_c"]].groupby('area_c')
            #y = grouped2.apply(fun1)
            
            #####
            # after getting x, y train, we will use undersample to sample from each classes, p1a1 and others
            
            rus = RandomUnderSampler(random_state  = 1)
            X_train, Y_train = rus.fit_resample(X_train0, Y_train0)
            print("number of samples used for training:", X_train.shape[0]/2)
            #y2 = y2.reshape(-1, 1)
            #y2_rus, y_rus = rus.fit_resample(y2, y)
            #y2_rus= y2_rus.flatten()
           
            #len(train)+len(test)
            
            # relable
            label_all = [classes[k], "Others"]
            #classtype  =  [(j, "float32") for j in classes]
            
            #Pcl.geocode_2.unique()
            i = 0
            idx2class = {}
            class2idx = {}
            for tp in label_all:
                idx2class[i] = tp
                class2idx[tp] = i 
                i+= 1
           
             
            Y_trainnum = cl2idx(Y_train.values, class2idx).astype(int)
            Y_testnum = cl2idx(Y_test.values, class2idx).astype(int)
             
            np.unique(Y_trainnum)
            params = {'max_depth': 6, 'eta': 0.002, 
                      'objective':'binary:logistic', 'num_class': 1}
             
            clf = xgb.XGBModel(**params)

            clf.fit(X_train.values, Y_trainnum,
            eval_set=[(X_train.values, Y_trainnum), (X_test.values, Y_testnum)],
            eval_metric='logloss',
            verbose=True)
            
            #for testing
            #clf = DecisionTreeClassifier(min_samples_split= 30, max_depth= 4, min_samples_leaf=20, random_state=1)

            yhat = clf.predict(X_test)
                     
                    # threshold 0.5, probability higher than 0.5 -> positive. 
            yhat_labels = yhat>threshold
            yhat_labels = yhat_labels.astype(int)
            
 
            #TP
            TP = ((Y_testnum == 1) & (yhat_labels == 1)).astype(float) * X_test["area"]
            #FP
            FP = ((Y_testnum == 0) & (yhat_labels == 1)).astype(float) * X_test["area"]
            #TN
            TN = ((Y_testnum == 0) & (yhat_labels == 0)).astype(float) * X_test["area"]
            #FN
            FN =((Y_testnum == 1) & (yhat_labels == 0)).astype(float) * X_test["area"]
            precision = np.sum(TP)/np.sum(TP+FP) 
            recall = np.sum(TP)/np.sum(TP+TN) 
            

            averec = np.append(averec, recall) #store all of them
            avepre = np.append(avepre, precision)

        recall = averec.sum()/seednum #get the mean but exclude the first one (0)
        precision = avepre.sum()/seednum
        print(averec, recall)
        if resultdir is not None:
            Y_testnum =  Y_testnum.astype(int)
            plt.rcParams.update({'font.size': 8})
            ax = xgb.plot_importance(model, grid=False, importance_type='gain', title='Feature importance')
            ax.set_title(f'xgboost importance {layers[j]} {classes[k]}')
            fname = f"{resultdir}/P_{layers[j]}_{classes[k]}_imp"
            plt.savefig(fname, dpi=1200)
        return (recall, precision)
コード例 #19
0
    0: size0,
    1: size1,
    2: size2,
    3: size3,
    4: size4,
    5: size5,
    6: size6,
    7: size7,
    8: size8,
    9: size9,
    10: size10,
    11: size11,
    12: size12,
    13: size13,
    14: size14
}
ros = RandomUnderSampler(sampling_strategy=strategy, random_state=7)
X_under, y_under = ros.fit_resample(df, df['dfiscx.label'])

# transformando os vetores em dataframes
X_under = pd.DataFrame(X_under)
y_under = pd.DataFrame(y_under)

# Concatenando features e classes
dataset = pd.concat([X_under, y_under])
# salvando em CSV
export_csv = dataset.to_csv(
    r'/home/latin/export_dataframe5PercentCleanedPCA.csv',
    index=None,
    header=True)  #Don't forget to add '.csv' at the end of the path
コード例 #20
0
X = df.drop(['id','Response'], axis = 1)

cat_var = np.where(X.dtypes != np.float)[0]

neg, pos = np.bincount(y)

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

over = RandomOverSampler(sampling_strategy = 0.4)
under = RandomUnderSampler(sampling_strategy = 0.8)
#smote = SMOTE(sampling_strategy = 0.4, random_state = 1)

#X, y = smote.fit_resample(X, y)
X, y = over.fit_resample(X,y)
X, y = under.fit_resample(X, y)

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.15, shuffle = True, stratify = y)

from catboost import CatBoostClassifier 
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

classifier = CatBoostClassifier()#, scale_pos_weight = neg/pos)
classifier.fit(X_train, y_train, eval_set = (X_val, y_val), cat_features = cat_var)
yhat = classifier.predict(df_sub)

s = np.column_stack((case_id,yhat))
s = pd.DataFrame(s)
s.columns = ['id', 'Response']
s.to_csv("Submission.csv", index = False, index_label = None)  
コード例 #21
0
ファイル: test_base.py プロジェクト: bodycat/imbalanced-learn
 def func(X, y, sampling_strategy, random_state):
     rus = RandomUnderSampler(
         sampling_strategy=sampling_strategy, random_state=random_state)
     return rus.fit_resample(X, y)
コード例 #22
0
 def func(X, y, sampling_strategy, random_state):
     rus = RandomUnderSampler(
         sampling_strategy=sampling_strategy, random_state=random_state
     )
     return rus.fit_resample(X, y)
    3: weights[3],
    4: weights[4]
}
over = RandomOverSampler(sampling_strategy=ratio_over, random_state=314)
X_train, y_train = over.fit_resample(X_train, y_train)

# undersample samples > average
ratio_under = {
    0: average_samples,
    1: average_samples,
    2: average_samples,
    3: average_samples,
    4: average_samples
}
under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314)
X_train, y_train = under.fit_resample(X_train, y_train)

# OUD: Maak class weights voor class imbalance
#label_integers =np.argmax(labels_as_array, axis=1)
#class_weights = compute_class_weight('balanced', np.unique(label_integers), label_integers)
#d_class_weights = dict(enumerate(class_weights))
#print(d_class_weights)

# Hieronder is voor parameters testen
# Maak model
print(type(X_train))
print(type(y_train))
estimator = KerasClassifier(build_fn=baseline_model,
                            epochs=40,
                            batch_size=20,
                            verbose=1)
        dataset2.rename(columns={"500_Buggy?": "Buggy"}, inplace=True)

        # separate the data from the target attributes
        test_data = dataset2.drop('change_id', axis=1)
        test_data = test_data.drop('411_commit_time', axis=1)
        test_data = test_data.drop('412_full_path', axis=1)

        # remove unnecessary features
        #test_data = test_data.drop('File', axis=1)

        # the lables of test data
        test_target = dataset2.Buggy

        #print(test_target)
        from imblearn.under_sampling import RandomUnderSampler
        rus = RandomUnderSampler(random_state=0)
        X_resampled, y_resampled = rus.fit_resample(train_data, train_target)
        test_data_resampled, test_target_resampled = rus.fit_resample(
            test_data, test_target)

        clf = LogisticRegression(warm_start=True, max_iter=1000000000)
        test_pred = clf.fit(X_resampled,
                            y_resampled).predict(test_data_resampled)

        file.write(
            classification_report(test_target_resampled,
                                  test_pred,
                                  labels=[0, 1]))
        file.write("\n")
file.close()
train_X = pad_sequences(train_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)

len(train_X)

train_X[110]
"""## **Undersampling**"""

from collections import Counter
from sklearn.datasets import make_classification

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(train_X, train_y)
print(sorted(Counter(y_resampled).items()))

train_X.shape

train_X[110]

embed_size = 100  # how big is each word vector

S_DROPOUT = 0.4
DROPOUT = 0.1


def plotting(history):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
コード例 #26
0
class ModelDataset(Dataset):
    """
    The dataset class responsible for loading the data and providing the samples for \
training.

    :param Dataset: Base Dataset class to use with PyTorch models
    :type Dataset: torch.utils.data.Dataset
    """

    def __init__(
        self,
        out_var=None,
        out_mean=None,
        forecast_dir=None,
        forcings_dir=None,
        reanalysis_dir=None,
        transform=None,
        hparams=None,
        **kwargs,
    ):
        """
        Constructor for the ModelDataset class

        :param out_var: Variance of the output variable, defaults to None
        :type out_var: float, optional
        :param out_mean: Mean of the output variable, defaults to None
        :type out_mean: float, optional
        :param forecast_dir: The directory containing the FWI-Forecast data, defaults \
to None
        :type forecast_dir: str, optional
        :param forcings_dir: The directory containing the FWI-Forcings data, defaults \
to None
        :type forcings_dir: str, optional
        :param reanalysis_dir: The directory containing the FWI-Reanalysis data, \
defaults to None
        :type reanalysis_dir: str, optional
        :param transform: Custom transform for the input variable, defaults to None
        :type transform: torch.transforms, optional
        :param hparams: Holds configuration values, defaults to None
        :type hparams: Namespace, optional
        """

        self.hparams = hparams
        self.out_mean = out_mean
        self.out_var = out_var
        self.hparams.thresh = self.hparams.out_mad / 2
        if self.hparams.binned:
            self.bin_intervals = self.hparams.binned

        # Mean of output variable used for bias-initialization.
        self.out_mean = out_mean if out_mean else self.hparams.out_mean
        # Variance of output variable used to scale the training loss.
        self.out_var = out_var if out_var else self.hparams.out_var

        # Convert string dates to numpy format
        if self.hparams.date_range:
            self.hparams.date_range = [
                np.datetime64(d) for d in self.hparams.date_range
            ]
        # Convert case-study dates to numpy format
        if (
            hasattr(self.hparams, "case_study_dates")
            and self.hparams.case_study_dates
            and not self.hparams.date_range
        ):
            self.hparams.case_study_dates = [
                [np.datetime64(d) for d in r] for r in self.hparams.case_study_dates
            ]
        # If custom date range specified, override
        else:
            self.hparams.case_study_dates = None

        # Create imbalanced-learn random subsampler
        if self.hparams.undersample:
            self.undersampler = RandomUnderSampler()

        if not self.hparams.benchmark:
            # Input transforms including mean and std normalization
            self.transform = transforms.Compose(
                [
                    transforms.ToTensor(),
                    # Mean and standard deviation stats used to normalize the input data
                    # to the mean of zero and standard deviation of one.
                    transforms.Normalize(
                        [
                            x
                            for i in range(self.hparams.in_days)
                            for x in (
                                self.hparams.inp_mean["rh"],
                                self.hparams.inp_mean["t2"],
                                self.hparams.inp_mean["tp"],
                                self.hparams.inp_mean["wspeed"],
                            )
                        ]
                        + (
                            [
                                self.hparams.smos_mean
                                for i in range(self.hparams.in_days)
                            ]
                            if self.hparams.smos_input
                            else []
                        ),
                        [
                            x
                            for i in range(self.hparams.in_days)
                            for x in (
                                self.hparams.inp_std["rh"],
                                self.hparams.inp_std["t2"],
                                self.hparams.inp_std["tp"],
                                self.hparams.inp_std["wspeed"],
                            )
                        ]
                        + (
                            [self.hparams.smos_std for i in range(self.hparams.in_days)]
                            if self.hparams.smos_input
                            else []
                        ),
                    ),
                ]
            )

    def __len__(self):
        """
        The internal method used to obtain the number of iteration samples.

        :return: The maximum possible iterations with the provided data.
        :rtype: int
        """
        return len(self.dates)

    def __getitem__(self, idx):
        """
        Internal method used by pytorch to fetch input and corresponding output tensors.

        :param idx: The index number of data sample.
        :type idx: int
        :return: Batch of data containing input and output tensors
        :rtype: tuple
        """

        if torch.is_tensor(idx):
            idx = idx.tolist()

        if self.hparams.benchmark:
            X = torch.from_numpy(
                np.stack(
                    [
                        resize(
                            self.input[list(self.input.data_vars)[0]]
                            .sel(time=[self.dates[idx]], lead=[i])
                            .values.squeeze(),
                            self.output[list(self.output.data_vars)[0]][0].shape,
                        )
                        for i in range(self.hparams.out_days)
                    ],
                    axis=0,
                )
            )
        else:
            X = self.transform(
                np.stack(
                    [
                        self.input[v]
                        .sel(time=[self.dates[idx] - np.timedelta64(i, "D")])
                        .values.squeeze()
                        for i in range(self.hparams.in_days)
                        for v in ["rh", "t2", "tp", "wspeed"]
                    ]
                    + (
                        [
                            resize(
                                np.nan_to_num(
                                    self.smos_input[list(self.smos_input.data_vars)[0]]
                                    .sel(
                                        time=[self.dates[idx] - np.timedelta64(i, "D")],
                                        method="nearest",
                                    )
                                    .values.squeeze()[::-1],
                                    copy=False,
                                    # Use 50 as the placeholder for water bodies
                                    nan=50,
                                ),
                                self.input.rh[0].shape,
                            )
                            for i in range(self.hparams.in_days)
                        ]
                        if self.hparams.smos_input
                        else []
                    ),
                    axis=-1,
                )
            )

        y = torch.from_numpy(
            np.stack(
                [
                    self.output[list(self.output.data_vars)[0]]
                    .sel(time=[self.dates[idx] + np.timedelta64(i, "D")])
                    .values.squeeze()
                    for i in range(self.hparams.out_days)
                ],
                axis=0,
            )
        )

        return X, y

    def get_cb_loss_factor(self, y):
        """
        Compute the Class-Balanced loss factor mask using output value frequency \
distribution and the supplied beta factor.

        :param y: The 1D ground truth value tensor
        :type y: torch.tensor
        """
        idx = (
            (
                y.unsqueeze(0).expand(self.bin_centers.shape[0], -1)
                - self.bin_centers.unsqueeze(-1).expand(-1, y.shape[0])
            )
            .abs()
            .argmin(dim=0)
        )
        loss_factor = torch.empty_like(y)
        for i in range(self.bin_centers.shape[0]):
            loss_factor[idx == i] = self.loss_factors[i]
        return loss_factor

    def apply_mask(self, *y_list):
        """
        Returns batch_size x channels x N sized matrices after applying the mask.

        :param *y_list: The interable of tensors to be masked
        :type y_list: torch.Tensor
        :return: The list of masked tensors
        :rtype: list(torch.Tensor)
        """
        return [
            y.permute(-2, -1, 0, 1)[self.mask.expand_as(y[0][0])].permute(-2, -1, 0)
            for y in y_list
        ]

    def get_loss(self, y, y_hat):
        """
        Do the applicable processing and return the loss for the supplied prediction \
and the label tensors.

        :param y: Label tensor
        :type y: torch.Tensor
        :param y_hat: Predicted tensor
        :type y_hat: torch.Tensor
        :return: Prediction loss
        :rtype: torch.Tensor
        """
        if self.hparams.undersample:
            sub_mask = y < self.hparams.undersample
            subval = y[sub_mask]
            low = max(subval.min(), 0.5)
            high = subval.max()
            boundaries = torch.arange(low, high, (high - low) / 10).to(
                self.model.device
            )
            freq_idx = torch.bucketize(subval, boundaries[:-1], right=False)
            self.undersampler.fit_resample(
                subval.cpu().unsqueeze(-1),
                (boundaries.take(index=freq_idx).cpu() * 100).int(),
            )
            idx = self.undersampler.sample_indices_
            y = torch.cat((y[~sub_mask], subval[idx]))
            y_hat = torch.cat((y_hat[~sub_mask], y_hat[sub_mask][idx]))

        if self.hparams.round_to_zero:
            y_hat = y_hat[y > self.hparams.round_to_zero]
            y = y[y > self.hparams.round_to_zero]

        if self.hparams.clip_output:
            y_hat = y_hat[
                (y < self.hparams.clip_output[-1]) & (self.hparams.clip_output[0] < y)
            ]
            y = y[
                (y < self.hparams.clip_output[-1]) & (self.hparams.clip_output[0] < y)
            ]

        if self.hparams.cb_loss:
            loss_factor = self.get_cb_loss_factor(y)

        if self.hparams.boxcox:
            y = torch.from_numpy(boxcox(y.cpu(), lmbda=self.hparams.boxcox,)).to(
                y.device
            )

        pre_loss = (y_hat - y) ** 2
        # if "loss_factor" in locals():
        #     pre_loss *= loss_factor
        loss = pre_loss.mean()
        assert loss == loss

        return loss

    def training_step(self, model, batch):
        """
        Called inside the training loop with the data from the training dataloader \
passed in as `batch`.

        :param model: The chosen model
        :type model: Model
        :param batch: Batch of input and ground truth variables
        :type batch: int
        :return: Loss and logs
        :rtype: dict
        """

        # forward pass
        x, y_pre = batch
        y_hat_pre = model(x)
        y_pre, y_hat_pre = self.apply_mask(y_pre, y_hat_pre)

        assert y_pre.shape == y_hat_pre.shape
        tensorboard_logs = defaultdict(dict)
        for b in range(y_pre.shape[0]):
            for c in range(y_pre.shape[1]):
                loss = self.get_loss(y_pre[b][c], y_hat_pre[b][c])

                tensorboard_logs["train_loss_unscaled"][str(c)] = loss
        loss = torch.stack(
            list(tensorboard_logs["train_loss_unscaled"].values())
        ).mean()
        tensorboard_logs["_train_loss_unscaled"] = loss
        # model.logger.log_metrics(tensorboard_logs)
        return {
            "loss": loss.true_divide(model.data.out_var * self.hparams.out_days),
            "_log": tensorboard_logs,
        }

    def validation_step(self, model, batch):
        """
        Called inside the validation loop with the data from the validation dataloader \
passed in as `batch`.

        :param model: The chosen model
        :type model: Model
        :param batch: Batch of input and ground truth variables
        :type batch: int
        :return: Loss and logs
        :rtype: dict
        """

        # forward pass
        x, y_pre = batch
        y_hat_pre = model(x)
        y_pre, y_hat_pre = self.apply_mask(y_pre, y_hat_pre)

        assert y_pre.shape == y_hat_pre.shape
        tensorboard_logs = defaultdict(dict)
        for b in range(y_pre.shape[0]):
            for c in range(y_pre.shape[1]):
                y, y_hat = y_pre[b][c], y_hat_pre[b][c]
                loss = self.get_loss(y, y_hat)

                # Accuracy for a threshold
                abs_diff = (y - y_hat).abs()
                acc = (abs_diff < self.hparams.thresh).float().mean()
                mae = abs_diff.mean()

                tensorboard_logs["val_loss"][str(c)] = loss
                tensorboard_logs["acc"][str(c)] = acc
                tensorboard_logs["mae"][str(c)] = mae

        val_loss = torch.stack(list(tensorboard_logs["val_loss"].values())).mean()
        tensorboard_logs["_val_loss"] = val_loss
        # model.logger.log_metrics(tensorboard_logs)
        return {
            "val_loss": val_loss,
            "log": tensorboard_logs,
        }

    def inference_step(self, y_pre, y_hat_pre):
        """
        Run inference for the target and predicted values and return the loss and the \
metrics values as logs.

        :param y_pre: Label values
        :type y_pre: torch.Tensor
        :param y_hat_pre: Predicted value
        :type y_hat_pre: torch.Tensor
        :return: Loss and the log dictionary
        :rtype: tuple
        """
        y_pre, y_hat_pre = self.apply_mask(y_pre, y_hat_pre)

        tensorboard_logs = defaultdict(dict)

        for b in range(y_pre.shape[0]):
            for c in range(y_pre.shape[1]):
                y = y_pre[b][c]
                y_hat = y_hat_pre[b][c]

                if self.hparams.boxcox and not self.hparams.benchmark:
                    # Negative predictions give NaN after inverse-boxcox
                    y_hat[y_hat < 0] = 0
                    y_hat = torch.from_numpy(
                        inv_boxcox(y_hat.cpu().numpy(), self.hparams.boxcox)
                    ).to(y_hat.device)

                if not y.numel():
                    return None

                pre_loss = (y_hat - y) ** 2

                loss = lambda low, high: pre_loss[(y > low) & (y <= high)].mean()
                assert loss(y.min(), y.max()) == loss(y.min(), y.max())

                # Accuracy for a threshold
                acc = (
                    lambda low, high: (
                        (y - y_hat)[(y > low) & (y <= high)].abs() < self.hparams.thresh
                    )
                    .float()
                    .mean()
                )

                # Mean absolute error
                mae = (
                    lambda low, high: (y - y_hat)[(y > low) & (y <= high)]
                    .abs()
                    .float()
                    .mean()
                )

                tensorboard_logs["mse"][str(c)] = loss(y.min(), y.max())
                tensorboard_logs["acc"][str(c)] = acc(y.min(), y.max())
                tensorboard_logs["mae"][str(c)] = mae(y.min(), y.max())

                # Inference on binned values
                if self.hparams.binned:
                    for i in range(len(self.bin_intervals) - 1):
                        low, high = (
                            self.bin_intervals[i],
                            self.bin_intervals[i + 1],
                        )
                        tensorboard_logs[f"mse_{low}_{high}"][str(c)] = loss(low, high)
                        tensorboard_logs[f"acc_{low}_{high}"][str(c)] = acc(low, high)
                        tensorboard_logs[f"mae_{low}_{high}"][str(c)] = mae(low, high)
                    tensorboard_logs[f"mse_{self.bin_intervals[-1]}inf"][str(c)] = loss(
                        self.bin_intervals[-1], y.max()
                    )
                    tensorboard_logs[f"acc_{self.bin_intervals[-1]}inf"][str(c)] = acc(
                        self.bin_intervals[-1], y.max()
                    )
                    tensorboard_logs[f"mae_{self.bin_intervals[-1]}inf"][str(c)] = mae(
                        self.bin_intervals[-1], y.max()
                    )

        inference_loss = torch.stack(list(tensorboard_logs["mse"].values())).mean()
        tensorboard_logs["_inference_loss"] = inference_loss

        return inference_loss, tensorboard_logs

    def test_step(self, model, batch):
        """
        Called inside the testing loop with the data from the testing dataloader \
passed in as `batch`.

        :param model: The chosen model
        :type model: Model
        :param batch: Batch of input and ground truth variables
        :type batch: int
        :return: Loss and logs
        :rtype: dict
        """
        x, y_pre = batch
        y_hat_pre = model(x)

        test_loss, tensorboard_logs = self.inference_step(y_pre, y_hat_pre)

        return {
            "mse": test_loss,
            "log": tensorboard_logs,
        }

    def benchmark_step(self, batch):
        """
        Called inside the testing loop with the data from the testing dataloader \
passed in as `batch`.

        :param model: The chosen model
        :type model: Model
        :param batch: Batch of input and ground truth variables
        :type batch: int
        :return: Loss and logs
        :rtype: dict
        """
        y_hat_pre, y_pre = batch

        benchmark_loss, tensorboard_logs = self.inference_step(y_pre, y_hat_pre)

        return {
            "mse": benchmark_loss,
            "log": tensorboard_logs,
        }
コード例 #27
0
if __name__=='__main__':
    ty,auc_thres,k = str(args.type),float(args.threshold),int(args.num)
    # load the data
    with open(f'data_{ty}.pkl', 'rb') as f:
        data = pickle.load(f)
    X_train,y_train,X_test,y_test = data['X_train'],data['y_train'],data['X_test'],data['y_test']
    print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)
    print(np.sum(y_train))
    pool = ['SVM']
    # train base models
    models,i = [],1
    while True:
        # use RandomUnderSampler to sample
        rus = RandomUnderSampler(random_state=random.randint(1000,9999))
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
        # train
        locModel = random.choice(pool)
        clf = get_base_model(locModel)
        clf.fit(X_resampled, y_resampled)
        # predict
        ypro_pre = clf.predict_proba(X_test)
        y_pre = ypro_pre.argmax(axis=1).reshape(-1,1)
        # evaluate
        acc = (sum(y_pre==y_test)/len(y_pre))[0]
        fpr, tpr, thresholds = metrics.roc_curve(y_test, ypro_pre[:, 1])
        auc = metrics.auc(fpr, tpr)
        pre = metrics.precision_score(y_test, ypro_pre[:, 1]>0.5)
        rec = metrics.recall_score(y_test, ypro_pre[:, 1]>0.5)
        f1 = metrics.f1_score(y_test, ypro_pre[:, 1]>0.5)
        print(f'Base Model {i}: {locModel}')
コード例 #28
0
def preprocessing(betas,
                  labels,
                  cpg_sites,
                  index,
                  threshold_to_drop=0.1,
                  test_size=0.3,
                  sampling_strategy=0.5,
                  fill_na_strategy='knn',
                  smote=False,
                  undersample=False,
                  train_test=True):
    print(f"=== Drop Columns and Rows ===")
    # Dropping rows for which label is NA
    idx_to_delete = np.where(np.isnan(labels))[0]
    print(f"Dropping {idx_to_delete.shape[0]} because of missing labels")
    labels = np.delete(labels, idx_to_delete)
    betas = np.delete(betas, idx_to_delete, axis=0)
    index = np.delete(index, idx_to_delete)
    print(f"New Shape = {betas.shape}")

    # Dropping columns
    percent_threshold = threshold_to_drop * 100
    print(
        f"Dropping columns which have more than {percent_threshold:.0f}% of values missing"
    )
    betas, cpg_sites = drop_columns(betas,
                                    cpg_sites,
                                    threshold=threshold_to_drop)

    # Dropping rows
    print(
        f"\nDropping rows which have more than {percent_threshold:.0f}% of values missing"
    )
    betas, labels, index = drop_rows(betas, labels, index)

    # Filling remaining NA Values
    print(f"\n=== Fill remaining NAs ===")
    nb_nan = np.sum(np.sum(np.isnan(betas), axis=1), axis=0)
    if fill_na_strategy == 'knn':
        print(f"Filling remaining NA values using a KNNImputer")
        betas = fill_remaining_na(betas)
    elif fill_na_strategy == 'simple':
        print(f"Filling remaining NA values using a Simple Median Imputer")
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        betas = imputer.fit_transform(betas)
    else:
        print(f"Filling remaining NAs with zeros")
        nan_idx = np.where(np.isnan(betas))
        betas[nan_idx] = 0
    print(
        f"{nb_nan} NA were filled, i.e. approximately {nb_nan / betas.shape[0]:.2f} per rows"
    )

    if train_test:
        print(f"\n=== Train / Test Split ===")
        print(f"Splitting dataset into train and test")
        print(f"Train = {100 - test_size * 100:.0f} %")
        print(f"Test = {test_size * 100:.0f} %")
        X_train, X_test, y_train, y_test = train_test_split(
            betas, labels, test_size=test_size, random_state=123)

        print(f"\n=== Standardize dataset ===")
        scaler = StandardScaler().fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print(
            f"The average of column mean on train is {np.mean(np.mean(X_train_scaled, axis=1), axis=0):.2f}"
        )
        print(
            f"The average of column mean on test is {np.mean(np.mean(X_test_scaled, axis=1), axis=0):.2f}"
        )

        if smote:
            print("\n=== Balance dataset with oversample ===")
            # Computing multi-class ratio
            unique, count = np.unique(y_train, return_counts=True)
            print(list(zip(unique, count)))
            m = max(count)
            majority_class = unique[np.argmax(count)]

            # Every class will be oversampled to (ratio) * #observations in majority class
            # Except the majority class which is left as is
            resampling_strategy = {
                k: max(c, int(sampling_strategy * m))
                for (k, c) in zip(unique, count)
            }
            resampling_strategy[majority_class] = m

            print(
                f"The resampling_strategy gives the following repartition {resampling_strategy}"
            )
            sm = SMOTE(random_state=123, sampling_strategy=resampling_strategy)
            X_train_res, y_train_res = sm.fit_sample(X_train_scaled, y_train)
            print(
                f"{X_train_res.shape[0] - X_train_scaled.shape[0]} rows were added in the training data"
            )
        elif undersample:
            print("=== Balance dataset with undersample ===")

            print(
                f"The resampling_strategy gives the following repartition {sampling_strategy}"
            )
            under_sampling = RandomUnderSampler(
                sampling_strategy=sampling_strategy)
            X_train_res, y_train_res = under_sampling.fit_resample(
                X_train_scaled, y_train)
        else:
            X_train_res = X_train_scaled
            y_train_res = y_train

        return X_train_res, X_test_scaled, y_train_res, y_test, labels, cpg_sites

    else:
        df = DataFrame(betas, columns=cpg_sites, index=index)
        df['label'] = labels
        df['label'] = df['label'].astype(int)
        return df
コード例 #29
0
    print("recall_score   : ", recall_score(y_label.iloc[:df_test_len], y_pred, average="macro"))
    print("\nAccuracy Score :",accuracy_score(y_pred,y_label.iloc[:df_test_len]))
    roc_curve_plots(y_label.iloc[:df_test_len],y_pred,X_test,model)

"""## Use techniques like undersampling or oversampling before running Naïve Bayes, Logistic Regression or SVM.
   * ### Oversampling or undersampling can be used to tackle the class imbalance problem
   * ### Oversampling increases the prior probability of imbalanced class and in case of other classifiers, error gets multiplied as the low-proportionate class is mimicked multiple times.
"""

ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(x, y)

print(X_ros.shape, y_ros.shape)

rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(x, y)

print(X_rus.shape, y_rus.shape)

gnb = GaussianNB()
df_test_len = df_test.shape[0]
# print(df_test.sample(n=df_test_len))
model_accuracies(model = gnb, x_feature=X_ros,y_label= y_ros, X_test=df_test, df_test_len=df_test_len)

lr = LogisticRegression(class_weight ='balanced')
model_accuracies(model = lr, x_feature=X_ros,y_label= y_ros,X_test=df_test,df_test_len=df_test.shape[0])

gnb = GaussianNB()
model_accuracies(model = gnb, x_feature=X_rus,y_label= y_rus, X_test=df_test.iloc[:788],df_test_len = 788)

lr = LogisticRegression(class_weight ='balanced')
#
# ``sampling_strategy`` can be given a ``float``. For **under-sampling
# methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by
# :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and
# :math:`N_{m}` are the number of samples in the majority class after
# resampling and the number of samples in the minority class, respectively.

# select only 2 classes since the ratio make sense in this case
binary_mask = np.bitwise_or(y == 0, y == 2)
binary_y = y[binary_mask]
binary_X = X[binary_mask]

sampling_strategy = 0.8

rus = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_res, y_res = rus.fit_resample(binary_X, binary_y)
print('Information of the iris data set after making it '
      'balanced using a float and an under-sampling method: \n '
      'sampling_strategy={} \n y: {}'
      .format(sampling_strategy, Counter(y_res)))
plot_pie(y_res)

###############################################################################
# For **over-sampling methods**, it correspond to the ratio
# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
# minority class after resampling and the number of samples in the majority
# class, respectively.

ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(binary_X, binary_y)
コード例 #31
0
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import make_pipeline
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# define oversampling strategy
sm = SMOTE(sampling_strategy={3: 20000, 2: 20000}, random_state=1)
X_ov, Y_ov = sm.fit_resample(X_train, Y_train)
print(Counter(Y_ov))

under = RandomUnderSampler(sampling_strategy={
    0: 20000,
    1: 20000
},
                           random_state=1)
X_new, Y_new = under.fit_resample(X_ov, Y_ov)
print(Counter(Y_new))

# oversample = RandomOverSampler(sampling_strategy=0.1, random_state=1)
# X_new, Y_new = oversample.fit_resample(X_ov, Y_ov)

# # ------------------------------------------------------------- #
# # ---------------------- Encoding Data ------------------------- #
# # ------------------------------------------------------------- #
# Prepare Y values for one-hot encoding

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

# encode class values as integers
encoder = LabelEncoder()
コード例 #32
0
    # Get data
    df_train, df_test = GetData().get()

    # Feature Engineering of df_train
    myFE = FeatEng()
    df_train = myFE.fit_transform(df_train)

    X_train = np.array(df_train.drop(['Y'], axis=1))
    y_train = np.array(df_train[['Y']]).reshape(-1, )

    # Oversampling and Undersampling
    oversample = RandomOverSampler(sampling_strategy=1)
    undersample = RandomUnderSampler(sampling_strategy=1)

    X_over, y_over = oversample.fit_resample(X_train, y_train)
    X_under, y_under = undersample.fit_resample(X_train, y_train)

    # Hyperparameter tuning
    params_dir = os.path.join(os.path.dirname(os.getcwd()), "params")
    # Direct
    myModel = OurModel()
    models_tune = TuningHyperparameters(myModel.models, myModel.params_to_tune)
    models_tune.fit(X_train, y_train)
    print(models_tune.get_best_params())
    choosen_params = models_tune.get_best_params()
    with open(os.path.join(params_dir, 'hyperparameters_direct.json'),
              'w') as f:
        json.dump(choosen_params, f)

    # Oversampling
    myModel = OurModel()
コード例 #33
0
ファイル: creditcard.py プロジェクト: Mushtaq-D5037/Projects
# Stratify: ensures same proportion of samples to be present in y_train and y_test
y_train.value_counts(normalize=True)
y_test.value_counts(normalize=True)
# =============================================================================
# Sampling
# =============================================================================
# sampling only Training data
# sampling strategies
# Random Under Sampling : reduces majority class to match minority class
# Random Over Sampling  : increases minority class to match majority class
# SMOTE                 : increases minority class to match majority class by creating synthetic samples
# and many more

# Random Under Sampling
rus = RandomUnderSampler(sampling_strategy=0.3)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
print(
    f'sampled trained data percentage:\n{y_rus.value_counts(normalize =True)}')
print(f'sampled trained data count:\n{y_rus.value_counts()}')

# =============================================================================
# Feature Selection
# =============================================================================
# Quick Shortlisting variables Strategy
# 1. Removing constant variables : standar deviation = 0
# 2. Removing Quasi constant variables
# 3. Removing columns with High precentage of missing value
# 4. Removing Highly Correlated Variables
# 5. Removing Low Univariate ROC-AUC curve ( cut-off - 50% or 55%)

# 1. Constant Features
コード例 #34
0
def prep(df):
   #sample data
   
   data_rate=2400/df.shape[0]
   if data_rate >1:
      data_rate=1
   report_file.write("data portion used: "+str(data_rate)+"\n")
   df = df.sample(frac=data_rate, replace=False,random_state=0)

   #remove repeated rows
   df = df.drop_duplicates()

   #print(df.head())
   #dropnas
   #df.isna().sum()
   df=df.dropna(axis=0)
   #print(df["Attr1"].value_counts())
   #binarize categorical values
   features = list(df.head(0)) 
   colection = []
   names =[]
   for f in features:
      if df[f].dtype =='O' and f!=class_name :
         colection.append(pd.get_dummies(df[f],prefix=f).iloc[:,1:])
         names.append(f)
   if(len(colection)>0):
      df =df.drop(names,axis=1)
      
      concatdf  =pd.concat(colection,axis =1)
      
      df = pd.concat([df,concatdf],axis=1)
      
      df.shape

   print(df.shape)
   report_file.write("data size: "+str(df.shape)+"\n")

   #get class distribuition

   target_counts = df[class_name].value_counts()
   rate_of_maiority = max(target_counts)/sum(target_counts) 
   print(rate_of_maiority  )
   report_file.write("portion of class: "+str(max(target_counts)/sum(target_counts))+"\n")

   #reduce to featureset and class
   X_all = df.drop([class_name],axis=1)
   y_all = df[class_name]

   #rebalanced data
   if rate_of_maiority >= 0.6:
      print("Rebalancing data")
      sm = RandomUnderSampler(random_state=42)
      X_all, y_all = sm.fit_resample(X_all, y_all)
      #features.remove("class")
     # X_all = pd.DataFrame.from_records(X_all)
      print("Y:",y_all)
     # print(y_all)
     # y_all = np.reshape(y_all, (-1, 1))
     # print(y_all)
     # y_all = pd.DataFrame.from_records(y_all)
     # print(y_all)
      #print ("X: ", X_all) 
   else:
      y_all=y_all.values
     # print("Y:",y_all)
      #print("X: ",X_all)

   #normalize
   X_all = preprocessing.MinMaxScaler((0,1)).fit(X_all).transform(X_all)
   #print(X_all[0:5,:])

   #print head
   #print(df.head(0))


   #generate train and test_set
   x_train, x_test, y_train, y_test = train_test_split(X_all,y_all,test_size =test_rate,stratify=y_all,random_state=0)

   return x_train, x_test,y_train,y_test
コード例 #35
0
directory = get_working_directories('pipeline/vegetation-binary',
                                    ['data', 'params', 'results', 'model'])
""" Read Data """
cols, rows, bands, data = read_binary('data/update-2020-09/stack_v2.bin',
                                      to_string=False)
X = data[:, :11]
test_size = .8
y = np.zeros((cols * rows), dtype=int)
for x in range(11, 24):
    if is_vegetation[x]:
        y = data[:, x].astype(int) | y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
rus = RandomUnderSampler(sampling_strategy=1)
X_train_sub, y_train_sub = rus.fit_resample(X_train, y_train)
scaler = StandardScaler().fit(X_train_sub)
X_train_sub = scaler.transform(X_train_sub)

pipeline = Pipeline([('kmeans', KMeans()),
                     ('rf', RandomForestClassifier(n_jobs=-1))])

param_grid = dict(kmeans__n_clusters=range(1, 101, 25),
                  rf__max_depth=[3, 8],
                  rf__max_features=[0.1, 0.5],
                  rf__n_estimators=[50, 500])
grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=0, n_jobs=-1)
""" Fit, Predict, and Display Results """
grid_clf.fit(X_train_sub, y_train_sub)

X_scaled = scaler.transform(X)
コード例 #36
0
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
rus = RandomUnderSampler(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                   idx_resampled)

idx_class_0 = y_resampled == 0
plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
            alpha=.8, label='Removed samples')
コード例 #37
0
    DATASET_PATH = Path('data/preprocessed_data/dataset_10k')

    print('Loading data')

    X = pd.read_json(DATASET_PATH.joinpath(SENTENCE_LIST))[0].tolist()
    y = pd.read_json(DATASET_PATH.joinpath(LABEL_LIST))[0].tolist()

    print('Splitting data')

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=22)

    rus = RandomUnderSampler()
    X_train, y_train = rus.fit_resample(
        np.asarray(X_train).reshape(-1, 1), y_train)
    X_train = X_train.squeeze()
    y_train = y_train.squeeze()

    print('Transforming input data')
    # Transform train data
    from sklearn.feature_extraction.text import CountVectorizer

    count_vect = CountVectorizer(min_df=10, ngram_range=(1, 2))

    X_train_counts = count_vect.fit_transform(X_train)
    X_test_counts = count_vect.transform(X_test)

    # from sklearn.feature_extraction.text import TfidfTransformer
    # tfidf_transformer = TfidfTransformer(use_idf=True)
    #
コード例 #38
0
    #    result = hdd

    x = result.iloc[:, :-1].values
    y = result.iloc[:, -1].values

    from imblearn.under_sampling import RandomUnderSampler
    rus = RandomUnderSampler()

    del hdd
    del hdd_extra
    del hdd_merged
    del result
    gc.collect()

    X_resampled, y_resampled = rus.fit_resample(x, y)
    print(Counter(y_resampled))
    #    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)
    clf = RandomForestClassifier()
    clf.fit(X_resampled, y_resampled)

    features = [1, 4, 5, 7, 9, 12, 188, 193, 194, 197, 198, 199]
    columns_specified = []
    for feature in features:
        columns_specified += ["smart_{0}_raw".format(feature)]

    stripe_size = 50

    thresholds = np.arange(start=0, stop=.505, step=.005)

    output_file = open(