コード例 #1
0
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                               n_informative=3, n_redundant=1, flip_y=0,
                               n_features=20, n_clusters_per_class=1,
                               n_samples=5000, random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_trans2, y_trans2 = pipeline.fit_sample(X, y)
    X_trans3, y_trans3 = rus.fit_sample(X, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)
    assert_array_almost_equal(y_trans, y_trans2)
    assert_array_almost_equal(y_trans, y_trans3)

    pca = PCA()
    pipeline = Pipeline([('pca', pca), ('rus', rus)])

    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_sample(X_pca, y)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(y_trans, y_trans2)
コード例 #2
0
def down_sample(X, y, random_state):
    rus = RandomUnderSampler(random_state=random_state)
    X_rus, y_rus = rus.fit_sample(X, y)
    indices = rus.sample_indices_
    X_train = X.iloc[indices, :].reset_index(drop=True)
    y_train = y.iloc[indices].reset_index(drop=True)
    return X_train, y_train
コード例 #3
0
    def balanceInputData(self):
        # Deal with imbalanced class sizes below
        # Make Data 1D for compatability upsampling methods
        X_trainShape = self.input_train.shape[1] * self.input_train.shape[
            2] * self.input_train.shape[3]
        #X_testShape = self.input_test.shape[1]*self.input_test.shape[2]*self.input_test.shape[3]
        X_trainFlat = self.input_train.reshape(self.input_train.shape[0],
                                               X_trainShape)
        #X_testFlat = self.input_test.reshape(self.input_test.shape[0], X_testShape)
        Y_train = self.output_train
        #Y_test = self.output_test

        ros = RandomUnderSampler(ratio='auto')
        X_trainRos, Y_trainRos = ros.fit_sample(X_trainFlat, Y_train)
        #X_testRos, Y_testRos = ros.fit_sample(X_testFlat, Y_test)
        # Encode labels to hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0])
        #print(Y_trainRos.shape)
        #print(Y_trainRos)
        Y_trainRosHot = to_categorical(Y_trainRos,
                                       num_classes=self._NUM_CLASSES)
        #Y_testRosHot = to_categorical(Y_testRos, num_classes = self._NUM_CLASSES)
        # Make Data 2D again
        for i in range(len(X_trainRos)):
            height, width, channels = 100, 150, 3
            X_trainRosReshaped = X_trainRos.reshape(len(X_trainRos), height,
                                                    width, channels)
        #for i in range(len(X_testRos)):
        #height, width, channels = 100,150,3
        #X_testRosReshaped = X_testRos.reshape(len(X_testRos),height,width,channels)

        self.input_train = X_trainRosReshaped
        self.output_train = Y_trainRosHot
コード例 #4
0
ファイル: SVM_1.py プロジェクト: TokeF/ML
def SVM_classify(X: np.array, lbl: np.array):
    #divide into training and test set using sklearn
    from sklearn.model_selection import train_test_split, cross_val_score
    X_train, X_test, lbl_train, lbl_test = train_test_split(X,
                                                            lbl,
                                                            test_size=0.20)

    #apply random undersampling
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.over_sampling import RandomOverSampler
    rus = RandomUnderSampler(return_indices=True)
    X_train_rus, lbl_train_rus, id_rus = rus.fit_sample(X_train, lbl_train)
    # X_train_rus = X_train
    # lbl_train_rus = lbl_train

    #import and train SVC (classifier)
    from sklearn.svm import SVC
    svclassifier = SVC(kernel='poly', degree=10, gamma='auto')
    svclassifier.fit(X_train_rus, lbl_train_rus)
    #predict labels for test data
    lbl_pred = svclassifier.predict(X_test)
    #evaluate results using built in report and confusion matrix
    from sklearn.metrics import classification_report, confusion_matrix
    print(confusion_matrix(lbl_test, lbl_pred))
    print(classification_report(lbl_test, lbl_pred))
    print(lbl_test)
    print(lbl_pred)
    scores = cross_val_score(svclassifier, X, lbl, cv=5)
    print(scores.mean())
コード例 #5
0
def resample(X_train=None, y_train=None, df=None, balance=None, nclass=None):
    """
    Perform resampling based on chosen method
    :param X_train:  tensor of training data (size, dimension)
    :param y_train:  tensor of training target (size, 1)
    :param df: dataframe to resample from
    :param balance: type of resample to perform
    :return: resampled data
    """

    if balance == 'SMOTE':
        X_train, y_train = SMOTE().fit_sample(X_train, y_train)
        logger.info(f'Using {balance} oversampling')
    elif balance == 'RandomUnderSampler':
        rus = RandomUnderSampler(random_state=0)
        X_train, y_train = rus.fit_sample(X_train, y_train)
        logger.info(f'Using {balance} oversampling')
    elif balance == 'Bootstrap':
        logger.info(f'Using {balance} oversampling')
        df = bootstrap(df, nclass)
        return df
    elif balance == 'Handsample':
        logger.info(f'Using {balance} oversampling')
        df = bootstrap(df, nclass, if_new=True)
        return df

    return X_train, y_train
コード例 #6
0
 def downsample(self):
     """Balance class data based on outcome"""
     print('Current outcome sampling {}'.format(Counter(self.y)))
     rus = RandomUnderSampler()
     self.X, self.y = rus.fit_sample(self.X, self.y)
     self.Xview = self.X.view()[:,:self.n_features]
     print('Resampled dataset shape {}'.format(Counter(self.y)))
コード例 #7
0
    def downsample(self):
        """Balance class data based on outcome"""
        print('Current outcome sampling {}'.format(Counter(self.y)))
        
        # to use a random sampling seed at random:
       # rus = RandomUnderSampler()
       # self.X, self.y = rus.fit_sample(self.X, self.y)
       
        # to fix the random sampling seed at a certain value & return indices: 
        rus = RandomUnderSampler(random_state=0,return_indices=True)
        self.X, self.y, ds_idx = rus.fit_sample(self.X, self.y)
        
        # print out the downsampled index to file: 
        file = open('downsampled_idx','a')
        file.write(str(ds_idx)+'\n')
        file.close()

        # print out the downsampled y to file: 
        file = open('downsampled_y','a')
        file.write(str(self.y)+'\n')
        file.close()
        
        
        self.Xview = self.X.view()[:, :self.n_features]
        print('Resampled dataset shape {}'.format(Counter(self.y)))
コード例 #8
0
    def fit_divided_hab(self, df_X, df_y):
        for i in range(self.n_estimators):
            imb = RandomUnderSampler()
            X_resampled, y_resampled = imb.fit_sample(df_X, df_y)
            X_res_vis = self.decomposor.transform(X_resampled)
            X_train, X_test, y_train, y_test = train_test_split(X_res_vis,
                                                                y_resampled,
                                                                test_size=0.3,
                                                                random_state=0)

            clf = clone(self.estimator)
            score = cross_val_score(clf,
                                    X_train,
                                    y_train,
                                    cv=10,
                                    scoring=make_scorer(roc_auc_score)).mean()
            if self.verbose:
                print("Estimator %d:" % (i + 1))
                print("AUC: %.2f" % score)
            self.score_weights.append(score)

            # clf.fit(X_train, y_train)
            y_score = clf.fit(X_train, y_train).decision_function(X_test)
            self.clfs.append(clf)
            self.imbs.append(imb)
            self.y_tests.append(y_test)
            self.y_scores.append(y_score)
コード例 #9
0
    def customSampler(self, X, Y):
        neighbours = 3
        sampler = RandomUnderSampler(return_indices=True,
                                     replacement=False,
                                     ratio=.9)
        # sampler = CondensedNearestNeighbour(return_indices=True,n_neighbors=neighbours)
        # sampler = RepeatedEditedNearestNeighbours(return_indices=True,n_neighbors=neighbours)
        # sampler = NeighbourhoodCleaningRule(return_indices=True,n_neighbors=2)
        # sampler = EditedNearestNeighbours(return_indices=True,n_neighbors=neighbours)
        # sampler = SMOTE()
        # sampler = ClusterCentroids(ratio='auto' , n_jobs= 4 )
        # sampler = AllKNN(return_indices=True,n_neighbors=neighbours,n_jobs=4)

        index = []
        for i in range(len(X)):
            index.append(i)

        # print("before x " , len(X)  , ' y ', len(Y) )
        a, b, c = sampler.fit_sample(X, Y)

        # print("after x " , len(a)  , ' y ', len(b) , ' c ' , len(c))
        # print(c)

        # return a,b,c
        return X, Y, index
コード例 #10
0
 def downsample(self):
     """Balance class data based on outcome"""
     print('Current outcome sampling {}'.format(Counter(self.y)))
     rus = RandomUnderSampler()
     self.X, self.y = rus.fit_sample(self.X, self.y)
     self.Xview = self.X.view()[:, :self.n_features]
     print('Resampled dataset shape {}'.format(Counter(self.y)))
コード例 #11
0
def data_under_sampling(X, y, under_sample_type, ratio, random_state=None):
    '''
    数据欠采样
    :param X: 数据
    :param y: 标签
    :param under_sample_type: 欠采样类型(str)
    :param ratio: ratio = (少类样本)/(多类样本)
    :param random_state: 随机采样的种子
    :return: 采样后的数据(nparray)
    '''
    under_sampler = None
    if under_sample_type == 'Random':
        from imblearn.under_sampling import RandomUnderSampler
        under_sampler = RandomUnderSampler(ratio=ratio,
                                           random_state=random_state)
    elif under_sample_type == 'NearMiss':
        from imblearn.under_sampling import NearMiss
        under_sampler = NearMiss(ratio=ratio,
                                 random_state=random_state,
                                 version=1)
    elif under_sample_type == 'EditedNearestNeighbours':
        from imblearn.under_sampling import EditedNearestNeighbours
        under_sampler = EditedNearestNeighbours(random_state=random_state)
    X_resampled, y_resampled = under_sampler.fit_sample(X, y)
    return X_resampled, y_resampled
コード例 #12
0
def pipeline(X_train, X_test, y_train, y_test, model):
    dict1 = dict()
    dataset = [[X_train, X_test, y_train, y_test]]

    # Create the Under samplers
    rus = RandomUnderSampler(random_state=9)
    X_sample2, y_sample2 = rus.fit_sample(X_train, y_train)
    dataset.append([X_sample2, X_test, y_sample2, y_test])

    ros = RandomOverSampler(random_state=9)
    X_sample3, y_sample3 = ros.fit_sample(X_train, y_train)
    dataset.append([X_sample3, X_test, y_sample3, y_test])

    smote = SMOTE(random_state=9, kind='borderline2')
    X_sample4, y_sample4 = smote.fit_sample(X_train, y_train)
    dataset.append([X_sample4, X_test, y_sample4, y_test])

    roc_old = 0
    roc_new = 0
    for m in model:
        for X_train, X_test, y_train, y_test in dataset:
            m.fit(X_train, y_train)
            roc_new = roc_auc_score(y_test, m.predict(X_test))
            if (roc_new >= roc_old):
                dict1.clear()
                dict1[m] = roc_new
                roc_old = roc_new
    return list(dict1.keys())[0], list(dict1.values())[0]
コード例 #13
0
ファイル: undersample.py プロジェクト: brettin/pilot1-docs
def undersample(X, y, bal_strategy):
	print 'Shape of X: ', X.shape
	print 'Shape of y_Train: ', y.shape

	if(bal_strategy == "RANDOM" or bal_strategy == "ALL"):
		# apply random under-sampling
		rus = RandomUnderSampler()
		X_sampled, y_sampled = rus.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "TOMEK" or bal_strategy == "ALL"):
		# Apply Tomek Links cleaning
		tl = TomekLinks()
		X_sampled, y_sampled = tl.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in ALL, RANDOM, TOMEK, NONE'
		sys.exit(1)

	return (X_sampled, y_sampled)
    def __init__(self,
                 X,
                 y,
                 Number_trials,
                 maxdepth_settings=None,
                 scaler=None):
        score_train = []
        score_test = []
        if maxdepth_settings is not None:
            self.maxdepth_settings = maxdepth_settings
            self.var = maxdepth_settings

        with tqdm(total=Number_trials * len(self.maxdepth_settings)) as pb:
            for seed in range(1, Number_trials + 1, 1):
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, stratify=y, test_size=0.25, random_state=seed)
                under_samp = RandomUnderSampler()
                X_train, y_train = under_samp.fit_sample(X_train, y_train)
                if scaler is not None:
                    scaler_inst = scaler.fit(X_train)
                    X_train = scaler_inst.transform(X_train)
                    X_test = scaler_inst.transform(X_test)
                pb.set_description(f'Trial: {seed}')
                training_accuracy = []
                test_accuracy = []

                for depth in maxdepth_settings:
                    tree = DecisionTreeClassifier(
                        max_depth=depth, random_state=42)  # build the model
                    tree.fit(X_train, y_train)

                    training_accuracy.append(tree.score(
                        X_train, y_train))  # record training set accuracy
                    test_accuracy.append(tree.score(
                        X_test, y_test))  # record generalization accuracy
                    pb.update(1)

                score_train.append(training_accuracy)
                score_test.append(test_accuracy)

        self.score = np.mean(score_test, axis=0)
        self.sc_train = np.mean(score_train, axis=0)
        self.std_score = np.std(score_test, axis=0)
        self.std_train = np.std(score_train, axis=0)

        # get top predictor
        best_depth = maxdepth_settings[np.argmax(self.score)]
        tree = DecisionTreeClassifier(max_depth=best_depth,
                                      random_state=42)  # build the model
        tree.fit(X_train, y_train)
        self.top_predictor = X.columns[np.argmax(tree.feature_importances_)]

        abs_mean_coefs = np.abs(tree.feature_importances_)
        coefs_count = len(abs_mean_coefs)
        fig, ax = plt.subplots(figsize=(3, 5))
        ax.barh(np.arange(coefs_count), sorted(abs_mean_coefs))
        ax.set_yticks(np.arange(coefs_count))
        ax.set_yticklabels(X.columns[np.argsort(abs_mean_coefs)])
        #self.top_predictor='NA'
        return
コード例 #15
0
def rando_under(df, to_drop, flag, verbose=0):

    a = []

    # As the dataset is biased towards non-churners we must perform sampling
    y = df[flag].copy()
    X = df[:].copy()
    X = X.reset_index(drop=False)

    for n in range(len(to_drop)):
        try:
            X.drop(to_drop[n], axis=1, inplace=True)
        except:
            a.append(to_drop[n])

    if verbose > 0 and len(a) > 0:
        print("\nUnable to drop:\n", list(set(a)))

    rus = RandomUnderSampler(ratio='majority',
                             random_state=42,
                             replacement=False)
    X_rus, y_rus = rus.fit_sample(X, y)

    df_rus = pd.DataFrame(X_rus, columns=X.columns)
    df_rus[flag] = y_rus
    df_rus = df_rus.set_index('id_subs_id', drop=True)

    return df_rus
コード例 #16
0
def under_sampling(x, y):
    n_positive = sum(y)
    x_indice = np.array(range(len(x))).reshape((-1, 1))
    rus = RandomUnderSampler(ratio={0: n_positive * 3, 1: n_positive})
    x_indice_resampled, y_resampled = rus.fit_sample(x_indice, y)
    x_resampled = x[x_indice_resampled.reshape((-1, ))]
    return x_indice_resampled.reshape((-1, ))
コード例 #17
0
    def transform(self, X, y=None):
        # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
        # TODO      simple trainer in the correct order and leave this to advanced users?

        # Extract predicted column
        y = np.squeeze(X[[self.predicted_column]])

        # Copy the dataframe without the predicted column
        temp_dataframe = X.drop([self.predicted_column], axis=1)

        # Initialize and fit the under sampler
        under_sampler = RandomUnderSampler(random_state=self.random_seed)
        x_under_sampled, y_under_sampled = under_sampler.fit_sample(temp_dataframe, y)

        # Build the resulting under sampled dataframe
        result = pd.DataFrame(x_under_sampled)

        # Restore the column names
        result.columns = temp_dataframe.columns

        # Restore the y values
        y_under_sampled = pd.Series(y_under_sampled)
        result[self.predicted_column] = y_under_sampled

        return result
コード例 #18
0
def train_linear_SVC():
    df = encode_features()
    print(df)

    classifier = LinearSVC()
    X_train, X_test, y_train, y_test = create_train_test_split(df)

    rus = RandomUnderSampler(return_indices=True)
    X_rus, y_rus, id_rus = rus.fit_sample(
        X_train, y_train
    )  # TODO other sampling techniques, e.g. ros = RandomOverSampler() X_ros, y_ros = ros.fit_sample(X, y)

    print('Removed indexes:', len(X_train), len(id_rus), id_rus)

    classifier.fit(X_rus, y_rus)  # X_train, y_train.values.ravel()

    pickle.dump(classifier, open("linear_SVC_relations.pkl", 'wb'))

    # Predict Class
    y_pred = classifier.predict(X_test)
    print(y_pred)

    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("acc: ", accuracy)
コード例 #19
0
ファイル: b_Adaboost.py プロジェクト: FoxerLee/SSE-DAM2018
def main():
    train_datas = b.train_generator()
    # train_datas.to_csv('X.csv')
    train_datas.set_index(['vipno', 'pluno'], inplace=True, drop=True)
    train = train_datas.as_matrix()
    X = np.delete(train, train.shape[1] - 1, axis=1)
    y = train[:, train.shape[1] - 1]
    # 用于做降采样,以确保正负样本的数量相近
    rus = RandomUnderSampler(return_indices=True)
    X, y, idx_resampled = rus.fit_sample(X, y)

    param_test1 = {
        'n_estimators': range(30, 101, 10),
        'learning_rate': np.arange(0.01, 0.1, 10)
    }
    gsearch1 = GridSearchCV(estimator=AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=20)),
                            param_grid=param_test1,
                            scoring='roc_auc',
                            cv=5)
    gsearch1.fit(X, y)

    print("Best param: {}".format(gsearch1.best_params_))
    print("Best score: {}".format(gsearch1.best_score_))
    print("Best estimator: {}".format(gsearch1.best_estimator_))

    print("****************************")
コード例 #20
0
def main():
    train_datas = ci.train_generator()
    train_datas.to_csv('X.csv')
    train_datas.set_index(['vipno'], inplace=True, drop=True)
    train = train_datas.as_matrix()
    X = np.delete(train, train.shape[1] - 1, axis=1)
    y = train[:, train.shape[1] - 1]

    # 用于做降采样,以确保正负样本的数量相近
    rus = RandomUnderSampler(return_indices=True)
    X, y, idx_resampled = rus.fit_sample(X, y)

    param_test1 = {'n_estimators': range(10, 61, 10),
                   'learning_rate': np.arange(0.01, 0.1, 10),
                   'max_depth': range(1, 14, 2),
                   'max_features': range(7, 20, 2),
                   'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]
                   }
    param_test2 = {'max_depth': range(1, 14, 2)}
    param_test3 = {'max_features': range(7, 20, 2),
                   'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]}
    gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.01, n_estimators=50, max_depth=7),
                            param_grid=param_test1, scoring='precision', cv=5)

    gsearch1.fit(X, y)

    print("Best param: {}".format(gsearch1.best_params_))
    print("Best score: {}".format(gsearch1.best_score_))
    print("Best estimator: {}".format(gsearch1.best_estimator_))

    print("****************************")
コード例 #21
0
    def transform(self, X, y=None):
        # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
        # TODO      simple trainer in the correct order and leave this to advanced users?

        # Extract predicted column
        y = np.squeeze(X[[self.predicted_column]])

        # Copy the dataframe without the predicted column
        temp_dataframe = X.drop([self.predicted_column], axis=1)

        # Initialize and fit the under sampler
        under_sampler = RandomUnderSampler(random_state=self.random_seed)
        x_under_sampled, y_under_sampled = under_sampler.fit_sample(
            temp_dataframe, y)

        # Build the resulting under sampled dataframe
        result = pd.DataFrame(x_under_sampled)

        # Restore the column names
        result.columns = temp_dataframe.columns

        # Restore the y values
        y_under_sampled = pd.Series(y_under_sampled)
        result[self.predicted_column] = y_under_sampled

        return result
コード例 #22
0
def RUS(X_train, y_train):
    a = list(X_train)
    rus = RandomUnderSampler()
    X_resampled, y_resampled = rus.fit_sample(X_train, y_train)
    X_resampled = pd.DataFrame(X_resampled, columns=a)
    y_resampled = pd.DataFrame(y_resampled, columns=['Target'])
    return X_resampled, y_resampled
コード例 #23
0
def train_and_evaluate_classifier(clf, model, X_train, y_train, X_test, y_test,
                                  operation='hadamard', undersample=True, ratio=0.8,
                                  print_progress=True):
    num_negative = len(y_test[y_test == 0])
    num_positive = len(y_test[y_test == 1])
    positive_to_sample = min(int(num_negative * ratio), num_positive)
    ratio_dict = {0: num_negative, 1: positive_to_sample}
    rus = RandomUnderSampler(return_indices=False, ratio=ratio_dict)
    X_sampled, y_sampled = X_train, y_train
    if undersample:
        X_sampled, y_sampled = rus.fit_sample(X_train, y_train)
    if print_progress:
        print('Assembling training set features....')
    X = []
    for u, v in X_sampled:
        X.append(model.get_edge_features(u, v, operation))
    y = y_sampled
    X = np.array(X)

    if print_progress:
        print('Fitting classifier model')

    clf.fit(X, y)


    if print_progress:
        print('Assembling testing set features')

    X_sampled, y_sampled = X_test, y_test
    #X_sampled, y_sampled = rus.fit_sample(X_test, y_test)
    X = []
    for u, v in X_sampled:
        X.append(model.get_edge_features(u, v, operation))
    y = y_sampled
    X = np.array(X)
    predictions = clf.predict(X)
    probs = clf.predict_proba(X)[:,1]

    macro_f1 = metrics.f1_score(y, predictions, average='macro')
    micro_f1 = metrics.f1_score(y, predictions, average='micro')
    average_percision_score = metrics.average_precision_score(y, probs)
    auc = metrics.roc_auc_score(y, probs)
    kappa = metrics.cohen_kappa_score(y, predictions)
    mathew = metrics.matthews_corrcoef(y, predictions)
    confustion_matrix = metrics.confusion_matrix(y, predictions)
    classification_report = metrics.classification_report(y, predictions)

    metric_reports = {
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'average_percision_score':average_percision_score,
        'auc': auc,
        'confusion_matrix': confustion_matrix,
        'classification_report': classification_report,
        'kappa': kappa,
        'mathew': mathew
    }

    return metric_reports
コード例 #24
0
def subsample_data(X, y, ratio=0.5, seed=None):
    size = 1100
    rus = RandomUnderSampler(ratio={
        0: int(size * ratio),
        1: int(size * (1 - ratio)),
    },
                             random_state=seed)
    return rus.fit_sample(X.as_matrix(), y.as_matrix().ravel())
コード例 #25
0
def get_training_data(X, y):
    '''
    balances classes for training
    '''
    rus = RandomUnderSampler()
    X_resampled, y_resampled = rus.fit_sample(X, y)

    return X_resampled, y_resampled
コード例 #26
0
    def _get_trained_model(X, y):
        rus = RandomUnderSampler(random_state=0)
        X_resampled, y_resampled = rus.fit_sample(X, y)

        model = LinearSVC()
        clf = CalibratedClassifierCV(model, cv=5)
        clf.fit(X_resampled, y_resampled)
        return clf
コード例 #27
0
 def _random_sampler(self, X, y, strategy, sample_ratio):
     if strategy == 'oversample':
         ros = RandomOverSampler(ratio=sample_ratio)
         X_resampled, y_resampled = ros.fit_sample(X, y)
     else:
         rus = RandomUnderSampler(ratio=sample_ratio)
         X_resampled, y_resampled = rus.fit_sample(X, y)
     return X_resampled, y_resampled
コード例 #28
0
def create_train_test_model(df, cnt_col, feature_col):
    scaled_features = scaled_fit_transform(df, cnt_col)
    X, y = create_x_y(scaled_features, feature_col)
    rus = RandomUnderSampler(random_state=42)
    X, y = rus.fit_sample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42)
    return X_train, y_train, X_test, y_test
コード例 #29
0
    def under_sampling(self, _X, _y):
        """
        under-sampling for unbalanced training set

        :return: X_resampled, y_resampled
        """
        rus = RandomUnderSampler()
        return rus.fit_sample(_X, _y)
コード例 #30
0
ファイル: sampler.py プロジェクト: Apiao-1/BankMarketing
def under_sampler(df):
    X = df.copy()
    y = X.pop('y')
    print(X.shape, y.shape)
    under = RandomUnderSampler(random_state=42)
    X_under, y_under = under.fit_sample(X, y)
    print(X_under.shape, y_under.shape)
    return X_under, y_under
コード例 #31
0
class kp_classifier_dataset(object):
    """
    This class is dedicated to building and rearranging classifier datasource
    """
    def load_data(self):
        file_store = BASE_LOCATION + "{ext}_%d.h5".replace(
            "{ext}", str(self.file_ext))
        self.O = pd.DataFrame()
        for year in range(1995, 2019):
            fname = file_store % year
            self.O = pd.concat([
                self.O,
                pd.read_hdf(fname, mode="r", key="df", parse_dates=True)
            ])
            pass
        _x = utils.to_linear_Kp(delay_unit=1)
        self.O["Kp"] = _x["Kp"]
        self.O["STORM"] = _x["STORM"]
        return

    def __init__(self, look_back=3, file_ext=180):
        self.look_back = look_back
        self.file_ext = file_ext
        self.load_data()
        self.xparams = [
            "Bx", "B_T", "THETA", "SIN_THETA", "V", "n", "T", "P_DYN", "BETA",
            "MACH_A", "STORM"
        ]
        self.yparam = ["STORM"]
        self.O = utils.transform_variables(self.O)
        self.sclX = MinMaxScaler(feature_range=(0, 1))
        self.sclY = MinMaxScaler(feature_range=(0, 1))
        return

    def form_look_back_array(self, X, y):
        dataX, dataY = [], []
        for i in range(self.look_back + 1, len(X)):
            a = X[i - self.look_back:i, :].T
            dataX.append(a)
            dataY.append(y[i].tolist())
            pass
        return np.array(dataX), np.array(dataY)

    def create_master_model_data(self):
        X, y = self.O[self.xparams].values, self.O[self.yparam].values.ravel()
        X = self.sclX.fit_transform(X)
        self.rus = RandomUnderSampler(return_indices=True)
        X_resampled, y_resampled, idx_resampled = self.rus.fit_sample(X, y)
        y_bin = to_categorical(y_resampled)
        X_resampled, y_resampled = self.form_look_back_array(
            X_resampled, y_bin)
        X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                            y_resampled,
                                                            test_size=1.0 /
                                                            3.0,
                                                            random_state=42)
        return X_train, X_test, y_train, y_test
コード例 #32
0
ファイル: sampling.py プロジェクト: brunnurs/PA1
def random_undersampling(x, y):
    print('Original dataset shape {}'.format(Counter(y)))

    rus = RandomUnderSampler(random_state=42)
    x_sampled, y_sampled = rus.fit_sample(x, y)

    print('With RandomUnderSampler sampled dataset shape {}'.format(Counter(y_sampled)))

    return x_sampled, y_sampled
コード例 #33
0
def model_(X, y, rs):
    osp = RandomUnderSampler(random_state=rs)
    x_train_, y_train_ = osp.fit_sample(X, y)
    # 基础模型
    clf = CatBoostClassifier(loss_function='Logloss',
                             logging_level='Silent',
                             cat_features=categorical_features_indices)
    clf.fit(x_train_, y_train_)
    return clf
コード例 #34
0
    def DownsampleMajority(self):
        print('Original dataset shape {}'.format(Counter(self.y)))
        rus = RandomUnderSampler(return_indices=True)
        X_resampled, y_resampled, idx_resampled = rus.fit_sample(self.x, self.y)

        print('Resampled dataset shape {}'.format(Counter(y_resampled)))
        self.x = X_resampled
        self.y = y_resampled
        return
コード例 #35
0
def cross_validation(classifier,
                     data,
                     labels,
                     n_of_fold,
                     threshold=0.5,
                     method="SMOTE",
                     ratio=0.5):
    true_pos = []
    false_pos = []
    true_neg = []
    false_neg = []
    auc = []
    real_labels = []
    probability_labels = []
    #KFold creates the fold with respect to the number given as an input
    kfold = KFold(n_splits=n_of_fold, shuffle=True, random_state=42)
    for train_ind, test_ind in kfold.split(data):
        #split the dataset into train and test as the kfold indicates
        data_train, data_test = data.iloc[train_ind], data.iloc[test_ind]
        labels_train, labels_test = labels.iloc[train_ind], labels.iloc[
            test_ind]
        #apply resampling method
        if method == 'under':
            und = RandomUnderSampler(ratio=float(ratio))
            data_resampling, labels_resampling = und.fit_sample(
                data_train, labels_train)
        elif method == 'smotetomek':
            resampling = SMOTETomek(ratio=float(ratio))
            data_resampling, labels_resampling = resampling.fit_sample(
                data_train, labels_train)
        else:
            resampling = SMOTE(ratio=float(ratio))
            data_resampling, labels_resampling = resampling.fit_sample(
                data_train, labels_train)
        #train the classifier with the train data after the resampling
        classifier.fit(data_resampling, labels_resampling)
        #evaluate the model for the output predictions using confusion matrix
        labels_prediction = classifier.predict(data_test)
        tn, fp, fn, tp = confusion_matrix(labels_test,
                                          labels_prediction).ravel()
        #computhe the probabilities and use them to construct the roc curve
        labels_prediction_probability = classifier.predict_proba(data_test)[:,
                                                                            1]
        #store the data in a array
        true_pos.append(tp)
        false_pos.append(fp)
        true_neg.append(tn)
        false_neg.append(fn)
        #keep the labels and the probabilities
        real_labels.extend(labels_test)
        probability_labels.extend(labels_prediction_probability)
    #transform the list to nparray
    true_pos = np.array(true_pos)
    false_pos = np.array(false_pos)
    true_neg = np.array(true_neg)
    false_neg = np.array(false_neg)
    return true_pos, false_pos, true_neg, false_neg, real_labels, probability_labels
コード例 #36
0
def test_multiclass_fit_sample():
    y = Y.copy()
    y[5] = 2
    y[6] = 2
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_sample(X, y)
    count_y_res = Counter(y_resampled)
    assert count_y_res[0] == 2
    assert count_y_res[1] == 2
    assert count_y_res[2] == 2
コード例 #37
0
def test_rus_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #38
0
def test_rus_fit_sample():
    rus = RandomUnderSampler(random_state=RND_SEED,
                             replacement=True)
    X_resampled, y_resampled = rus.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.12372842, 0.6536186], [0.04352327, -0.20515826]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])

    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #39
0
def test_pipeline_sample():
    # Test whether pipeline works with a sampler at the end.
    # Also test pipeline.sampler
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0)

    rus = RandomUnderSampler(random_state=0)
    pipeline = Pipeline([('rus', rus)])

    # test transform and fit_transform:
    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_trans2, y_trans2 = pipeline.fit_sample(X, y)
    X_trans3, y_trans3 = rus.fit_sample(X, y)
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(X_trans, X_trans3, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans3, rtol=R_TOL)

    pca = PCA()
    pipeline = Pipeline([('pca', PCA()),
                         ('rus', rus)])

    X_trans, y_trans = pipeline.fit(X, y).sample(X, y)
    X_pca = pca.fit_transform(X)
    X_trans2, y_trans2 = rus.fit_sample(X_pca, y)
    # We round the value near to zero. It seems that PCA has some issue
    # with that
    X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0
    X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0
    assert_allclose(X_trans, X_trans2, rtol=R_TOL)
    assert_allclose(y_trans, y_trans2, rtol=R_TOL)
コード例 #40
0
def test_rus_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'rus_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
コード例 #41
0
def test_rus_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.12372842, 0.6536186], [0.04352327, -0.20515826]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    idx_gt = np.array([1, 3, 8, 6, 7, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
コード例 #42
0
ファイル: ModelSelection.py プロジェクト: DunZhang/SVPFS
def CrossVal(estimator, X, y,procsessor=None,cv=3,times=10,random_state=0,imb=False):
    """
    交叉验证
    
    estimator:
        模型
    
    X:
        数据集X部分
    
    y:
        数据集的label
    
    procsessor:
        预处理器,其实就是做特征选择
    
    cv:
        做cv折交叉验证
    
    times:
        重复times次交叉验证
        
    random_state:
        随机数种子
    
    imb:
        是否使用SMOTE使得正负样本数平衡
    
    """
    
    res=[]
    for t in range(times):
        skf=StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state+t)
        indices=list(skf.split(X=X,y=y))        
        for k in indices:
            x_train,y_train,x_test,y_test=X[k[0]],y[k[0]],X[k[1]],y[k[1]]              
            if(imb==True):
                n,p=__lableCount(y_train)
                rus=RandomUnderSampler(random_state=random_state+t)
                x_train,y_train=rus.fit_sample(x_train,y_train)         
            if(procsessor is not None):
                procsessor.fit(x_train,y_train)
                x_train,y_train=procsessor.transform(x_train,y_train)
                x_test,y_test=procsessor.transform(x_test,y_test)
            estimator.fit(x_train,y_train)
            res.append(Metrics.Score(estimator,x_test,y_test))                
    res=np.array(res)
    return res
コード例 #43
0
def test_rus_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.5
    rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.12372842, 0.6536186], [0.04352327, -0.20515826],
                     [0.15490546, 0.3130677], [0.15490546, 0.3130677],
                     [0.15490546, 0.3130677]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #44
0
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[5] = 2
    y[6] = 2

    # Resample the data
    rus = RandomUnderSampler(random_state=RND_SEED)
    X_resampled, y_resampled = rus.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 2)
    assert_equal(count_y_res[1], 2)
    assert_equal(count_y_res[2], 2)
コード例 #45
0
def test_rus_fit_sample_half():
    ratio = {0: 3, 1: 6}
    rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED,
                             replacement=True)
    X_resampled, y_resampled = rus.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773],
                     [0.47104475, 0.44386323],
                     [0.92923648, 0.76103773],
                     [0.15490546, 0.3130677],
                     [0.15490546, 0.3130677],
                     [0.15490546, 0.3130677],
                     [0.20792588, 1.49407907],
                     [0.15490546, 0.3130677],
                     [0.12372842, 0.6536186]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    print(X_resampled)
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
コード例 #46
0
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
rus = RandomUnderSampler(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]),
                                   idx_resampled)

idx_class_0 = y_resampled == 0
plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1],
            alpha=.8, label='Class #0')
plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1],
            alpha=.8, label='Class #1')
plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1],
            alpha=.8, label='Removed samples')
コード例 #47
0
 def func(X, y, ratio, random_state):
     rus = RandomUnderSampler(ratio=ratio, random_state=random_state)
     return rus.fit_sample(X, y)