Ejemplo n.º 1
0
    def __init__(self,  window_size=6, training_ratio=.7, seq="sequence", pos="label"):
        self.training_ratio = training_ratio  # Float value representing % of data used for training
        self.features = []
        self.labels = []
        self.words = []
        self.window_size = window_size
        self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4),
                                       "mlp_adam": MLPClassifier(),
                                       "svc": svm.SVC(verbose=1),
                                       "xgb": XGBClassifier(max_delta_step=5),
                                       "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf")
                                       }

        self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(),
                                    "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(),
                                    "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(),
                                    "near_miss": NearMiss(), "pass": -1}
        self.seq = seq
        self.pos = pos
        self.random_data = 0
        self.test_results = 0
        self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"}
        self.vector = 0
        self.features_labels = {}
        self.test_cv = 0
        self.benchmark_mcc = 0
        self.mcc_scorer = make_scorer(matthews_corrcoef)
Ejemplo n.º 2
0
def Balance_classes(X_train, y_train, Sampling_Function):
    if Sampling_Function == 'RandomUnderSampler':
        us = RandomUnderSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'NearMiss1':
        us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3)
    elif Sampling_Function == 'NearMiss2':
        us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3)
    elif Sampling_Function == 'NearMiss3':
        us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3)
    elif Sampling_Function == 'CondensedNearestNeighbour':
        us = CondensedNearestNeighbour(random_state=1)
    elif Sampling_Function == 'EditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'RepeatedEditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'TomekLinks':
        us = TomekLinks(random_state=1)
    elif Sampling_Function == 'RandomOverSampler':
        us = RandomOverSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'SMOTE':
        us = SMOTE(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTETomek':
        us = SMOTETomek(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTEENN':
        us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5)
    elif Sampling_Function == 'EasyEnsemble':
        us = EasyEnsemble()
    elif Sampling_Function == 'BalanceCascade_rf':
        us = BalanceCascade(classifier='random-forest', random_state=1)
    elif Sampling_Function == 'BalanceCascade_svm':
        us = BalanceCascade(classifier='linear-svm', random_state=1)

    X_train_res, y_train_res = us.fit_sample(X_train, y_train)

    return X_train_res, y_train_res
Ejemplo n.º 3
0
def buildModel(clf, X, y, cv_nums=10, is_random=False):
    # 是否打乱数据
    if is_random == True:
        random_lst = list(np.random.randint(0, 1000, 4))
    elif is_random == False:
        random_lst = [0] * 4

    print('----------各种类别不平衡处理方法结果, 为' + str(cv_nums) + '折交叉验证的f1均值----------')
    # 不做处理,使用原始数据集做预测
    print('原始数据集: ', np.mean(cross_val_score(clf, X, y, scoring='f1', cv=cv_nums)))

    ros = RandomOverSampler(random_state=random_lst[0])
    X_oversampled, y_oversampled = ros.fit_sample(X, y)
    # print(sorted(Counter(y_oversampled).items()))
    print('过采样: ', np.mean(cross_val_score(clf, X_oversampled, y_oversampled, scoring='f1', cv=cv_nums)))

    cc = ClusterCentroids(random_state=random_lst[1])
    X_undersampled, y_undersampled = cc.fit_sample(X, y)
    #print(sorted(Counter(y_undersampled).items()))
    print('欠采样: ', np.mean(cross_val_score(clf, X_undersampled, y_undersampled, scoring='f1', cv=cv_nums)))

    sm = SMOTE(random_state=random_lst[2])
    X_smote, y_smote = sm.fit_sample(X, y)
    #print(sorted(Counter(y_smote).items()))
    print('SMOTE: ', np.mean(cross_val_score(clf, X_smote, y_smote, scoring='f1', cv=cv_nums)))

    # 将样本多的类别划分为若干个集合供不同学习器使用,这样对每个学习器来看都进行了欠采样,
    # 但在全局来看却不会丢失重要信息,假设将负样本的类别划分为10份,正样本的类别只有1份,
    # 这样训练10个学习器,每个学习器使用1份负样本和1份正样本,正样本共用
    ee = EasyEnsemble(random_state=random_lst[3], n_subsets=10)
    X_ee, y_ee = ee.fit_sample(X, y)
Ejemplo n.º 4
0
def ensemble_train(X,y, working_dir,n, name, svm=True):
    ees = EasyEnsemble(random_state=557, n_subsets=n)
    X_res, y_res = ees.fit_sample(X,y)
   

    try:
        raise Exception('Retrain')
        with open(working_dir + "/" + name  + '.pkl', 'rb') as f1:
            clf = pickle.load(f1)
    except:
        # scores = cross_val_score(clf, X, y, cv=4, scoring="roc_auc")
        # print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2))
        clf = []
        for i in range(len(X_res)):
            print(Counter(y_res[i]))
            if(svm):
                clfi = SVC(kernel="linear", probability=True)
            else:
                clfi = AdaBoostClassifier(n_estimators=20)
            #clfi=AdaBoostClassifier()
            clfi.fit(X_res[i], y_res[i])
            clf.append(clfi)
            scores = cross_val_score(clfi, X_res[i], y_res[i], cv=4, scoring="roc_auc")
            print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2))
        with open(working_dir + "/" + name + '.pkl', 'wb') as f1:
            pickle.dump(clf, f1)  
    return clf
def test_fit_sample_half():
    # Define the sampling_strategy parameter
    sampling_strategy = {0: 2, 1: 3, 2: 3}

    # Create the sampling object
    ee = EasyEnsemble(sampling_strategy=sampling_strategy,
                      random_state=RND_SEED,
                      n_subsets=3)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)

    X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556],
                      [1.35269503, 0.44812421], [-1.23195149, 0.15427291],
                      [0.5220963, 0.11349303], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [1.35269503, 0.44812421], [-2.10724436, 0.70263997],
                      [-1.23195149, 0.15427291], [0.59091459, 0.40692742],
                      [0.22039505, 0.26469445], [1.10915364, 0.05718352]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [0.5220963, 0.11349303],
                      [1.35269503, 0.44812421], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]]])
    y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2],
                     [0, 0, 1, 1, 1, 2, 2, 2]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Ejemplo n.º 6
0
 def __init__(self, base_model, n_subsets):
     self.base_model = base_model
     self.n_subsets = n_subsets
     self.easy_ensemble = EasyEnsemble('auto',
                                       random_state=RAND_SEED,
                                       n_subsets=4)
     self.trained_based_models = []
Ejemplo n.º 7
0
def test_fit_sample_auto():
    """Test the fit and sample routine with auto ratio."""

    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio,
                      random_state=RND_SEED,
                      return_indices=True,
                      n_subsets=3)

    # Get the different subset
    X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y)

    X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [1.35269503, 0.44812421], [0.5220963, 0.11349303],
                      [1.10915364, 0.05718352], [0.22039505, 0.26469445]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [-2.10724436, 0.70263997],
                      [0.22039505, 0.26469445], [1.10915364, 0.05718352]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [0.5220963, 0.11349303],
                      [1.10915364, 0.05718352], [0.59091459, 0.40692742]]])
    y_gt = np.array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2],
                     [0, 0, 1, 1, 2, 2]])
    idx_gt = np.array([[5, 9, 4, 0, 2, 3], [5, 9, 8, 6, 3, 2],
                       [5, 9, 8, 0, 2, 1]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Ejemplo n.º 8
0
def ezensemble(X_train, y_train):
    a = list(X_train)
    ee = EasyEnsemble(random_state=0, n_subsets=10)
    ee.fit(X_train, y_train)
    X_resampled, y_resampled = ee.fit_sample(X_train, y_train)
    X_resampled = pd.DataFrame(X_resampled[1], columns=a)
    y_resampled = pd.DataFrame(y_resampled[1], columns=['Target'])
    return X_resampled, y_resampled
Ejemplo n.º 9
0
def test_continuous_error():
    """Test either if an error is raised when the target are continuous
    type"""

    # continuous case
    y = np.linspace(0, 1, 10)
    ee = EasyEnsemble(random_state=RND_SEED)
    assert_warns(UserWarning, ee.fit, X, y)
Ejemplo n.º 10
0
def test_ee_fit_invalid_ratio():
    """Test either if an error is raised when the balancing ratio to fit is
    smaller than the one of the data"""

    # Create the object
    ratio = 1. / 10000.
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    assert_raises(RuntimeError, ee.fit, X, Y)
def easy_ensemble(train_set, train_label):
    ee = EasyEnsemble(ratio='auto',
                      return_indices=True,
                      random_state=None,
                      replacement=False,
                      n_subsets=easy_ensemble_num)
    X_resampled, y_resampled, idx_resampled = ee.fit_sample(
        train_set, train_label)
    return X_resampled, y_resampled
Ejemplo n.º 12
0
def test_ee_init():
    # Define a ratio
    ratio = 1.
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)

    assert_equal(ee.ratio, ratio)
    assert_equal(ee.replacement, False)
    assert_equal(ee.n_subsets, 10)
    assert_equal(ee.random_state, RND_SEED)
Ejemplo n.º 13
0
def test_ee_init():
    # Define a ratio
    ratio = 1.
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)

    assert ee.ratio == ratio
    assert ee.replacement is False
    assert ee.n_subsets == 10
    assert ee.random_state == RND_SEED
Ejemplo n.º 14
0
def EasySample(data):
    x = data.iloc[:, 0:2]
    y = data.iloc[:, -2]
    # 使用集成方法EasyEnsemble处理不均衡样本
    model_EasyEnsemble = EasyEnsemble()  # 建立EasyEnsemble模型对象
    x_EasyEnsemble_resampled, y_EasyEnsemble_resampled = model_EasyEnsemble.fit_sample(
        x, y)  # 输入数据并应用集成方法处理
    print(x_EasyEnsemble_resampled.shape)  # 打印输出集成方法处理后的x样本集概况
    print(y_EasyEnsemble_resampled.shape)  # 打印输出集成方法处理后的y标签集概况
def test_random_state_none():
    # Define the sampling_strategy parameter
    sampling_strategy = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)
Ejemplo n.º 16
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ee = EasyEnsemble(random_state=RND_SEED)
    ee.fit(X, Y)
    assert_raises(RuntimeError, ee.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Ejemplo n.º 17
0
def test_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)
    assert_raises(RuntimeError, ee.sample, X, Y)
def test_ee_init():
    # Define a sampling_strategy
    sampling_strategy = 1.
    ee = EasyEnsemble(sampling_strategy=sampling_strategy,
                      random_state=RND_SEED)

    assert ee.sampling_strategy == sampling_strategy
    assert ee.replacement is False
    assert ee.n_subsets == 10
    assert ee.random_state == RND_SEED
Ejemplo n.º 19
0
def test_random_state_none():
    """Test that the processing is going throw with random state being None."""

    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio, random_state=None)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)
Ejemplo n.º 20
0
def test_ee_init():
    """Test the initialisation of the object"""

    # Define a ratio
    ratio = 1.
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)

    assert_equal(ee.ratio, ratio)
    assert_equal(ee.replacement, False)
    assert_equal(ee.n_subsets, 10)
    assert_equal(ee.random_state, RND_SEED)
    def generate_data(self, random_=1, random_ratio=2, random_test=0):
        imb_fun = {
            "smote": SMOTEENN(),
            "under": RandomUnderSampler(),
            "adasyn": ADASYN(),
            "ee": EasyEnsemble(),
            "smotetomek": SMOTETomek()
        }
        rand_features = []
        neg_labels = [0 for i in range(len(self.neg_features))]
        pos_labels = [1 for i in range(len(self.pos_features))]
        features = self.pos_features + self.neg_features
        labels = pos_labels + neg_labels
        if self.imba:
            for i in self.imba:
                features, labels = imb_fun[i].fit_sample(features, labels)
        if random_ == 1 and random_ratio > 0:
            for i in range(
                    int((len(self.pos_features) + len(self.neg_features)) *
                        random_ratio)):
                rand_features.append(
                    featurify(
                        ProteinAnalysis(
                            random_seq(locked=self.pos_seq,
                                       wing_size=self.window,
                                       center=self.amino_acid)),
                        (2 * self.window + 1)))
        if random_test == 0:

            temp = list(zip(features, labels))
            random.shuffle(temp)
            features, labels = zip(*temp)
            training_slice = int(self.training_ratio * len(labels))
            self.training_features = list(
                features[:training_slice]) + rand_features
            self.training_labels = list(labels[:training_slice]) + [
                0 for i in range(len(rand_features))
            ]

            self.test_features = features[training_slice:]
            self.test_labels = labels[training_slice:]

        else:
            features = features + rand_features

            labels = labels + [0 for i in range(len(rand_features))]
            temp = list(zip(features, labels))
            random.shuffle(temp)
            features, labels = zip(*temp)
            training_slice = int(self.training_ratio * len(labels))
            self.training_features = list(features[:training_slice])
            self.training_labels = list(labels[:training_slice])
            self.test_features = features[training_slice:]
            self.test_labels = labels[training_slice:]
Ejemplo n.º 22
0
def test_ee_fit_single_class():
    """Test either if an error when there is a single class"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_warns(UserWarning, ee.fit, X, y_single_class)
Ejemplo n.º 23
0
def test_ee_bad_ratio():
    """Test either if an error is raised with a wrong decimal value for
    the ratio"""

    # Define a negative ratio
    ratio = -1.0
    ee = EasyEnsemble(ratio=ratio)
    assert_raises(ValueError, ee.fit, X, Y)

    # Define a ratio greater than 1
    ratio = 100.0
    ee = EasyEnsemble(ratio=ratio)
    assert_raises(ValueError, ee.fit, X, Y)

    # Define ratio as an unknown string
    ratio = 'rnd'
    ee = EasyEnsemble(ratio=ratio)
    assert_raises(ValueError, ee.fit, X, Y)

    # Define ratio as a list which is not supported
    ratio = [.5, .5]
    ee = EasyEnsemble(ratio=ratio)
    assert_raises(ValueError, ee.fit, X, Y)
Ejemplo n.º 24
0
def ensemble_model(X_train, y_train):

    # define the methods
    over = BorderlineSMOTE(k_neighbors=7, kind="borderline-1")
    under = EasyEnsemble(random_state=1)

    steps = [('o', over), ('u', under)]

    pipeline = Pipeline(steps=steps)

    # transform the dataset
    new_X_train, new_y_train = pipeline.fit_resample(X_train, y_train)

    return new_X_train[0], new_y_train[0]
Ejemplo n.º 25
0
    def fit(self, train_x, train_y):
        self._estimators = []
        ee = EasyEnsemble(replacement=True, n_subsets=self._no_of_estimators)
        X_res, y_res = ee.fit_sample(train_x, train_y)

        for i in range(self._no_of_estimators):
            X, y = X_res[i, :, :], y_res[i, :]

            estimator = clone(self._base_classifier)
            estimator.fit(X, y)

            self._estimators.append(estimator)

        return self
Ejemplo n.º 26
0
def get_downsampling_data(train_pth="data/train_data.npy",
                          val_pth="data/val_data.npy",
                          test_pth="data/test_data.npy"):
    train_data = np.load(train_pth)[:, :-1]
    train_flag = np.load(train_pth)[:, -1]
    ee = EasyEnsemble(random_state=0, n_subsets=10)
    train_data, train_flag = ee.fit_sample(train_data, train_flag)
    train_flag = np.array(train_flag, dtype=np.int)
    val_data = np.load(val_pth)[:, :-1]
    val_flag = np.load(val_pth)[:, -1]
    val_flag = np.array(val_flag, dtype=np.int)
    test_data = np.load(test_pth)[:, :-1]
    test_flag = np.load(test_pth)[:, -1]
    test_flag = np.array(test_flag, dtype=np.int)
    return train_data, train_flag, val_data, val_flag, test_data, test_flag
Ejemplo n.º 27
0
def test_ee_fit():
    """Test the fitting method"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    ee.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ee.min_c_, 0)
    assert_equal(ee.maj_c_, 1)
    assert_equal(ee.stats_c_[0], 500)
    assert_equal(ee.stats_c_[1], 4500)
Ejemplo n.º 28
0
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ee = EasyEnsemble(random_state=RND_SEED)
    X_resampled, y_resampled = ee.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled[0])
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 400)
    assert_equal(count_y_res[2], 400)
Ejemplo n.º 29
0
def test_ee_init():
    """Test the initialisation of the object"""

    # Define a ratio
    ratio = 1.
    verbose = True
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, verbose=verbose)

    assert_equal(ee.ratio, ratio)
    assert_equal(ee.replacement, False)
    assert_equal(ee.n_subsets, 10)
    assert_equal(ee.random_state, RND_SEED)
    assert_equal(ee.verbose, verbose)
    assert_equal(ee.min_c_, None)
    assert_equal(ee.maj_c_, None)
    assert_equal(ee.stats_c_, {})
Ejemplo n.º 30
0
def easy_ensemble_classifier(clf, x_train, y_train, x_test, nsubs, repl):
    ee = EasyEnsemble(n_subsets=nsubs,
                      replacement=repl)  # Create EasyEnsemble object
    X_train_res, y_train_res = ee.fit_sample(x_train,
                                             y_train)  # re-sample the data
    clfs = []
    i = 0
    preds_ = np.zeros([1, np.shape(x_test)[0]])

    # Iterate through sub-samples:
    for xtrain in X_train_res:
        clfs += [clf]
        clfs[i].fit(xtrain, y_train_res[i])
        preds_ = np.add(preds_, clfs[i].predict(x_test))
        i += 1

    return np.divide(preds_, nsubs)