def test_fit_sample_check_voting():
    cc = ClusterCentroids(random_state=RND_SEED)
    cc.fit_sample(X, Y)
    assert cc.voting_ == 'soft'
    cc = ClusterCentroids(random_state=RND_SEED)
    cc.fit_sample(sparse.csr_matrix(X), Y)
    assert cc.voting_ == 'hard'
def test_fit_sample_check_voting():
    cc = ClusterCentroids(random_state=RND_SEED)
    cc.fit_sample(X, Y)
    assert cc.voting_ == 'soft'
    cc = ClusterCentroids(random_state=RND_SEED)
    cc.fit_sample(sparse.csr_matrix(X), Y)
    assert cc.voting_ == 'hard'
def test_fit_sample_error():
    ratio = 'auto'
    cluster = 'rnd'
    cc = ClusterCentroids(
        ratio=ratio, random_state=RND_SEED, estimator=cluster)
    with raises(ValueError, match="has to be a KMeans clustering"):
        cc.fit_sample(X, Y)

    voting = 'unknown'
    cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED)
    with raises(ValueError, match="needs to be one of"):
        cc.fit_sample(X, Y)
Esempio n. 4
0
def test_fit_sample_error():
    ratio = 'auto'
    cluster = 'rnd'
    cc = ClusterCentroids(ratio=ratio,
                          random_state=RND_SEED,
                          estimator=cluster)
    with raises(ValueError, match="has to be a KMeans clustering"):
        cc.fit_sample(X, Y)

    voting = 'unknown'
    cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED)
    with raises(ValueError, match="needs to be one of"):
        cc.fit_sample(X, Y)
Esempio n. 5
0
def buildModel(clf, X, y, cv_nums=10, is_random=False):
    # 是否打乱数据
    if is_random == True:
        random_lst = list(np.random.randint(0, 1000, 4))
    elif is_random == False:
        random_lst = [0] * 4

    print('----------各种类别不平衡处理方法结果, 为' + str(cv_nums) + '折交叉验证的f1均值----------')
    # 不做处理,使用原始数据集做预测
    print('原始数据集: ', np.mean(cross_val_score(clf, X, y, scoring='f1', cv=cv_nums)))

    ros = RandomOverSampler(random_state=random_lst[0])
    X_oversampled, y_oversampled = ros.fit_sample(X, y)
    # print(sorted(Counter(y_oversampled).items()))
    print('过采样: ', np.mean(cross_val_score(clf, X_oversampled, y_oversampled, scoring='f1', cv=cv_nums)))

    cc = ClusterCentroids(random_state=random_lst[1])
    X_undersampled, y_undersampled = cc.fit_sample(X, y)
    #print(sorted(Counter(y_undersampled).items()))
    print('欠采样: ', np.mean(cross_val_score(clf, X_undersampled, y_undersampled, scoring='f1', cv=cv_nums)))

    sm = SMOTE(random_state=random_lst[2])
    X_smote, y_smote = sm.fit_sample(X, y)
    #print(sorted(Counter(y_smote).items()))
    print('SMOTE: ', np.mean(cross_val_score(clf, X_smote, y_smote, scoring='f1', cv=cv_nums)))

    # 将样本多的类别划分为若干个集合供不同学习器使用,这样对每个学习器来看都进行了欠采样,
    # 但在全局来看却不会丢失重要信息,假设将负样本的类别划分为10份,正样本的类别只有1份,
    # 这样训练10个学习器,每个学习器使用1份负样本和1份正样本,正样本共用
    ee = EasyEnsemble(random_state=random_lst[3], n_subsets=10)
    X_ee, y_ee = ee.fit_sample(X, y)
def test_fit_sample_auto():
    ratio = 'auto'
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, Y)
    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.06738818, -0.529627],
                     [0.17901516, 0.69860992], [0.094035, -2.55298982]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_multiclass_fit_sample():
    y = Y.copy()
    y[5] = 2
    y[6] = 2
    cc = ClusterCentroids(random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, y)
    count_y_res = Counter(y_resampled)
    assert count_y_res[0] == 2
    assert count_y_res[1] == 2
    assert count_y_res[2] == 2
def test_multiclass_fit_sample():
    y = Y.copy()
    y[5] = 2
    y[6] = 2
    cc = ClusterCentroids(random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, y)
    count_y_res = Counter(y_resampled)
    assert count_y_res[0] == 2
    assert count_y_res[1] == 2
    assert count_y_res[2] == 2
def test_fit_sample_half():
    ratio = .5
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, Y)
    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.19220316, 0.32337101], [0.094035, -2.55298982],
                     [0.20792588, 1.49407907], [0.04352327, -0.20515826],
                     [0.12372842, 0.6536186]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto():
    ratio = 'auto'
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, Y)
    X_gt = np.array([[0.92923648, 0.76103773],
                     [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502],
                     [0.06738818, -0.529627],
                     [0.17901516, 0.69860992],
                     [0.094035, -2.55298982]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 11
0
def test_multiclass_fit_sample():
    # Make y to be multiclass
    y = Y.copy()
    y[5] = 2
    y[6] = 2

    # Resample the data
    cc = ClusterCentroids(random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 2)
    assert_equal(count_y_res[1], 2)
    assert_equal(count_y_res[2], 2)
def test_fit_sample_object():
    sampling_strategy = 'auto'
    cluster = KMeans(random_state=RND_SEED)
    cc = ClusterCentroids(
        sampling_strategy=sampling_strategy,
        random_state=RND_SEED,
        estimator=cluster)

    X_resampled, y_resampled = cc.fit_sample(X, Y)
    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.06738818, -0.529627],
                     [0.17901516, 0.69860992], [0.094035, -2.55298982]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 13
0
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    cc = ClusterCentroids(random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 400)
    assert_equal(count_y_res[2], 400)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    cc = ClusterCentroids(random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled)
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 400)
    assert_equal(count_y_res[2], 400)
Esempio n. 15
0
def UnderSample(X, Y, method='Random', random_state=42):
    if X.size == len(X):
        X = X.reshape(-1, 1)
    if method is 'Cluster':  # 默认kmeans估计器
        sampler = ClusterCentroids(ratio='auto',
                                   random_state=random_state,
                                   estimator=None)
    elif method is 'Random':
        sampler = RandomUnderSampler(ratio='auto',
                                     random_state=random_state,
                                     replacement=False)
    elif method is 'NearMiss_1':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=1)
    elif method is 'NearMiss_2':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=2)
    elif method is 'NearMiss_3':
        sampler = NearMiss(ratio='auto', random_state=random_state, version=3)
    elif method is 'TomekLinks':
        sampler = TomekLinks(ratio='auto', random_state=random_state)
    elif method is 'ENN':  # kind_sel可取'all'和'mode'
        sampler = EditedNearestNeighbours(ratio='auto',
                                          random_state=random_state,
                                          kind_sel='all')
    elif method is 'RENN':  # kind_sel可取'all'和'mode'
        sampler = RepeatedEditedNearestNeighbours(ratio='auto',
                                                  random_state=random_state,
                                                  kind_sel='all')
    elif method is 'All_KNN':
        sampler = AllKNN(ratio='auto',
                         random_state=random_state,
                         kind_sel='all')
    elif method is 'CNN':
        sampler = CondensedNearestNeighbour(ratio='auto',
                                            random_state=random_state)
    elif method is 'One_SS':
        sampler = OneSidedSelection(ratio='auto', random_state=random_state)
    elif method is 'NCR':
        sampler = NeighbourhoodCleaningRule(ratio='auto',
                                            random_state=random_state,
                                            kind_sel='all',
                                            threshold_cleaning=0.5)
    elif method is 'IHT':
        sampler = InstanceHardnessThreshold(estimator=None,
                                            ratio='auto',
                                            random_state=random_state)
    X_resampled, Y_resampled = sampler.fit_sample(X, Y)
    return X_resampled, Y_resampled
def test_fit_sample_half():
    """Test fit and sample routines with ratio of .5"""

    # Define the parameter for the under-sampling
    ratio = .5

    # Create the object
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)

    # Fit and sample
    X_resampled, y_resampled = cc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cc_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cc_y_05.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 17
0
def test_fit_sample_half():
    """Test fit and sample routines with ratio of .5"""

    # Define the parameter for the under-sampling
    ratio = .5

    # Create the object
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)

    # Fit and sample
    X_resampled, y_resampled = cc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'cc_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'cc_y_05.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_sample_half():
    ratio = {0: 3, 1: 6}
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, Y)
    X_gt = np.array([[0.92923648, 0.76103773],
                     [0.13347175, 0.12167502],
                     [0.47104475, 0.44386323],
                     [0.09125309, -0.85409574],
                     [0.19220316, 0.32337101],
                     [0.094035, -2.55298982],
                     [0.20792588, 1.49407907],
                     [0.04352327, -0.20515826],
                     [0.12372842, 0.6536186]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    print(X_resampled)
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
	def kmeans_undersample(self):
		'''
		Undersample majority class with its centroids
		'''
		df = self.data

		cc = ClusterCentroids(voting='soft', n_jobs=-1)
		data = df[self.features].as_matrix()
		labels = df['label']
		data_resampled, label_resampled = cc.fit_sample(data, labels)

		df2 = pd.DataFrame(data_resampled.tolist(),columns=self.features)

		df2['label'] = label_resampled
		df2['cluster'] = 0
		df2['original'] = 0

		return df2
Esempio n. 20
0
def test_fit_hard_voting():
    ratio = 'auto'
    voting = 'hard'
    cluster = KMeans(random_state=RND_SEED)
    cc = ClusterCentroids(ratio=ratio,
                          random_state=RND_SEED,
                          estimator=cluster,
                          voting=voting)

    X_resampled, y_resampled = cc.fit_sample(X, Y)
    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.12372842, 0.6536186], [0.094035, -2.55298982]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
    for x in X_resampled:
        assert np.any(np.all(x == X, axis=1))
Esempio n. 21
0
def test_fit_sample_auto():
    """Test fit and sample routines with auto ratio"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)

    # Fit and sample
    X_resampled, y_resampled = cc.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.06738818, -0.529627],
                     [0.17901516, 0.69860992], [0.094035, -2.55298982]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto():
    """Test fit and sample routines with auto ratio"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)

    # Fit and sample
    X_resampled, y_resampled = cc.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.06738818, -0.529627],
                     [0.17901516, 0.69860992], [0.094035, -2.55298982]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 23
0
def precalculate_nearest_neighbors(
        reference_taxonomy: Series, reference_sequences: DNAIterator,
        max_centroids_per_class: int=10,
        feature_extractor_specification: str=_default_feature_extractor,
        knn_classifier_specification: str=_default_knn_classifier,
        n_jobs: int=1, random_state: int=42) -> dict:
    spec = json.loads(feature_extractor_specification)
    feat_ext = pipeline_from_spec(spec)
    if not isinstance(feat_ext.steps[-1][-1], TransformerMixin):
        raise ValueError('feature_extractor_specification must specify a '
                         'transformer')
    spec = json.loads(knn_classifier_specification)
    nn = pipeline_from_spec(spec)
    if not isinstance(nn.steps[-1][-1], KNeighborsMixin):
        raise ValueError('knn_classifier_specification must specifiy a '
                         'KNeighbors classifier')

    seq_ids, X = _extract_reads(reference_sequences)
    data = [(reference_taxonomy[s], x)
            for s, x in zip(seq_ids, X) if s in reference_taxonomy]
    y, X = list(zip(*data))
    X = feat_ext.transform(X)

    if max_centroids_per_class > 0:
        class_counts = Counter(y)
        undersample_classes = {t: max_centroids_per_class
                               for t, c in class_counts.items()
                               if c > max_centroids_per_class}
        cc = ClusterCentroids(random_state=random_state, n_jobs=n_jobs,
                              ratio=undersample_classes, voting='hard')
        X_resampled, y_resampled = cc.fit_sample(X, y)
    else:
        X_resampled, y_resampled = X, y

    if 'n_jobs' in nn.steps[-1][-1].get_params():
        nn.steps[-1][-1].set_params(n_jobs=n_jobs)
    nn.fit(X_resampled)
    nn = nn.steps[-1][-1]
    if n_jobs != 1 and hasattr(X_resampled, 'todense'):
        indices = nn.kneighbors(X_resampled.todense(), return_distance=False)
    else:
        indices = nn.kneighbors(X_resampled, return_distance=False)
    return {'neighbors': indices.tolist(), 'taxonomies': y_resampled.tolist()}
Esempio n. 24
0
def test_fit_sample_object():
    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    cluster = KMeans(random_state=RND_SEED)
    cc = ClusterCentroids(ratio=ratio,
                          random_state=RND_SEED,
                          estimator=cluster)

    # Fit and sample
    X_resampled, y_resampled = cc.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.06738818, -0.529627],
                     [0.17901516, 0.69860992], [0.094035, -2.55298982]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Esempio n. 25
0
def undersample_cluster_centroid(X,
                                 y,
                                 label='Cluster Centroids under-sampling',
                                 plot=False,
                                 sampling_strategy='auto',
                                 random_state=None,
                                 estimator=None,
                                 voting='auto',
                                 n_jobs=-1,
                                 ratio=None):
    '''
    voting:str, optional (default=’auto’)
    Voting strategy to generate the new samples:

    If 'hard', the nearest-neighbors of the centroids found using the clustering algorithm will be used.
    If 'soft', the centroids found by the clustering algorithm will be used.
    '''
    cc = ClusterCentroids(sampling_strategy=sampling_strategy,
                          random_state=random_state,
                          estimator=estimator,
                          voting=voting,
                          n_jobs=n_jobs,
                          ratio=ratio)
    X_cc, y_cc = cc.fit_sample(X, y)
    X_cc = pd.DataFrame(X_cc, columns=X.columns)
    y_cc = pd.Series(y_cc, name=y.name)
    if plot == True:
        # plotting using pca
        pca = PCA(n_components=2)
        X_pca = pd.DataFrame(pca.fit_transform(X_cc))
        colors = ['#1F77B4', '#FF7F0E']
        markers = ['o', 's']
        for l, c, m in zip(np.unique(y_cc), colors, markers):
            plt.scatter(
                X_pca.loc[y_cc.sort_index() == l, 0],  # pc 1
                X_pca.loc[y_cc.sort_index() == l, 1],  # pc 2
                c=c,
                label=l,
                marker=m)
        plt.title(label)
        plt.legend(loc='upper right')
        plt.show()
    return X_cc, y_cc, cc
def test_fit_hard_voting():
    ratio = 'auto'
    voting = 'hard'
    cluster = KMeans(random_state=RND_SEED)
    cc = ClusterCentroids(
        ratio=ratio, random_state=RND_SEED, estimator=cluster,
        voting=voting)

    X_resampled, y_resampled = cc.fit_sample(X, Y)
    X_gt = np.array([[0.92923648, 0.76103773],
                     [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502],
                     [0.09125309, -0.85409574],
                     [0.12372842, 0.6536186],
                     [0.094035, -2.55298982]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
    for x in X_resampled:
        assert np.any(np.all(x == X, axis=1))
Esempio n. 27
0
def test_fit_sample_half():
    """Test fit and sample routines with ratio of .5"""

    # Define the parameter for the under-sampling
    ratio = .5

    # Create the object
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)

    # Fit and sample
    X_resampled, y_resampled = cc.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.19220316, 0.32337101], [0.094035, -2.55298982],
                     [0.20792588, 1.49407907], [0.04352327, -0.20515826],
                     [0.12372842, 0.6536186]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_sample_half():
    """Test fit and sample routines with ratio of .5"""

    # Define the parameter for the under-sampling
    ratio = .5

    # Create the object
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)

    # Fit and sample
    X_resampled, y_resampled = cc.fit_sample(X, Y)

    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
                     [0.19220316, 0.32337101], [0.094035, -2.55298982],
                     [0.20792588, 1.49407907], [0.04352327, -0.20515826],
                     [0.12372842, 0.6536186]])
    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def createUnderAndOverSample(given_data, outputdata_filename, max_len, codebook):
    dataX = []
    dataY = []
    for xx in given_data:
        dataX.append(xx[0:-1])
        dataY.append(xx[-1])

    X = pad_sequences(dataX, maxlen=max_len, dtype='float32')
    X_norm = X / (float(len(codebook)))
    y_norm = numpy.array(dataY)

    # perform over or under sampling    
    sm_over = SMOTE(kind='borderline2')
    X_res_over, y_res_over = sm_over.fit_sample(X_norm, y_norm)
    sm_under = ClusterCentroids()
    X_res_under, y_res_under = sm_under.fit_sample(X_norm, y_norm)
        

    X_d_under = X_res_under * (float(len(codebook)))
    X_d_over = X_res_over * (float(len(codebook)))
    
    writeSampledSequences(X_d_under, y_res_under, codebook, "under/"+outputdata_filename)
    writeSampledSequences(X_d_over, y_res_over, codebook, "over/"+outputdata_filename)
class ClassRebalancer(DataStorer):
    '''
    Class that rebalances the classes according to the sampling_strategy and balance_method during initialization
    This will help us reduce volume of data to compute without losing information about decision boundary
    '''

    def __init__(self, balance_method_name='undersample_centroid', sampling_strategy=0.5, *args, **kwargs):
        '''
        Init balance_method_name for class that easy to check which used, rebalance classes
         and init DataStorer with data contained in *args and **kwargs

        :param balance_method_name:  str
            define the name of rebalance method (imblearn)
        :param sampling_strategy:  float or str ('auto')
            define of desire balance classes ratio after rebalancing
        :param args: tuple
            here should be X, y for init DataStorer Class
        :param kwargs:  dict
            here should be X, y for init DataStorer Class
        '''
        self.balance_method_name = balance_method_name

        super().__init__(*args, **kwargs)  # here init DataStorer for further potentially using it in rebalance_classes

        if self.balance_method_name == 'undersample_centroid':
            self.balance_method = ClusterCentroids(sampling_strategy=sampling_strategy)
            self.rebalance_classes()
        else:
            print(f'balance_method_name: {self.balance_method_name} doesnt fit. Сlasses were not rebalanced')

    def rebalance_classes(self, ):
        '''
        Just rebalances the data and displays information about changes in class balance
        '''
        print(f'Changing balances from {Counter(self.y).items()}')
        self.X, self.y = self.balance_method.fit_sample(self.X, self.y)
        print(f'to {Counter(self.y).items()}')
Esempio n. 31
0
    def learning(self):
        self.models = []
        self.alphas = []

        N, _ = self.X.shape
        W = np.ones(N) / N
        for i in range(self.k):
            print(i)
            cus = ClusterCentroids(ratio='majority')
            x_undersampled, y_undersampled = cus.fit_sample(self.X, self.Y)
            cl = tree.DecisionTreeClassifier(splitter='best')
            cl.fit(x_undersampled, y_undersampled)

            P = cl.predict(self.X)

            err = np.sum(W[P != self.Y])

            if err > 0.5:
                i = i - 1
            if err <= 0:
                err = 0.0000001
            else:
                try:
                    if (np.log(1 - err) - np.log(err)) == 0:
                        alpha = 0
                    else:
                        alpha = 0.5 * (np.log(1 - err) - np.log(err))
                    W = W * np.exp(-alpha * Y * P)  # vectorized form
                    W = W / W.sum()  # normalize so it sums to 1
                except:
                    alpha = 0
                    # W = W * np.exp(-alpha * Y * P)  # vectorized form
                    W = W / W.sum()  # normalize so it sums to 1

                self.models.append(cl)
                self.alphas.append(alpha)
Esempio n. 32
0
不同的是,对于 Borderline-2 SMOTE,随机样本b可以是属于任何一个类的样本;
SVM SMOTE:kind='svm',使用支持向量机分类器产生支持向量然后再生成新的少数类样本.
'''



'''
下采样(Under-sampling)
原型生成(prototype generation)
给定数据集S,原型生成算法将生成一个子集S’,其中|S’|<|S|,但是子集并非来自于原始数据集. 
意思就是说:原型生成方法将减少数据集的样本数量,剩下的样本是由原始数据集生成的,而不是直接来源于原始数据集.
ClusterCentroids函数实现了上述功能: 每一个类别的样本都会用K-Means算法的中心点来进行合成, 而不是随机从原始样本进行抽取.
'''
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
# ClusterCentroids函数提供了一种很高效的方法来减少样本的数量, 但需要注意的是, 该方法要求原始数据集最好能聚类成簇. 
# 此外, 中心点的数量应该设置好, 这样下采样的簇能很好地代表原始数据.
'''
原型选择(prototype selection)
与原型生成不同的是, 原型选择算法是直接从原始数据集中进行抽取. 
抽取的方法大概可以分为两类:(i)可控的下采样技术(the controlled under-sampling techniques)
(ii)the cleaning under-sampling techniques
第一类的方法可以由用户指定下采样抽取的子集中样本的数量;第二类方法则不接受这种用户的干预.
'''
#RandomUnderSampler函数是一种快速并十分简单的方式来平衡各个类别的数据: 随机选取数据的子集.
from imblearn.under_sampling import RandomUnderSampler#下采样函数
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
from imblearn.under_sampling import ClusterCentroids

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Cluster Centroids
cc = ClusterCentroids()
X_resampled, y_resampled = cc.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
Esempio n. 34
0
import sys, os, csv
from imblearn.under_sampling import ClusterCentroids
input_csv_file = sys.argv[1]
input_csv = input_csv_file.split(".csv")[0]
with open(input_csv_file, newline="") as input_file:
    reader = csv.reader(input_file, delimiter=',')
    with open(input_csv + "-cc-.csv", 'w', newline='') as output_file:
        writer = csv.writer(output_file, delimiter=',')
        skip_header = True
        X = []
        y = []
        cc = ClusterCentroids()
        for x in reader:
            if skip_header:
                skip_header = False
                continue
            y.append(x[-1])
            X.append(list(map(int, x[:len(x) - 1])))
            #print (X)
        X_res, y_res = cc.fit_sample(X, y)        
        print (len(X_res))
        print (len(y_res))
        for idx, s in enumerate(X_res):
            #print (list(s) + list(y_res[idx]))
            writer.writerow(list(s) + list(y_res[idx]))
            #break;
            
Esempio n. 35
0
            print("")

            
            print('-----------------')
            best_dict[imbalance] = [clf, roc_auc_score(y_test, clf.predict(X_test))]

#analysis with just cluster centroids(best imbalancer)
classifiers = [LogisticRegression(), SVC(probability=True),
                      GaussianNB(), DecisionTreeClassifier(), RandomForestClassifier(),
                      KNeighborsClassifier(n_neighbors=6)]

cc = ClusterCentroids()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                         random_state=4444)
        
X_train, y_train = cc.fit_sample(X_train, y_train)


fprs,tprs,roc_aucs = [],[],[]
for clf in classifiers:
    clf.fit(X_train,y_train)
    y_pred = clf.predict_proba(X_test)[:,1]

    y_true = y_test
    
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    
    fprs.append(fpr)
    tprs.append(tpr)
    roc_aucs.append(roc_auc)
Esempio n. 36
0
    'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct',
    'Nov', 'Dec'
],
                                       ordered=True,
                                       inplace=True)
# Convert data in test set
apply_cats(df=df_test, trn=df_raw)

# Convert category to numerical and replace missing data
df, y, nas = proc_df(df_raw, 'FraudFound_P')
X_test, _, nas = proc_df(df_test, na_dict=nas)
df, y, nas = proc_df(df_raw, 'FraudFound_P', na_dict=nas)

# Undersample majority class
cc = ClusterCentroids(ratio={0: 6650}, n_jobs=-1)
X_cc_full, y_cc_full = cc.fit_sample(df, y)
plot_2d_space(X_cc, y_cc, 'Cluster Centroids under-sampling')

# Model(LGBoost)
lgb_train = lgb.Dataset(X_cc_full, y_cc_full, free_raw_data=False)
# Parametes for lgboost
parameters = {
    'num_leaves': 2**5,
    'learning_rate': 0.05,
    'is_unbalance': True,
    'min_split_gain': 0.03,
    'min_child_weight': 1,
    'reg_lambda': 1,
    'subsample': 1,
    'objective': 'binary',
    'task': 'train'
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#ClusterCentroids
cc = ClusterCentroids(random_state=0)
os_X,os_y = cc.fit_sample(X_train,y_train)

#XGboost
clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1,
                       max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1,
                       reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000)  
clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False)  
evals_result = clf_XG.evals_result()  
y_true, y_pred = y_test, clf_XG.predict(X_test)  

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
Esempio n. 38
0
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


#define X y
X, y = data.loc[:, data.columns != 'state'].values, data.loc[:, data.columns ==
                                                             'state'].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

#ClusterCentroids
cc = ClusterCentroids(random_state=0)
os_X, os_y = cc.fit_sample(X_train, y_train)

#Random Forest
clf_RF = RandomForestClassifier(n_estimators=10,
                                max_depth=None,
                                min_samples_split=2,
                                random_state=0)
clf_RF.fit(os_X, os_y)
y_true, y_pred = y_test, clf_RF.predict(X_test)

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
from imblearn.under_sampling import ClusterCentroids
import pandas as pd
import numpy as np

benchmark = pd.read_csv("./data/feature_new.csv", sep='\t')
benchmark['label'] = (benchmark['label'] == "acr").astype(int)
X = benchmark[["len", "function", "codon", "dev", "hth"]]
y = benchmark['label']

cc = ClusterCentroids(sampling_strategy={0: 25158}, n_jobs=1, random_state=0)

X_smt, y_smt = cc.fit_sample(X, y)

new_benchmark = pd.concat([y_smt, X_smt], axis=1)
new_benchmark.to_csv("./data/feature_CC.csv", sep="\t", index=False)
Esempio n. 40
0
 def clustercentroidundersample(self, x_train, y_train):
     cc = ClusterCentroids()
     X_cc, y_cc = cc.fit_sample(x_train, y_train)
     return X_cc, y_cc
Esempio n. 41
0
#Separa los datos que corresponden a las características y a las Etiquetas
dataset = dataframe.values
X = dataset[0:8330, 0:138].astype(float)
Yn = dataset[0:8330:, 138]
Y = np_utils.to_categorical(Yn)
scaler = Normalizer('l2').fit(X)
X_normalized = scaler.transform(X)

#Separar los datos entrenamiento y validación 60-40
#Separar los datos entrenamiento y validación 60-40
X_train, X_test, y_train, y_test = train_test_split(X_normalized,
                                                    Yn,
                                                    test_size=0.4,
                                                    random_state=42)

##balancear datos con SMOTE
sm = SMOTE(random_state=12, ratio=1.0)
X_train1, Y_train1 = sm.fit_sample(X_train, y_train)
##Convertir en vectores binarios y_train y y_test
y_train1 = np_utils.to_categorical(Y_train1)
y_test1 = np_utils.to_categorical(y_test)

#Balancear datos con UNDERSAMPLING

#print(sorted(Counter(y_train).items()))
cc = ClusterCentroids(random_state=0)
X_train2, Y_train2 = cc.fit_sample(X_train, y_train)
#print(sorted(Counter(y_resampled).items())
y_train2 = np_utils.to_categorical(Y_train2)
y_test2 = np_utils.to_categorical(y_test)
adasyn_smote_accuracy_score: 0.3661544972905157
adasyn_f1_score: 0.3661544972905157
adasyn_cohen_kappa_score: 0.04467966548542168
adasyn_hamming_loss 0.6338455027094844
'''
'''
下采样(Under-sampling)
原型生成(prototype generation)
给定数据集S,原型生成算法将生成一个子集S’,其中|S’|<|S|,但是子集并非来自于原始数据集. 
意思就是说:原型生成方法将减少数据集的样本数量,剩下的样本是由原始数据集生成的,而不是直接来源于原始数据集.
ClusterCentroids函数实现了上述功能: 每一个类别的样本都会用K-Means算法的中心点来进行合成, 而不是随机从原始样本进行抽取.
'''

from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=0)
X_resampled_cc, y_resampled_cc = cc.fit_sample(train_set_1_1, label)
print('ClusterCentroids:', sorted(Counter(y_resampled_cc).items()))
x_train_cc, x_test_cc, y_train_cc, y_test_cc = train_test_split(X_resampled_cc,
                                                                y_resampled_cc,
                                                                random_state=1)
# ClusterCentroids函数提供了一种很高效的方法来减少样本的数量, 但需要注意的是, 该方法要求原始数据集最好能聚类成簇.
# 此外, 中心点的数量应该设置好, 这样下采样的簇能很好地代表原始数据.
svm_clf.fit(x_train_cc, y_train_cc)
joblib.dump(svm_clf, '../model/cc_sample_model.pkl')

#smote评估
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svm_clf, x_test_cc, y_test_cc, cv=5)
print('cc_score:', scores)
pred3 = svm_clf.predict(x_test_cc)
print('cc_accuracy_score:', metrics.accuracy_score(y_test_cc, pred3))
Esempio n. 43
0
X_tl, y_tl, id_tl = tl.fit_sample(X, y)



print('Removed indexes:', id_tl)



plot_2d_space(X_tl, y_tl, 'Tomek links under-sampling')
from imblearn.under_sampling import ClusterCentroids



cc = ClusterCentroids(ratio={0: 10})

X_cc, y_cc = cc.fit_sample(X, y)



plot_2d_space(X_cc, y_cc, 'Cluster Centroids under-sampling')
from imblearn.over_sampling import SMOTE



smote = SMOTE(ratio='minority')

X_sm, y_sm = smote.fit_sample(X, y)



plot_2d_space(X_sm, y_sm, 'SMOTE over-sampling')
Esempio n. 44
0
from sklearn.metrics import log_loss
from sklearn.neural_network import MLPClassifier
# import some data to play with
X = []
Y = []
reader = DictReader(open("picture.csv", 'r'))
for row in reader:
	Y.append(row['win'])
	del row['win']
	X.append(row)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X)
print('Original dataset shape {}'.format(Counter(Y)))
#sm = SMOTE(kind='svm')
sm = ClusterCentroids(random_state=42)
X, Y = sm.fit_sample(X, Y)
print('Resampled dataset shape {}'.format(Counter(Y)))
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.25, random_state=0)
### building the classifiers
clfs = []

svc = SVC(kernel="linear", C=0.025,probability=True)
svc.fit(train_x, train_y)
print('SVC LogLoss {score}'.format(score=log_loss(test_y, svc.predict_proba(test_x))))
clfs.append(svc)

svc = SVC(kernel="linear", C=0.025,probability=True)
svc.fit(train_x, train_y)
print('SVC LogLoss {score}'.format(score=log_loss(test_y, svc.predict_proba(test_x))))
clfs.append(svc)