def main():

    # Create 2 artificial clusters that partially overlap
    X,y = createCluster()
    
    # Plot the clusters
    colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
    colors = np.hstack([colors] * 20)
    pl.scatter(np.array(X)[:, 0], np.array(X)[:, 1], color=colors[y].tolist(), s=10)
    pl.show()

    # Get the minority and majority count
    ms,ml = ADASYN.getClassCount(X,y)
    d = ADASYN.getd(X,y,ms,ml)
    G = ADASYN.getG(X,y,ms,ml,1)

    # Get the list of r values, which indicate how many samples will be made per data point in the minority dataset
    rlist = ADASYN.getRis(X,y,0,5)

    # Generate the synthetic data
    newX,newy = ADASYN.generateSamples(rlist,X,y,G,0,5)
    
    # Plot the dataset again
    pl.scatter(np.array(X)[:, 0], np.array(X)[:, 1], color=colors[y].tolist(), s=10)
    pl.scatter(np.array(newX)[:, 0], np.array(newX)[:, 1], color='red', s=10)
    pl.show()
    
    X,y = ADASYN.joinwithmajorityClass(X,y,newX,newy,1)

    print 'test'
Beispiel #2
0
def main():

    # Create 2 artificial clusters that partially overlap
    X, y = createCluster()

    # Plot the clusters
    colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
    colors = np.hstack([colors] * 20)
    pl.scatter(np.array(X)[:, 0],
               np.array(X)[:, 1],
               color=colors[y].tolist(),
               s=10)
    pl.show()

    # Get the minority and majority count
    ms, ml = ADASYN.getClassCount(X, y)
    d = ADASYN.getd(X, y, ms, ml)
    G = ADASYN.getG(X, y, ms, ml, 1)

    # Get the list of r values, which indicate how many samples will be made per data point in the minority dataset
    rlist = ADASYN.getRis(X, y, 0, 5)

    # Generate the synthetic data
    newX, newy = ADASYN.generateSamples(rlist, X, y, G, 0, 5)

    # Plot the dataset again
    pl.scatter(np.array(X)[:, 0],
               np.array(X)[:, 1],
               color=colors[y].tolist(),
               s=10)
    pl.scatter(np.array(newX)[:, 0], np.array(newX)[:, 1], color='red', s=10)
    pl.show()

    X, y = ADASYN.joinwithmajorityClass(X, y, newX, newy, 1)

    print 'test'
Beispiel #3
0
def templet(sampler_name, sample_ratio):
    """
    模板方法
    :param sampler_name: 采样算法名
    :param sample_ratio: 采样比例
    :return:
    """
    dataset = fetch_datasets()['satimage']
    X = dataset.data
    y = dataset.target
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        sb = None
        if sampler_name == 'CART':
            sb = DummySampler()
        elif sampler_name == 'SMOTE':
            sb = SMOTE(N=sample_ratio, k_neighbors=5, random_state=42)
        elif sampler_name == 'Border1':
            sb = BorderSMOTE(N=sample_ratio,
                             m_neighbors=9,
                             k_neighbors=5,
                             random_state=42,
                             kind='borderline1')
        elif sampler_name == 'Border2':
            sb = BorderSMOTE(N=sample_ratio,
                             m_neighbors=9,
                             k_neighbors=5,
                             random_state=42,
                             kind='borderline2')
        elif sampler_name == 'ADASYN':
            sb = ADASYN(bata=sample_ratio, k_neighbors=5, random_state=42)
        elif sampler_name == 'Safe-level':
            sb = SafeLevelSMOTE(N=sample_ratio, k_neighbors=5, random_state=42)
        else:
            pass
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])  # 采样
        model = tree.DecisionTreeClassifier(max_depth=8,
                                            min_samples_split=10,
                                            random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]
        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        # write2dic
        fill_dic('precision', sampler_name, sample_ratio, precision)
        fill_dic('recall', sampler_name, sample_ratio, recall)
        fill_dic('f1', sampler_name, sample_ratio, f1)
        fill_dic('auc', sampler_name, sample_ratio, auc)
        fill_dic('gmean', sampler_name, sample_ratio, gmean)
    print('%s %.1f building id transforming took %fs!' %
          (sampler_name, sample_ratio, time.time() - start_time))
Beispiel #4
0
n = 10  # repeat the CV procedure 10 times to get more precise results
#===============================================================================
for i in range(n):
    print "======================================= CROSS VALIDATION LOOP: ", (i+1)

    # for each iteration, randomly hold out 20% of the data as CV set
    X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
        X, y, test_size=.20, random_state=i*25)

    print "training size: ", X_train.shape
    print "label size: ", y_train.shape 

    print "the ADASYN goodie..."

    y_train = list(np.array(y_train).reshape(-1,))
    ms,ml = ADASYN.getClassCount(X_train,y_train)

    d = ADASYN.getd(X_train,y_train,ms,ml)
    G = ADASYN.getG(X_train,y_train,ms,ml,1)

    # Get the list of r values, which indicate how many samples will be made per data point in the minority dataset
    rlist = ADASYN.getRis(X_train,y_train,0,2)

    # Generate the synthetic data
    newX,newy = ADASYN.generateSamples(rlist,X_train,y_train,G,0,2)
    
    X_train,y_train = ADASYN.joinwithmajorityClass(X_train,y_train,newX,newy,1)


    print "new training size: ", X_train.shape
    print "new label size: ", y_train.shape 
Beispiel #5
0
def test():
    dic = {'recall': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'precision': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'f1': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'auc': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'gmean': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}}

    results = prettytable.PrettyTable(["Classifier", "Precision", 'Recall', 'AUC', 'F-measure', 'G-mean'])

    # 加载数据
    dataset = fetch_datasets()['satimage']
    X = dataset.data
    y = dataset.target
    print(Counter(y))
    # 随机种子,保证每次实验结果相同
    np.random.seed(42)
    # -------------------------------------------CART----------------------------------------------------
    # 起始时间
    start_time = time.time()
    # 交叉验证CART
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # initialize CART
        cart = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        # 归一化
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        cart.fit(X_train_minmax, y[train])
        # 测试
        predict = cart.predict(X_test_minmax)
        probability = cart.predict_proba(X_test_minmax)

        cart_auc = metrics.roc_auc_score(y[test], probability[:, 1])
        cart_precision = metrics.precision_score(y[test], predict)
        cart_recall = metrics.recall_score(y[test], predict)
        if cart_precision == 0:
            cart_f1 = 0
        else:
            cart_f1 = 2 * (cart_precision * cart_recall) / (cart_precision + cart_recall)
        cart_gmean = geometric_mean_score(y[test], predict)
        dic['precision']['CART'].append(cart_precision)
        dic['recall']['CART'].append(cart_recall)
        dic['f1']['CART'].append(cart_f1)
        dic['auc']['CART'].append(cart_auc)
        dic['gmean']['CART'].append(cart_gmean)
    print('CART building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------------SMOTE----------------------------------------------------------
    # 起始时间
    start_time = time.time()
    # 交叉验证
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # preprocess
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # initialize sampler
        sb = SMOTE(N=100, k_neighbors=5, random_state=42)
        # sampling
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])
        # initialize classifier
        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        # model = svm.SVC(class_weight={1: 20})
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['SMOTE'].append(precision)
        dic['recall']['SMOTE'].append(recall)
        dic['f1']['SMOTE'].append(f1)
        dic['auc']['SMOTE'].append(auc)
        dic['gmean']['SMOTE'].append(gmean)

    print('SMOTE building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------Borderline-SMOTE1----------------------------------------------------
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 初始化采样器
        sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline1')
        # 采样
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['Border1'].append(precision)
        dic['recall']['Border1'].append(recall)
        dic['f1']['Border1'].append(f1)
        dic['auc']['Border1'].append(auc)
        dic['gmean']['Border1'].append(gmean)

    print('BorderSmote1 building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------Borderline-SMOTE2----------------------------------------------------
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 初始化采样器
        sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline2')
        # 采样
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['Border2'].append(precision)
        dic['recall']['Border2'].append(recall)
        dic['f1']['Border2'].append(f1)
        dic['auc']['Border2'].append(auc)
        dic['gmean']['Border2'].append(gmean)

    print('BorderSmote2 building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------ADASYN---------------------------------------------------------------
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        sb = ADASYN(bata=0.1, k_neighbors=5, random_state=42)
        # 预测
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['ADASYN'].append(precision)
        dic['recall']['ADASYN'].append(recall)
        dic['f1']['ADASYN'].append(f1)
        dic['auc']['ADASYN'].append(auc)
        dic['gmean']['ADASYN'].append(gmean)

    print('ADASYN building id transforming took %fs!' % (time.time() - start_time))

    # ------------------------------------------------Safe-Level-SMOTE----------------------------------------------
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        sb = SafeLevelSMOTE(N=100, k_neighbors=5, random_state=42)
        # 预测
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['Safe-level'].append(precision)
        dic['recall']['Safe-level'].append(recall)
        dic['f1']['Safe-level'].append(f1)
        dic['auc']['Safe-level'].append(auc)
        dic['gmean']['Safe-level'].append(gmean)

    print('Safe-level building id transforming took %fs!' % (time.time() - start_time))

    # display
    results.add_row(['CART',
                     np.mean(np.array(dic['precision']['CART'])),
                     np.mean(np.array(dic['recall']['CART'])),
                     np.mean(np.array(dic['auc']['CART'])),
                     np.mean(np.array(dic['f1']['CART'])),
                     np.mean(np.array(dic['gmean']['CART']))])
    results.add_row(['SMOTE',
                     np.mean(np.array(dic['precision']['SMOTE'])),
                     np.mean(np.array(dic['recall']['SMOTE'])),
                     np.mean(np.array(dic['auc']['SMOTE'])),
                     np.mean(np.array(dic['f1']['SMOTE'])),
                     np.mean(np.array(dic['gmean']['SMOTE']))])
    results.add_row(['Border1',
                     np.mean(np.array(dic['precision']['Border1'])),
                     np.mean(np.array(dic['recall']['Border1'])),
                     np.mean(np.array(dic['auc']['Border1'])),
                     np.mean(np.array(dic['f1']['Border1'])),
                     np.mean(np.array(dic['gmean']['Border1']))])
    results.add_row(['Border2',
                     np.mean(np.array(dic['precision']['Border2'])),
                     np.mean(np.array(dic['recall']['Border2'])),
                     np.mean(np.array(dic['auc']['Border2'])),
                     np.mean(np.array(dic['f1']['Border2'])),
                     np.mean(np.array(dic['gmean']['Border2']))])
    results.add_row(['ADASYN',
                     np.mean(np.array(dic['precision']['ADASYN'])),
                     np.mean(np.array(dic['recall']['ADASYN'])),
                     np.mean(np.array(dic['auc']['ADASYN'])),
                     np.mean(np.array(dic['f1']['ADASYN'])),
                     np.mean(np.array(dic['gmean']['ADASYN']))])
    results.add_row(['Safe-level',
                     np.mean(np.array(dic['precision']['Safe-level'])),
                     np.mean(np.array(dic['recall']['Safe-level'])),
                     np.mean(np.array(dic['auc']['Safe-level'])),
                     np.mean(np.array(dic['f1']['Safe-level'])),
                     np.mean(np.array(dic['gmean']['Safe-level']))])
    print(results)