Exemple #1
0
def format_X_y(pos_file, neg_file, flag):
    #flag = 'train' or 'test'
    #print pos_file, neg_file
    X, y = [], []
    for line in open(pos_file):
        try:
            x = [float(item) for item in line.strip().split()]
            X.append(x)
            y.append(1)
        except:
            print line

    if flag == 'train':
        #using SMOTE to over sample the positive feature vectors.
        X = SMOTE.smote(X, 15, 3)
        length1 = len(X) - len(y)
        y += [1] * length1

    #down sample part
    tempX, tempy = [], []
    for line in open(neg_file):
        try:
            x = [float(item) for item in line.strip().split()]
            tempX.append(x)
            tempy.append(0)
        except:
            print line
    '''if flag == 'train':
        tempX = SMOTE.downsample(tempX,0.5)
        tempy = tempy[:len(tempX)]'''
    #print float(len(tempy))/float(len(y))
    X += tempX
    y += tempy

    return X, y
def apply_smote(df):
    df.reset_index(drop=True, inplace=True)
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df
def cross_val(data, labels, k, smote, classifier):
    """ Performs k-fold cross validation using the specified classifier, returns number of true/false
    positives/negatives """
    kf = KFold(n_splits=k)
    tp, fp, fn, tn = 0, 0, 0, 0
    i = 0
    for train_index, test_index in kf.split(data):
            test_set, train_set, test_label, train_label = [], [], [], []
            # make train and test sets/labels
            for i in train_index:
                train_set.append(data[i])
                train_label.append(labels[i])
            for i in test_index:
                test_set.append(data[i])
                test_label.append(labels[i])

            # Apply SMOTEing when smote parameter is True
            if smote:
                train_set, train_label = SMOTE.SMOTEd(train_set, train_label)

            if classifier == 'linear':
                predicted = classifiers.lin_reg(train_set, test_set, train_label)
            elif classifier == 'logistic':
                predicted = classifiers.log_reg(train_set, test_set, train_label)
            elif classifier == 'decision tree':
                predicted = classifiers.decision_tree(train_set, test_set, train_label)
            elif classifier == 'neuralnetwork':
                predicted = classifiers.neuralnetwork(train_set, test_set, train_label)
            elif classifier == 'naive bayes':
                predicted = classifiers.naive_bayes(train_set, test_set, train_label)
            elif classifier == 'randomforest':
                predicted = classifiers.randomforest(train_set, test_set, train_label)
            elif classifier == 'knn':
                predicted = classifiers.knn(train_set, test_set, train_label)
            else:
                print 'Wrong name supplied: %s' % classifier

    return [test_label, predicted]
Exemple #4
0
def format_X_y(pos_file, neg_file,flag):
    #flag = 'train' or 'test'
    #print pos_file, neg_file
    X, y = [], []
    for line in open(pos_file):
        try:
            x = [float(item) for item in line.strip().split()]
            X.append(x)
            y.append(1)
        except:
            print line


    if flag == 'train':
    #using SMOTE to over sample the positive feature vectors. 
        X=SMOTE.smote(X,15,3)
        length1 = len(X)-len(y)
        y+=[1]*length1

 
    #down sample part 
    tempX,tempy = [],[]
    for line in open(neg_file):
        try:
            x = [float(item) for item in line.strip().split()]
            tempX.append(x)
            tempy.append(0)
        except:
            print line
    '''if flag == 'train':
        tempX = SMOTE.downsample(tempX,0.5)
        tempy = tempy[:len(tempX)]'''
    #print float(len(tempy))/float(len(y))
    X+=tempX
    y+=tempy
    
    return X, y
Exemple #5
0
 def apply_smote(self, df):
     cols = df.columns
     smt = SMOTE.smote(df)
     df = smt.run()
     df.columns = cols
     return df
        'gamma': gammaRange,
        'C': cRange,
        'class_weight': classWeightRange,
        'decision_function_shape': ['ovr']  # ['ovo', 'ovr', None]
    }]
    scores = ['f1', 'f1_macro']  # ['f1_macro', 'precision_macro', 'f1_micro']

    # train_counts = count_vectorizer.fit_transform(train_corpus)
    # vect = DictVectorizer()
    # train_counts = vect.fit_transform(features(tokenize, d) for d in train_corpus)

    train_dict = [features(tokenize, d) for d in train_corpus]

    newMinoritySamples = SMOTE.smoteAlgo(getMinoritySamples(
        train_dict, train_labels),
                                         rate=4,
                                         k=100,
                                         random_seed=RANDOMSEED)
    train_dict = train_dict + newMinoritySamples
    train_labels = train_labels + [1] * len(newMinoritySamples)

    vect = DictVectorizer()
    train_counts = vect.fit_transform(train_dict)

    (train_counts, train_labels) = shuffle(train_counts,
                                           train_labels,
                                           random_state=RANDOMSEED)

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()
def createSMOTEsamples(X,Y,nearestneigh,numNeighbors,majoritylabel,minoritylabel):
    global synX, synY
    (synX,synY) = SMOTE.createSyntheticSamples(X,Y,nearestneigh,numNeighbors,majoritylabel,minoritylabel)
Exemple #8
0
import pandas as pd
import SMOTE as sm

df = pd.read_csv('sample.csv', header=None)


# Simple pre-processing function to get the desired format for the dataset
def pre_processing(dataset):
    d = dataset.T
    return [list(d[i]) for i in d]


df = pre_processing(df)
minority = df[50:75]  # Use all 25 class 'B' data as the input dataset

# The SMOTE function is labelled as augment()
syn = sm.augment(minority, 50, 5)
print(syn)
print(len(syn))

# syn = sm.augment(minority, 100, 7)
# print(syn[0:25])
# print(syn[25:50])
Exemple #9
0
def templet(sampler_name, sample_ratio):
    """
    模板方法
    :param sampler_name: 采样算法名
    :param sample_ratio: 采样比例
    :return:
    """
    dataset = fetch_datasets()['satimage']
    X = dataset.data
    y = dataset.target
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        sb = None
        if sampler_name == 'CART':
            sb = DummySampler()
        elif sampler_name == 'SMOTE':
            sb = SMOTE(N=sample_ratio, k_neighbors=5, random_state=42)
        elif sampler_name == 'Border1':
            sb = BorderSMOTE(N=sample_ratio,
                             m_neighbors=9,
                             k_neighbors=5,
                             random_state=42,
                             kind='borderline1')
        elif sampler_name == 'Border2':
            sb = BorderSMOTE(N=sample_ratio,
                             m_neighbors=9,
                             k_neighbors=5,
                             random_state=42,
                             kind='borderline2')
        elif sampler_name == 'ADASYN':
            sb = ADASYN(bata=sample_ratio, k_neighbors=5, random_state=42)
        elif sampler_name == 'Safe-level':
            sb = SafeLevelSMOTE(N=sample_ratio, k_neighbors=5, random_state=42)
        else:
            pass
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])  # 采样
        model = tree.DecisionTreeClassifier(max_depth=8,
                                            min_samples_split=10,
                                            random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]
        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        # write2dic
        fill_dic('precision', sampler_name, sample_ratio, precision)
        fill_dic('recall', sampler_name, sample_ratio, recall)
        fill_dic('f1', sampler_name, sample_ratio, f1)
        fill_dic('auc', sampler_name, sample_ratio, auc)
        fill_dic('gmean', sampler_name, sample_ratio, gmean)
    print('%s %.1f building id transforming took %fs!' %
          (sampler_name, sample_ratio, time.time() - start_time))
def createSMOTEsamples(X, Y, nearestneigh, numNeighbors, majoritylabel,
                       minoritylabel):
    global synX, synY
    (synX, synY) = SMOTE.createSyntheticSamples(X, Y, nearestneigh,
                                                numNeighbors, majoritylabel,
                                                minoritylabel)
Exemple #11
0
def test():
    dic = {'recall': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'precision': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'f1': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'auc': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []},
           'gmean': {'CART': [], 'SMOTE': [], 'Border1': [], 'Border2': [], 'ADASYN': [], 'Safe-level': []}}

    results = prettytable.PrettyTable(["Classifier", "Precision", 'Recall', 'AUC', 'F-measure', 'G-mean'])

    # 加载数据
    dataset = fetch_datasets()['satimage']
    X = dataset.data
    y = dataset.target
    print(Counter(y))
    # 随机种子,保证每次实验结果相同
    np.random.seed(42)
    # -------------------------------------------CART----------------------------------------------------
    # 起始时间
    start_time = time.time()
    # 交叉验证CART
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # initialize CART
        cart = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        # 归一化
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        cart.fit(X_train_minmax, y[train])
        # 测试
        predict = cart.predict(X_test_minmax)
        probability = cart.predict_proba(X_test_minmax)

        cart_auc = metrics.roc_auc_score(y[test], probability[:, 1])
        cart_precision = metrics.precision_score(y[test], predict)
        cart_recall = metrics.recall_score(y[test], predict)
        if cart_precision == 0:
            cart_f1 = 0
        else:
            cart_f1 = 2 * (cart_precision * cart_recall) / (cart_precision + cart_recall)
        cart_gmean = geometric_mean_score(y[test], predict)
        dic['precision']['CART'].append(cart_precision)
        dic['recall']['CART'].append(cart_recall)
        dic['f1']['CART'].append(cart_f1)
        dic['auc']['CART'].append(cart_auc)
        dic['gmean']['CART'].append(cart_gmean)
    print('CART building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------------SMOTE----------------------------------------------------------
    # 起始时间
    start_time = time.time()
    # 交叉验证
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # preprocess
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # initialize sampler
        sb = SMOTE(N=100, k_neighbors=5, random_state=42)
        # sampling
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])
        # initialize classifier
        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        # model = svm.SVC(class_weight={1: 20})
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['SMOTE'].append(precision)
        dic['recall']['SMOTE'].append(recall)
        dic['f1']['SMOTE'].append(f1)
        dic['auc']['SMOTE'].append(auc)
        dic['gmean']['SMOTE'].append(gmean)

    print('SMOTE building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------Borderline-SMOTE1----------------------------------------------------
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 初始化采样器
        sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline1')
        # 采样
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['Border1'].append(precision)
        dic['recall']['Border1'].append(recall)
        dic['f1']['Border1'].append(f1)
        dic['auc']['Border1'].append(auc)
        dic['gmean']['Border1'].append(gmean)

    print('BorderSmote1 building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------Borderline-SMOTE2----------------------------------------------------
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 初始化采样器
        sb = BorderSMOTE(N=100, m_neighbors=30, k_neighbors=5, random_state=42, kind='borderline2')
        # 采样
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['Border2'].append(precision)
        dic['recall']['Border2'].append(recall)
        dic['f1']['Border2'].append(f1)
        dic['auc']['Border2'].append(auc)
        dic['gmean']['Border2'].append(gmean)

    print('BorderSmote2 building id transforming took %fs!' % (time.time() - start_time))

    # ---------------------------------------------ADASYN---------------------------------------------------------------
    # 起始时间
    start_time = time.time()
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        sb = ADASYN(bata=0.1, k_neighbors=5, random_state=42)
        # 预测
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['ADASYN'].append(precision)
        dic['recall']['ADASYN'].append(recall)
        dic['f1']['ADASYN'].append(f1)
        dic['auc']['ADASYN'].append(auc)
        dic['gmean']['ADASYN'].append(gmean)

    print('ADASYN building id transforming took %fs!' % (time.time() - start_time))

    # ------------------------------------------------Safe-Level-SMOTE----------------------------------------------
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # 训练
        sb = SafeLevelSMOTE(N=100, k_neighbors=5, random_state=42)
        # 预测
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision']['Safe-level'].append(precision)
        dic['recall']['Safe-level'].append(recall)
        dic['f1']['Safe-level'].append(f1)
        dic['auc']['Safe-level'].append(auc)
        dic['gmean']['Safe-level'].append(gmean)

    print('Safe-level building id transforming took %fs!' % (time.time() - start_time))

    # display
    results.add_row(['CART',
                     np.mean(np.array(dic['precision']['CART'])),
                     np.mean(np.array(dic['recall']['CART'])),
                     np.mean(np.array(dic['auc']['CART'])),
                     np.mean(np.array(dic['f1']['CART'])),
                     np.mean(np.array(dic['gmean']['CART']))])
    results.add_row(['SMOTE',
                     np.mean(np.array(dic['precision']['SMOTE'])),
                     np.mean(np.array(dic['recall']['SMOTE'])),
                     np.mean(np.array(dic['auc']['SMOTE'])),
                     np.mean(np.array(dic['f1']['SMOTE'])),
                     np.mean(np.array(dic['gmean']['SMOTE']))])
    results.add_row(['Border1',
                     np.mean(np.array(dic['precision']['Border1'])),
                     np.mean(np.array(dic['recall']['Border1'])),
                     np.mean(np.array(dic['auc']['Border1'])),
                     np.mean(np.array(dic['f1']['Border1'])),
                     np.mean(np.array(dic['gmean']['Border1']))])
    results.add_row(['Border2',
                     np.mean(np.array(dic['precision']['Border2'])),
                     np.mean(np.array(dic['recall']['Border2'])),
                     np.mean(np.array(dic['auc']['Border2'])),
                     np.mean(np.array(dic['f1']['Border2'])),
                     np.mean(np.array(dic['gmean']['Border2']))])
    results.add_row(['ADASYN',
                     np.mean(np.array(dic['precision']['ADASYN'])),
                     np.mean(np.array(dic['recall']['ADASYN'])),
                     np.mean(np.array(dic['auc']['ADASYN'])),
                     np.mean(np.array(dic['f1']['ADASYN'])),
                     np.mean(np.array(dic['gmean']['ADASYN']))])
    results.add_row(['Safe-level',
                     np.mean(np.array(dic['precision']['Safe-level'])),
                     np.mean(np.array(dic['recall']['Safe-level'])),
                     np.mean(np.array(dic['auc']['Safe-level'])),
                     np.mean(np.array(dic['f1']['Safe-level'])),
                     np.mean(np.array(dic['gmean']['Safe-level']))])
    print(results)
Exemple #12
0
def create_datasets(input_df,
                    target_var,
                    num_val_pos,
                    num_val_neg,
                    num_train_pos,
                    num_train_neg,
                    num_smote=None,
                    num_train_sets=5,
                    replace=True,
                    SMOTE_random_sample=None):
    """
    Automatically creates validation and train/test sets for you.
    
    INPUTS
      --------------------------------------------------------------------------
      | TYPE  |  VARIABLE NAME   |  DESCRIPTION                                |
      --------------------------------------------------------------------------
      pd.df   |  df              |  Starting dataframe                         |
      int     |  num_val_pos     |  # pos. values in validation dataframe      |
      int     |  num_val_neg     |  # neg values in validation dataframe       |
      int     |  num_train_pos   |  # pos. values in each train dataframe      |
                                    if == "all", then you use all non-validation
                                    examples for each training set leading to an
                                    oversampling factor of num_train_sets.
      int     |  num_train_neg   |  # neg. values in each train dataframe      |
      int     |  num_train_sets  |  # of training setss desires                |
      --------------------------------------------------------------------------

    RETURNS:
      --------------------------------------------------------------------------
      | TYPE  |  VARIABLE NAME |  DESCRIPTION                                  |
      --------------------------------------------------------------------------
      dict    |  train_pos     |  dict of dataframes of positive training obs  |
      dict    |  train_neg     |  dict of dataframes of negative training obs  |
      pd.df   |  val_pos       |  dataframe of positive testing obs            |
      pd.df   |  val_neg       |  dataframe of negative testing obs            |
      --------------------------------------------------------------------------
    """
    total_positives = (input_df[target_var]).sum()

    # if num_val_pos < 1:
    #     num_val_pos = int(num_val_pos * total_positives)

    # if num_train_pos < 1:
    #     num_train_pos = int(num_train_pos * total_positives)

    # if num_val_neg < 1:
    #     num_val_neg = int(num_val_neg * total_positives)

    # if num_train_neg < 1:
    #     num_train_neg = int(num_train_neg * total_positives)

    # # After defining num_val_positives, use the rest for training
    # if not num_train_pos:
    #     num_train_pos = total_positives - num_val_pos

    # Get Positive, Negative Indices
    positive_indices = input_df[input_df[target_var] == 1].index.tolist()
    num_positives = len(positive_indices)

    negative_indices = input_df[input_df[target_var] == 0].index.tolist()
    num_negatives = len(negative_indices)

    # Create validation set
    print("Creating Validation sets...")
    start = time.time()

    val_positive_indices = np.random.choice(positive_indices,
                                            num_val_pos,
                                            replace=False)
    val_pos = input_df.copy().iloc[val_positive_indices, :]

    val_negative_indices = np.random.choice(negative_indices,
                                            num_val_neg,
                                            replace=False)
    val_neg = input_df.copy().iloc[val_negative_indices, :]

    # Remove Validation Set Elements from Train/Test Set Elements
    positive_indices = list(set(positive_indices)\
                       .difference(set(val_positive_indices)))
    negative_indices = list(set(negative_indices)\
                       .difference(set(val_negative_indices)))
    end = time.time()
    print("Completed in {}s\n".format(round(end - start, 1)))

    ## SMOTE new samples
    remaining_indices = positive_indices + negative_indices
    remaining_df = input_df.iloc[remaining_indices, :].reset_index(drop=True)
    if num_smote is not None and num_smote > 0:
        start = time.time()
        print("SMOTEing synthetic examples...")
        X_pos = remaining_df[remaining_df['target_var'] == 1].drop(
            [target_var], axis=1)
        if SMOTE_random_sample is not None and SMOTE_random_sample > 0:
            X_pos = X_pos.sample(SMOTE_random_sample)
        smoter = SMOTE()
        X_synth = smoter.generate(
            X_pos,
            None,
            num_smote,
            False,
            custom_SMOTE.match_columns,
            custom_SMOTE.smote_columns,
        )
        y_synth = np.ones(X_synth.shape[0]).reshape(-1, 1)
        synths = pd.DataFrame(np.hstack((X_synth, y_synth)),
                              columns=remaining_df.columns)
        new_df = pd.concat((remaining_df, synths))
        new_df = new_df.convert_objects()
        end = time.time()
        print("Completed in {}s\n".format(round(end - start, 1)))
    else:
        new_df = remaining_df.copy()

    # Get indices of remaining samples
    positive_indices = new_df[new_df[target_var] == 1].index.tolist()
    negative_indices = new_df[new_df[target_var] == 0].index.tolist()

    # Create Train/Test Set Values
    print("Creating Train/Test sets...")
    start = time.time()

    if num_train_pos == 'all':
        train_positives = np.array(positive_indices)[:, np.newaxis].T
        train_positives = np.repeat(train_positives, num_train_sets, axis=0)

    else:
        train_positives = np.random.choice(positive_indices,
                                           size=(num_train_sets,
                                                 num_train_pos))

    train_negatives = np.random.choice(negative_indices,
                                       size=(num_train_sets, num_train_neg))

    # Return Dataframes
    print("Returning Dataframes...")
    train_pos, train_neg = {}, {}
    for i in range(num_train_sets):
        set_name = "set_{}".format(i + 1)
        train_pos[set_name] = new_df.iloc[train_positives[i], :]
        train_neg[set_name] = new_df.iloc[train_negatives[i], :]

    end = time.time()
    print("Completed in {}s\n".format(round(end - start, 1)))
    print("Done")
    return train_pos, train_neg, val_pos, val_neg
Exemple #13
0
    def fit(self, X, y):
        # Determine the minority class label.
        stats_c_ = Counter(y)
        maj_c_ = max(stats_c_, key=stats_c_.get)
        self.majority_target = maj_c_
        min_c_ = min(stats_c_, key=stats_c_.get)
        self.minority_target = min_c_

        total_number = len(X)  # Total number of instances in the training set
        pos_data = X[y == self.minority_target]
        neg_data = X[y == self.majority_target]
        pos_size = len(pos_data)  # number of positive data
        neg_size = len(neg_data)  # number of negative data
        # Reorganize TRAIN by putting all the positive and negative exampels together, respectively
        X_train = np.vstack([pos_data, neg_data])
        y_train = np.array([self.minority_target] * pos_size +
                           [self.majority_target] * neg_size)
        # weights stores the weights of the instances in each row for every iteration of boosting
        weights = np.zeros(shape=[self.n_estimator, X.shape[0]])
        # Weights for all the instances are initialized by 1/m for the first iteration
        weights[0] = 1 / X.shape[0]

        t = 0  # Loop counter
        count = 0  # Keeps counts of the number of times the same boosting iteration have been repeated
        while t < self.n_estimator:
            # log message
            # logger.debug('Boosting iteration # %d' % t)
            # print('Boosting iteration # %d' % t)
            if self.class_dist is True:
                # Resampling positive_data with weights of positive example
                sum_pos_weights = np.sum(weights[t][:pos_size])
                pos_weights = weights[t][:pos_size] / sum_pos_weights

                resample_pos = pos_data[np.random.choice(a=pos_size,
                                                         size=pos_size,
                                                         replace=True,
                                                         p=pos_weights)]

                # Resampling negative with weights of negative example
                sum_neg_weights = np.sum(weights[t][pos_size:total_number])
                neg_weights = weights[t][
                    pos_size:total_number] / sum_neg_weights
                resample_neg = neg_data[np.random.choice(a=neg_size,
                                                         size=neg_size,
                                                         replace=True,
                                                         p=neg_weights)]
                # Resampled TRAIN is stored in RESAMPLED
                X_resampled = np.vstack([resample_pos, resample_neg])
                y_resampled = np.array([self.minority_target] * pos_size +
                                       [self.majority_target] * neg_size)

                # Calulating the number of boosting the positive class
                syn_size = pos_size * self.rate
            else:
                # indices of resampled train
                random_index = np.random.choice(a=total_number,
                                                size=total_number,
                                                replace=True,
                                                p=weights[t])
                # Resampled TRAIN is stored in RESAMPLED
                X_resampled = X_train[random_index]
                y_resampled = y_train[random_index]

                # Calulating the number of boosting the positive class
                pos_size = np.sum(y_resampled == self.minority_target)
                neg_size = np.sum(y_resampled == self.majority_target)
                syn_size = pos_size * self.rate

            # SMOTE step
            # self.smote.fit(X_resampled[y_resampled == self.minority_target])
            # X_syn = self.smote.sample(syn_size)
            # y_syn = np.array([self.minority_target] * syn_size)

            smote = SMOTE(N=self.rate,
                          k_neighbors=5,
                          random_state=self.random_state)
            X_res, y_res = smote.fit_sample(X_resampled, y_resampled)

            # train classifier
            model = clone(self.weak_estimator)
            # if self.weak_estimator == 'decision tree':
            #     model = tree.DecisionTreeClassifier(max_depth=8, min_samples_split=10, random_state=42)
            # elif self.weak_estimator == 'svm':
            #     model = svm.SVC(class_weight={1: 8})
            # else:
            #     pass
            model.fit(X_res, y_res)
            predict = model.predict(X_train)

            # Computing the pseudo loss of hypothesis 'model'
            incorrect = predict != y_train
            loss = np.mean(np.average(incorrect, weights=weights[t], axis=0))
            # print(loss)

            # If count exceeds a pre-defined threshold (5 in the current implementation),
            # the loop is broken and rolled back to the state where loss > 0.5 was not encountered
            if count > 5:
                self.pseudo_loss = self.pseudo_loss[:t]
                self.estimator_weights_ = self.estimator_weights_[:t]
                self.estimators_ = self.estimators_[:t]
                print('Too many iterations have loss > 0.5')
                print('Aborting boosting')
                break

            if loss > 0.5:
                count = count + 1
                continue
            else:
                count = 1

            self.pseudo_loss.append(loss)  # Pseudo-loss at each iteration
            self.estimators_.append(model)  # Hypothesis function
            beta = loss / (1 - loss)  # Setting weight update parameter 'beta'.
            self.estimator_weights_.append(np.log(
                1 / beta))  # Weight of the hypothesis

            # At the final iteration there is no need to update the weights any further
            if t == self.n_estimator - 1:
                break

            # Updating weight
            weights[t + 1][y_train ==
                           predict] = weights[t][y_train == predict] * beta
            weights[t + 1][y_train != predict] = weights[t][y_train != predict]

            # Normalizing the weight for the next iteration
            sum_weights = np.sum(weights[t + 1])
            weights[t + 1] /= sum_weights

            # Incrementing loop counter
            t = t + 1