def test_sample_regular_with_nn_svm():
    """Test sample function with regular SMOTE with a NN object."""

    # Create the object
    kind = 'svm'
    nn_k = NearestNeighbors(n_neighbors=6)
    svm = SVC(random_state=RND_SEED)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.47436888, -0.2645749], [1.07844561, -0.19435291],
                     [1.44015515, -1.30621303]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_sample_borderline2():
    """Test sample function with borderline 2 SMOTE."""

    # Create the object
    kind = 'borderline2'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.47436888, -0.2645749], [1.07844561, -0.19435291],
                     [0.33339622, 0.49870937]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_resample_nn_obj():
    kind = 'borderline1'
    nn_m = NearestNeighbors(n_neighbors=11)
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
        1.25192108, -0.22367336
    ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
        -0.28162401, -2.10400981
    ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
        0.70472253, -0.73309052
    ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
        0.88407872, 0.35454207
    ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
        -0.18410027, -0.45194484
    ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [
        -0.41635887, -0.38299653
    ], [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.3765279, -0.2009615], [0.55276636, -0.10550373],
                     [0.45413452, -0.08883319], [1.21118683, -0.22817957]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_sample_with_nn_svm():
    kind = 'svm'
    nn_k = NearestNeighbors(n_neighbors=6)
    svm = SVC(gamma='scale', random_state=RND_SEED)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234],
                     [0.47436887, -0.2645749],
                     [1.07844562, -0.19435291],
                     [1.44228238, -1.31256615],
                     [1.25636713, -1.04463226]])
    y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
                     1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
    def fit(self, X , y = None):
        # 'Random under-sampling'
        # CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51)
        #Accuracy: 0.939693267481
        #Precision: 0.238095238095
        #Recall: 0.897435897436

        #Accuracy: 0.962568234988
        #Precision: 0.324468085106
        #Recall: 0.782051282051
        #SMOTE(ratio=ratio, kind='borderline1')
        #Accuracy: 0.971146347803
        #Precision: 0.372093023256
        #Recall: 0.615384615385
        #SMOTE(ratio=ratio, kind='borderline2')
        #Accuracy: 0.965427605927
        #Precision: 0.333333333333
        #Recall: 0.705128205128
        #svm_args = {'class_weight': 'auto'}
        #svmsmote = SMOTE(ratio=ratio, kind='svm', **svm_args)
        #Accuracy: 0.972186119054
        #Precision: 0.395683453237
        #Recall: 0.705128205128

        smote = SMOTE(ratio='auto', kind='regular')
        X, y = smote.fit_sample(X, y)
       # weights = np.array([1/y.mean() if i == 1 else 1 for i in y])
        return super(RandomForestClassifier, self).fit(X,y)#,sample_weight=weights)
def train(addr_train, clf, sampling, add_estimators):
    with open(os.path.join(addr_train, "day_samp_bin.npy"), "r") as file_in:
        X = smio.load_sparse_csr(file_in)
    width = np.size(X, 1)
    X_train = X[:, :width-1]
    y_train = X[:, width-1]
    if sampling == "Over":
        sm = SMOTE(ratio=0.95)
        X_train, y_train = sm.fit_sample(X_train, y_train)
    elif sampling == "Under":
        X_train, y_train = US.undersample(X, 0.01)

    print "Fitting Model......"
    clf.n_estimators += add_estimators
    clf.fit(X_train, y_train)
    print "Done"

    if __SAVE_MODEL:
        model_name = "RF_" + onoff_line + "_" + sampling + "_Model.p"
        dir_out = os.path.join(addr_train, "Random_Forest_Models")
        if not os.path.isdir(dir_out):
            os.mkdir(dir_out)
        path_out = os.path.join(dir_out, model_name)
        with open(path_out, "w") as file_out:
            pickle.dump(clf, file_out)

    return clf
Exemple #7
0
def get_data(month, day, hour=-1, mode="normal"):
    if hour != -1:
        if hour == 24:
            hour = 0
            day += 1
        addr_in = os.path.join("/mnt/rips2/2016",
                               str(month).rjust(2, "0"),
                               str(day).rjust(2, "0"),
                               str(hour).rjust(2, "0"),
                               "output_bin.npy")
    else:
        addr_in = os.path.join("/mnt/rips2/2016",
                               str(month).rjust(2, "0"),
                               str(day).rjust(2, "0"),
                               "day_samp_newer_bin.npy")
    with open(addr_in, "r") as file_in:
        loader = np.load(file_in)
        data = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']).toarray()
    X = data[:, :-1]
    y = data[:, -1]

    if mode == "over":
        sm = SMOTE(ratio=0.99, verbose=0)
        X, y = sm.fit_sample(X, y)

    return X, y
def resample_data(X, y, categorical_lst):
    '''
    up-samples minority class
    '''
    sm = SMOTE(kind='regular')
    X_train_re, y_train_re = sm.fit_sample(X,y)
    #rounding categorical variables
    X_train_re[:,categorical_lst] = np.round(X_train_re[:,categorical_lst])
    return X_train_re, y_train_re
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    sm = SMOTE(random_state=RND_SEED)
    sm.fit(X, Y)
    assert_raises(RuntimeError, sm.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_sample_regular_wrong_svm():
    kind = 'svm'
    nn_k = NearestNeighbors(n_neighbors=6)
    svm = 'rnd'
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm)

    with raises(ValueError, match="has to be one of"):
        smote.fit_sample(X, Y)
def Input_Preparing(Scaled_Input_Data, Surgery_Outcome, N_Feat):
    # Feature Selection
    MIFS = mifs.MutualInformationFeatureSelector(method='JMI', verbose=2, n_features = N_Feat)
    MIFS.fit(Scaled_Input_Data, Surgery_Outcome)
    Selected_Input_Data = Scaled_Input_Data.loc[:,MIFS.support_]

    # Balancing using SMOTE
    sm = SMOTE(kind='regular')
    Prep_Train_Data, Prep_Surgery_Outcome = sm.fit_sample(X, y)
    
    return(Prep_Train_Data, Prep_Surgery_Outcome, MIFS.support_)
Exemple #12
0
    def transform(self, fp):
        fm, train_x, train_y = FeaturePool.to_train_arrays(fp)

        os = SMOTE(random_state = self.random_state)
        os_train_x, os_train_y = os.fit_sample(train_x, train_y[:, 0])
        os_train_y = os_train_y.reshape((os_train_y.shape[0], 1))

        for f in FeaturePool.from_train_arrays(fm, os_train_x, os_train_y):
            yield Feature.apply_config(f, is_over_sampled=True)
        for f in fp:
            if f.split_type == SplitType.TEST:
                yield f
def SMT(df, target):
    df1 = df.copy()
    y = df1.pop('anti_churn')
    X = df1
    Xcols = df1.columns
    sm = SMOTE(kind='regular', ratio = target)
    X_resampled, y_resampled = sm.fit_sample(X, y)
    X_resampled = pd.DataFrame(X_resampled)
    y_resampled = pd.DataFrame(y_resampled)
    X_resampled.columns = Xcols
    y_resampled.columns = ['anti_churn']
    return X_resampled, y_resampled
    def oversample(self):
        """Balance class data based on outcome"""
        print('Current outcome sampling {}'.format(Counter(self.y)))
        
        # to use a random sampling seed at random:
        #ros = RandomOverSampler()
        ros = SMOTE()
        #ros = ADASYN()

        self.X, self.y = ros.fit_sample(self.X, self.y)

        self.Xview = self.X.view()[:, :self.n_features]
        print('Resampled dataset shape {}'.format(Counter(self.y)))
def test_smote_fit():
    """Test the fitting method"""

    # Create the object
    smote = SMOTE(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(smote.min_c_, 0)
    assert_equal(smote.maj_c_, 1)
    assert_equal(smote.stats_c_[0], 8)
    assert_equal(smote.stats_c_[1], 12)
def test_sample_regular():
    """Test sample function with regular SMOTE."""

    # Create the object
    kind = 'regular'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemple #17
0
def train(cutoffs):
    print "\n========== Start Training =========="
    if __DATA_FROM == 2:
        list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1])
    else:
        list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1])
    clf = BernoulliNB(class_prior=[0.05, 0.95])

    if __IF_TRAIN_WITHOUT_SAVE:
        print "Performing correlation explanation......"
        with open("/home/wlu/Desktop/day_samp_bin_1-2.npy", "r") as file_in:
            X = Sparse_Matrix_IO.load_sparse_csr(file_in)
            if len(cutoffs) > 0:
                X = discard_vars(X, cutoffs)
            layer = correlation_ex(X)

    for i in range(0, len(list_io_addr)):
        path_in = list_io_addr[i]
        print "\nGenerating training set from {}".format(path_in)
        with open(path_in, "r") as file_in:
            X = Sparse_Matrix_IO.load_sparse_csr(file_in)

        if len(cutoffs) > 0:
            X = discard_vars(X, cutoffs)

        vector_len = len(X[0])
        X_train = X[:, 0:vector_len-1]
        y_train = X[:, vector_len-1]

        if __IF_TRAIN_WITHOUT_SAVE:
            print "Transforming training set according to CorEx......"
            X_train = corex_transform(layer, X_train)

        sm = SMOTE(ratio=0.95)
        X_train, y_train = sm.fit_sample(X_train, y_train)

        print "Fitting Model......"
        clf.partial_fit(X_train, y_train, classes=[0, 1])
        print "Done"

    if __IF_TRAIN_WITHOUT_SAVE:
        return [clf, layer]
    else:
        with open(__ROOT_MODEL, "w") as file_out:
            pickle.dump(clf, file_out)
        return []
def get_data(ratio, sampling):
    list_io_addr = get_io_addr()
    data = []
    for addr_in in list_io_addr:
        with open(addr_in, "r") as file_in:
            X = smio.load_sparse_csr(file_in)
            data.extend(X)
    data = np.array(data)

    n = 30000
    if sampling == "Over":
        m = int(np.size(data, 1))
        k = int(0.8*n)
        X = data[:n, :m-1]
        y = data[:n, m-1:]
        X_train = X[:k, :]
        y_train = y[:k]
        sm = SMOTE(ratio=ratio)
        X_train, y_train = sm.fit_sample(X_train, column_or_1d(y_train, warn=False))
        X_test = X[k:, :]
        y_test = y[k:]
    elif sampling == "None":
        m = int(np.size(data, 1))
        k = int(0.8*n)
        X = data[:n, :m-1]
        y = data[:n, m-1:].ravel()
        X_train = X[:k, :]
        y_train = y[:k]
        X_test = X[k:, :]
        y_test = y[k:]
    else:
        m = int(np.size(data, 1))
        k = int(0.2*np.size(data, 0))
        data_test = data[k:, :]
        data = data[:k, :]
        data = US.undersample(data, ratio)
        k = int(0.8*np.size(data, 0))
        if np.size(data_test, 0) > k:
            data_test = data[:k, :]
        X_train = data[:, :m-1]
        y_train = data[:, m-1:].ravel()
        X_test = data_test[:, :m-1]
        y_test = data_test[:, m-1:].ravel()
    return X_train, y_train, X_test, y_test
def clf_extratree_predictor(item):
    (clf_args,idx,X,y,use_SMOTE) = item
    train_index, test_index = idx

    clf = sklearn.ensemble.ExtraTreesClassifier(**clf_args)
        
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    if use_SMOTE:
        sampler = SMOTE(ratio='auto', kind='regular')
        X_train, y_train = sampler.fit_sample(X_train,y_train)
    
    clf.fit(X_train,y_train)
    
    pred   = clf.predict(X_test)
    pred_proba = clf.predict_proba(X_test)

    return idx,pred,pred_proba
def train_and_test_dnn(args):
    
    for a in args:
        print(a)
    
    primitive = args[1]
    res =  pickle.load(open(sys.argv[2], "rb" ))
    notes_with_truth_labels_for_query_primitives = pd.read_csv(args[3])
   
    dl_results = pd.DataFrame(columns = ['primitive', 'avg_fit_time', 'avg_score_time', 'avg_score'])
    
    X = get_doc_term_matrix(res)
    y = notes_with_truth_labels_for_query_primitives.loc[:, primitive]

    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 5, 2), random_state=1)

    try:

        sm = SMOTE(random_state=357)
        X_sm, y_sm = sm.fit_sample(X, y)

    except ValueError:
        print("value error, smote")
        X_sm = X
        y_sm = y

    cv_results = cross_validate(clf, X_sm, y_sm, cv=3, return_train_score=False)
    print(cv_results)

    dump(clf, './models/{}_trained_dnn.joblib'.format(primitive)) 

    dl_results.loc[0, 'primitive'] = primitive
    dl_results.loc[0, 'avg_fit_time'] = np.mean(cv_results['fit_time'])
    dl_results.loc[0, 'avg_score_time'] = np.mean(cv_results['score_time'])
    dl_results.loc[0, 'avg_test_score'] = np.mean(cv_results['test_score'])

    with open(args[4], 'a') as f:
        f.write("{}, {}, {}, {}\n".format(dl_results.loc[0,'primitive'], dl_results.loc[0,'avg_fit_time'], dl_results.loc[0,'avg_score_time'], dl_results.loc[0,'avg_test_score']))
        #f.write(dl_results.loc[0,:])
        #f.write("\n")
        f.close()
    
    print("DONE w/ {}".format(primitive))
def run_save_model(save_folder, spec, model_no, X_train, y_train, model_fn):
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
    cvscores = []
    f1scores = []
    for train, val in kfold.split(X_train, y_train):
        # create model using the model_fn parameter
        model = model_fn(spec, X_train)
        if model == None:
            return  # returns if there was a mistake in specifications
        # fit model to k-split of training data
        num_examples, dx, dy = X_train[train].shape
        X_resampled, y_resampled = SMOTE(kind='borderline1', random_state=1).fit_sample(
            X_train[train].reshape((num_examples, dx * dy)), y_train[train])
        num_total_examples, _ = X_resampled.shape
        X_resampled_reshaped = X_resampled.reshape(num_total_examples, dx, dy)
        model.fit(x=X_resampled_reshaped, y=y_resampled, epochs=10, batch_size=16, verbose=0)
        # evaluate model
        scores = model.evaluate(X_train[val], y_train[val], verbose=0)
        print('Accuracy: {}%'.format(scores[1] * 100))
        cvscores.append(scores[1])
        # get f1
        f1 = f1_score(y_train[val], model.predict(X_train[val]) > 0.5)
        print('F1 score: {}'.format(f1))
        f1scores.append(f1)

    mean_acc = 'Mean Accuracy: {}% +/- {}%'.format(np.mean(cvscores) * 100, np.std(cvscores) * 100)
    mean_f1 = 'Mean F1 score: {} +/- {}'.format(np.mean(f1scores), np.std(f1scores))
    print(mean_acc)
    print(mean_f1)

    # modelfile = save_folder + 'model' + str(model_no) + '.h5'
    # save_model(model, modelfile)
    # print('model saved')

    txtfile = save_folder + 'model' + str(model_no) + '.txt'
    with open(txtfile, 'w') as f:
        f.write(mean_acc)
        f.write(mean_f1)
        f.write('\n')
        f.write('\n')
        f.writelines(spec)
        print('specs saved')
def test_sample_regular_half():
    ratio = {0: 9, 1: 12}
    kind = 'regular'
    smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind)
    X_resampled, y_resampled = smote.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.36784496, -0.1953161]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Exemple #23
0
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False,
                            export=False):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Exemple #24
0
def DataFormat(data):
    Data = smio.load_sparse_csr(data)
    m = int(np.size(Data, 1))
    n = int(np.size(Data, 0))
    X_train = Data[:50000, :m-1]
    y_train = Data[:50000, m-1]
    sm = SMOTE(ratio=0.95)
    X_train, y_train = sm.fit_sample(X_train, y_train)

    data_new = []
    for i in range(np.size(X_train, 0)):
        row = list(X_train[i].tolist())
        row.append(y_train[i])
        data_new.append(row)
    shuffle(data_new)
    data_new = np.array(data_new)
    m = int(np.size(data_new, 1))
    X_train = data_new[:, :m-1]
    y_train = data_new[:, m-1]

    K = np.count_nonzero(y_train)   # Number of good data points
    return X_train, y_train, n, K   # Training set plus some numbers useful for weighting
def test_sample_regular_with_nn():
    kind = 'regular'
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k)
    X_resampled, y_resampled = smote.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668], [0.66052536, -0.28246517]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Exemple #26
0
def oversample(X, y, bal_strategy):

	if(bal_strategy == "SMOTESVN"  or bal_strategy == "ALL"):
		# Apply SMOTE SVM
		sm = SMOTE(kind='svm')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "SMOTE"  or bal_strategy == "ALL"):
		# Apply regular SMOTE
		sm = SMOTE(kind='regular')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "ADASYN"  or bal_strategy == "ALL"):
	# Apply the random over-sampling
		ada = ADASYN()
		X_sampled, y_sampled = ada.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in SMOTESVN, SMOTE, ADASYN, ALL, NONE'
		sys.exit(1)


	return (X_sampled, y_sampled)
Exemple #27
0
def get(addr_day, mode="normal", ratio=-1, sampling_method="None", bin=False):
    if "res" in mode:
        res_ratio = mode.split("-")[1]
        prefix = "day_samp_res"
        suffix = "_{}.npy".format(res_ratio)
        res = "Reservoir_Data"
    else:
        prefix = "day_samp_new"
        suffix = ".npy"
        res = ""

    if not ratio == -1:
        n = 100000
        neg = int(n / (1+ratio))
        pos = n - neg

        with open(os.path.join(addr_day, "PosNeg", res, prefix + "_neg" + suffix), "r") as file_neg:
            matrix_neg = smio.load_sparse_csr(file_neg)
        matrix_neg = matrix_neg[:neg, :]
        with open(os.path.join(addr_day, "PosNeg", res, prefix + "_pos" + suffix), "r") as file_pos:
            matrix_pos = smio.load_sparse_csr(file_pos)
        matrix_pos = matrix_pos[:pos, :]

        matrix = vstack((matrix_neg, matrix_pos))
        np.random.shuffle(matrix)
    else:
        with open(os.path.join(addr_day, res, prefix + suffix), "r") as file_in:
            matrix = smio.load_sparse_csr(file_in)

    width = np.size(matrix, 1)
    X = matrix[:, :width-1]
    y = matrix[:, width-1]

    if "Over" in sampling_method:
        sm = SMOTE(ratio=0.95)
        X, y = sm.fit_sample(X, y)

    return X, y
def test_wrong_nn():
    kind = 'borderline1'
    nn_m = 'rnd'
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m)
    with raises(ValueError, match="has to be one of"):
        smote.fit_sample(X, Y)
    nn_k = 'rnd'
    nn_m = NearestNeighbors(n_neighbors=10)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m)
    with raises(ValueError, match="has to be one of"):
        smote.fit_sample(X, Y)
    kind = 'regular'
    nn_k = 'rnd'
    smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k)
    with raises(ValueError, match="has to be one of"):
        smote.fit_sample(X, Y)
Exemple #29
0
y_pred = rf.predict(X_test)

# calculate the accuracy score
score = accuracy_score(y_test, y_pred)

# calculate the precision
precision = precision_score(y_test, y_pred)

# display 'score' and 'precision'

# --------------
# import packages
from imblearn.over_sampling import SMOTE

# Instantiate smote
smote = SMOTE()

# fit_sample onm training data
X_train, y_train = smote.fit_sample(X_train, y_train)

# fit modelk on training data
rf.fit(X_train, y_train)

# predict on test data
y_pred = rf.predict(X_test)

# calculate the accuracy score
score = accuracy_score(y_test, y_pred)

# calculate the precision
precision = precision_score(y_test, y_pred)
Exemple #30
0
class Predict_football():
    def __init__(self, raw_data, model):
        plt.rc("font", family="Malgun Gothic")
        plt.rcParams['axes.unicode_minus'] = False
        plt.figure(figsize=(10, 10))
        self.data = raw_data
        self.scaler = StandardScaler()
        self.sampler = SMOTE(random_state=42)
        self.pca = PCA(random_state=42)
        self.add = 0.1
        self.skip = []
        self.columns = None
        self.clf = model
        self.data["2HTHG"] = self.data["FTHG"] - self.data["HTHG"]
        self.data["2HTAG"] = self.data["FTAG"] - self.data["HTAG"]
        self.data["FGD"] = self.data["FTHG"] - self.data["FTAG"]
        self.data["2HGD"] = self.data["2HTHG"] - self.data["2HTAG"]
        self.data["HGD"] = self.data["HTHG"] - self.data["HTAG"]
        self.data["SD"] = self.data["HS"] - self.data["AS"]
        self.data["STD"] = self.data["HST"] - self.data["AST"]
        self.data["Pezzali"] = (self.data["FTHG"] + self.add) / (
            self.data["HS"] + self.add) * (self.data["AS"] + self.add) / (
                self.data["FTAG"] + self.add)
        self.COL = [
            "FTHG", "2HTHG", "HTHG", "HS", "HST", "FTAG", "2HTAG", "HTAG",
            "AS", "AST"
        ]
        self.r_COL = ["FGD", "2HGD", "HGD", "SD", "STD", "Pezzali"]
        self.df = self.data.copy()

    # ------------------ ANOVA ------------------

    def ANOVA(self):
        df = self.data.drop(columns=["Div", "HomeTeam", "AwayTeam", "RESULT"])
        df = df.iloc[:train_index, :]
        scaled_train = self.scaler.fit_transform(df)
        scaled_train = pd.DataFrame(scaled_train, columns=df.columns)
        df = self.sampler.fit_resample(scaled_train,
                                       self.data.loc[:train_index, "RESULT"])
        df = pd.concat([df[0], df[1]], axis=1)
        fstat, p_val = f_oneway(df.loc[df["RESULT"] == 0, df.columns[:-1]],
                                df.loc[df["RESULT"] == 1, df.columns[:-1]],
                                df.loc[df["RESULT"] == 2, df.columns[:-1]])
        print(p_val)
        print(df.columns[:-1][p_val > 0.05])

    # ------------------ Post-hoc ------------------

    def PH(self):
        df = self.data.drop(columns=["Div", "HomeTeam", "AwayTeam", "RESULT"])
        for i in df.columns:
            posthoc = pairwise_tukeyhsd(self.data.iloc[:, [i]],
                                        self.data["RESULT"],
                                        alpha=0.05)
            plt.figure(figsize=(10, 10))
            posthoc.plot_simultaneous()
            plt.title("{}".format(self.data.columns[i]))
            plt.show()

    # ------------------ plot data pdf(probability density function) ------------------

    def plot(self):
        res = ["패", "무", "승"]
        color = ["r", "g", "b"]
        y_Max = 0
        x_Max = 0
        x_Min = 100
        for col in self.data.columns[3:-1]:
            for i in range(3):
                values = self.data[self.data["RESULT"] == i][col].value_counts().values / \
                         self.data[self.data["RESULT"] == i][col].shape[0]
                y_Max = max(y_Max, values.max())
                x_Max = max(x_Max,
                            self.data[self.data["RESULT"] == i][col].max())
                x_Min = min(x_Min,
                            self.data[self.data["RESULT"] == i][col].min())
            fig, axes = plt.subplots(1, 3, figsize=(15, 8))
            for i, (r, ax, c) in enumerate(zip(res, axes, color)):
                print(self.data[self.data["RESULT"] == i][col].value_counts())
                index = self.data[self.data["RESULT"] ==
                                  i][col].value_counts().index.tolist()
                values = self.data[self.data["RESULT"] == i][col].value_counts().values / \
                         self.data[self.data["RESULT"] == i][col].shape[0]
                ax.bar(index, values, color=c, label="{}".format(r))
                ax.set_xlabel("{}".format(col))
                ax.set_ylabel("bins")
                ax.set_xlim(x_Min, x_Max)
                ax.set_ylim(0, y_Max)
                ax.legend()
            plt.show()

    # ------------------ 해당 경기 홈팀과 원정팀의 이전 5경기 맞대결 데이터들의 평균 값 ------------------

    def H2H(self, home, away, index, ratio=1):
        selected_df = self.data[self.data.index < index]
        record = selected_df[((selected_df['HomeTeam'] == home) &
                              (selected_df['AwayTeam'] == away)) |
                             ((selected_df['HomeTeam'] == away) &
                              (selected_df['AwayTeam'] == home))].copy()
        # 승리와 패배의 경우 현재 경기의 홈팀이 과거에 원정에서 치른 경기의 HTR과 RESULT를 홈과 바꿈 (계산 편리)
        record.loc[(record['AwayTeam'] == home) & (record['HTR'] != 1), ['HTR']] = \
            3 - record.loc[(record['AwayTeam'] == home) & (record['HTR'] != 1), ['HTR']]
        record.loc[(record['AwayTeam'] == home) & (record['RESULT'] != 1), ['RESULT']] = \
            2 - record.loc[(record['AwayTeam'] == home) & (record['RESULT'] != 1), ['RESULT']]
        # 현재 경기의 홈팀이 과거에 원정에서 치른 경기의 feature들을 홈팀기준으로 변경, Pezzali는 -가 아닌 역수를 취함
        record.loc[record["AwayTeam"] == home,
                   self.r_COL[:-1]] = -record.loc[record["AwayTeam"] == home,
                                                  self.r_COL[:-1]]
        record.loc[record["AwayTeam"] == home,
                   ["Pezzali"]] = 1 / record.loc[record["AwayTeam"] == home,
                                                 ["Pezzali"]]
        temp = record.loc[record["AwayTeam"] == home, self.COL[:5]].values
        record.loc[record["AwayTeam"] == home,
                   self.COL[:5]] = record.loc[record["AwayTeam"] == home,
                                              self.COL[5:]].values
        record.loc[record["AwayTeam"] == home, self.COL[5:]] = temp

        if record.shape[0] == 0:
            self.skip.append(index)
            return
        div = 0
        if record.shape[0] >= 5:
            record = record[-5:]
        INDEX = record["RESULT"].value_counts().index
        VALUES = record["RESULT"].value_counts().values
        for idx, val in zip(INDEX, VALUES):
            record.loc[record["RESULT"] == idx,
                       self.r_COL] = record.loc[record["RESULT"] == idx,
                                                self.r_COL] * val
            record.loc[record["RESULT"] == idx,
                       self.COL] = record.loc[record["RESULT"] == idx,
                                              self.COL] * val
            div += val**2
        # 다른 방식과 혼합해서 사용할 경우 비율을 조정
        self.df.loc[[index],
                    self.r_COL] = record[self.r_COL].sum(axis=0).values * ratio
        self.df.loc[[index],
                    self.COL] = record[self.COL].sum(axis=0).values * ratio
        # self.df.loc[[index], self.r_self.COL] = record[self.r_self.COL].ewm(span=record.shape[0], adjust=True).mean().sum().values * ratio
        # self.df.loc[[index], self.COL] = record[self.COL].ewm(span=record.shape[0], adjust=True).mean().mean().values * ratio
        self.df.loc[[index],
                    ["HTR"]] = np.ravel(record["HTR"].mean(axis=0)) * ratio

    # ------------------ 해당 경기 홈팀의 이전 5경기 데이터 평균 값 - 해당 경기 원정팀의 이전 5경기 데이터 평균 값  ------------------

    def Last_5(self, home, away, index, ratio=0.2):
        selected_df = self.data[self.data.index < index]
        home_record = selected_df[((selected_df['HomeTeam'] == home) |
                                   (selected_df['AwayTeam'] == home))].copy()
        away_record = selected_df[((selected_df['HomeTeam'] == away) |
                                   (selected_df['AwayTeam'] == away))].copy()
        home_record["RESULT"].replace(2, 3, inplace=True)
        away_record["RESULT"].replace(2, 3, inplace=True)
        # 현재 경기의 홈팀이 과거에 원정에서 치른 경기의 HTR과 RESULT를 홈과 바꿈
        home_record.loc[(home_record['AwayTeam'] == home) & (home_record['HTR'] != 1), ['HTR']] = \
            3 - home_record.loc[(home_record['AwayTeam'] == home) & (home_record['HTR'] != 1), ['HTR']]
        home_record.loc[(home_record['AwayTeam'] == home) & (home_record['RESULT'] != 1), ['RESULT']] = \
            3 - home_record.loc[(home_record['AwayTeam'] == home) & (home_record['RESULT'] != 1), ['RESULT']]
        home_record.loc[home_record["AwayTeam"] == home,
                        ["Pezzali"]] = 1 / home_record.loc[
                            home_record["AwayTeam"] == home, ["Pezzali"]]
        # 현재 경기의 원정팀이 과거에 원정에서 치른 경기의 HTR과 RESULT를 홈과 바꿈
        away_record.loc[(away_record['AwayTeam'] == away) & (away_record['HTR'] != 1), ['HTR']] = \
            3 - away_record.loc[(away_record['AwayTeam'] == away) & (away_record['HTR'] != 1), ['HTR']]
        away_record.loc[(away_record['AwayTeam'] == away) & (away_record['RESULT'] != 1), ['RESULT']] = \
            3 - away_record.loc[(away_record['AwayTeam'] == away) & (away_record['RESULT'] != 1), ['RESULT']]

        if index in self.skip:
            self.df.loc[[index], ["HTR"] + self.r_COL] = 0
            ratio = 0.5
        # 이전 10 경기 획득 승점
        if home_record.shape[0] >= 10:
            home_record = home_record[-10:]
        # 2부리그 경기에 대해 0.8의 가중치
        home_record.loc[home_record["Div"] == "E1", "RESULT"] *= 0.8
        self.df.loc[[index], "HP"] = home_record["RESULT"].sum(axis=0)

        if away_record.shape[0] >= 10:
            away_record = away_record[-10:]
        away_record.loc[away_record["Div"] == "E1", "RESULT"] *= 0.8
        self.df.loc[[index], "AP"] = away_record["RESULT"].sum(axis=0)

        if home_record.shape[0] >= 5:
            home_record = home_record[-5:]
        if away_record.shape[0] >= 5:
            away_record = away_record[-5:]
        # 현재 경기의 홈팀이 과거에 원정에서 치른 경기의 피처들을 홈을 기준으로 변환
        home_record.loc[home_record["AwayTeam"] == home,
                        self.r_COL[:-1]] = -home_record.loc[
                            home_record["AwayTeam"] == home, self.r_COL[:-1]]
        home_record.loc[home_record["AwayTeam"] == home,
                        ["Pezzali"]] = 1 / home_record.loc[
                            home_record["AwayTeam"] == home, ["Pezzali"]]
        INDEX = home_record["RESULT"].value_counts().index
        VALUES = home_record["RESULT"].value_counts().values
        for idx, val in zip(INDEX, VALUES):
            home_record.loc[home_record["RESULT"] == idx, self.r_COL] *= val
        # df.loc[[index], self.r_COL + ["HTR"]] += home_record[self.r_COL + ["HTR"]].ewm(
        #     span=home_record.shape[0]).mean().mean().values
        self.df.loc[[index], self.r_COL] += home_record[self.r_COL].sum(
            axis=0).values * ratio
        H_HTR = np.ravel(home_record["HTR"].mean(axis=0))

        # 현재 경기의 원정팀이 과거에 원정에서 치른 경기의 피처들을 홈을 기준으로 변환
        away_record.loc[away_record["AwayTeam"] == away,
                        self.r_COL[:-1]] = -away_record.loc[
                            away_record["AwayTeam"] == away, self.r_COL[:-1]]
        away_record.loc[away_record["AwayTeam"] == away,
                        ["Pezzali"]] = 1 / away_record.loc[
                            away_record["AwayTeam"] == away, ["Pezzali"]]
        INDEX2 = away_record["RESULT"].value_counts().index
        VALUES2 = away_record["RESULT"].value_counts().values
        for idx, val in zip(INDEX2, VALUES2):
            away_record.loc[away_record["RESULT"] == idx, self.r_COL] *= val
        # self.df.loc[[index], self.r_COL + ["HTR"]] += away_record[self.r_COL + ["HTR"]].ewm(
        #     span=away_record.shape[0]).mean().mean().values
        self.df.loc[[index], self.r_COL] -= away_record[self.r_COL].sum(
            axis=0).values * ratio
        A_HTR = np.ravel(away_record["HTR"].mean(axis=0))
        self.df.loc[[index], ["HTR"]] += (H_HTR - A_HTR + 3) / 2 * ratio

    # ------------------ 해당 경기 홈팀의 이전 홈 5경기 데이터 평균 값 - 해당 경기 원정팀이 이전 원정 5경기 데이터 평균 값 ------------------

    def Last_5_GF_GA(self, home, away, index, ratio=0.2):
        selected_df = self.data[self.data.index < index].copy()
        home_record = selected_df[selected_df['HomeTeam'] == home].copy()
        away_record = selected_df[selected_df['AwayTeam'] == away].copy()
        # 승점 등의 기준으 홈팀으로 설정되어 있으므로 원정팀에 해당하는 형식으로 바꿈
        away_record.loc[:, ["Pezzali"]] = 1 / away_record["Pezzali"]
        away_record.loc[away_record['HTR'] != 1, ['HTR']] = \
            3 - away_record.loc[away_record['HTR'] != 1, ['HTR']]
        away_record.loc[away_record['RESULT'] != 1, ['RESULT']] = \
            2 - away_record.loc[away_record['RESULT'] != 1, ['RESULT']]
        if home_record.shape[0] == 0 & away_record.shape[0] == 0:
            return
        if home_record.shape[0] >= 5:
            home_record = home_record[-5:]
        if away_record.shape[0] >= 5:
            away_record = away_record[-5:]
        # 해당 방식을 단독으로 사용할 경우
        # df.loc[[index], self.r_self.COL + ['HTR']] = 0
        # 현재 경기가 상대전적 데이터가 없는 팀간의 경기일 경우
        if index in self.skip:
            self.df.loc[[index], ["HTR"] + self.r_COL] = 0
            ratio = 1
        INDEX = home_record["RESULT"].value_counts().index
        VALUES = home_record["RESULT"].value_counts().values
        # 빈도에 따른 가중치 부여
        for idx, val in zip(INDEX, VALUES):
            home_record.loc[home_record["RESULT"] == idx,
                            self.COL[:5] + ["Pezzali"]] *= val
        # df.loc[[index], self.r_self.COL] += home_record[self.COL[:5] + ["Pezzali"]].ewm(
        #     span=home_record.shape[0]).mean().sum().values * ratio
        self.df.loc[[index],
                    self.r_COL] += home_record[self.COL[:5] + ["Pezzali"]].sum(
                        axis=0).values * ratio
        self.df.loc[[index], self.COL[:5]] += home_record[self.COL[:5]].sum(
            axis=0).values
        INDEX2 = away_record["RESULT"].value_counts().index
        VALUES2 = away_record["RESULT"].value_counts().values
        for idx, val in zip(INDEX2, VALUES2):
            away_record.loc[away_record["RESULT"] == idx,
                            self.COL[5:] + ["Pezzali"]] *= val
        # self.df.loc[[index], self.r_COL] -= away_record[self.COL[:5] + ["Pezzali"]].ewm(
        #     span=away_record.shape[0]).mean().sum().values * ratio
        self.df.loc[[index],
                    self.r_COL] -= away_record[self.COL[5:] + ["Pezzali"]].sum(
                        axis=0).values * ratio
        self.df.loc[[index], self.COL[:5]] += away_record[self.COL[5:]].sum(
            axis=0).values
        H_HTR = home_record[["HTR"]].mean(axis=0)
        A_HTR = away_record["HTR"].mean(axis=0)
        # 0 ~ 3으로 정규화
        val = (H_HTR - A_HTR + 3) / 2 * ratio
        self.df.loc[[index], "HTR"] += np.ravel(val)

    def remove_draw(self):
        # 무승부 데이터 제거
        self.df = self.df[self.df.RESULT != 1]

    def train_test_split(self):
        train = self.df.loc[self.df.index < train_index, ["HTR"] + self.r_COL]
        train_label = self.df.loc[self.df.index < train_index, ["RESULT"]]
        test = self.df.loc[self.df.index >= train_index, ["HTR"] + self.r_COL]
        test_label = self.df.loc[self.df.index >= train_index, ["RESULT"]]
        self.columns = train.columns
        print("------------------ trainset example ------------------\n",
              train.head(20))
        print("------------------ testset example ------------------\n",
              test.head(20))
        print("------------------ train_label counts ------------------\n",
              train_label.value_counts())
        print("------------------ test_label counts ------------------\n",
              test_label.value_counts())
        train = self.scaler.fit_transform(train)
        test = self.scaler.transform(test)

        return train, test, train_label, test_label

    def corr(self):
        sns.heatmap(data=self.df.corr(), annot=True, fmt=".2f")
        plt.savefig("corr.jpg")
        sns.pairplot(self.df, height=3, hue="RESULT")
        plt.savefig("pairplot.jpg")

    # ------------------ oversampling ------------------

    def oversampling(self, data, label, n=5):
        self.sampler.k_neighbors = n
        resampled_data, resampled_label = self.sampler.fit_resample(
            data, label)

        return resampled_data, resampled_label

    def Train(self, data, label):
        self.clf.fit(data, np.ravel(label))

    def prediction(self, data, label):
        print(classification_report(np.ravel(label), self.clf.predict(data)))
        print(confusion_matrix(np.ravel(label), self.clf.predict(data)))
        print("{}%".format(
            np.round(self.clf.score(data, np.ravel(label)) * 100, 3)))

    # ------------------ PCA ------------------

    def D_red(self, data1, data2, n=2):
        self.pca.n_components = n
        pca_train = self.pca.fit_transform(data1)
        pca_test = self.pca.transform(data2)
        print(np.round(self.pca.explained_variance_, 3))
        print(np.round(self.pca.explained_variance_ratio_, 3))

        return pca_train, pca_test

    def plot_dist(self, data, label, db=False):
        plt.xlim(data[:, 0].min(), data[:, 0].max() + 1)
        plt.ylim(data[:, 1].min(), data[:, 1].max() + 1)
        mglearn.discrete_scatter(data[:, 0],
                                 data[:, 1],
                                 np.ravel(label.values.reshape(1, -1)).astype(
                                     np.int32),
                                 alpha=0.7)
        plt.legend(["패", "무", "승"])
        if db:
            mglearn.plots.plot_2d_classification(self.clf,
                                                 data,
                                                 fill=True,
                                                 alpha=.7)
        plt.show()
Exemple #31
0
# fit
vec_final = TfidfVectorizer(max_df=.5,
                            min_df=5,
                            lowercase=False,
                            ngram_range=(1, 1))
dtm = vec_final.fit_transform([t for t in df['raw']])
logreg = LogisticRegression(max_iter=1000)

start_time = time.time()  # time execution for comparison with wordfish

sm = SMOTE(random_state=42,
           sampling_strategy={
               'AfD': len(df[df['party'] == 'CDU/CSU']) * 5,
               'SPD': len(df[df['party'] == 'CDU/CSU']),
               'CDU/CSU': len(df[df['party'] == 'CDU/CSU']),
               'FDP': len(df[df['party'] == 'CDU/CSU']),
               'GRUENE': len(df[df['party'] == 'CDU/CSU']),
               'PDS/LINKE': len(df[df['party'] == 'CDU/CSU'])
           })
X_final, y_res_pt = sm.fit_resample(dtm, df['party'])
y_res = [t == 'AfD' for t in y_res_pt]
logreg.fit(X_final, y_res)

# predict
pred = logreg.predict_proba(dtm)
l_pred = []
for p in pred:
    l_pred.append(p[1])

end_time = time.time()
Exemple #32
0
    def smote_oversample(self, X, y):
        sm = SMOTE(n_jobs=2)
        heart_signal_res, labels_res = sm.fit_sample(X, y)

        heart_signal_res = np.reshape(heart_signal_res, (heart_signal_res.shape[0],))
        return heart_signal_res, labels_res
# +
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=10)
# -

confusion_matrices(y_train, y_train_pred)

# # this is terrible! - will SMOTE help?

# +
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE

smt = SMOTE()


def KFold_SMOTE_model_scores(X_df, y, model):

    scores = []
    cv = KFold(n_splits=5, random_state=42, shuffle=False)

    # need to reset the indices as the
    X_df = X_df.reset_index(drop=True)
    y = y.reset_index(drop=True)

    #this will shuffle through 5 different training and validation data splits
    for train_index, val_index in cv.split(X_df):

        X_train = X_df.loc[train_index]
Exemple #34
0
# #### Cost Sensitive Learning, 
# ### Synthetic Data Generation looks more suitable as it will be less prone to overfitting and also there will be no loss of data

# In[73]:


imbalance_train = churn = (sum(y_train['churn'])/len(y_train['churn'].index))*100
print("Telecom train dataset Imbalance before smote: {}".format(imbalance_train))


# In[74]:


# sampling_strategy: auto which is equivalent to "not majority" ie, oversampling all the classes except the majority
# kind: regular
smote = SMOTE(kind = "regular")
X_train_balanced,y_train_balanced = smote.fit_sample(X_train,y_train)


churn_percentage = (sum(y_train_balanced)/len(y_train_balanced))*100

print("X train dataset {}".format(X_train_balanced.shape))
print("y train dataset {}".format(y_train_balanced.shape))

print("Telecom train dataset Imbalance after smote: {}".format(churn_percentage))


# In[75]:


print(type(X_train_balanced))
    average_samples = int(mean(no_samples))
    weights = []
    for i in range(len(no_samples)):
        if no_samples[i] < average_samples:
            weights.append(average_samples)
        else:
            weights.append(no_samples[i])

    ratio_over = {
        0: weights[0],
        1: weights[1],
        2: weights[2],
        3: weights[3],
        4: weights[4]
    }
    over = SMOTE(sampling_strategy=ratio_over, random_state=314)
    X_train, y_train = over.fit_resample(X_train, y_train)

    # undersample samples > average
    ratio_under = {
        0: average_samples,
        1: average_samples,
        2: average_samples,
        3: average_samples,
        4: average_samples
    }
    under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314)
    X_train, y_train = under.fit_resample(X_train, y_train)
    cv_inner = KFold(n_splits=5, shuffle=True)
    model = KerasClassifier(build_fn=create_model, verbose=1)
Exemple #36
0
test_data = pchurn.chngtocat(test_data, pchurn.collist)
test_data = pchurn.removeColumns(pchurn.unimportantColumns, test_data)
test_data = pchurn.removeColumns(pchurn.unimportantColumnsfornum, test_data)
numerical_cols = pchurn.numerical_columns
tdf = pchurn.copytdf
for var in numerical_cols:
    minimum = min(tdf[var])
    maximum = max(tdf[var])
    test_data[var] = (test_data[var] - minimum) / (maximum - minimum)
test_data = test_data[pchurn.sccollist]

scaled_data = pchurn.scaled_data
X_original = scaled_data.drop(['Churn'], axis=1)
scaled_data['Churn'] = scaled_data['Churn'].replace([1, 0], ['Yes', 'No'])
Y_original = scaled_data['Churn']
sm = SMOTE(kind='regular')
X_oversampled, y_oversampled = sm.fit_sample(X_original, Y_original)
testing_features = np.array(test_data.drop(['Churn'], axis=1))
test_data['Churn'] = test_data['Churn'].replace([1, 0], ['Yes', 'No'])
testing_target = np.array(test_data['Churn'])


def false_nagative_rate(y_actual, y_hat):
    TP = 0
    FN = 0

    for i in range(len(y_hat)):
        if y_actual[i] == y_hat[i] == 'Yes':
            TP += 1
        if y_hat[i] == 'No' and y_actual[i] != y_hat[i]:
            FN += 1
Exemple #37
0
def perform_oversampling(oversamp_method, tr_features, tr_labels, model_class):
    start = time.time()
    if True:
        print(model_class + " oversampling method:\t" + oversamp_method +
              " ...")
        # 1 SMOTE
        if oversamp_method == 'SMOTE':
            # kind={'borderline1', 'borderline2', 'svm'}
            svm_model = svm.SVC(C=0.001,
                                kernel='rbf',
                                degree=3,
                                gamma='auto',
                                decision_function_shape='ovo')
            oversamp = SMOTE(ratio='auto',
                             random_state=None,
                             k_neighbors=5,
                             m_neighbors=10,
                             out_step=0.5,
                             kind='svm',
                             svm_estimator=svm_model,
                             n_jobs=1)

            # PROBAR SMOTE CON OTRO KIND

        elif oversamp_method == 'SMOTE_regular_min':
            oversamp = SMOTE(ratio='minority',
                             random_state=None,
                             k_neighbors=5,
                             m_neighbors=10,
                             out_step=0.5,
                             kind='regular',
                             svm_estimator=None,
                             n_jobs=1)

        elif oversamp_method == 'SMOTE_regular':
            oversamp = SMOTE(ratio='auto',
                             random_state=None,
                             k_neighbors=5,
                             m_neighbors=10,
                             out_step=0.5,
                             kind='regular',
                             svm_estimator=None,
                             n_jobs=1)

        elif oversamp_method == 'SMOTE_border':
            oversamp = SMOTE(ratio='auto',
                             random_state=None,
                             k_neighbors=5,
                             m_neighbors=10,
                             out_step=0.5,
                             kind='borderline1',
                             svm_estimator=None,
                             n_jobs=1)

        # 2 SMOTEENN
        elif oversamp_method == 'SMOTEENN':
            oversamp = SMOTEENN()

        # 3 SMOTE TOMEK
        # NOTE: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.65.3904&rep=rep1&type=pdf
        elif oversamp_method == 'SMOTETomek':
            oversamp = SMOTETomek()

        # 4 ADASYN
        elif oversamp_method == 'ADASYN':
            oversamp = ADASYN(ratio='auto',
                              random_state=None,
                              k=None,
                              n_neighbors=5,
                              n_jobs=cpu_threads)

        tr_features_balanced, tr_labels_balanced = oversamp.fit_sample(
            tr_features, tr_labels)

    end = time.time()

    count = collections.Counter(tr_labels_balanced)
    print("Oversampling balance")
    print(count)
    print("Time required: " + str(format(end - start, '.2f')) + " sec")

    return tr_features_balanced, tr_labels_balanced
Exemple #38
0
        ("num", num_pipeline, num_col),
        ("one", OneHotEncoder(), cat_col_one),
        #("ord", OneHotEncoder(), cat_col_ord),
    ])

features = full_pipeline.fit_transform(x)



#Ok, so now we have a training set encoded... 
#It is very imblanced.. so we need to correct this, else have issues finding a good classification
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
sm = SMOTE(ratio='auto', kind='regular')
ros = RandomOverSampler(random_state=0)

#X_train, y_train = sm.fit_sample(features,y)
X_train, y_train = ros.fit_sample(features,y)
#X_train, y_train = rus.fit_sample(features,y)
#X_train = features
#y_train = y



#Finally, we have 'feat' and a target 'Y' we can begin modeling

#Prep test data
xt = test.drop(target, axis=1)
y_test = test[target].copy()
Exemple #39
0
        labels.values.ravel(),
        train_size=train_size,
        shuffle=True,
        stratify=labels.values.ravel())

    # ### Impute Data
    if data_impute:
        imp = IterativeImputer(max_iter=25, random_state=1337)

        X_train = imp.fit_transform(X_train)
        X_test = imp.transform(X_test)

    # ### Augment Data
    if smote_ratio > 0:
        smote = SMOTE(sampling_strategy='all',
                      random_state=1337,
                      k_neighbors=5,
                      n_jobs=1)

        X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # ## Define Model

    knn = KNeighborsClassifier(
        n_neighbors=5,
        weights='uniform',  # or distance
        p=2,
        n_jobs=8)
df_org = df.copy()
df = df.dropna()

df['Customer_Location'] = df['Customer_Location'].astype("category").cat.codes
X = df.drop(['Email_Status'], axis=1)
Y = df['Email_Status']

## Oversampling Minority classes
from imblearn.over_sampling import SMOTE 
df_m = df.copy()
df_m = df_m[(df_m.Email_Status == 1) | (df_m.Email_Status == 2)]
X_m = df_m.drop(['Email_Status'], axis=1)
Y_m = df_m['Email_Status']
X = X.loc[Y[Y==0].index]
Y = Y[Y==0]
sm = SMOTE(random_state=np.random.randint(0, 100))
X_os_m , Y_os_m = sm .fit_resample(X_m, Y_m)
X_os = pd.concat([X, pd.DataFrame(X_os_m, columns= X.columns)], axis=0)
Y_os = pd.concat([Y, pd.Series(Y_os_m)], axis=0)

X_train, X_test, y_train, y_test = train_test_split(X_os, Y_os, 
                                                    test_size=0.3,
                                                    random_state=71, stratify=Y_os)

def hyperopt_train_test(params):
    t = params['type']
    del params['type']
    if t == 'naive_bayes':
        clf = BernoulliNB(**params)
    elif t == 'svm':
        clf = SVC(**params)
Exemple #41
0
model_evaluation(y_test, pred)


# SMOTE을 이용해서 Oversampling을 진행해보자!

# 기존의 X_train, y_train, X_test, y_test의 형태 확인
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

from imblearn.over_sampling import SMOTE
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) # y_train 중 레이블 값이 1인 데이터의 개수
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) # y_train 중 레이블 값이 0 인 데이터의 개수

sm = SMOTE(random_state = 42, ratio = 0.3) # SMOTE 알고리즘, 비율 증가
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) # Over Sampling 진행

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))



print("Before OverSampling, the shape of X_train: {}".format(X_train.shape)) # SMOTE 적용 이전 데이터 형태
print("Before OverSampling, the shape of y_train: {}".format(y_train.shape)) # SMOTE 적용 이전 데이터 형태
print('After OverSampling, the shape of X_train: {}'.format(X_train_res.shape)) # SMOTE 적용 결과 확인
print('After OverSampling, the shape of y_train: {}'.format(y_train_res.shape)) # # SMOTE 적용 결과 확인

lgb_dtrain2 = lgb.Dataset(data = pd.DataFrame(X_train_res), label = pd.DataFrame(y_train_res)) # 학습 데이터를 LightGBM 모델에 맞게 변환
lgb_param2 = {'max_depth': 10, # 트리 깊이
            'learning_rate': 0.01, # Step Size
Exemple #42
0
def ensembleSmote(xydev):
    xdevf,ydev = xydev
    sm = SMOTE(kind='svm',random_state=sh.getConst('smoteSeed'))
    xdevfr,ydevr = sm.fit_sample(xdevf,ydev)
    return (xdevfr,ydevr)
Exemple #43
0
# Preprocess
compound_x = preprocess_variables(compound_x)
# Find intersecting features
avail_columns = compound_x.columns.intersection(full_columns)
# Select features on subset
x_data = compound_x.loc[:, avail_columns]
y_data = compound_y.copy()
# Create binary variable
y_class = np.squeeze([int(y_val <= 10) for y_val in y_data])

# Smote
from custom_pipe_helper import SMOTER

import auto

smote = SMOTE()

check = smote.fit(x_data, y_class)
smote.fit_sample()
check = smote.sample(x_data, y_class)

check[0].shape
check[1]

# Create folds
# For each fold
# SMOTE the train data
# Train model
# Evaluate model

from sklearn.ensemble import AdaBoostClassifier
Exemple #44
0
    'num', numeric_transformer,
    numeric_features), (
        'cat', categorical_transformer,
        categorical_features), ('scaler', scaling_transformer,
                                numeric_features)])

# Boosting classifier
xgb_clf = xgb.XGBClassifier(objective="binary:logistic",
                            learning_rate=0.01,
                            n_estimators=500,
                            max_depth=1,
                            subsample=0.4,
                            random_state=42)

# Combine preprocessing with classifier
latePaymentsModel = make_pipeline(preprocess, SMOTE(random_state=42), xgb_clf)

# Fit the pipeline to the training data (fit is for both the preprocessing and the classifier)
print("\nTraining model ...")
latePaymentsModel.fit(X_train, y_train)

# Save the trained model as a pickle file
print("\nSaving model ...")
file = open('public/latePaymentsModel.pkl', 'wb')
pickle.dump(latePaymentsModel, file)
file.close()

# load the pickled model
print("\nLoading saved model to make example predictions...")
pickledModel = pickle.load(open('public/latePaymentsModel.pkl', 'rb'))
                                             y=None,
                                             kind='pie',
                                             ax=axs[1],
                                             autopct='%1.2f%%')  # 饼图
axs[1].set_title("Percentage of each TARGET")
plt.show()

# In[]:
# 1.7、上采样:
import imblearn
from imblearn.over_sampling import SMOTE

X_temp = data.iloc[:, 1:]
y_temp = data["SeriousDlqin2yrs"]  # y = data.iloc[:,0]

sm = SMOTE(random_state=42)  #实例化
X, y = sm.fit_sample(X_temp, y_temp)

n_sample_ = X.shape[0]  # 278584
pd.Series(y).value_counts()
n_1_sample = pd.Series(y).value_counts()[1]
n_0_sample = pd.Series(y).value_counts()[0]
print('样本个数:{}; 1占{:.2%}; 0占{:.2%}'.format(n_sample_, n_1_sample / n_sample_,
                                           n_0_sample / n_sample_))
#样本个数:278584; 1占50.00%; 0占50.00%

# In[]:
# 上采样之后,切分训练集、测试集; 保存 前期处理 结果
from sklearn.model_selection import train_test_split
X = pd.DataFrame(X)
y = pd.DataFrame(y)
Exemple #46
0
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')

plt.title('ROC Curve GradientBoosting Un-Balanced Data')
plt.legend(loc="lower right")

plt.show()



#As the data for target is unbalanced creating balanced datasets using SMOTE
kind = ['svm'];
sm = [SMOTE(kind=k) for k in kind]
for method in sm:
    X_res, y_res = method.fit_sample(emp_mod, y)
    
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=.2,
                                                    random_state=0)

#Using Logistic Regression with Balanced Dataset after SMOTE Sampling
regr=skl_lm.LogisticRegression()
regr.fit(X_train,y_train)
pred=regr.predict(X_test)
#validation of Logistic
fpr,tpr,_=roc_curve(y_test,pred)

from sklearn.metrics import auc
                 self.precisionMaj, self.precisionMin, self.recallMaj,
                 self.recallMin, self.f1Maj, self.f1Min, self.gMaj, self.gMin,
                 self.mcc, self.aucRes, int(round(self.tp)), int(round(
                     self.tn)), int(round(self.fp)), int(round(self.fn)),
                 str(self.clf), dictDesbalanceamento['imbLevel']))
            conn.close()


if __name__ == '__main__':
    # exemplo
    # defino a lista de datasets
    dataSets = ["pima.csv"]
    # a lista com as tecnicas de amostragem
    tecnicasAmostragem = [
        None,
        SMOTE(kind="regular", ratio=1.0),
        SMOTETomek(ratio=1.0),
        SMOTE(kind="borderline1", ratio=1.0),
        SMOTE(kind="borderline2", ratio=1.0)
    ]
    clfs = [
        svm.LinearSVC(),
        BernoulliNB(),
        tree.DecisionTreeClassifier(criterion="entropy", max_depth=7)
    ]

    for dataSet in dataSets:
        cv = CrossValidationStratified(dataset=dataSet, verbose=True)
        cv.splitClasses()
        # pego os dados de desbalanceamento para o sql
        dictDesbalanceamento = cv.getImbalanceLevel()
Exemple #48
0
                                                    random_state=8)

# Defining pipelines along with relative parameters to be used in GridSearchCV

# #### Pipe 1 ---> StandardScaler + PCA

# In[11]:


pipe_1 = make_pipeline(PearsonSelector(),                       #OutliersIQR(),\
                       StandardScalerCust(),\
                       BinaryEncoder(selected_columns = ['international_plan','voice_mail_plan']),\
                       DropColumns(['state','area_code']),\
                       #GetDummies(),

                       SMOTE(),\
                       PCA(),\
                       xgb.XGBClassifier(n_jobs=-1))

params_1 = [{
    'pearsonselector__limit': [0.2, 0.4],
    'smote__k_neighbors': [3, 5],
    'pca__n_components': [2, 3],
    'xgbclassifier__n_estimators': [1000]
}]

# #### Pipe 2 ---> MinMaxScaler + RFE

# In[12]:

Exemple #49
0
def fraud_detection(data):
    data = data.drop('cardverificationcodesupplied', 1)
    data = data.drop('cvcresponsecode', 1)
    y = data['simple_journal']
    X = data.drop('simple_journal', 1)

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.35) 
    sm = SMOTE(ratio = 1)
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

    print("\nBefore SMOTE, counts of label '1': {}".format(sum(y_train==1)))
    print("Before SMOTE, counts of label '0': {} \n".format(sum(y_train==0)))
    print("\nAfter SMOTE, counts of label '1': {}".format(sum(y_train_sm==1)))
    print("After SMOTE, counts of label '0': {} \n".format(sum(y_train_sm==0)))
    print("\nTest data, counts of label '1': {}".format(sum(y_test==1)))
    print("Test data, counts of label '0': {} \n".format(sum(y_test==0)))

    # (1) Build Random Forest classifier (Black Box)
    # with SMOTE
    clf_rf_sm = RandomForestClassifier(n_estimators=25, random_state=12)
    clf_rf_sm.fit(x_train_sm, y_train_sm)

    probs_rf_sm = clf_rf_sm.predict_proba(x_test)
    probs_rf_sm = probs_rf_sm[:,1]
    auc_rf_sm = roc_auc_score(y_test, probs_rf_sm)
    fpr_rf_sm, tpr_rf_sm, thresholds_rf_sm = roc_curve(y_test, probs_rf_sm)

    # without SMOTE
    clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
    clf_rf.fit(x_train, y_train)

    probs_rf = clf_rf.predict_proba(x_test)
    probs_rf = probs_rf[:,1]
    auc_rf = roc_auc_score(y_test, probs_rf)
    fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, probs_rf)

    # (2) Build KNN Classifier (Black Box)
    # with SMOTE
    neigh_sm = KNeighborsClassifier(n_neighbors=3)
    neigh_sm.fit(x_train_sm, y_train_sm)

    probs_neigh_sm = neigh_sm.predict_proba(x_test)
    probs_neigh_sm = probs_neigh_sm[:,1]
    auc_neigh_sm = roc_auc_score(y_test, probs_neigh_sm)
    fpr_neigh_sm, tpr_neigh_sm, thresholds_neigh_sm = roc_curve(y_test, probs_neigh_sm)

    # without SMOTE
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(x_train, y_train)

    probs_neigh = neigh.predict_proba(x_test)
    probs_neigh = probs_neigh[:,1]
    auc_neigh = roc_auc_score(y_test, probs_neigh)
    fpr_neigh, tpr_neigh, thresholds_neigh = roc_curve(y_test, probs_neigh)

    # (3) Build Decision Tree Classifier (White Box)
    # with SMOTE
    dt_sm = tree.DecisionTreeClassifier()
    dt_sm.fit(x_train_sm, y_train_sm)

    probs_dt_sm = dt_sm.predict_proba(x_test)
    probs_dt_sm = probs_dt_sm[:,1]
    auc_dt_sm = roc_auc_score(y_test, probs_dt_sm)
    fpr_dt_sm, tpr_dt_sm, thresholds_dt_sm = roc_curve(y_test, probs_dt_sm)

    # without SMOTE
    dt = tree.DecisionTreeClassifier()
    dt.fit(x_train, y_train)

    probs_dt = dt.predict_proba(x_test)
    probs_dt = probs_dt[:,1]
    auc_dt = roc_auc_score(y_test, probs_dt)
    fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, probs_dt)

    # Plot ROC curves
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.plot(fpr_rf_sm, tpr_rf_sm, marker='.', label='Random Forest SMOTE (area= %0.2f)' % auc_rf_sm)
    plt.plot(fpr_rf, tpr_rf, marker='.', label='Random Forest (area= %0.2f)' % auc_rf)
    plt.plot(fpr_neigh_sm, tpr_neigh_sm, marker='.', label='kN SMOTE (area= %0.2f)' % auc_neigh_sm)
    plt.plot(fpr_neigh, tpr_neigh, marker='.', label='kN (area= %0.2f)' % auc_neigh)
    plt.plot(fpr_dt_sm, tpr_dt_sm, marker='.', label='Decision Tree SMOTE (area= %0.2f)' % auc_dt_sm)
    plt.plot(fpr_dt, tpr_dt, marker='.', label='Decision Tree (area= %0.2f)' % auc_dt)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc='lower right')
    plt.show()
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
    'month', 'day_of_week', 'poutcome'
]
data_vars = df.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]

print(to_keep)
data_final = df[to_keep]
print(data_final.columns.values)

x = data_final.loc[:, data_final.columns != 'y']
y = data_final.loc[:, data_final.columns == 'y']

from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
columns = x_train.columns

os_data_x, os_data_y = os.fit_sample(x_train, y_train)
os_data_x = pd.DataFrame(data=os_data_x, columns=columns)
os_data_y = pd.DataFrame(data=os_data_y, columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ", len(os_data_x))
print("Number of no subscription in oversampled data",
      len(os_data_y[os_data_y['y'] == 0]))
print("Number of subscription", len(os_data_y[os_data_y['y'] == 1]))
print("Proportion of no subscription data in oversampled data is ",
Exemple #51
0
# calculate the accuracy score
score = accuracy_score(y_pred, y_test)

# calculate the precision
precision = precision_score(y_test, y_pred)

# display 'score' and 'precision'
print('Accuracy:', score)
print('Precision:', precision)

# --------------
# import packages
from imblearn.over_sampling import SMOTE

# Instantiate smote
smote = SMOTE(random_state=9)

# fit_sample onm training data
X_train, y_train = smote.fit_sample(X_train, y_train)

# fit modelk on training data
rf.fit(X_train, y_train)

# predict on test data
y_pred = rf.predict(X_test)

# calculate the accuracy score
score = accuracy_score(y_test, y_pred)

# calculate the precision
precision = precision_score(y_test, y_pred)
Exemple #52
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from rd import ReadData
import numpy as np
from goGoogle import goGoogle
import time

for i in range(30):
    train_data, test_data, train_label, test_label = ReadData(i)
    Train_data = np.asarray(train_data).astype(np.float64)
    Test_data = np.asarray(test_data).astype(np.float64)
    Train_label = np.asarray(train_label).astype(np.float64)
    Test_label = np.asarray(test_label).astype(np.float64)
    sm = SMOTE(random_state=42)
    New_Data, New_Label = sm.fit_resample(Train_data, Train_label)
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(Train_data, Train_label)
    print('RFC_Acc:', rfc.score(Test_data, Test_label))
    ac = rfc.score(Test_data, Test_label)
    rfc.fit(New_Data, New_Label)
    print('New_Rfc_Acc:', rfc.score(Test_data, Test_label))
    nac = rfc.score(Test_data, Test_label)
    goGoogle(i, 1, ac, nac, 'rfc')
    time.sleep(2)
Exemple #53
0
def train(datasetFilename, classFilename):
    # second, prepare text samples and their labels
    print('Processing text dataset')
    texts = []  # list of text samples
    labels = []  # list of label ids
    dataset = open(datasetFilename).read()  #Read file
    texts = dataset.split('#SEPARATOR#')  #Split Status Dataset by each user
    labelClass = open(classFilename).read()
    labels = labelClass.split('\n')
    print('Found %s texts.' % len(texts))

    tempTexts = []
    tempLabels = []
    indices = np.arange(400)
    np.random.shuffle(indices)
    for i in range(0, 400):
        tempTexts.append(texts[indices[i]])
        tempLabels.append(labels[indices[i]])
    texts = tempTexts
    labels = tempLabels

    # finally, vectorize the text samples into a 2D integer tensor
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
    x_train = data[:-num_validation_samples]
    y_train = []
    x_test = data[-num_validation_samples:]
    y_test = []
    startIdx = 0
    finishIdx = 0.8 * data.shape[0]
    for i in range(int(startIdx), int(finishIdx)):
        y_train.append(labels[i])
    startIdx = data.shape[0] - 0.2 * data.shape[0]
    finishIdx = data.shape[0]
    for i in range(int(startIdx), int(finishIdx)):
        y_test.append(labels[i])

    # Start of Oversampling #######################################################################################################
    y_train = np.reshape(y_train, (len(y_train)))
    y_test = np.reshape(y_test, (len(y_test)))

    # kind = ['regular', 'borderline1', 'borderline2', 'svm']
    kind = ['borderline1']
    sm = [SMOTE(kind=k) for k in kind]
    for method in sm:
        x_train_resampled, y_train_resampled = method.fit_sample(
            x_train, y_train)
        x_test_resampled, y_test_resampled = method.fit_sample(x_test, y_test)

    print("x_train= " + str(x_train.shape) + " x_test= " + str(x_test.shape))
    print("y_train= " + str(y_train.shape) + " y_test= " + str(y_test.shape))
    print("x_train_resampled= " + str(x_train_resampled.shape) +
          " x_test_resampled= " + str(x_test_resampled.shape))
    print("y_train_resampled= " + str(y_train_resampled.shape) +
          " y_test_resampled= " + str(x_test_resampled.shape))
    x_train = x_train_resampled
    x_test = x_test_resampled
    y_train = to_categorical(np.asarray(y_train_resampled.tolist()))
    y_test = to_categorical(np.asarray(y_test_resampled.tolist()))
    # End of Oversampling #########################################################################################################

    print('Shape of data tensor:', x_train.shape + x_test.shape)
    print('Shape of label tensor:', y_train.shape + y_test.shape)

    print('Preparing embedding matrix.')
    # prepare embedding matrix
    num_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_NB_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    print('Training model.')
    # LSTM
    model = Sequential()
    model.add(embedding_layer)
    model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'mse', 'mae'])

    model.fit(x_train,
              y_train,
              batch_size=32,
              epochs=10,
              validation_data=(x_test, y_test),
              verbose=2)

    model_json = model.to_json()
    with open("model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("model.h5")
    print("Saved model to disk")

    # Evaluate Model
    confusion = np.array([[0, 0], [0, 0]])

    predictions = model.predict(x_test)
    predictedRound = [round(x[1]) for x in predictions]
    predicted = [x[1] for x in predictions]
    tested = [round(x[1]) for x in y_test]

    confusion += confusion_matrix(tested, predictedRound)
    precisionScore = precision_score(tested, predictedRound,
                                     pos_label=1.)  #pos_label= (1.=yes, 0.=no)
    recallScore = recall_score(tested, predictedRound, pos_label=1.)
    accuracyScore = accuracy_score(tested, predictedRound)
    f1Score = f1_score(tested, predictedRound, pos_label=1.)
    rocAucScore = roc_auc_score(tested, predictedRound)
    maeScore = mean_absolute_error(tested, predictedRound)
    mseScore = mean_squared_error(tested, predictedRound)
    r2Score = r2_score(tested, predictedRound)

    trainResult = ''
    #     trainResult += 'Filename: ' + dsetFilename + '\n'
    #     trainResult += 'Classifier: ' + str(classifier) + '\n'
    trainResult += str(confusion[0]) + str(confusion[1]) + ','
    trainResult += str(precisionScore) + ','
    trainResult += str(recallScore) + ','
    trainResult += str(f1Score) + ','
    trainResult += str(accuracyScore) + ','
    trainResult += str(rocAucScore) + ','
    trainResult += str(maeScore) + ','
    trainResult += str(mseScore) + ','
    trainResult += str(r2Score) + '\n'
    print("Trained successfully\n")

    return trainResult
            vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
        elif 'unibitri_gram' in experiment:
            vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
        else:
            vectorizer = CountVectorizer(binary=True)

    vectorizer.fit(texts_train)
    X_train = vectorizer.transform(texts_train)
    X_test = vectorizer.transform(texts_test)
    X = vectorizer.transform(pro_texts)

    # In[35]:

    if 'smote' in experiment:
        #oversampling with SMOTE
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    elif 'undersampling' in experiment:
        rus = RandomUnderSampler(random_state=42)
        X_train, y_train = rus.fit_resample(X_train, y_train)
    elif 'random_oversampling' in experiment:
        ros = RandomOverSampler(random_state=42)
        X_train, y_train = ros.fit_resample(X_train, y_train)

    X_train.shape

    # In[36]:

    vocab_size = X_train.shape[1]
    vocab_size
Exemple #55
0
# making our independent set

Y = X['isFraud']

# removing the dependent set
X = X.drop(['isFraud'], axis=1)

# getting the shapes of x and y
print("Shape of x: ", X.shape)
print("Shape of y: ", Y.shape)

print(X.head())

from imblearn.over_sampling import SMOTE
x_resample, y_resample = SMOTE().fit_sample(X, Y.values.ravel())

# getting the shapes of x and y after resampling
print("Shape of x: ", x_resample.shape)
print("Shape of y:", y_resample.shape)

# splitting the dataset into train and tests

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_resample,
                                                    y_resample,
                                                    test_size=0.2,
                                                    random_state=0)

# checking the new shapes
X, Y = import_training_data()
#X, Y = data_cleaning(X, Y, 5)
X = check_collinearity(X)
X, y = reorganize_data(X, Y)
y = y.ix[:, 0]
names = list(X)
print X.shape, y.shape

# split the training data into training and test set
X_training, X_test, y_training, y_val = train_test_split(X,
                                                         y,
                                                         train_size=0.75,
                                                         random_state=0)

# correct the skewness
oversampler = SMOTE(random_state=0)
X_training, y_training = oversampler.fit_sample(X_training, y_training)

# random forest parameter
params_rf = {
    'n_jobs': 1,
    'n_estimators': 1600,
    'warm_start': True,
    'max_features': 0.3,
    'max_depth': 9,
    'min_samples_leaf': 2,
    'random_state': 0,
    'verbose': 0
}

# random forest
from imblearn.over_sampling import SMOTE

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Borderline SMOTE 1
sm = SMOTE(kind='borderline1')
X_resampled, y_resampled = sm.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
Exemple #58
0
    k_best = SelectKBest(score_func=score_func, k=10).fit(X, y)

    idxs = k_best.get_support(indices=True)
    X = X.iloc[:,idxs]
    return X


"""
testar KBest
"""
X, y = split_dataset(super_table, CLASS)
X = getKBest(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

sm = SMOTE(random_state=2)

X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

results = {}

for clf in base_clfs:
    clf_name = type(clf).__name__
    stats = classifier_statistics(clf, X_train_res, X_test, y_train_res, y_test)
    results[clf_name] = stats

measures = {}
i = 0
for clf in results:
    clf_res = results[clf]
    measures[i] = {'Classifier': clf, 'Measure': 'Accuracy', 'Value': clf_res['accuracy']}
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

# Dividing dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=100)
print(X_train.shape)
print(X_test.shape)

# Implementing Oversampling to balance the dataset; SMOTE stands for Synthetic Minority Oversampling TEchnique
print(
    "Number of observations in each class before oversampling (training data): \n",
    pd.Series(Y_train).value_counts())
smote = SMOTE(random_state=101)
X_train, Y_train = smote.fit_sample(X_train, Y_train)

print(
    "Number of observations in each class after oversampling (training data): \n",
    pd.Series(Y_train).value_counts())

rfc = RandomForestClassifier(criterion='entropy',
                             max_features='auto',
                             random_state=1)
grid_param = {'n_estimators': [50, 100, 150, 200, 250, 300]}

gd_sr = GridSearchCV(estimator=rfc,
                     param_grid=grid_param,
                     scoring='precision',
                     cv=5)
Exemple #60
0
#使用imbalanceed-learn库来进行 欠采样、过采样
#但相当一部分模型都可以通过 class_weight 参数来调整每种分类下样本的权重。
from imblearn.over_sampling import SMOTE,RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

#随机欠采样:在样本中,如果某一分类的样本明显多于另一类,则可以使用欠采样来减少多数类样本的个数
rus=RandomUnderSampler({0:4000,1:3000})#“0”,“1”为多数样本,4000为采样个数,可以根据实际情况调整。参数中没有提及的分类则不会改动
x,y=rus.fit_resample(x,y)
#这时x,y就被替换为欠采样后的数据
#如果欠采样的比例过小,则可能造成欠拟合


#过采样
#1.随机过采样:某分类的样本明显偏少,则可以使用它对少数样本进行过采样。本质上是对少数类样本的随机重复使用
ros=RandomOverSampler({2:3000,3:4000})#同上
x,y=ros.fit_resample(x,y)

#2.少数类合成过采样 Synthetic Minority Over-Sampling Technique 我看过用它写的一篇论文。写得还行
smote=SMOTE(kind='regular',k_neighbors=5,ratio={2:3000,3:4000})
'''
parameters:
kind: ('regular', 'borderline1', 'borderline2' or  'svm',default='regular') 样本合成的方式,不懂就百度SMOTE,有真相
k_neighbors: (int,default=5) 样本合成时使用的邻近样本数
ratio: 样本合成的数量或比例
svm_estimator: kind=svm时才需要设置,传入一个sklearn的模型就行
'''
#如果过采样比例过大,则可能造成“过拟合”,即把少数类中的偶然情况当作一般规律