コード例 #1
0
def gradient_boosting():
    kfold = model_selection.StratifiedKFold(n_splits=10, random_state=True)

    scoreings = []

    for train_index, test_index in kfold.split(X, y):
        # print("Train:", train_index, "Validation:", test_index)
        X_t, X_test = X[train_index], X[test_index]
        y_t, y_test = y[train_index], y[test_index]

        GSMOTE = EGSmote()
        X_train, y_train = GSMOTE.fit_resample(X_t, y_t)
        gbc = GradientBoostingClassifier(n_estimators=100,
                                         learning_rate=0.01,
                                         max_depth=3)
        gbc.fit(X_train, y_train)

        # Predicting the Test set results
        y_predict = gbc.predict(X_test)
        y_pred = np.where(y_predict > 0.5, 1, 0)

        scoreings.append(evaluate2(y_test, y_pred))

    scoreings = np.asarray(scoreings)
    fscores = scoreings[:, 0]
    gmean = scoreings[:, 1]
    auc = scoreings[:, 2]

    return ["GBC", fscores.mean(), gmean.mean(), auc.mean()]
コード例 #2
0
def logistic_training():

    kfold = model_selection.StratifiedKFold(n_splits=10, random_state=True)

    scoreings = []

    for train_index, test_index in kfold.split(X, y):
        # print("Train:", train_index, "Validation:", test_index)
        X_t, X_test = X[train_index], X[test_index]
        y_t, y_test = y[train_index], y[test_index]

        GSMOTE = EGSmote()
        X_train, y_train = GSMOTE.fit_resample(X_t, y_t)
        regressor = LogisticRegression(max_iter=120)
        regressor.fit(X_train, y_train)

        # Predicting the Test set results
        y_predict = regressor.predict(X_test)
        y_pred = np.where(y_predict > 0.5, 1, 0)

        scoreings.append(evaluate2(y_test, y_pred))
    scoreings = np.asarray(scoreings)
    fscores = scoreings[:, 0]
    gmean = scoreings[:, 1]
    auc = scoreings[:, 2]

    return ["LR", fscores.mean(), gmean.mean(), auc.mean()]
コード例 #3
0
def KNN():

    # Fitting Simple Linear Regression to the Training set

    kfold = model_selection.StratifiedKFold(n_splits=10, random_state=True)

    scoreings = []

    for train_index, test_index in kfold.split(X, y):
        # print("Train:", train_index, "Validation:", test_index)
        X_t, X_test = X[train_index], X[test_index]
        y_t, y_test = y[train_index], y[test_index]

        GSMOTE = EGSmote()
        X_train, y_train = GSMOTE.fit_resample(X_t, y_t)
        classifier = KNeighborsClassifier(n_neighbors=5,
                                          metric='minkowski',
                                          p=2)
        classifier.fit(X_train, y_train)

        # Predicting the Test set results
        y_pred = classifier.predict(X_test)

        scoreings.append(evaluate2(y_test, y_pred))

    scoreings = np.asarray(scoreings)
    fscores = scoreings[:, 0]
    gmean = scoreings[:, 1]
    auc = scoreings[:, 2]

    return ["KNN", fscores.mean(), gmean.mean(), auc.mean()]
コード例 #4
0
    def parse_input_zoo_data(filename, header='infer'):
        gsmote = EGSmote(random_state=1)
        df = pd.read_csv(filename)
        X = np.asarray(df.iloc[:, :-1].values)
        y = np.asarray(df.iloc[:, -1].values)
        X_t, X_test, y_t, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
        # X_train, y_train = gsmote.fit_resample(X_t,y_t)
        smt = SMOTE()
        X_train, y_train = smt.fit_sample(X_t, y_t)
        classes = y_train.tolist()
        labels = y_train.tolist()
        input_database = {0: X_train}

        # input_data = pd.read_csv(filename, header=header)
        #
        # classes = input_data[17].tolist()
        # labels = input_data[0].tolist()
        # input_database = {
        #     0: input_data.as_matrix([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
        # }

        return input_database, labels, classes, X_test, y_test
    print("---------------------------------------------------------")

    kfold = model_selection.StratifiedKFold(n_splits=5,
                                            random_state=True,
                                            shuffle=True)
    scorings = []
    iter = 0
    for train_index, test_index in kfold.split(X, y):
        iter = iter + 1
        print("Itertion: " + str(iter) + " =>   Processing")
        X_t, X_test = X[train_index], X[test_index]
        y_t, y_test = y[train_index], y[test_index]
        #
        # GSMOTE = SMOTE()
        # GSMOTE = OldGeometricSMOTE()
        GSMOTE = EGSmote()
        X_train, y_train = GSMOTE.fit_resample(X_t, y_t)
        # X_train,y_train = X_t,y_t
        fold_score = []
        performance1 = logistic_training(X_train, y_train, X_test, y_test)
        # performance2 = gradient_boosting(X_train,y_train,X_test,y_test)
        # performance3 = XGBoost(X_train,y_train,X_test,y_test)
        # performance4 = KNN(X_train,y_train,X_test,y_test)
        # performance5 = decision_tree(X_train,y_train,X_test,y_test)

        fold_score.append(performance1)
        # fold_score.append(performance2)
        # fold_score.append(performance3)
        # fold_score.append(performance4)
        # fold_score.append(performance5)
        scorings.append(fold_score)
コード例 #6
0
# Partition the dataset
from sklearn.model_selection import train_test_split

date_file = "../../data/CICID-11372.csv"
X, y = pp.pre_process(date_file)

X, X_t, y, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Instantiate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

gs = gs()
X_resampled, y_resampled = gs.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == '0', 0],
                 X_vis[y == '0', 1],
                 label="Class #0",
                 alpha=0.5,
                 marker='.')
c1 = ax1.scatter(X_vis[y == '1', 0],
                 X_vis[y == '1', 1],
                 label="Class #1",
                 alpha=0.5,
                 marker='.')
コード例 #7
0
from gsmote.comparison_testing import preprocessing as pp
import pandas as pd
from gsmote import EGSmote
date_file = "../../data/KDDCUP0.csv"
X, y = pp.pre_process(date_file)
sm = EGSmote()
# X,y = sm.fit_resample(X,y)

train_sizes = [
    100, 500, 600, 7000, 1000, 1500, 2000, 3000, 10000, 15000, 20000, 30000,
    40000, 50000, 60000, 70000
]

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import learning_curve
train_sizes, train_scores, validation_scores = learning_curve(
    estimator=LinearRegression(),
    X=X,
    y=y,
    train_sizes=train_sizes,
    cv=16,
    shuffle=True,
    scoring="f1")
print('Training scores:\n\n', train_scores)
print('\n', '-' * 70)  # separator to make the output easy to read
print('\nValidation scores:\n\n', validation_scores)

train_scores_mean = -train_scores.mean(axis=1)
validation_scores_mean = -validation_scores.mean(axis=1)
print('Mean training scores\n\n',
      pd.Series(train_scores_mean, index=train_sizes))