Example #1
0
def detect_outliers_SVM(df):
    ''' Returns the outlier scores using SVM (beware: prone to overfitting)

    Parameters:
    -----------
    df: pd.DataFrame,
    '''
    clf = OneClassSVM()
    clf.fit_predict(df)
    scores = clf.score_samples(df)
    # dec_func = clf.decision_function(df_imputed)
    return scores
Example #2
0
def log_anomalyPRF_isof(cp, ground_truth, dataset, log_flag, SEED=1234):
    # Init clustering hyperparameters
    n_clusters = cp.getint('Hyperparameters', 'ClusterNum')
    cluster_init = cp.getint('Hyperparameters', 'ClusterInit')
    km = OCS(kernel='linear')
    if isinstance(dataset, basestring):
        pred = km.fit_predict(np.load(dataset))
        pred[np.where(pred == -1)[0]] = 0
    else:
        pred = km.fit_predict(dataset)
        pred[np.where(pred == -1)[0]] = 0
    #  pred = assign_labels(pred, ground_truth)
    print CR(ground_truth, pred)
Example #3
0
def log_accuracy_isof(cp, ground_truth, dataset, log_flag, SEED=1234):
    # Init clustering hyperparameters
    n_clusters = cp.getint('Hyperparameters', 'ClusterNum')
    cluster_init = cp.getint('Hyperparameters', 'ClusterInit')
    # KMeans model
    km = OCS(kernel='linear')
    if isinstance(dataset, basestring):
        pred = km.fit_predict(np.load(dataset))
        pred[np.where(pred == -1)[0]] = 0
    else:
        pred = km.fit_predict(dataset)
        pred[np.where(pred == -1)[0]] = 0
    log('--------------- {} {} ------------------------'.format(
        log_flag,
        cluster_acc(pred, ground_truth)[0]))
def outliers_OneClassSVM(df, iters):
    dataset = df.copy()
    OCSVM = OneClassSVM(kernel='rbf', gamma='auto', max_iter=iters)
    df_with_svm = dataset.join(pd.DataFrame(OCSVM.fit_predict(dataset),
                                            index=dataset.index, columns=['svm']), how='left')

    return df_with_svm.loc[df_with_svm['svm'] != 1].index
def model_ocSVM(data):
    from sklearn.svm import OneClassSVM
    
    ocSVM = OneClassSVM(kernel="rbf")
    y_pred = ocSVM.fit_predict(data["X_test"])
    
    return y_pred
Example #6
0
def svm_anomalies(train_data, train_oids, test_data, test_oids):
    '''
    Function to detect anomalies given training data

    Keyword Args:
    train_data - training data
    train_oids - overflight ids for training data
    test_data - testing data
    test_oids - overflight ids for testing data

    Returns:
    two dictionaries of oid -> anomaly (-1 or 1)

    '''

    OC_SVM = OneClassSVM(kernel="rbf", degree=20)
    train_anomalies = OC_SVM.fit_predict(train_data)
    test_anomalies = OC_SVM.predict(test_data)

    train_dict = {}
    test_dict = {}

    for i in range(len(train_oids)):
        train_dict[train_oids[i]] = train_anomalies[i]

    for i in range(len(test_oids)):
        test_dict[test_oids[i]] = test_anomalies[i]

    return train_dict, test_dict
Example #7
0
class OCSVM(object):
    def __init__(self, file_name, config):
        self.dataset = config.dataset
        self.file_name = file_name

        self.x_dim = config.x_dim

        self.kernel = config.kernel
        self.degree = config.degree
        self.gamma = config.gamma
        self.coef0 = config.coef0
        self.tol = config.tol
        self.nu = config.nu

        self.pid = config.pid

        self.model = OneClassSVM(kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0,
                                 tol=self.tol, nu=self.nu)

    def fit(self, train_input, train_label, test_input, test_label):
        # Perform fit on X and returns labels for X.
        # Returns -1 for outliers and 1 for inliers.
        y_pred = self.model.fit_predict(train_input)
        decision_function = self.model.decision_function(train_input)

        ocsvm_output = OCSVMOutput(y_hat=y_pred, decision_function=decision_function)

        return ocsvm_output
Example #8
0
class OutlineCheck():
    '''
    check outline rows by sk outliner model
    '''
    def __init__(self, clf=None, **kwargs):
        '''
        default use OneClassSVM,or you can use sk outliner model(IsolationForest/LocalOutlierFactor)
        '''
        if clf is None:
            self.clf = OneClassSVM(**kwargs)
        else:
            self.clf = clf(**kwargs)

    def get_detail(self, X: pd.DataFrame):
        pre_result = self.clf.fit_predict(X)
        inliers = X[pre_result == 1]
        return self.clf, inliers

    @staticmethod
    def get_predict_detail(clf, X):
        '''
        Params:
        clf:the model by get_detail return
        X:the input data for check
        '''
        pre = clf.predict(X)
        if X.shape[0] > 1:
            return X[pre == 1]
        else:
            return pre[0]
Example #9
0
def dixon():
    try:
        data = np.array(request.json["Data"])
        params = request.json['Params']

        kernel = "rbf"
        degree = 3
        gamma = 'scale'
        coef = 0

        if "kernel" in params:
            kernel = params["kernel"]

        if "degree" in params:
            degree = params["degree"]

        if "gamma" in params:
            gamma = params["gamma"]

        if "coef" in params:
            coef = params["coef"]

        clf = OneClassSVM(kernel=kernel,
                          degree=degree,
                          gamma=gamma,
                          coef0=coef)

        indices = clf.fit_predict(data)
        indices = [0 if x == 1 else 1 for x in indices.tolist()]

    except Exception as e:
        return jsonify({"message": str(e)}), 400
    return jsonify({"message": "OK", "data": indices})
Example #10
0
class OCSVMDetector(IAnomaly):
    def __init__(self, slidingWindowSize = None):
        self.slidingWindowSize = slidingWindowSize
        self.receivedSamplesNumber = 0
        self.currentSamples = []
        self.clf = OneClassSVM(nu=0.1, kernel="rbf", gamma='auto')
        self.dictHeaders = ['detectionCode', 'anomalyLikelihood', 'anomalyScore']

    def appendNewData(self, sample):
        self.currentSamples.append(float(sample["Resistance"]))
        self.receivedSamplesNumber = self.receivedSamplesNumber +1
    def detect(self, new_data):
        if self.receivedSamplesNumber < self.slidingWindowSize - 1:
            #Append all of the stabilization samples
            self.appendNewData(new_data)
            return dict(zip(self.dictHeaders, [-1, -1, -1]))
        else:
            #Remove one from current samples and add new data
            self.currentSamples.pop(0)
            self.appendNewData(new_data)
            result = self.clf.fit_predict(np.array(self.currentSamples).reshape(-1,1))[-1]
            likelihood = self.clf.score_samples(np.array(self.currentSamples).reshape(-1,1))[-1]
            return dict(zip(self.dictHeaders, [result, likelihood, -1]))
    def detectFromList(self, data):
        results = []
        print "Detecting anomalies for {} samples of data".format(data.__len__())
        for data_point in tqdm(data):
            detection = self.detect(data_point)
            result = copy.copy(data_point)
            result.update(detection)
            results.append(result)
        return results
Example #11
0
def find_anomaly_svm(matrix, per_out):
    detector = OneClassSVM(kernel='precomputed', nu=per_out)
    inlines = detector.fit_predict(matrix)
    result = []
    for i, res in enumerate(inlines):
        if res == -1:
            result.append(i)
    return result
Example #12
0
def OneClassSVMFunction(X,Y):
    clf = OneClassSVM()
    pred = clf.fit_predict(X)
    deleted = []
    for i in range(len(pred)):
        if pred[i] < 0:
            deleted.append(i)
    X_new = np.delete(X,deleted,0)
    Y_new = np.delete(Y,deleted)
    return X_new,Y_new, clf
Example #13
0
def SVM(data):

	svm = OneClassSVM()
	labels = svm.fit_predict(data)

	for label in labels:
		if label < 0:
			label = 0

		if label>0:
			label = 1
	return labels
class OutlierDetectionTransform:

    def __init__(self):
        self.__is_algorithm_set = False
        self.__algorithm = None
        self.__random_state = 1
        self.__jobs = -1
        self.__outliers_mask = []

    def set_random_state(self, state):
        self.__random_state = state

    def set_jobs(self, jobs):
        self.__jobs = jobs

    def set_isolation_forest(self, contamination_val):
        self.__is_algorithm_set = True
        self.__algorithm = IsolationForest(behaviour='new', n_jobs=self.__jobs, random_state=self.__random_state, contamination=contamination_val)

    def set_elliptic_envelope(self, contamination_val, support_fraction_val = None):
        self.__is_algorithm_set = True
        self.__algorithm = EllipticEnvelope(support_fraction=support_fraction_val, contamination=contamination_val, random_state=self.__random_state)

    def set_OCSVM(self, nu_val=0.5, gamma_val='scale', kernel_val='rbf', coef0_val=0.0):
        self.__is_algorithm_set = True
        self.__algorithm = OneClassSVM(nu=nu_val, kernel=kernel_val, coef0=coef0_val,
            gamma=gamma_val, shrinking=True)

    def filter_data(self, original_target, original_data):#, compare_data):

        if not self.__is_algorithm_set:
            return None, None

        self.__outliers_mask = self.__algorithm.fit_predict(original_data)
        data_filtered = original_data[self.__outliers_mask > 0]
        target_filtered = original_target[self.__outliers_mask > 0]
       # compare_filtered = compare_data[outliers_mask > 0]

        return target_filtered.copy(), data_filtered.copy()#, compare_filtered

    def mask_data(self, original_target, original_data):#, compare_data):

        if len(self.__outliers_mask) == 0:
            return None, None

        data_filtered = original_data[self.__outliers_mask > 0]
        target_filtered = original_target[self.__outliers_mask > 0]
       # compare_filtered = compare_data[outliers_mask > 0]

        return target_filtered.copy(), data_filtered.copy()#, compare_filtered
Example #15
0
def cluster(folderName, vectorsize, clusterType):
    corpus = loadXES.get_doc_XES_tagged(folderName + '.xes')
    print('Data Loading finished, ', str(len(corpus)), ' traces found.')

    model = gensim.models.Doc2Vec.load('output/' + folderName + 'T2VVS' +
                                       str(vectorsize) + '.model')

    vectors = []
    NUM_CLUSTERS = 5
    print("inferring vectors")
    for doc_id in range(len(corpus)):
        inferred_vector = model.infer_vector(corpus[doc_id].words)
        vectors.append(inferred_vector)
    print("done")

    if (clusterType == "KMeans"):
        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
    elif (clusterType == "HierWard"):
        ward = AgglomerativeClustering(n_clusters=NUM_CLUSTERS,
                                       linkage='ward').fit(vectors)
        assigned_clusters = ward.labels_
    elif clusterType == "OCSVM":
        ocsvm = OneClassSVM()
        assigned_clusters = ocsvm.fit_predict(vectors)

    else:
        print(
            clusterType,
            " is not a predefined cluster type. Please use 'KMeans' or 'HierWard', or create a definition for ",
            clusterType)
        return
    trace_list = loadXES.get_trace_names(folderName + ".xes")
    clusterResult = {}
    for doc_id in range(len(corpus)):
        clusterResult[trace_list[doc_id]] = assigned_clusters[doc_id]

    resultFile = open(
        'output/' + folderName + 'T2VVS' + str(vectorsize) + clusterType +
        '.csv', 'w')
    for doc_id in range(len(corpus)):
        resultFile.write(trace_list[doc_id] + ',' +
                         str(assigned_clusters[doc_id]) + "\n")

    resultFile.close()
    print("done with ", clusterType, " on event log ", folderName)
Example #16
0
def process_outliers(X_train, y_train):
    # only select numeric columns
    numerics = [
        'uint8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64'
    ]
    X_train = X_train.select_dtypes(include=numerics)

    # identify outliers in the training dataset
    outlier_predictor = OneClassSVM(nu=0.02)
    y_hat = outlier_predictor.fit_predict(X_train)
    unique, counts = np.unique(y_hat, return_counts=True)

    # select all rows that are not outliers
    outlier_mask = y_hat != -1
    return X_train[outlier_mask], y_train[outlier_mask]
Example #17
0
def one_class_svm(training_vectors, test_vectors_clean, test_vectors_anomalous):
    """Predicting Outlier using a one Class SVM
    """
    print("Starting One Class SVM...")

    # Fitting model for novel predictions
    svm = OneClassSVM(gamma = 'auto', kernel = 'rbf', nu = 0.05)
    print("Fitting with Parameters: ", svm.get_params())
    result_training = svm.fit_predict(training_vectors)

    print("Fitting successful!")    
    print("Starting Prediction...")

    # Predict returns 1 for inlier and -1 for outlier
    result_clean = svm.predict(test_vectors_clean)
    result_anomalous = svm.predict(test_vectors_anomalous)

    print("Predicting successful!")    
    print("**************************")

    return result_clean, result_anomalous, result_training
Example #18
0
param_grid = {
    'nu': [0.001, 0.0015, 0.002, 0.003, 0.005, 0.01],
    'gamma': ['scale', 0.0005, 0.001, 0.0025, 0.005, 0.01]
}
# param_grid = {'nu':  np.arange(0.0001, 0.01, 0.0005),
#               'gamma': np.arange(0.0005, 0.01, 0.001)}
# param_grid = {'nu': [0.0015, 0.03], 'gamma': [0.2, 0.3]}
grid = ParameterGrid(param_grid)

pred_list = []
for params in tqdm(grid):
    contamination = params['nu']
    gamma = params['gamma']
    ocsvm_model = OneClassSVM(**params)
    ocsvm_pred = ocsvm_model.fit_predict(X)
    pred_list.append((ocsvm_pred, params))

for pred, params in tqdm(pred_list):
    plt.plot(df.index,
             df.value,
             label='Original data',
             linestyle='--',
             alpha=0.5)
    tdf = df.loc[X.index, :]
    plt.plot(tdf.index, tdf.value, label='Used data', color='C0')
    anomalies = tdf.value[pred == -1]

    plt.plot(anomalies, 'x', label="Predicted anomalies", markersize=10)
    plt.plot(labels,
             tdf.loc[labels],
               Xs[nb_samples:, 1],
               marker='^',
               s=80,
               label='Test samples')
    ax.scatter(Xs[:nb_samples, 0], Xs[:nb_samples, 1], label='Inliers')

    ax.set_xlabel('Age', fontsize=14)
    ax.set_ylabel('Height', fontsize=14)

    ax.legend(fontsize=14)

    plt.show()

    # Train the One-Class SVM
    ocsvm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.2)
    Ys = ocsvm.fit_predict(Xs)

    # Show the results
    fig, ax = plt.subplots(1, 2, figsize=(22, 10), sharey=True)

    ax[0].scatter(Xs[Ys == -1, 0],
                  Xs[Ys == -1, 1],
                  marker='x',
                  s=100,
                  label='Ouliers')
    ax[0].scatter(Xs[Ys == 1, 0], Xs[Ys == 1, 1], marker='o', label='Inliers')

    ax[1].scatter(Xs[Ys == -1, 0], Xs[Ys == -1, 1], marker='x', s=100)

    ax[0].set_xlabel('Age', fontsize=16)
    ax[0].set_ylabel('Height', fontsize=16)
Example #20
0
# add the data to the main
df['anomaly26'] = pd.Series(model.predict(data))
df['anomaly26'] = df['anomaly26'].map({1: 0, -1: 1})
print(df['anomaly26'].value_counts())

# test
journeys[(journeys.Journey_ID == 21)].plot(x='timestamp', y='acceleration')
# probably a crash since it is a long period of immobility followed by slow movement

outliers_fraction = 0.0001
min_max_scaler = preprocessing.StandardScaler()

df = journeys
data = journeys.loc[:, ('x', 'y', 'z')]
np_scaled = min_max_scaler.fit_transform(data)
# train one class SVM
model = LocalOutlierFactor(
    n_neighbors=5,
    contamination=outliers_fraction)  #nu=0.95 * outliers_fraction  + 0.05
data = pd.DataFrame(np_scaled)
model.fit(data)
# add the data to the main
df['anomaly26'] = pd.Series(model.fit_predict(data))
df['anomaly26'] = df['anomaly26'].map({1: 0, -1: 1})
print(df['anomaly26'].value_counts())

a = df.loc[df['anomaly26'] == 1,
           ['timestamp', 'acceleration', 'speed', 'Journey_ID']]  #anomaly

a
Example #21
0
def indices_inliers_by_svm(data, nu):
    ocs = OneClassSVM(nu=nu)
    indices_outliers = ocs.fit_predict(data)
    mask = indices_outliers != -1  # select all rows that are not outliers
    return mask
x2_l2 = np.concatenate((y1h_l2, y1_l2), axis=1, out=None)

from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

acc_1svm = []
acc_h_1svm = []
acc_l_1svm = []
TPR_1svm = []
FPR_1svm = []
for run in range(100):
    print(run)
    clf = OneClassSVM(nu=0.001, kernel='rbf', gamma=5.3)
    # clf = IsolationForest(n_estimators=300, max_samples=100000, contamination=0.001, max_features=1.0, n_jobs=10, behaviour='new')

    y_pred_train = clf.fit_predict(x2_h1)

    y_pred = clf.predict(np.concatenate((x2_h2, x2_l1, x2_l2), axis=0))
    y2_tst = np.concatenate(
        (np.ones(x2_h2.shape[0]), -1 * np.ones(x2_l1.shape[0]),
         -1 * np.ones(x2_l2.shape[0])))
    confm = confusion_matrix(-1 * y2_tst, -1 * y_pred)
    print('Confusion Matrix : \n', confm)
    tn, fp, fn, tp = confm.ravel()
    total1 = np.sum(np.sum(confm))
    acc_1svm = np.append(acc_1svm, (confm[0, 0] + confm[1, 1]) / total1)
    acc_h_1svm = np.append(acc_h_1svm,
                           confm[0, 0] / (confm[0, 0] + confm[0, 1]))
    acc_l_1svm = np.append(acc_l_1svm,
                           confm[1, 1] / (confm[1, 1] + confm[1, 0]))
    TPR_1svm = np.append(TPR_1svm, tp / (tp + fn))
def svm_anomaly_score(df_data):
    """To figure out anomaly scores."""
    # must calibrate it for all measurements
    outliers = []
    for label, content in df_data.items():
        df_data[f'{label}'] = df_data[f'{label}'].fillna(0)
        svm = OneClassSVM(kernel='rbf', gamma=0.00001, nu=0.03)
        pred = svm.fit_predict(df_data[f'{label}'].values.reshape(-1, 1))
        scores = svm.score_samples(df_data[f'{label}'].values.reshape(-1, 1))

        thresh = quantile(scores, 0.008)
        feature_score = []
        anom = []
        inliers_feature_score = []
        inliers = []
        kazim = []
        ali = []
        for i, j in enumerate(scores):
            if j <= thresh:
                outliers.append(i)
                anom.append(j)
                feature_score.append(df_data[f'{label}'][i])
                ali.append(i)
            else:
                inliers.append(j)
                inliers_feature_score.append(df_data[f'{label}'][i])
                kazim.append(i)

        inliers_pd = pd.DataFrame({
            'inliers': inliers,
            'inliers_feature_score': inliers_feature_score,
            'inliers_index': kazim
        })
        pd_anom = pd.DataFrame({
            'AnomScore': anom,
            'FeatureScore': feature_score,
            'outlier_index': ali
        })

        fig = go.Figure()
        fig.update_layout(
            title={
                'text': f"SVM Detection of {label}",
                'y': 0.97,
                'x': 0.5
            },
            paper_bgcolor='white',
            plot_bgcolor="rgb(211, 216, 230)",
            # xaxis_title=" ",
            yaxis_title="Anomaly Score",
            font=dict(family="Courier New, monospace",
                      size=50,
                      color="rgb(10, 16, 87)"),
            title_font_color='rgb(145, 0, 0)',
            shapes=[
                dict(type="line",
                     xref="x",
                     yref="y",
                     x0=df_data[f'{label}'].min(),
                     y0=thresh,
                     x1=df_data[f'{label}'].max(),
                     y1=thresh,
                     opacity=1,
                     line=dict(color='blue', dash='dot'))
            ])

        fig.add_trace(
            go.Scatter(x=inliers_pd['inliers_feature_score'],
                       y=inliers_pd['inliers'],
                       mode='markers',
                       marker=dict(size=6, color='rgb(0, 0, 0)'),
                       name='Normal',
                       marker_symbol='circle'))
        fig.add_trace(
            go.Scatter(x=pd_anom['FeatureScore'],
                       y=pd_anom['AnomScore'],
                       mode='markers',
                       marker=dict(size=14, color='rgb(255, 0, 0)'),
                       name='Abnormal',
                       marker_symbol=206))

        fig.show()
        plotly.io.write_image(fig,
                              f'SVM_images/{label}.png',
                              width=2560,
                              height=1440)
    return outliers
# load the dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = read_csv(url, header=None)
# retrieve the array
data = df.values
# split into input and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
ii.fit(dataset[features])  #Error occurs here.

dataset['outlier'] = ii.predict(dataset[features])
del ii

print(dataset[dataset['outlier'] == -1])

#IsolationForest

from sklearn.ensemble import IsolationForest
ii = IsolationForest(max_samples=62,
                     contamination=0.25,
                     random_state=np.random.RandomState(42))

print("Fit data")
ii.fit(dataset[features])  #Error occurs here.

dataset['outlier'] = ii.predict(dataset[features])
del ii

print(dataset[dataset['outlier'] == -1])

#LocalOutlierFactor

from sklearn.neighbors import LocalOutlierFactor
ii = LocalOutlierFactor(n_neighbors=35, contamination=0.25)

dataset['outlier'] = ii.fit_predict(dataset[features])
del ii
print(dataset[dataset['outlier'] == -1])
def get_svm(db: pd.DataFrame) -> list:
    ee = OneClassSVM(nu=0.01)
    yhat_svm = ee.fit_predict(db)
    return yhat_svm == -1
Example #27
0
# train the model
outliers_fraction = 0.01
model = OneClassSVM(nu=0.95 * outliers_fraction)
model.fit(data)

# add the data to the main
#df['anomaly'] = Series(model.predict(data))
#df['anomaly'] = df['anomaly'].map({1: 0, -1: 1})
#print(df['anomaly'].value_counts())
data_test = df_test[['cpu']]
# standardize test data
min_max_scaler = preprocessing.StandardScaler()
np_scaled = min_max_scaler.fit_transform(data_test)
data_test = DataFrame(np_scaled)
# test model
df_test['anomaly'] = Series(model.fit_predict(data_test))
#df_test['anomaly'] = Series(model.predict(data_test))
df_test['anomaly'] = df_test['anomaly'].map({1: 0, -1: 1})
print(df_test['anomaly'].value_counts())

# visualisation of anomaly throughout time
fig, ax = plt.subplots()
#a = df.loc[df['anomaly'] == 1, ['time', 'cpu']]
#ax.plot(df['time'], df['cpu'], color='blue')
#ax.scatter(a['time'], a['cpu'], color='red')
#plt.show()
b = df_test.loc[df_test['anomaly'] == 1, ['time', 'cpu']]
ax.plot(df_test['time'], df_test['cpu'], color='blue')
ax.scatter(b['time'], b['cpu'], color='red')
plt.show()
Example #28
0
 def objective(space):
     params = {'nu': space['nu'], 'gamma': space['gamma']}
     estimator = OneClassSVM(cache_size=2048, kernel='rbf', **params)
     prediction = estimator.fit_predict(self.features)
     score = -metrics.calinski_harabasz_score(self.features, prediction)
     return {'loss': score, 'status': STATUS_OK}
Example #29
0
def main():
    '''main'''
    total_start_time = time.time()
    config = load_yaml(CONFIG_PATH)
    metric_path = config['metric_path']
    model_path = config['model_path']
    processed_path = config['processed_path']
    trained_path = config['trained_path']
    feature_df = pd.read_csv(processed_path)
    feature_df['StartTime'] = pd.to_datetime(feature_df['StartTime'])
    feature_df.loc[feature_df.Label == 0, 'Label'] = -1
    feature_df.Proto_Int = feature_df.Proto_Int.astype('category')
    feature_df.Sport_Int = feature_df.Sport_Int.astype('category')
    feature_df.Dir_Int = feature_df.Dir_Int.astype('category')
    feature_df.Dport_Int = feature_df.Dport_Int.astype('category')
    feature_df.State_Int = feature_df.State_Int.astype('category')
    malicious_df = feature_df.loc[feature_df.Label == 1]
    mal_forward_df = malicious_df.loc[malicious_df.is_fwd == 1]
    mal_back_df = malicious_df.loc[malicious_df.is_fwd == 0]
    benign_df = feature_df.loc[feature_df.Label == -1]
    del feature_df, malicious_df
    X_fwd_train, X_fwd_test, y_fwd_train, y_fwd_test = train_test_split(
        mal_forward_df, mal_forward_df['Label'], test_size=0.2, random_state=0)
    X_bwd_train, X_bwd_test, y_bwd_train, y_bwd_test = train_test_split(
        mal_back_df, mal_back_df['Label'], test_size=0.2, random_state=0)
    del mal_forward_df, mal_back_df
    X_train = pd.concat([X_fwd_train, X_bwd_train])

    X_test = pd.concat([X_fwd_test, X_bwd_test])
    X_test = pd.concat([X_test, benign_df])

    y_train = X_train.Label
    y_test = X_test.Label

    del X_fwd_train, X_fwd_test, y_fwd_train, y_fwd_test
    del X_bwd_train, X_bwd_test, y_bwd_train, y_bwd_test
    del benign_df
    # Hyper Tuning One Class
    # sample_size = 100000
    # if len(X_train) < sample_size:
    #     sample_size = len(X_train)
    # X_train_sample = X_train.sample(sample_size, random_state=0)
    # y_train_sample = X_train_sample.Label
    # start_time = time.time()
    # print(f'Hyper Tune with Size {sample_size}')
    # oc_params = tune_oneclass(df_train_subset(X_train_sample), y_train_sample, 'f1')
    # print(f'Time (param search) {sample_size} size. 3 Folds. 18 tot Fits: {time.time()-start_time}')
    oc_kernel = 'rbf'
    oc_nu = 1e-2
    oc_gamma = 1e-6
    oc_clf = OneClassSVM(kernel=oc_kernel,
                         nu=oc_nu,
                         gamma=oc_gamma,
                         cache_size=7000,
                         verbose=True)
    oc_model_name = 'oneclass'
    oc_scaler = preprocessing.StandardScaler()
    oc_scaler.fit(df_train_subset(X_train))
    save_model(oc_scaler, model_path, 'oc_scaler')

    #Fit One Class
    start_time = time.time()
    oc_predict_train = oc_clf.fit_predict(oc_scaler.transform(
        df_train_subset(X_train)),
                                          y=y_train)
    print(
        f'Time One Class Train Size {len(X_train)} :{time.time() - start_time}'
    )
    save_model(oc_clf, model_path, oc_model_name)

    #Confusion Matrix
    save_confuse_matrix(y_train, oc_predict_train, metric_path, oc_model_name,
                        'train')
    oc_predict_test = oc_clf.predict(
        oc_scaler.transform(df_train_subset(X_test)))
    save_confuse_matrix(y_test, oc_predict_test, metric_path, oc_model_name,
                        'test')

    #Performance
    save_performance(y_train, oc_predict_train, metric_path, oc_model_name,
                     'train')
    save_performance(y_test, oc_predict_test, metric_path, oc_model_name,
                     'test')

    #Get confidence scores
    start_time = time.time()
    data_f = pd.concat([X_train, X_test])
    data_f.sort_values('StartTime', inplace=True)
    oc_conf_score = oc_clf.decision_function(
        oc_scaler.transform(df_train_subset(data_f)))
    print(f'Time Confidence Scores: {time.time() - start_time}')
    del data_f, oc_kernel, oc_nu, oc_gamma, oc_clf, oc_scaler

    #Saving to CSV
    start_time = time.time()
    x_test_label = X_test['Label']
    X_test.drop(columns=['Label'], inplace=True, axis=1)
    X_test['Label'] = x_test_label
    X_test['Predicted_Label'] = oc_predict_test

    mal_train_label = X_train['Label']
    X_train.drop(columns=['Label'], inplace=True, axis=1)
    X_train['Label'] = mal_train_label
    X_train['Predicted_Label'] = oc_predict_train

    final_df = pd.concat([X_train, X_test])
    del X_train, X_test, y_train, y_test
    final_df.sort_values('StartTime', inplace=True)

    final_df['Confidence_Score'] = oc_conf_score
    makedirs(dirname(f'{trained_path}'), exist_ok=True)
    final_df.to_csv(f'{trained_path}{oc_model_name}.csv', index=False)
    print(f'Saving one_class_features csv: {time.time() - start_time}')

    # Train Logistic Regression
    #Hypter tune with 10 perent of data.
    # start_time = time.time()
    # lr_train_size = 0.1
    # if len(final_df) < 100000:
    #     lr_train_size = 0.95
    # final_df, X_test_sample, y_train_s, y_test_s = train_test_split(final_df,
    #                                                                 final_df.Label,
    #                                                                 train_size=lr_train_size,
    #                                                                 stratify=final_df.Label)
    # del X_test_sample, y_train_s, y_test_s
    # lr_params = tune_log_reg(df_train_subset(final_df), final_df.Label, 'average_precision')
    # print(f'Time Hyper Tuning LR: {time.time() - start_time}')
    lr_params = {'C': 69.54618247583652, 'tol': 0.0009555227427965779}
    lr_clf = LogisticRegression(solver='saga',
                                penalty='l2',
                                dual=False,
                                tol=lr_params['tol'],
                                C=lr_params['C'],
                                max_iter=80000)
    lr_model_name = 'lr'
    lr_scaler = preprocessing.StandardScaler()
    lr_scaler.fit(df_train_subset(final_df))
    #Save LR Scaler
    save_model(lr_scaler, model_path, 'lr_scaler')

    #Fit Logistic Regression
    start_time = time.time()
    lr_train_transformed = lr_scaler.transform(df_train_subset(final_df))
    lr_clf.fit(lr_train_transformed, y=final_df.Label)
    save_model(lr_clf, model_path, lr_model_name)
    print(f'Time Train LR Size {len(final_df)}: {time.time() - start_time}')

    #Performance (Write afterwards)
    lr_predicted = lr_clf.predict(lr_train_transformed)
    save_performance(final_df.Label, lr_predicted, metric_path, lr_model_name,
                     'train')

    #Confusion Matrix
    save_confuse_matrix(final_df.Label, lr_predicted, metric_path,
                        lr_model_name, 'train')

    #Normalize Confidence Score
    start_time = time.time()
    ncs = normalize_confidence_score(lr_clf, lr_scaler,
                                     df_train_subset(final_df))
    final_df['LR_Predicted'] = lr_predicted
    lr_classes = lr_clf.classes_
    final_df[f'CS_LR_{lr_classes[0]}'] = [prob[0] for prob in ncs]
    final_df[f'CS_LR_{lr_classes[1]}'] = [prob[1] for prob in ncs]
    print(f'Time Normalize Conf Score: {time.time() - start_time}')

    #Save to CSV
    start_time = time.time()
    final_df.to_csv(f'{trained_path}{lr_model_name}.csv', index=False)
    print(f'Time Saving Normalized DF to CSV: {time.time() - start_time}')
    print(
        f'Training Complete - Time Elapsed: {time.time() - total_start_time}')
Example #30
0
# identify outliers in training dataset with Isolation Forest Algorithm
#iso = IsolationForest(contamination=0.1)
#yhat = iso.fit_predict(X_train) # Find outliers

# identify outliers with Minimum Covarience Determinant
#ee = EllipticEnvelope(contamination=0.01)
#yhat = ee.fit_predict(X_train)

# identify outliers with Local Outlier Factor
#lof = LocalOutlierFactor()
#yhat = lof.fit_predict(X_train)

# identify outliers with One Class SVM (Support Vector Machine)
ocs = OneClassSVM(nu=0.03)
yhat = ocs.fit_predict(X_train)

# select all raws that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]

# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

# fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# evaluate the model
yhat = model.predict(X_test)