def detect_outliers_SVM(df): ''' Returns the outlier scores using SVM (beware: prone to overfitting) Parameters: ----------- df: pd.DataFrame, ''' clf = OneClassSVM() clf.fit_predict(df) scores = clf.score_samples(df) # dec_func = clf.decision_function(df_imputed) return scores
def log_anomalyPRF_isof(cp, ground_truth, dataset, log_flag, SEED=1234): # Init clustering hyperparameters n_clusters = cp.getint('Hyperparameters', 'ClusterNum') cluster_init = cp.getint('Hyperparameters', 'ClusterInit') km = OCS(kernel='linear') if isinstance(dataset, basestring): pred = km.fit_predict(np.load(dataset)) pred[np.where(pred == -1)[0]] = 0 else: pred = km.fit_predict(dataset) pred[np.where(pred == -1)[0]] = 0 # pred = assign_labels(pred, ground_truth) print CR(ground_truth, pred)
def log_accuracy_isof(cp, ground_truth, dataset, log_flag, SEED=1234): # Init clustering hyperparameters n_clusters = cp.getint('Hyperparameters', 'ClusterNum') cluster_init = cp.getint('Hyperparameters', 'ClusterInit') # KMeans model km = OCS(kernel='linear') if isinstance(dataset, basestring): pred = km.fit_predict(np.load(dataset)) pred[np.where(pred == -1)[0]] = 0 else: pred = km.fit_predict(dataset) pred[np.where(pred == -1)[0]] = 0 log('--------------- {} {} ------------------------'.format( log_flag, cluster_acc(pred, ground_truth)[0]))
def outliers_OneClassSVM(df, iters): dataset = df.copy() OCSVM = OneClassSVM(kernel='rbf', gamma='auto', max_iter=iters) df_with_svm = dataset.join(pd.DataFrame(OCSVM.fit_predict(dataset), index=dataset.index, columns=['svm']), how='left') return df_with_svm.loc[df_with_svm['svm'] != 1].index
def model_ocSVM(data): from sklearn.svm import OneClassSVM ocSVM = OneClassSVM(kernel="rbf") y_pred = ocSVM.fit_predict(data["X_test"]) return y_pred
def svm_anomalies(train_data, train_oids, test_data, test_oids): ''' Function to detect anomalies given training data Keyword Args: train_data - training data train_oids - overflight ids for training data test_data - testing data test_oids - overflight ids for testing data Returns: two dictionaries of oid -> anomaly (-1 or 1) ''' OC_SVM = OneClassSVM(kernel="rbf", degree=20) train_anomalies = OC_SVM.fit_predict(train_data) test_anomalies = OC_SVM.predict(test_data) train_dict = {} test_dict = {} for i in range(len(train_oids)): train_dict[train_oids[i]] = train_anomalies[i] for i in range(len(test_oids)): test_dict[test_oids[i]] = test_anomalies[i] return train_dict, test_dict
class OCSVM(object): def __init__(self, file_name, config): self.dataset = config.dataset self.file_name = file_name self.x_dim = config.x_dim self.kernel = config.kernel self.degree = config.degree self.gamma = config.gamma self.coef0 = config.coef0 self.tol = config.tol self.nu = config.nu self.pid = config.pid self.model = OneClassSVM(kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, tol=self.tol, nu=self.nu) def fit(self, train_input, train_label, test_input, test_label): # Perform fit on X and returns labels for X. # Returns -1 for outliers and 1 for inliers. y_pred = self.model.fit_predict(train_input) decision_function = self.model.decision_function(train_input) ocsvm_output = OCSVMOutput(y_hat=y_pred, decision_function=decision_function) return ocsvm_output
class OutlineCheck(): ''' check outline rows by sk outliner model ''' def __init__(self, clf=None, **kwargs): ''' default use OneClassSVM,or you can use sk outliner model(IsolationForest/LocalOutlierFactor) ''' if clf is None: self.clf = OneClassSVM(**kwargs) else: self.clf = clf(**kwargs) def get_detail(self, X: pd.DataFrame): pre_result = self.clf.fit_predict(X) inliers = X[pre_result == 1] return self.clf, inliers @staticmethod def get_predict_detail(clf, X): ''' Params: clf:the model by get_detail return X:the input data for check ''' pre = clf.predict(X) if X.shape[0] > 1: return X[pre == 1] else: return pre[0]
def dixon(): try: data = np.array(request.json["Data"]) params = request.json['Params'] kernel = "rbf" degree = 3 gamma = 'scale' coef = 0 if "kernel" in params: kernel = params["kernel"] if "degree" in params: degree = params["degree"] if "gamma" in params: gamma = params["gamma"] if "coef" in params: coef = params["coef"] clf = OneClassSVM(kernel=kernel, degree=degree, gamma=gamma, coef0=coef) indices = clf.fit_predict(data) indices = [0 if x == 1 else 1 for x in indices.tolist()] except Exception as e: return jsonify({"message": str(e)}), 400 return jsonify({"message": "OK", "data": indices})
class OCSVMDetector(IAnomaly): def __init__(self, slidingWindowSize = None): self.slidingWindowSize = slidingWindowSize self.receivedSamplesNumber = 0 self.currentSamples = [] self.clf = OneClassSVM(nu=0.1, kernel="rbf", gamma='auto') self.dictHeaders = ['detectionCode', 'anomalyLikelihood', 'anomalyScore'] def appendNewData(self, sample): self.currentSamples.append(float(sample["Resistance"])) self.receivedSamplesNumber = self.receivedSamplesNumber +1 def detect(self, new_data): if self.receivedSamplesNumber < self.slidingWindowSize - 1: #Append all of the stabilization samples self.appendNewData(new_data) return dict(zip(self.dictHeaders, [-1, -1, -1])) else: #Remove one from current samples and add new data self.currentSamples.pop(0) self.appendNewData(new_data) result = self.clf.fit_predict(np.array(self.currentSamples).reshape(-1,1))[-1] likelihood = self.clf.score_samples(np.array(self.currentSamples).reshape(-1,1))[-1] return dict(zip(self.dictHeaders, [result, likelihood, -1])) def detectFromList(self, data): results = [] print "Detecting anomalies for {} samples of data".format(data.__len__()) for data_point in tqdm(data): detection = self.detect(data_point) result = copy.copy(data_point) result.update(detection) results.append(result) return results
def find_anomaly_svm(matrix, per_out): detector = OneClassSVM(kernel='precomputed', nu=per_out) inlines = detector.fit_predict(matrix) result = [] for i, res in enumerate(inlines): if res == -1: result.append(i) return result
def OneClassSVMFunction(X,Y): clf = OneClassSVM() pred = clf.fit_predict(X) deleted = [] for i in range(len(pred)): if pred[i] < 0: deleted.append(i) X_new = np.delete(X,deleted,0) Y_new = np.delete(Y,deleted) return X_new,Y_new, clf
def SVM(data): svm = OneClassSVM() labels = svm.fit_predict(data) for label in labels: if label < 0: label = 0 if label>0: label = 1 return labels
class OutlierDetectionTransform: def __init__(self): self.__is_algorithm_set = False self.__algorithm = None self.__random_state = 1 self.__jobs = -1 self.__outliers_mask = [] def set_random_state(self, state): self.__random_state = state def set_jobs(self, jobs): self.__jobs = jobs def set_isolation_forest(self, contamination_val): self.__is_algorithm_set = True self.__algorithm = IsolationForest(behaviour='new', n_jobs=self.__jobs, random_state=self.__random_state, contamination=contamination_val) def set_elliptic_envelope(self, contamination_val, support_fraction_val = None): self.__is_algorithm_set = True self.__algorithm = EllipticEnvelope(support_fraction=support_fraction_val, contamination=contamination_val, random_state=self.__random_state) def set_OCSVM(self, nu_val=0.5, gamma_val='scale', kernel_val='rbf', coef0_val=0.0): self.__is_algorithm_set = True self.__algorithm = OneClassSVM(nu=nu_val, kernel=kernel_val, coef0=coef0_val, gamma=gamma_val, shrinking=True) def filter_data(self, original_target, original_data):#, compare_data): if not self.__is_algorithm_set: return None, None self.__outliers_mask = self.__algorithm.fit_predict(original_data) data_filtered = original_data[self.__outliers_mask > 0] target_filtered = original_target[self.__outliers_mask > 0] # compare_filtered = compare_data[outliers_mask > 0] return target_filtered.copy(), data_filtered.copy()#, compare_filtered def mask_data(self, original_target, original_data):#, compare_data): if len(self.__outliers_mask) == 0: return None, None data_filtered = original_data[self.__outliers_mask > 0] target_filtered = original_target[self.__outliers_mask > 0] # compare_filtered = compare_data[outliers_mask > 0] return target_filtered.copy(), data_filtered.copy()#, compare_filtered
def cluster(folderName, vectorsize, clusterType): corpus = loadXES.get_doc_XES_tagged(folderName + '.xes') print('Data Loading finished, ', str(len(corpus)), ' traces found.') model = gensim.models.Doc2Vec.load('output/' + folderName + 'T2VVS' + str(vectorsize) + '.model') vectors = [] NUM_CLUSTERS = 5 print("inferring vectors") for doc_id in range(len(corpus)): inferred_vector = model.infer_vector(corpus[doc_id].words) vectors.append(inferred_vector) print("done") if (clusterType == "KMeans"): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) elif (clusterType == "HierWard"): ward = AgglomerativeClustering(n_clusters=NUM_CLUSTERS, linkage='ward').fit(vectors) assigned_clusters = ward.labels_ elif clusterType == "OCSVM": ocsvm = OneClassSVM() assigned_clusters = ocsvm.fit_predict(vectors) else: print( clusterType, " is not a predefined cluster type. Please use 'KMeans' or 'HierWard', or create a definition for ", clusterType) return trace_list = loadXES.get_trace_names(folderName + ".xes") clusterResult = {} for doc_id in range(len(corpus)): clusterResult[trace_list[doc_id]] = assigned_clusters[doc_id] resultFile = open( 'output/' + folderName + 'T2VVS' + str(vectorsize) + clusterType + '.csv', 'w') for doc_id in range(len(corpus)): resultFile.write(trace_list[doc_id] + ',' + str(assigned_clusters[doc_id]) + "\n") resultFile.close() print("done with ", clusterType, " on event log ", folderName)
def process_outliers(X_train, y_train): # only select numeric columns numerics = [ 'uint8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64' ] X_train = X_train.select_dtypes(include=numerics) # identify outliers in the training dataset outlier_predictor = OneClassSVM(nu=0.02) y_hat = outlier_predictor.fit_predict(X_train) unique, counts = np.unique(y_hat, return_counts=True) # select all rows that are not outliers outlier_mask = y_hat != -1 return X_train[outlier_mask], y_train[outlier_mask]
def one_class_svm(training_vectors, test_vectors_clean, test_vectors_anomalous): """Predicting Outlier using a one Class SVM """ print("Starting One Class SVM...") # Fitting model for novel predictions svm = OneClassSVM(gamma = 'auto', kernel = 'rbf', nu = 0.05) print("Fitting with Parameters: ", svm.get_params()) result_training = svm.fit_predict(training_vectors) print("Fitting successful!") print("Starting Prediction...") # Predict returns 1 for inlier and -1 for outlier result_clean = svm.predict(test_vectors_clean) result_anomalous = svm.predict(test_vectors_anomalous) print("Predicting successful!") print("**************************") return result_clean, result_anomalous, result_training
param_grid = { 'nu': [0.001, 0.0015, 0.002, 0.003, 0.005, 0.01], 'gamma': ['scale', 0.0005, 0.001, 0.0025, 0.005, 0.01] } # param_grid = {'nu': np.arange(0.0001, 0.01, 0.0005), # 'gamma': np.arange(0.0005, 0.01, 0.001)} # param_grid = {'nu': [0.0015, 0.03], 'gamma': [0.2, 0.3]} grid = ParameterGrid(param_grid) pred_list = [] for params in tqdm(grid): contamination = params['nu'] gamma = params['gamma'] ocsvm_model = OneClassSVM(**params) ocsvm_pred = ocsvm_model.fit_predict(X) pred_list.append((ocsvm_pred, params)) for pred, params in tqdm(pred_list): plt.plot(df.index, df.value, label='Original data', linestyle='--', alpha=0.5) tdf = df.loc[X.index, :] plt.plot(tdf.index, tdf.value, label='Used data', color='C0') anomalies = tdf.value[pred == -1] plt.plot(anomalies, 'x', label="Predicted anomalies", markersize=10) plt.plot(labels, tdf.loc[labels],
Xs[nb_samples:, 1], marker='^', s=80, label='Test samples') ax.scatter(Xs[:nb_samples, 0], Xs[:nb_samples, 1], label='Inliers') ax.set_xlabel('Age', fontsize=14) ax.set_ylabel('Height', fontsize=14) ax.legend(fontsize=14) plt.show() # Train the One-Class SVM ocsvm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.2) Ys = ocsvm.fit_predict(Xs) # Show the results fig, ax = plt.subplots(1, 2, figsize=(22, 10), sharey=True) ax[0].scatter(Xs[Ys == -1, 0], Xs[Ys == -1, 1], marker='x', s=100, label='Ouliers') ax[0].scatter(Xs[Ys == 1, 0], Xs[Ys == 1, 1], marker='o', label='Inliers') ax[1].scatter(Xs[Ys == -1, 0], Xs[Ys == -1, 1], marker='x', s=100) ax[0].set_xlabel('Age', fontsize=16) ax[0].set_ylabel('Height', fontsize=16)
# add the data to the main df['anomaly26'] = pd.Series(model.predict(data)) df['anomaly26'] = df['anomaly26'].map({1: 0, -1: 1}) print(df['anomaly26'].value_counts()) # test journeys[(journeys.Journey_ID == 21)].plot(x='timestamp', y='acceleration') # probably a crash since it is a long period of immobility followed by slow movement outliers_fraction = 0.0001 min_max_scaler = preprocessing.StandardScaler() df = journeys data = journeys.loc[:, ('x', 'y', 'z')] np_scaled = min_max_scaler.fit_transform(data) # train one class SVM model = LocalOutlierFactor( n_neighbors=5, contamination=outliers_fraction) #nu=0.95 * outliers_fraction + 0.05 data = pd.DataFrame(np_scaled) model.fit(data) # add the data to the main df['anomaly26'] = pd.Series(model.fit_predict(data)) df['anomaly26'] = df['anomaly26'].map({1: 0, -1: 1}) print(df['anomaly26'].value_counts()) a = df.loc[df['anomaly26'] == 1, ['timestamp', 'acceleration', 'speed', 'Journey_ID']] #anomaly a
def indices_inliers_by_svm(data, nu): ocs = OneClassSVM(nu=nu) indices_outliers = ocs.fit_predict(data) mask = indices_outliers != -1 # select all rows that are not outliers return mask
x2_l2 = np.concatenate((y1h_l2, y1_l2), axis=1, out=None) from sklearn.svm import OneClassSVM from sklearn.ensemble import IsolationForest acc_1svm = [] acc_h_1svm = [] acc_l_1svm = [] TPR_1svm = [] FPR_1svm = [] for run in range(100): print(run) clf = OneClassSVM(nu=0.001, kernel='rbf', gamma=5.3) # clf = IsolationForest(n_estimators=300, max_samples=100000, contamination=0.001, max_features=1.0, n_jobs=10, behaviour='new') y_pred_train = clf.fit_predict(x2_h1) y_pred = clf.predict(np.concatenate((x2_h2, x2_l1, x2_l2), axis=0)) y2_tst = np.concatenate( (np.ones(x2_h2.shape[0]), -1 * np.ones(x2_l1.shape[0]), -1 * np.ones(x2_l2.shape[0]))) confm = confusion_matrix(-1 * y2_tst, -1 * y_pred) print('Confusion Matrix : \n', confm) tn, fp, fn, tp = confm.ravel() total1 = np.sum(np.sum(confm)) acc_1svm = np.append(acc_1svm, (confm[0, 0] + confm[1, 1]) / total1) acc_h_1svm = np.append(acc_h_1svm, confm[0, 0] / (confm[0, 0] + confm[0, 1])) acc_l_1svm = np.append(acc_l_1svm, confm[1, 1] / (confm[1, 1] + confm[1, 0])) TPR_1svm = np.append(TPR_1svm, tp / (tp + fn))
def svm_anomaly_score(df_data): """To figure out anomaly scores.""" # must calibrate it for all measurements outliers = [] for label, content in df_data.items(): df_data[f'{label}'] = df_data[f'{label}'].fillna(0) svm = OneClassSVM(kernel='rbf', gamma=0.00001, nu=0.03) pred = svm.fit_predict(df_data[f'{label}'].values.reshape(-1, 1)) scores = svm.score_samples(df_data[f'{label}'].values.reshape(-1, 1)) thresh = quantile(scores, 0.008) feature_score = [] anom = [] inliers_feature_score = [] inliers = [] kazim = [] ali = [] for i, j in enumerate(scores): if j <= thresh: outliers.append(i) anom.append(j) feature_score.append(df_data[f'{label}'][i]) ali.append(i) else: inliers.append(j) inliers_feature_score.append(df_data[f'{label}'][i]) kazim.append(i) inliers_pd = pd.DataFrame({ 'inliers': inliers, 'inliers_feature_score': inliers_feature_score, 'inliers_index': kazim }) pd_anom = pd.DataFrame({ 'AnomScore': anom, 'FeatureScore': feature_score, 'outlier_index': ali }) fig = go.Figure() fig.update_layout( title={ 'text': f"SVM Detection of {label}", 'y': 0.97, 'x': 0.5 }, paper_bgcolor='white', plot_bgcolor="rgb(211, 216, 230)", # xaxis_title=" ", yaxis_title="Anomaly Score", font=dict(family="Courier New, monospace", size=50, color="rgb(10, 16, 87)"), title_font_color='rgb(145, 0, 0)', shapes=[ dict(type="line", xref="x", yref="y", x0=df_data[f'{label}'].min(), y0=thresh, x1=df_data[f'{label}'].max(), y1=thresh, opacity=1, line=dict(color='blue', dash='dot')) ]) fig.add_trace( go.Scatter(x=inliers_pd['inliers_feature_score'], y=inliers_pd['inliers'], mode='markers', marker=dict(size=6, color='rgb(0, 0, 0)'), name='Normal', marker_symbol='circle')) fig.add_trace( go.Scatter(x=pd_anom['FeatureScore'], y=pd_anom['AnomScore'], mode='markers', marker=dict(size=14, color='rgb(255, 0, 0)'), name='Abnormal', marker_symbol=206)) fig.show() plotly.io.write_image(fig, f'SVM_images/{label}.png', width=2560, height=1440) return outliers
# load the dataset url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv' df = read_csv(url, header=None) # retrieve the array data = df.values # split into input and output elements X, y = data[:, :-1], data[:, -1] # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # summarize the shape of the training dataset print(X_train.shape, y_train.shape) # identify outliers in the training dataset ee = OneClassSVM(nu=0.01) yhat = ee.fit_predict(X_train) # select all rows that are not outliers mask = yhat != -1 X_train, y_train = X_train[mask, :], y_train[mask] # summarize the shape of the updated training dataset print(X_train.shape, y_train.shape) # fit the model model = LinearRegression() model.fit(X_train, y_train) # evaluate the model yhat = model.predict(X_test) # evaluate predictions mae = mean_absolute_error(y_test, yhat) print('MAE: %.3f' % mae)
ii.fit(dataset[features]) #Error occurs here. dataset['outlier'] = ii.predict(dataset[features]) del ii print(dataset[dataset['outlier'] == -1]) #IsolationForest from sklearn.ensemble import IsolationForest ii = IsolationForest(max_samples=62, contamination=0.25, random_state=np.random.RandomState(42)) print("Fit data") ii.fit(dataset[features]) #Error occurs here. dataset['outlier'] = ii.predict(dataset[features]) del ii print(dataset[dataset['outlier'] == -1]) #LocalOutlierFactor from sklearn.neighbors import LocalOutlierFactor ii = LocalOutlierFactor(n_neighbors=35, contamination=0.25) dataset['outlier'] = ii.fit_predict(dataset[features]) del ii print(dataset[dataset['outlier'] == -1])
def get_svm(db: pd.DataFrame) -> list: ee = OneClassSVM(nu=0.01) yhat_svm = ee.fit_predict(db) return yhat_svm == -1
# train the model outliers_fraction = 0.01 model = OneClassSVM(nu=0.95 * outliers_fraction) model.fit(data) # add the data to the main #df['anomaly'] = Series(model.predict(data)) #df['anomaly'] = df['anomaly'].map({1: 0, -1: 1}) #print(df['anomaly'].value_counts()) data_test = df_test[['cpu']] # standardize test data min_max_scaler = preprocessing.StandardScaler() np_scaled = min_max_scaler.fit_transform(data_test) data_test = DataFrame(np_scaled) # test model df_test['anomaly'] = Series(model.fit_predict(data_test)) #df_test['anomaly'] = Series(model.predict(data_test)) df_test['anomaly'] = df_test['anomaly'].map({1: 0, -1: 1}) print(df_test['anomaly'].value_counts()) # visualisation of anomaly throughout time fig, ax = plt.subplots() #a = df.loc[df['anomaly'] == 1, ['time', 'cpu']] #ax.plot(df['time'], df['cpu'], color='blue') #ax.scatter(a['time'], a['cpu'], color='red') #plt.show() b = df_test.loc[df_test['anomaly'] == 1, ['time', 'cpu']] ax.plot(df_test['time'], df_test['cpu'], color='blue') ax.scatter(b['time'], b['cpu'], color='red') plt.show()
def objective(space): params = {'nu': space['nu'], 'gamma': space['gamma']} estimator = OneClassSVM(cache_size=2048, kernel='rbf', **params) prediction = estimator.fit_predict(self.features) score = -metrics.calinski_harabasz_score(self.features, prediction) return {'loss': score, 'status': STATUS_OK}
def main(): '''main''' total_start_time = time.time() config = load_yaml(CONFIG_PATH) metric_path = config['metric_path'] model_path = config['model_path'] processed_path = config['processed_path'] trained_path = config['trained_path'] feature_df = pd.read_csv(processed_path) feature_df['StartTime'] = pd.to_datetime(feature_df['StartTime']) feature_df.loc[feature_df.Label == 0, 'Label'] = -1 feature_df.Proto_Int = feature_df.Proto_Int.astype('category') feature_df.Sport_Int = feature_df.Sport_Int.astype('category') feature_df.Dir_Int = feature_df.Dir_Int.astype('category') feature_df.Dport_Int = feature_df.Dport_Int.astype('category') feature_df.State_Int = feature_df.State_Int.astype('category') malicious_df = feature_df.loc[feature_df.Label == 1] mal_forward_df = malicious_df.loc[malicious_df.is_fwd == 1] mal_back_df = malicious_df.loc[malicious_df.is_fwd == 0] benign_df = feature_df.loc[feature_df.Label == -1] del feature_df, malicious_df X_fwd_train, X_fwd_test, y_fwd_train, y_fwd_test = train_test_split( mal_forward_df, mal_forward_df['Label'], test_size=0.2, random_state=0) X_bwd_train, X_bwd_test, y_bwd_train, y_bwd_test = train_test_split( mal_back_df, mal_back_df['Label'], test_size=0.2, random_state=0) del mal_forward_df, mal_back_df X_train = pd.concat([X_fwd_train, X_bwd_train]) X_test = pd.concat([X_fwd_test, X_bwd_test]) X_test = pd.concat([X_test, benign_df]) y_train = X_train.Label y_test = X_test.Label del X_fwd_train, X_fwd_test, y_fwd_train, y_fwd_test del X_bwd_train, X_bwd_test, y_bwd_train, y_bwd_test del benign_df # Hyper Tuning One Class # sample_size = 100000 # if len(X_train) < sample_size: # sample_size = len(X_train) # X_train_sample = X_train.sample(sample_size, random_state=0) # y_train_sample = X_train_sample.Label # start_time = time.time() # print(f'Hyper Tune with Size {sample_size}') # oc_params = tune_oneclass(df_train_subset(X_train_sample), y_train_sample, 'f1') # print(f'Time (param search) {sample_size} size. 3 Folds. 18 tot Fits: {time.time()-start_time}') oc_kernel = 'rbf' oc_nu = 1e-2 oc_gamma = 1e-6 oc_clf = OneClassSVM(kernel=oc_kernel, nu=oc_nu, gamma=oc_gamma, cache_size=7000, verbose=True) oc_model_name = 'oneclass' oc_scaler = preprocessing.StandardScaler() oc_scaler.fit(df_train_subset(X_train)) save_model(oc_scaler, model_path, 'oc_scaler') #Fit One Class start_time = time.time() oc_predict_train = oc_clf.fit_predict(oc_scaler.transform( df_train_subset(X_train)), y=y_train) print( f'Time One Class Train Size {len(X_train)} :{time.time() - start_time}' ) save_model(oc_clf, model_path, oc_model_name) #Confusion Matrix save_confuse_matrix(y_train, oc_predict_train, metric_path, oc_model_name, 'train') oc_predict_test = oc_clf.predict( oc_scaler.transform(df_train_subset(X_test))) save_confuse_matrix(y_test, oc_predict_test, metric_path, oc_model_name, 'test') #Performance save_performance(y_train, oc_predict_train, metric_path, oc_model_name, 'train') save_performance(y_test, oc_predict_test, metric_path, oc_model_name, 'test') #Get confidence scores start_time = time.time() data_f = pd.concat([X_train, X_test]) data_f.sort_values('StartTime', inplace=True) oc_conf_score = oc_clf.decision_function( oc_scaler.transform(df_train_subset(data_f))) print(f'Time Confidence Scores: {time.time() - start_time}') del data_f, oc_kernel, oc_nu, oc_gamma, oc_clf, oc_scaler #Saving to CSV start_time = time.time() x_test_label = X_test['Label'] X_test.drop(columns=['Label'], inplace=True, axis=1) X_test['Label'] = x_test_label X_test['Predicted_Label'] = oc_predict_test mal_train_label = X_train['Label'] X_train.drop(columns=['Label'], inplace=True, axis=1) X_train['Label'] = mal_train_label X_train['Predicted_Label'] = oc_predict_train final_df = pd.concat([X_train, X_test]) del X_train, X_test, y_train, y_test final_df.sort_values('StartTime', inplace=True) final_df['Confidence_Score'] = oc_conf_score makedirs(dirname(f'{trained_path}'), exist_ok=True) final_df.to_csv(f'{trained_path}{oc_model_name}.csv', index=False) print(f'Saving one_class_features csv: {time.time() - start_time}') # Train Logistic Regression #Hypter tune with 10 perent of data. # start_time = time.time() # lr_train_size = 0.1 # if len(final_df) < 100000: # lr_train_size = 0.95 # final_df, X_test_sample, y_train_s, y_test_s = train_test_split(final_df, # final_df.Label, # train_size=lr_train_size, # stratify=final_df.Label) # del X_test_sample, y_train_s, y_test_s # lr_params = tune_log_reg(df_train_subset(final_df), final_df.Label, 'average_precision') # print(f'Time Hyper Tuning LR: {time.time() - start_time}') lr_params = {'C': 69.54618247583652, 'tol': 0.0009555227427965779} lr_clf = LogisticRegression(solver='saga', penalty='l2', dual=False, tol=lr_params['tol'], C=lr_params['C'], max_iter=80000) lr_model_name = 'lr' lr_scaler = preprocessing.StandardScaler() lr_scaler.fit(df_train_subset(final_df)) #Save LR Scaler save_model(lr_scaler, model_path, 'lr_scaler') #Fit Logistic Regression start_time = time.time() lr_train_transformed = lr_scaler.transform(df_train_subset(final_df)) lr_clf.fit(lr_train_transformed, y=final_df.Label) save_model(lr_clf, model_path, lr_model_name) print(f'Time Train LR Size {len(final_df)}: {time.time() - start_time}') #Performance (Write afterwards) lr_predicted = lr_clf.predict(lr_train_transformed) save_performance(final_df.Label, lr_predicted, metric_path, lr_model_name, 'train') #Confusion Matrix save_confuse_matrix(final_df.Label, lr_predicted, metric_path, lr_model_name, 'train') #Normalize Confidence Score start_time = time.time() ncs = normalize_confidence_score(lr_clf, lr_scaler, df_train_subset(final_df)) final_df['LR_Predicted'] = lr_predicted lr_classes = lr_clf.classes_ final_df[f'CS_LR_{lr_classes[0]}'] = [prob[0] for prob in ncs] final_df[f'CS_LR_{lr_classes[1]}'] = [prob[1] for prob in ncs] print(f'Time Normalize Conf Score: {time.time() - start_time}') #Save to CSV start_time = time.time() final_df.to_csv(f'{trained_path}{lr_model_name}.csv', index=False) print(f'Time Saving Normalized DF to CSV: {time.time() - start_time}') print( f'Training Complete - Time Elapsed: {time.time() - total_start_time}')
# identify outliers in training dataset with Isolation Forest Algorithm #iso = IsolationForest(contamination=0.1) #yhat = iso.fit_predict(X_train) # Find outliers # identify outliers with Minimum Covarience Determinant #ee = EllipticEnvelope(contamination=0.01) #yhat = ee.fit_predict(X_train) # identify outliers with Local Outlier Factor #lof = LocalOutlierFactor() #yhat = lof.fit_predict(X_train) # identify outliers with One Class SVM (Support Vector Machine) ocs = OneClassSVM(nu=0.03) yhat = ocs.fit_predict(X_train) # select all raws that are not outliers mask = yhat != -1 X_train, y_train = X_train[mask, :], y_train[mask] # summarize the shape of the updated training dataset print(X_train.shape, y_train.shape) # fit the model model = LinearRegression() model.fit(X_train, y_train) # evaluate the model yhat = model.predict(X_test)