def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = { "console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s' } file_handler, console_handler = logging.FileHandler( logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc loaded_dataset = load_data(logger) logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape)) # loaded_dataset["Label"] = DATASET_NAME.upper() logger.info(loaded_dataset.head()) loaded_dataset.info() dataset = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset(loaded_dataset, drop_columns=[ "Flow Bytes/s", "Flow Packets/s", "Flow ID", "Source IP", "Destination IP", "Timestamp", "Fwd Header Length.1" ], shuffle=True, dropna_axis=[0, 1]) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True) dataset = None logger.info('Scaling dataset') xTest = datasets.drop_variance(xTest) # standardScaler = StandardScaler() # xTestScaled = standardScaler.fit_transform(xTest) results = [] clf = RandomForestClassifier(random_state=42, n_jobs=1) param_name = "n_estimators" param_range = [i**2 for i in range(4, 24, 4)] logger.info(param_name) logger.info(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=4, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) param_name = "max_depth" param_range = [2**i for i in range(1, 8)] logger.info(param_name) logger.info(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=4, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) param_name = "min_samples_leaf" param_range = [2 * i for i in range(1, 21)] logger.info(param_name) logger.info(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=4, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) # plot.plt_validation_curve(training_score, test_score, param_range) console_handler.close() file_handler.close()
def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = { "console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s' } file_handler, console_handler = logging.FileHandler( logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc loaded_dataset = load_data() logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape)) # loaded_dataset["Label"] = DATASET_NAME.upper() logger.info(loaded_dataset.head()) loaded_dataset.info() logger.info(loaded_dataset['class'].value_counts()) dataset = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset( loaded_dataset, # drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"], shuffle=True, dropna_axis=[0, 1]) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True, column_name="class") xTest = xTest.select_dtypes(exclude=['object']) dataset = None logger.info('Scaling dataset') xTest = datasets.drop_variance(xTest) standardScaler = StandardScaler() xTestScaled = standardScaler.fit_transform(xTest) logger.info("Performing PCA") pca = PCA(random_state=42, n_components=0.95) xTestPCA = pca.fit_transform(xTest) logger.info("Dataset shape with PCA {}".format(xTestPCA.shape)) results_array = [] logger.info("Logistic Regression") log_reg = LogisticRegression(verbose=0, n_jobs=-1, random_state=42, max_iter=1000) results = scoring.cross_validate_scoring(log_reg, xTest, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Logistic Regression") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Logistic Regression Scaled") log_reg = LogisticRegression(verbose=0, n_jobs=-1, random_state=42, max_iter=1000) results = scoring.cross_validate_scoring(log_reg, xTestScaled, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Logistic Regression Scaled") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Logistic Regression PCA") log_reg = LogisticRegression(verbose=0, n_jobs=-1, random_state=42, max_iter=1000) results = scoring.cross_validate_scoring(log_reg, xTestPCA, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Logistic Regression PCA") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Naive Bayes") gaussian_nb = GaussianNB() results = scoring.cross_validate_scoring(gaussian_nb, xTest, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Naive Bayes") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Naive Bayes Scales") gaussian_nb = GaussianNB() results = scoring.cross_validate_scoring(gaussian_nb, xTestScaled, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Naive Bayes Scaled") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Naive Bayes DR") gaussian_nb = GaussianNB() results = scoring.cross_validate_scoring(gaussian_nb, xTestPCA, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Naive Bayes PCA") logger.info(results) logger.info(results['confusion_matrix']) logger.info("SVC Classifier") linearSvc = LinearSVC(random_state=42, verbose=0, dual=False) # svc classifier results = scoring.cross_validate_scoring(linearSvc, xTest, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "SVC Normal") logger.info(results) logger.info(results['confusion_matrix']) logger.info("SVC Classifier PCA") linearSvc = LinearSVC(random_state=42, verbose=0, dual=False) # svc classifier results = scoring.cross_validate_scoring(linearSvc, xTestPCA, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "SVC Normal PCA") logger.info(results) logger.info(results['confusion_matrix']) logger.info("SVC Classifier Scaled") linearSvc = LinearSVC(random_state=42, verbose=0, dual=False) #svc classifier results = scoring.cross_validate_scoring(linearSvc, xTestScaled, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "SVC Scaled") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Decision Tree") dec_tree = DecisionTreeClassifier(random_state=42) results = scoring.cross_validate_scoring(dec_tree, xTest, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Decision Tree") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Decision Tree Scaled") dec_tree = DecisionTreeClassifier(random_state=42) results = scoring.cross_validate_scoring(dec_tree, xTestScaled, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Decision Tree Scaled") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Decision Tree PCA") dec_tree = DecisionTreeClassifier(random_state=42) results = scoring.cross_validate_scoring(dec_tree, xTestPCA, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Decision Tree PCA") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Random Forest") rnd_forest = RandomForestClassifier(n_estimators=100, random_state=42, verbose=0, n_jobs=-1) results = scoring.cross_validate_scoring(rnd_forest, xTest, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Random Forest") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Random Forest Scaled") rnd_forest = RandomForestClassifier(n_estimators=100, random_state=42, verbose=0, n_jobs=-1) results = scoring.cross_validate_scoring(rnd_forest, xTestScaled, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Random Forest Scaled") logger.info(results) logger.info(results['confusion_matrix']) logger.info("Random Forest PCA") rnd_forest = RandomForestClassifier(n_estimators=100, random_state=42, verbose=0, n_jobs=-1) results = scoring.cross_validate_scoring(rnd_forest, xTestPCA, yTest, cv=10, scoring=[ 'roc_auc', 'f1', 'roc', 'precision', 'recall', 'confusion_matrix' ], return_train_score=True) save_result2(results_array, results, "Random Forest PCA") logger.info(results) logger.info(results['confusion_matrix']) console_handler.close() file_handler.close()
def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = { "console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s' } file_handler, console_handler = logging.FileHandler( logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc benign = datasets.load_all(os.path.join( "datasets", DATASET_NAME_2)) # load dataset from csv scareware = datasets.load_all(os.path.join( "datasets", DATASET_NAME)) # load dataset from csv logger.info("{} {}".format("benign shape", benign.shape)) logger.info("{} {}".format("scareware shape", scareware.shape)) benign = datasets.prepare_dataset(benign, shuffle=True) scareware = datasets.prepare_dataset(scareware, shuffle=True) n_elements = min(benign.shape[0], scareware.shape[0], 150000) benign = benign.head(n_elements) scareware = scareware.head(n_elements) logger.info("{} {}".format("benign shape after balancing", benign.shape)) logger.info("{} {}".format("scareware shape after balancing", scareware.shape)) scareware["Label"] = DATASET_NAME.upper() loaded_dataset = pd.concat([benign, scareware], ignore_index=True) # union dataset logger.info(loaded_dataset.head()) loaded_dataset.info() benign = None scareware = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset(loaded_dataset, drop_columns=[ "Flow Bytes/s", "Flow Packets/s", "Flow ID", "Source IP", "Destination IP", "Timestamp", "Fwd Header Length.1" ], shuffle=True, dropna_axis=True) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True) dataset = None xTest = datasets.drop_variance(xTest) roc_auc_scores = [] roc_fpr_tpr_thres = [] # Estimators number test logger.info("Estimators number test") for i in range(4, 30, 4): n_estimators = i**2 logger.info("Training random forest with {} estimators ({})".format( n_estimators, i)) clf = RandomForestClassifier( n_estimators=n_estimators, n_jobs=-1, random_state=42) # Random Forest Classifier roc, auc_score = random_forest.fit_and_roc(clf, xTest, yTest) save_result(roc, auc_score, "estimators", n_estimators, roc_fpr_tpr_thres, roc_auc_scores) # Max depth number test roc_auc_scores = [] roc_fpr_tpr_thres = [] logger.info("max depth number test") for i in range(1, 11): max_depth = 2**i logger.info("Training random forest with {} max depth ({})".format( max_depth, i)) rnd_forest = RandomForestClassifier( n_estimators=144, max_depth=max_depth, n_jobs=-1, random_state=42) # Random Forest Classifier roc, auc_score = random_forest.fit_and_roc(rnd_forest, xTest, yTest) save_result(roc, auc_score, "max_depth", max_depth, roc_fpr_tpr_thres, roc_auc_scores) # Min Sample Leaf number test roc_auc_scores = [] roc_fpr_tpr_thres = [] logger.info("Min Sample Leaf number test") for i in range(1, 11): min_sample_leaf = i logger.info( "Training random forest with {} min sample leaf ({})".format( min_sample_leaf, i)) rnd_forest = RandomForestClassifier( n_estimators=144, max_depth=32, min_samples_leaf=min_sample_leaf, n_jobs=-1, random_state=42) # Random Forest Classifier roc, auc_score = random_forest.fit_and_roc(rnd_forest, xTest, yTest) save_result(roc, auc_score, "min_sample_leaf", min_sample_leaf, roc_fpr_tpr_thres, roc_auc_scores) roc_auc_scores, roc_fpr_tpr_thres = [], [] xTest = None yTest = None file_handler.close() console_handler.close()
def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = { "console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s' } file_handler, console_handler = logging.FileHandler( logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc loaded_dataset = load_data() logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape)) logger.info(loaded_dataset['Label'].value_counts()) # loaded_dataset["Label"] = DATASET_NAME.upper() logger.info(loaded_dataset.head()) loaded_dataset.info() dataset = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset( loaded_dataset, drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"], shuffle=True, dropna_axis=[0, 1]) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True) dataset = None xTest = datasets.drop_variance(xTest) standardScaler = StandardScaler() xTestScaled = standardScaler.fit_transform(xTest) results = [] logger.info("Logistic Regression") param_name = "C" param_range = np.logspace(-1, 1, 10) log_reg = LogisticRegression(verbose=1, n_jobs=-1, max_iter=1000, solver="liblinear", penalty="l2", random_state=42) train_scores, val_scores = validation_curve(log_reg, xTest, yTest, param_name=param_name, param_range=param_range, cv=3, scoring="roc_auc", verbose=1, n_jobs=-1) results.append([param_name, param_range, train_scores, val_scores]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) console_handler.close() file_handler.close()
def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = {"console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s'} file_handler, console_handler = logging.FileHandler(logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc loaded_dataset = load_data() logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape)) logger.info(loaded_dataset['class1'].value_counts()) # loaded_dataset["Label"] = DATASET_NAME.upper() logger.info(loaded_dataset['class1'].value_counts()) logger.info(loaded_dataset.head()) loaded_dataset.info() dataset = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset(loaded_dataset, # drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"], shuffle=True, dropna_axis=[1]) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True, column_name="class1") dataset = None xTest = datasets.drop_variance(xTest) standardScaler = StandardScaler() xTestScaled = standardScaler.fit_transform(xTest) results = [] clf = DecisionTreeClassifier(random_state=42) param_name = "max_depth" param_range = [2**i for i in range(1, 11)] training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=6, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) param_name = "min_samples_leaf" param_range = [i for i in range(1, 15)] print(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=6, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) param_name = "max_features" param_range = [1/i for i in range(1, 11)] print(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=6, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) clf = DecisionTreeClassifier(min_samples_leaf=10) train_sizes, train_scores, test_scores = learning_curve(clf, xTest, yTest, cv=6, n_jobs=-1, train_sizes=np.linspace(0.1, 1, 10)) results = [train_sizes, train_scores, test_scores] datasets.pk_save(results, RESULTS_FOLDER_PATH, "learning_curves") console_handler.close() file_handler.close()
def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = { "console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s' } file_handler, console_handler = logging.FileHandler( logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc loaded_dataset = load_data() logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape)) # loaded_dataset["Label"] = DATASET_NAME.upper() logger.info(loaded_dataset.head()) loaded_dataset.info() logger.info(loaded_dataset['URL_Type_obf_Type'].value_counts()) dataset = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset( loaded_dataset, # drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"], shuffle=True, dropna_axis=[0, 1]) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True, column_name="URL_Type_obf_Type") dataset = None xTest = datasets.drop_variance(xTest) standardScaler = StandardScaler() xTestScaled = standardScaler.fit_transform(xTest) results_array = [] logger.info("Logistic Regression") log_reg = LogisticRegression(verbose=1, n_jobs=-1, max_iter=1000) console_handler.close() file_handler.close()