def save_result(roc_curve, auc_score, parameter_name, parameter_value, rocs_array, auc_scores_array): roc_curve.insert(0, parameter_value) auc_score.insert(0, parameter_value) rocs_array.append(roc_curve) auc_scores_array.append(auc_score) # plt_add_roc_curve(fpr, tpr, label=str(n_estimators)) np_roc_array = np.array(rocs_array) np_roc_auc_scores = np.array(auc_scores_array) save_dir = os.path.join(RESULTS_FOLDER_PATH, parameter_name) os.makedirs(save_dir, exist_ok=True) datasets.np_double_save(np_roc_array, save_dir, "rnd_forest_roc_fpr_tpr_thres", as_csv=True, as_npy=True) datasets.np_double_save(np_roc_auc_scores, save_dir, "rnd_forest_roc_auc_scores", as_csv=True, as_npy=True)
def save_result2(results_array, results: dict, name: str): results['label'] = name results_array.append(results) datasets.pk_save(results_array, RESULTS_FOLDER_PATH, 'results') datasets.np_double_save(results_array, RESULTS_FOLDER_PATH, 'results', as_csv=True)
def save_result(roc_curve, auc_score, classifier_name, rocs_array, auc_scores_array): roc_curve.insert(0, classifier_name) auc_score.insert(0, classifier_name) rocs_array.append(roc_curve) auc_scores_array.append(auc_score) # plt_add_roc_curve(fpr, tpr, label=str(n_estimators)) np_roc_array = np.array(rocs_array) np_roc_auc_scores = np.array(auc_scores_array) datasets.np_double_save(np_roc_array, RESULTS_FOLDER_PATH, "roc_fpr_tpr_thres", as_csv=True, as_npy=True) datasets.np_double_save(np_roc_auc_scores, RESULTS_FOLDER_PATH, "roc_auc_scores", as_csv=True, as_npy=True)
def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = { "console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s' } file_handler, console_handler = logging.FileHandler( logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc loaded_dataset = load_data(logger) logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape)) # loaded_dataset["Label"] = DATASET_NAME.upper() logger.info(loaded_dataset.head()) loaded_dataset.info() dataset = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset(loaded_dataset, drop_columns=[ "Flow Bytes/s", "Flow Packets/s", "Flow ID", "Source IP", "Destination IP", "Timestamp", "Fwd Header Length.1" ], shuffle=True, dropna_axis=[0, 1]) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True) dataset = None logger.info('Scaling dataset') xTest = datasets.drop_variance(xTest) # standardScaler = StandardScaler() # xTestScaled = standardScaler.fit_transform(xTest) results = [] clf = RandomForestClassifier(random_state=42, n_jobs=1) param_name = "n_estimators" param_range = [i**2 for i in range(4, 24, 4)] logger.info(param_name) logger.info(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=4, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) param_name = "max_depth" param_range = [2**i for i in range(1, 8)] logger.info(param_name) logger.info(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=4, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) param_name = "min_samples_leaf" param_range = [2 * i for i in range(1, 21)] logger.info(param_name) logger.info(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=4, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) # plot.plt_validation_curve(training_score, test_score, param_range) console_handler.close() file_handler.close()
def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = { "console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s' } file_handler, console_handler = logging.FileHandler( logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc loaded_dataset = load_data() logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape)) logger.info(loaded_dataset['Label'].value_counts()) # loaded_dataset["Label"] = DATASET_NAME.upper() logger.info(loaded_dataset.head()) loaded_dataset.info() dataset = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset( loaded_dataset, drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"], shuffle=True, dropna_axis=[0, 1]) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True) dataset = None xTest = datasets.drop_variance(xTest) standardScaler = StandardScaler() xTestScaled = standardScaler.fit_transform(xTest) results = [] logger.info("Logistic Regression") param_name = "C" param_range = np.logspace(-1, 1, 10) log_reg = LogisticRegression(verbose=1, n_jobs=-1, max_iter=1000, solver="liblinear", penalty="l2", random_state=42) train_scores, val_scores = validation_curve(log_reg, xTest, yTest, param_name=param_name, param_range=param_range, cv=3, scoring="roc_auc", verbose=1, n_jobs=-1) results.append([param_name, param_range, train_scores, val_scores]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) console_handler.close() file_handler.close()
def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = {"console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s'} file_handler, console_handler = logging.FileHandler(logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc loaded_dataset = load_data() logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape)) logger.info(loaded_dataset['class1'].value_counts()) # loaded_dataset["Label"] = DATASET_NAME.upper() logger.info(loaded_dataset['class1'].value_counts()) logger.info(loaded_dataset.head()) loaded_dataset.info() dataset = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset(loaded_dataset, # drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"], shuffle=True, dropna_axis=[1]) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True, column_name="class1") dataset = None xTest = datasets.drop_variance(xTest) standardScaler = StandardScaler() xTestScaled = standardScaler.fit_transform(xTest) results = [] clf = DecisionTreeClassifier(random_state=42) param_name = "max_depth" param_range = [2**i for i in range(1, 11)] training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=6, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) param_name = "min_samples_leaf" param_range = [i for i in range(1, 15)] print(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=6, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) param_name = "max_features" param_range = [1/i for i in range(1, 11)] print(param_range) training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name, param_range=param_range, scoring="roc_auc", cv=6, verbose=1, n_jobs=-1) results.append([param_name, param_range, training_score, test_score]) datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True) clf = DecisionTreeClassifier(min_samples_leaf=10) train_sizes, train_scores, test_scores = learning_curve(clf, xTest, yTest, cv=6, n_jobs=-1, train_sizes=np.linspace(0.1, 1, 10)) results = [train_sizes, train_scores, test_scores] datasets.pk_save(results, RESULTS_FOLDER_PATH, "learning_curves") console_handler.close() file_handler.close()