Ejemplo n.º 1
0
def load_data(logger):
    benign = datasets.load_all(os.path.join(
        "datasets", DATASET_NAME_2))  # load dataset from csv
    ransomware = datasets.load_all(os.path.join(
        "datasets", DATASET_NAME))  # load dataset from csv
    logger.info("{} {}".format("benign shape", benign.shape))
    logger.info("{} {}".format("ransomware shape", ransomware.shape))

    benign = datasets.prepare_dataset(benign, shuffle=True)
    ransomware = datasets.prepare_dataset(ransomware, shuffle=True)

    n_elements = min(benign.shape[0], ransomware.shape[0], 150000)

    benign = benign.head(n_elements)
    ransomware = ransomware.head(n_elements)

    logger.info("{} {}".format("benign shape after balancing", benign.shape))
    logger.info("{} {}".format("ransomware shape after balancing",
                               ransomware.shape))

    ransomware["Label"] = DATASET_NAME.upper()

    return pd.concat([benign, ransomware], ignore_index=True)  # union dataset
Ejemplo n.º 2
0
def calc():
    if not os.path.exists(RESULTS_FOLDER_PATH):
        os.makedirs(RESULTS_FOLDER_PATH)

    logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log")
    if os.path.exists(logfile):
        os.remove(logfile)

    # logging stuff
    level = logging.INFO
    formats = {
        "console": '\u001b[37m %(message)s\033[0m',
        "file": '%(message)s'
    }

    file_handler, console_handler = logging.FileHandler(
        logfile, "x"), logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(formats["console"]))
    file_handler.setFormatter(logging.Formatter(formats["file"]))

    logger = logging.getLogger(__name__)
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)

    # begin calc
    loaded_dataset = load_data(logger)
    logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape))

    # loaded_dataset["Label"] = DATASET_NAME.upper()

    logger.info(loaded_dataset.head())
    loaded_dataset.info()

    dataset = None

    logger.info("{} {}".format("Dataset shape BEFORE preparation",
                               loaded_dataset.shape))
    dataset = datasets.prepare_dataset(loaded_dataset,
                                       drop_columns=[
                                           "Flow Bytes/s", "Flow Packets/s",
                                           "Flow ID", "Source IP",
                                           "Destination IP", "Timestamp",
                                           "Fwd Header Length.1"
                                       ],
                                       shuffle=True,
                                       dropna_axis=[0, 1])

    loaded_dataset = None

    logger.info("{} {}".format("Dataset shape AFTER preparation",
                               dataset.shape))

    xTest, yTest = datasets.separate_labels(dataset, encode=True)

    dataset = None

    logger.info('Scaling dataset')
    xTest = datasets.drop_variance(xTest)
    # standardScaler = StandardScaler()
    # xTestScaled = standardScaler.fit_transform(xTest)

    results = []
    clf = RandomForestClassifier(random_state=42, n_jobs=1)

    param_name = "n_estimators"
    param_range = [i**2 for i in range(4, 24, 4)]
    logger.info(param_name)
    logger.info(param_range)
    training_score, test_score = validation_curve(clf,
                                                  xTest,
                                                  yTest,
                                                  param_name=param_name,
                                                  param_range=param_range,
                                                  scoring="roc_auc",
                                                  cv=4,
                                                  verbose=1,
                                                  n_jobs=-1)

    results.append([param_name, param_range, training_score, test_score])
    datasets.np_double_save(results,
                            RESULTS_FOLDER_PATH,
                            "results",
                            as_csv=True,
                            as_npy=True)

    param_name = "max_depth"
    param_range = [2**i for i in range(1, 8)]
    logger.info(param_name)
    logger.info(param_range)
    training_score, test_score = validation_curve(clf,
                                                  xTest,
                                                  yTest,
                                                  param_name=param_name,
                                                  param_range=param_range,
                                                  scoring="roc_auc",
                                                  cv=4,
                                                  verbose=1,
                                                  n_jobs=-1)

    results.append([param_name, param_range, training_score, test_score])
    datasets.np_double_save(results,
                            RESULTS_FOLDER_PATH,
                            "results",
                            as_csv=True,
                            as_npy=True)

    param_name = "min_samples_leaf"
    param_range = [2 * i for i in range(1, 21)]
    logger.info(param_name)
    logger.info(param_range)
    training_score, test_score = validation_curve(clf,
                                                  xTest,
                                                  yTest,
                                                  param_name=param_name,
                                                  param_range=param_range,
                                                  scoring="roc_auc",
                                                  cv=4,
                                                  verbose=1,
                                                  n_jobs=-1)

    results.append([param_name, param_range, training_score, test_score])
    datasets.np_double_save(results,
                            RESULTS_FOLDER_PATH,
                            "results",
                            as_csv=True,
                            as_npy=True)

    # plot.plt_validation_curve(training_score, test_score, param_range)
    console_handler.close()
    file_handler.close()
Ejemplo n.º 3
0
def calc():
    if not os.path.exists(RESULTS_FOLDER_PATH):
        os.makedirs(RESULTS_FOLDER_PATH)

    logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log")
    if os.path.exists(logfile):
        os.remove(logfile)

    # logging stuff
    level = logging.INFO
    formats = {
        "console": '\u001b[37m %(message)s\033[0m',
        "file": '%(message)s'
    }

    file_handler, console_handler = logging.FileHandler(
        logfile, "x"), logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(formats["console"]))
    file_handler.setFormatter(logging.Formatter(formats["file"]))

    logger = logging.getLogger(__name__)
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)

    # begin calc
    loaded_dataset = load_data()
    logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape))
    # loaded_dataset["Label"] = DATASET_NAME.upper()

    logger.info(loaded_dataset.head())
    loaded_dataset.info()

    logger.info(loaded_dataset['class'].value_counts())

    dataset = None

    logger.info("{} {}".format("Dataset shape BEFORE preparation",
                               loaded_dataset.shape))
    dataset = datasets.prepare_dataset(
        loaded_dataset,
        # drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"],
        shuffle=True,
        dropna_axis=[0, 1])

    loaded_dataset = None

    logger.info("{} {}".format("Dataset shape AFTER preparation",
                               dataset.shape))

    xTest, yTest = datasets.separate_labels(dataset,
                                            encode=True,
                                            column_name="class")
    xTest = xTest.select_dtypes(exclude=['object'])

    dataset = None

    logger.info('Scaling dataset')
    xTest = datasets.drop_variance(xTest)
    standardScaler = StandardScaler()
    xTestScaled = standardScaler.fit_transform(xTest)

    logger.info("Performing PCA")
    pca = PCA(random_state=42, n_components=0.95)
    xTestPCA = pca.fit_transform(xTest)
    logger.info("Dataset shape with PCA {}".format(xTestPCA.shape))

    results_array = []

    logger.info("Logistic Regression")
    log_reg = LogisticRegression(verbose=0,
                                 n_jobs=-1,
                                 random_state=42,
                                 max_iter=1000)
    results = scoring.cross_validate_scoring(log_reg,
                                             xTest,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Logistic Regression")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Logistic Regression Scaled")
    log_reg = LogisticRegression(verbose=0,
                                 n_jobs=-1,
                                 random_state=42,
                                 max_iter=1000)
    results = scoring.cross_validate_scoring(log_reg,
                                             xTestScaled,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Logistic Regression Scaled")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Logistic Regression PCA")
    log_reg = LogisticRegression(verbose=0,
                                 n_jobs=-1,
                                 random_state=42,
                                 max_iter=1000)
    results = scoring.cross_validate_scoring(log_reg,
                                             xTestPCA,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Logistic Regression PCA")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Naive Bayes")
    gaussian_nb = GaussianNB()
    results = scoring.cross_validate_scoring(gaussian_nb,
                                             xTest,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Naive Bayes")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Naive Bayes Scales")
    gaussian_nb = GaussianNB()
    results = scoring.cross_validate_scoring(gaussian_nb,
                                             xTestScaled,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Naive Bayes Scaled")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Naive Bayes DR")
    gaussian_nb = GaussianNB()
    results = scoring.cross_validate_scoring(gaussian_nb,
                                             xTestPCA,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Naive Bayes PCA")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("SVC Classifier")
    linearSvc = LinearSVC(random_state=42, verbose=0,
                          dual=False)  # svc classifier
    results = scoring.cross_validate_scoring(linearSvc,
                                             xTest,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)
    save_result2(results_array, results, "SVC Normal")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("SVC Classifier PCA")
    linearSvc = LinearSVC(random_state=42, verbose=0,
                          dual=False)  # svc classifier
    results = scoring.cross_validate_scoring(linearSvc,
                                             xTestPCA,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)
    save_result2(results_array, results, "SVC Normal PCA")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("SVC Classifier Scaled")
    linearSvc = LinearSVC(random_state=42, verbose=0,
                          dual=False)  #svc classifier
    results = scoring.cross_validate_scoring(linearSvc,
                                             xTestScaled,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)
    save_result2(results_array, results, "SVC Scaled")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Decision Tree")
    dec_tree = DecisionTreeClassifier(random_state=42)
    results = scoring.cross_validate_scoring(dec_tree,
                                             xTest,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Decision Tree")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Decision Tree Scaled")
    dec_tree = DecisionTreeClassifier(random_state=42)
    results = scoring.cross_validate_scoring(dec_tree,
                                             xTestScaled,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Decision Tree Scaled")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Decision Tree PCA")
    dec_tree = DecisionTreeClassifier(random_state=42)
    results = scoring.cross_validate_scoring(dec_tree,
                                             xTestPCA,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Decision Tree PCA")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Random Forest")
    rnd_forest = RandomForestClassifier(n_estimators=100,
                                        random_state=42,
                                        verbose=0,
                                        n_jobs=-1)
    results = scoring.cross_validate_scoring(rnd_forest,
                                             xTest,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Random Forest")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Random Forest Scaled")
    rnd_forest = RandomForestClassifier(n_estimators=100,
                                        random_state=42,
                                        verbose=0,
                                        n_jobs=-1)
    results = scoring.cross_validate_scoring(rnd_forest,
                                             xTestScaled,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Random Forest Scaled")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    logger.info("Random Forest PCA")
    rnd_forest = RandomForestClassifier(n_estimators=100,
                                        random_state=42,
                                        verbose=0,
                                        n_jobs=-1)
    results = scoring.cross_validate_scoring(rnd_forest,
                                             xTestPCA,
                                             yTest,
                                             cv=10,
                                             scoring=[
                                                 'roc_auc', 'f1', 'roc',
                                                 'precision', 'recall',
                                                 'confusion_matrix'
                                             ],
                                             return_train_score=True)

    save_result2(results_array, results, "Random Forest PCA")
    logger.info(results)
    logger.info(results['confusion_matrix'])

    console_handler.close()
    file_handler.close()
def calc():
    if not os.path.exists(RESULTS_FOLDER_PATH):
        os.makedirs(RESULTS_FOLDER_PATH)

    logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log")
    if os.path.exists(logfile):
        os.remove(logfile)

    # logging stuff
    level = logging.INFO
    formats = {
        "console": '\u001b[37m %(message)s\033[0m',
        "file": '%(message)s'
    }

    file_handler, console_handler = logging.FileHandler(
        logfile, "x"), logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(formats["console"]))
    file_handler.setFormatter(logging.Formatter(formats["file"]))

    logger = logging.getLogger(__name__)
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)

    # begin calc
    benign = datasets.load_all(os.path.join(
        "datasets", DATASET_NAME_2))  # load dataset from csv
    scareware = datasets.load_all(os.path.join(
        "datasets", DATASET_NAME))  # load dataset from csv
    logger.info("{} {}".format("benign shape", benign.shape))
    logger.info("{} {}".format("scareware shape", scareware.shape))

    benign = datasets.prepare_dataset(benign, shuffle=True)
    scareware = datasets.prepare_dataset(scareware, shuffle=True)

    n_elements = min(benign.shape[0], scareware.shape[0], 150000)

    benign = benign.head(n_elements)
    scareware = scareware.head(n_elements)

    logger.info("{} {}".format("benign shape after balancing", benign.shape))
    logger.info("{} {}".format("scareware shape after balancing",
                               scareware.shape))

    scareware["Label"] = DATASET_NAME.upper()

    loaded_dataset = pd.concat([benign, scareware],
                               ignore_index=True)  # union dataset
    logger.info(loaded_dataset.head())
    loaded_dataset.info()

    benign = None
    scareware = None

    logger.info("{} {}".format("Dataset shape BEFORE preparation",
                               loaded_dataset.shape))
    dataset = datasets.prepare_dataset(loaded_dataset,
                                       drop_columns=[
                                           "Flow Bytes/s", "Flow Packets/s",
                                           "Flow ID", "Source IP",
                                           "Destination IP", "Timestamp",
                                           "Fwd Header Length.1"
                                       ],
                                       shuffle=True,
                                       dropna_axis=True)

    loaded_dataset = None

    logger.info("{} {}".format("Dataset shape AFTER preparation",
                               dataset.shape))

    xTest, yTest = datasets.separate_labels(dataset, encode=True)

    dataset = None

    xTest = datasets.drop_variance(xTest)

    roc_auc_scores = []
    roc_fpr_tpr_thres = []

    # Estimators number test
    logger.info("Estimators number test")

    for i in range(4, 30, 4):
        n_estimators = i**2
        logger.info("Training random forest with {} estimators ({})".format(
            n_estimators, i))
        clf = RandomForestClassifier(
            n_estimators=n_estimators, n_jobs=-1,
            random_state=42)  # Random Forest Classifier
        roc, auc_score = random_forest.fit_and_roc(clf, xTest, yTest)
        save_result(roc, auc_score, "estimators", n_estimators,
                    roc_fpr_tpr_thres, roc_auc_scores)

    # Max depth number test
    roc_auc_scores = []
    roc_fpr_tpr_thres = []
    logger.info("max depth number test")
    for i in range(1, 11):
        max_depth = 2**i
        logger.info("Training random forest with {} max depth ({})".format(
            max_depth, i))
        rnd_forest = RandomForestClassifier(
            n_estimators=144, max_depth=max_depth, n_jobs=-1,
            random_state=42)  # Random Forest Classifier
        roc, auc_score = random_forest.fit_and_roc(rnd_forest, xTest, yTest)
        save_result(roc, auc_score, "max_depth", max_depth, roc_fpr_tpr_thres,
                    roc_auc_scores)

    # Min Sample Leaf number test
    roc_auc_scores = []
    roc_fpr_tpr_thres = []
    logger.info("Min Sample Leaf number test")
    for i in range(1, 11):
        min_sample_leaf = i
        logger.info(
            "Training random forest with {} min sample leaf ({})".format(
                min_sample_leaf, i))
        rnd_forest = RandomForestClassifier(
            n_estimators=144,
            max_depth=32,
            min_samples_leaf=min_sample_leaf,
            n_jobs=-1,
            random_state=42)  # Random Forest Classifier
        roc, auc_score = random_forest.fit_and_roc(rnd_forest, xTest, yTest)
        save_result(roc, auc_score, "min_sample_leaf", min_sample_leaf,
                    roc_fpr_tpr_thres, roc_auc_scores)

    roc_auc_scores, roc_fpr_tpr_thres = [], []
    xTest = None
    yTest = None
    file_handler.close()
    console_handler.close()
Ejemplo n.º 5
0
def calc():
    if not os.path.exists(RESULTS_FOLDER_PATH):
        os.makedirs(RESULTS_FOLDER_PATH)

    logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log")
    if os.path.exists(logfile):
        os.remove(logfile)

    # logging stuff
    level = logging.INFO
    formats = {
        "console": '\u001b[37m %(message)s\033[0m',
        "file": '%(message)s'
    }

    file_handler, console_handler = logging.FileHandler(
        logfile, "x"), logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(formats["console"]))
    file_handler.setFormatter(logging.Formatter(formats["file"]))

    logger = logging.getLogger(__name__)
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)

    # begin calc
    loaded_dataset = load_data()
    logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape))
    logger.info(loaded_dataset['Label'].value_counts())

    # loaded_dataset["Label"] = DATASET_NAME.upper()

    logger.info(loaded_dataset.head())
    loaded_dataset.info()

    dataset = None

    logger.info("{} {}".format("Dataset shape BEFORE preparation",
                               loaded_dataset.shape))
    dataset = datasets.prepare_dataset(
        loaded_dataset,
        drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"],
        shuffle=True,
        dropna_axis=[0, 1])

    loaded_dataset = None

    logger.info("{} {}".format("Dataset shape AFTER preparation",
                               dataset.shape))

    xTest, yTest = datasets.separate_labels(dataset, encode=True)

    dataset = None

    xTest = datasets.drop_variance(xTest)
    standardScaler = StandardScaler()
    xTestScaled = standardScaler.fit_transform(xTest)

    results = []

    logger.info("Logistic Regression")

    param_name = "C"
    param_range = np.logspace(-1, 1, 10)
    log_reg = LogisticRegression(verbose=1,
                                 n_jobs=-1,
                                 max_iter=1000,
                                 solver="liblinear",
                                 penalty="l2",
                                 random_state=42)

    train_scores, val_scores = validation_curve(log_reg,
                                                xTest,
                                                yTest,
                                                param_name=param_name,
                                                param_range=param_range,
                                                cv=3,
                                                scoring="roc_auc",
                                                verbose=1,
                                                n_jobs=-1)

    results.append([param_name, param_range, train_scores, val_scores])
    datasets.np_double_save(results,
                            RESULTS_FOLDER_PATH,
                            "results",
                            as_csv=True,
                            as_npy=True)

    console_handler.close()
    file_handler.close()
Ejemplo n.º 6
0
def calc():
    if not os.path.exists(RESULTS_FOLDER_PATH):
        os.makedirs(RESULTS_FOLDER_PATH)

    logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log")
    if os.path.exists(logfile):
        os.remove(logfile)

    # logging stuff
    level = logging.INFO
    formats = {"console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s'}

    file_handler, console_handler = logging.FileHandler(logfile, "x"), logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(formats["console"]))
    file_handler.setFormatter(logging.Formatter(formats["file"]))

    logger = logging.getLogger(__name__)
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)

    # begin calc
    loaded_dataset = load_data()
    logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape))
    logger.info(loaded_dataset['class1'].value_counts())

    # loaded_dataset["Label"] = DATASET_NAME.upper()

    logger.info(loaded_dataset['class1'].value_counts())

    logger.info(loaded_dataset.head())
    loaded_dataset.info()

    dataset = None

    logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape))
    dataset = datasets.prepare_dataset(loaded_dataset,
                                       # drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"],
                                       shuffle=True, dropna_axis=[1])

    loaded_dataset = None

    logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape))

    xTest, yTest = datasets.separate_labels(dataset, encode=True, column_name="class1")

    dataset = None

    xTest = datasets.drop_variance(xTest)
    standardScaler = StandardScaler()
    xTestScaled = standardScaler.fit_transform(xTest)
    results = []
    clf = DecisionTreeClassifier(random_state=42)

    param_name = "max_depth"
    param_range = [2**i for i in range(1, 11)]
    training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name,
                                                  param_range=param_range,
                                                  scoring="roc_auc", cv=6, verbose=1, n_jobs=-1)

    results.append([param_name, param_range, training_score, test_score])
    datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True)


    param_name = "min_samples_leaf"
    param_range = [i for i in range(1, 15)]
    print(param_range)
    training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name,
                                                  param_range=param_range,
                                                  scoring="roc_auc", cv=6, verbose=1, n_jobs=-1)

    results.append([param_name, param_range, training_score, test_score])
    datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True)


    param_name = "max_features"
    param_range = [1/i for i in range(1, 11)]
    print(param_range)
    training_score, test_score = validation_curve(clf, xTest, yTest, param_name=param_name,
                                                  param_range=param_range,
                                                  scoring="roc_auc", cv=6, verbose=1, n_jobs=-1)

    results.append([param_name, param_range, training_score, test_score])
    datasets.np_double_save(results, RESULTS_FOLDER_PATH, "results", as_csv=True, as_npy=True)

    clf = DecisionTreeClassifier(min_samples_leaf=10)
    train_sizes, train_scores, test_scores = learning_curve(clf, xTest, yTest, cv=6, n_jobs=-1,
                                                            train_sizes=np.linspace(0.1, 1, 10))

    results = [train_sizes, train_scores, test_scores]
    datasets.pk_save(results, RESULTS_FOLDER_PATH,
                     "learning_curves")

    console_handler.close()
    file_handler.close()
Ejemplo n.º 7
0
def calc():
    if not os.path.exists(RESULTS_FOLDER_PATH):
        os.makedirs(RESULTS_FOLDER_PATH)

    logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log")
    if os.path.exists(logfile):
        os.remove(logfile)

    # logging stuff
    level = logging.INFO
    formats = {
        "console": '\u001b[37m %(message)s\033[0m',
        "file": '%(message)s'
    }

    file_handler, console_handler = logging.FileHandler(
        logfile, "x"), logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(formats["console"]))
    file_handler.setFormatter(logging.Formatter(formats["file"]))

    logger = logging.getLogger(__name__)
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)

    # begin calc
    loaded_dataset = load_data()
    logger.info("{} {}".format("loaded_dataset shape", loaded_dataset.shape))

    # loaded_dataset["Label"] = DATASET_NAME.upper()

    logger.info(loaded_dataset.head())
    loaded_dataset.info()
    logger.info(loaded_dataset['URL_Type_obf_Type'].value_counts())

    dataset = None

    logger.info("{} {}".format("Dataset shape BEFORE preparation",
                               loaded_dataset.shape))
    dataset = datasets.prepare_dataset(
        loaded_dataset,
        # drop_columns=["Flow Bytes/s", "Flow Packets/s", "Fwd Header Length.1"],
        shuffle=True,
        dropna_axis=[0, 1])

    loaded_dataset = None

    logger.info("{} {}".format("Dataset shape AFTER preparation",
                               dataset.shape))

    xTest, yTest = datasets.separate_labels(dataset,
                                            encode=True,
                                            column_name="URL_Type_obf_Type")

    dataset = None

    xTest = datasets.drop_variance(xTest)
    standardScaler = StandardScaler()
    xTestScaled = standardScaler.fit_transform(xTest)

    results_array = []

    logger.info("Logistic Regression")
    log_reg = LogisticRegression(verbose=1, n_jobs=-1, max_iter=1000)

    console_handler.close()
    file_handler.close()