Beispiel #1
0
def load_data():
    loaded_dataset = datasets.load_all(os.path.join("datasets"))  # load dataset from csv
    tor = loaded_dataset[loaded_dataset.class1 == "TOR"]
    nontor = loaded_dataset[loaded_dataset.class1 == "NONTOR"]
    print(tor, nontor)

    tor_supsample = resample(tor,
                             replace=True,     # sample with replacement
                             n_samples=nontor.shape[0],    # to match majority class
                             random_state=42)
    return pd.concat([tor_supsample, nontor], ignore_index=True)
Beispiel #2
0
def load_data():
    # data = datasets.load_all(os.path.join("datasets"))  # load dataset from csv
    # ddos = data[data.Label != "BENIGN"]
    # benign = data[data.Label == "BENIGN"]
    # ddos['Label'] = "DDoS"
    #
    # subsample = resample(benign,
    #                      replace=True,
    #                      n_samples=ddos.shape[0],
    #                      random_state=42)
    #
    # return pd.concat([ddos, subsample], ignore_index=True)
    return datasets.load_all(os.path.join("datasets"))  # load dataset from csv
Beispiel #3
0
def load_data(logger):
    benign = datasets.load_all(os.path.join(
        "datasets", DATASET_NAME_2))  # load dataset from csv
    ransomware = datasets.load_all(os.path.join(
        "datasets", DATASET_NAME))  # load dataset from csv
    logger.info("{} {}".format("benign shape", benign.shape))
    logger.info("{} {}".format("ransomware shape", ransomware.shape))

    benign = datasets.prepare_dataset(benign, shuffle=True)
    ransomware = datasets.prepare_dataset(ransomware, shuffle=True)

    n_elements = min(benign.shape[0], ransomware.shape[0], 150000)

    benign = benign.head(n_elements)
    ransomware = ransomware.head(n_elements)

    logger.info("{} {}".format("benign shape after balancing", benign.shape))
    logger.info("{} {}".format("ransomware shape after balancing",
                               ransomware.shape))

    ransomware["Label"] = DATASET_NAME.upper()

    return pd.concat([benign, ransomware], ignore_index=True)  # union dataset
Beispiel #4
0
def load_data():
    data = datasets.load_all(os.path.join("datasets"))  # load dataset from csv
    asware = data[data.calss == "asware"]
    benign = data[data.calss == "benign"]

    n_samples = min(asware.shape[0], benign.shape[0], 100000)

    benign = resample(benign,
                      replace=False,
                      n_samples=n_samples,
                      random_state=42)

    asware = resample(asware,
                      replace=False,
                      n_samples=n_samples,
                      random_state=42)

    return pd.concat([asware, benign], ignore_index=True)
def load_data():
    return datasets.load_all(os.path.join("datasets"))  # load dataset from csv
def calc():
    if not os.path.exists(RESULTS_FOLDER_PATH):
        os.makedirs(RESULTS_FOLDER_PATH)

    logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log")
    if os.path.exists(logfile):
        os.remove(logfile)

    # logging stuff
    level = logging.INFO
    formats = {
        "console": '\u001b[37m %(message)s\033[0m',
        "file": '%(message)s'
    }

    file_handler, console_handler = logging.FileHandler(
        logfile, "x"), logging.StreamHandler()
    console_handler.setFormatter(logging.Formatter(formats["console"]))
    file_handler.setFormatter(logging.Formatter(formats["file"]))

    logger = logging.getLogger(__name__)
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)

    # begin calc
    benign = datasets.load_all(os.path.join(
        "datasets", DATASET_NAME_2))  # load dataset from csv
    scareware = datasets.load_all(os.path.join(
        "datasets", DATASET_NAME))  # load dataset from csv
    logger.info("{} {}".format("benign shape", benign.shape))
    logger.info("{} {}".format("scareware shape", scareware.shape))

    benign = datasets.prepare_dataset(benign, shuffle=True)
    scareware = datasets.prepare_dataset(scareware, shuffle=True)

    n_elements = min(benign.shape[0], scareware.shape[0], 150000)

    benign = benign.head(n_elements)
    scareware = scareware.head(n_elements)

    logger.info("{} {}".format("benign shape after balancing", benign.shape))
    logger.info("{} {}".format("scareware shape after balancing",
                               scareware.shape))

    scareware["Label"] = DATASET_NAME.upper()

    loaded_dataset = pd.concat([benign, scareware],
                               ignore_index=True)  # union dataset
    logger.info(loaded_dataset.head())
    loaded_dataset.info()

    benign = None
    scareware = None

    logger.info("{} {}".format("Dataset shape BEFORE preparation",
                               loaded_dataset.shape))
    dataset = datasets.prepare_dataset(loaded_dataset,
                                       drop_columns=[
                                           "Flow Bytes/s", "Flow Packets/s",
                                           "Flow ID", "Source IP",
                                           "Destination IP", "Timestamp",
                                           "Fwd Header Length.1"
                                       ],
                                       shuffle=True,
                                       dropna_axis=True)

    loaded_dataset = None

    logger.info("{} {}".format("Dataset shape AFTER preparation",
                               dataset.shape))

    xTest, yTest = datasets.separate_labels(dataset, encode=True)

    dataset = None

    xTest = datasets.drop_variance(xTest)

    roc_auc_scores = []
    roc_fpr_tpr_thres = []

    # Estimators number test
    logger.info("Estimators number test")

    for i in range(4, 30, 4):
        n_estimators = i**2
        logger.info("Training random forest with {} estimators ({})".format(
            n_estimators, i))
        clf = RandomForestClassifier(
            n_estimators=n_estimators, n_jobs=-1,
            random_state=42)  # Random Forest Classifier
        roc, auc_score = random_forest.fit_and_roc(clf, xTest, yTest)
        save_result(roc, auc_score, "estimators", n_estimators,
                    roc_fpr_tpr_thres, roc_auc_scores)

    # Max depth number test
    roc_auc_scores = []
    roc_fpr_tpr_thres = []
    logger.info("max depth number test")
    for i in range(1, 11):
        max_depth = 2**i
        logger.info("Training random forest with {} max depth ({})".format(
            max_depth, i))
        rnd_forest = RandomForestClassifier(
            n_estimators=144, max_depth=max_depth, n_jobs=-1,
            random_state=42)  # Random Forest Classifier
        roc, auc_score = random_forest.fit_and_roc(rnd_forest, xTest, yTest)
        save_result(roc, auc_score, "max_depth", max_depth, roc_fpr_tpr_thres,
                    roc_auc_scores)

    # Min Sample Leaf number test
    roc_auc_scores = []
    roc_fpr_tpr_thres = []
    logger.info("Min Sample Leaf number test")
    for i in range(1, 11):
        min_sample_leaf = i
        logger.info(
            "Training random forest with {} min sample leaf ({})".format(
                min_sample_leaf, i))
        rnd_forest = RandomForestClassifier(
            n_estimators=144,
            max_depth=32,
            min_samples_leaf=min_sample_leaf,
            n_jobs=-1,
            random_state=42)  # Random Forest Classifier
        roc, auc_score = random_forest.fit_and_roc(rnd_forest, xTest, yTest)
        save_result(roc, auc_score, "min_sample_leaf", min_sample_leaf,
                    roc_fpr_tpr_thres, roc_auc_scores)

    roc_auc_scores, roc_fpr_tpr_thres = [], []
    xTest = None
    yTest = None
    file_handler.close()
    console_handler.close()