Ejemplo n.º 1
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)
    pd.set_option('display.max_columns', None)

    if len(data) == 0:
        data = pd.DataFrame(pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE))
        data = transform_data(data)

    data = data.replace("True", 1)
    data = data.replace("False", 0)
    y = data['Label'].copy()
    x = data.drop(["Label"], axis=1).copy()
    train, test = train_test_split(data, test_size=0.35)
    log(action_logging_enum=INFO,
        logging_text="[K-NEAREST NEIGHBOR] Data ready for use.")

    y_train = train['Label'].copy()
    x_train = train.drop(["Label"], axis=1).copy()

    params = {'n_neighbor': 9}

    knn = KNeighborsClassifier()  # params)
    f1 = print_scores(knn, x, y)
    log(action_logging_enum=INFO,
        logging_text="[K-NEAREST NEIGHBOR] Starting training.")

    knn.fit(x_train, y_train)
    save_model(knn=knn)
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Ejemplo n.º 2
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)

    if len(data) == 0:
        data = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE)
        data = transform_data(data)

    train, test = train_test_split(data, test_size=0.2)
    pd.set_option('display.max_columns', None)
    y = data['Label']
    x = data.drop(['Label'], axis=1).values
    log(action_logging_enum=INFO,
        logging_text="[ADAPTIVE BOOSTING] Data ready for use.")

    if do_optimize == True:
        optimize()

    params = {'n_estimators': 120, 'random_state': 0}

    log(action_logging_enum=INFO,
        logging_text="[ADAPTIVE BOOSTING] Starting training.")
    adaptive_boosting = AdaBoostClassifier()  #params)
    f1 = print_scores(adaptive_boosting, x, y)

    y_train = train['Label']
    x_train = train.drop(['Label'], axis=1)

    # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10,
    #                                    max_features='sqrt', max_depth=17

    adaptive_boosting.fit(x_train, y_train)
    save_model(adaptive_boosting=adaptive_boosting)
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Ejemplo n.º 3
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)

    if len(data) == 0:
        return None

    data = transform_data(data)


    train, test = train_test_split(data, test_size=0.2)
    pd.set_option('display.max_columns', None)
    y = data['Label']
    x = data.drop(['Label'], axis=1).values
    log(action_logging_enum=INFO, logging_text="[RANDOM FOREST] Data ready for use.")

    if do_optimize == True:
        optimize()

    params = {
        'n_estimators': 800,
        'max_features': 6,
        'max_depth': 21,
        'min_samples_leaf': 1,
        'min_samples_split': 4
    }

    log(action_logging_enum=INFO, logging_text="[RANDOM FOREST] Starting training.")
    random_forest = RandomForestClassifier(n_estimators=700, max_features='auto', min_samples_leaf=2, min_samples_split=3)
    f1 = print_scores(random_forest, x, y)

    #   'n_estimators': 1400,
    #    'max_features': 'sqrt',
    #    'max_depth': 20,
    #    'min_samples_leaf': 2,
    #    'min_samples_split': 4
    #}



    # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10,
    #                                    max_features='sqrt', max_depth=17

    y_train = train['Label']
    x_train = train.drop(['Label'], axis=1).values
    random_forest.fit(x_train, y_train)
    save_model(random_forest=random_forest)
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Ejemplo n.º 4
0
def train_model(do_optimize=False, data=pd.DataFrame()):

    # log training starting
    log_module_start(MODULE_NAME=MODEL_NAME)
    # read data and split into test and train

    if len(data) == 0:
        print(len(data))
        data = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE)
        # transform data
        data = transform_data(data)

    train, test = train_test_split(data, test_size=0.2)
    pd.set_option('display.expand_frame_repr', False)
    # display all columns with head()
    pd.set_option('display.max_columns', None)
    y = data['Label']
    x = data.drop(['Label'], axis=1).values
    log(action_logging_enum=INFO,
        logging_text="[DECISION TREE]: Data ready for use.")

    # divide data to inputs (x) and labels (y)
    y_train = train['Label']
    x_train = train.drop(['Label'], axis=1).values

    if do_optimize == True:
        optimize()

    params = {
        'min_samples_split': 3,
        'min_samples_leaf': 1,
        'random_state': 42,
        'class_weight': 'balanced'
    }

    log(action_logging_enum=INFO,
        logging_text="[DECISION TREE]: Starting training.")
    # create classifier with specifications
    decision_tree = tree.DecisionTreeClassifier()  #params)
    f1 = print_scores(decision_tree, x, y)

    decision_tree.fit(x_train, y_train)
    save_model(decision_tree=decision_tree)
    # log train complete
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Ejemplo n.º 5
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)

    if len(data) == 0:
        data = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE)
        data = transform_data(data)

    y = data['Label']
    x = data.drop(['Label'], axis=1).values
    train, test = train_test_split(data, test_size=0.2)

    pd.set_option('display.max_columns', None)

    if do_optimize == True:
        optimize()

    log(action_logging_enum=INFO, logging_text="[SUPPORT VECTOR MACHINE] Data ready for use.")

    # support vector machine
    g = 0.1
    c = 0.1

    params = {
        'kernel': 'linear',
        'random_state': 0,
        'gamma': g,
        'C': c
    }

    support_vector_machine = SVC()#params)  # params=params)
    log(action_logging_enum=INFO, logging_text="[SUPPORT VECTOR MACHINE] Starting training.")
    f1 = print_scores(support_vector_machine=support_vector_machine, x=x, y=y)  # , params=params)

    y_train = train['Label']
    x_train = train.drop(['Label'], axis=1).values
    support_vector_machine.fit(x_train, y_train)
    save_model(support_vector_machine=support_vector_machine)
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Ejemplo n.º 6
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)
    pd.set_option('display.max_columns', None)

    if len(data) == 0:
        data = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE)
        data = transform_data(data)

    y = data['Label']
    x = data.drop(["Label"], axis=1)
    train, test = train_test_split(data, test_size=0.2)
    log(action_logging_enum=INFO, logging_text="[LOGISTIC REGRESSION]: Data ready for use.")

    params = {
        'random_state': 1,
        'C': 0.1
    }

    logistic_regression = LogisticRegression()#params)  # random_state=1, C=0.10)
    f1= print_scores(logistic_regression, x, y)

    if do_optimize == True:
        optimize()

    log(action_logging_enum=INFO, logging_text="[LOGISTIC REGRESSION]: Starting training.")

    y_train = train['Label']
    x_train = train.drop(["Label"], axis=1)
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(x_train)
    x_train = pd.DataFrame(scaler.transform(x_train))
    logistic_regression.fit(x_train, y_train)
    save_model(logistic_regression=logistic_regression)

    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Ejemplo n.º 7
0
def run(content=False, lexical=False, signature=False, val_sets=False):

    # log module start
    log_module_start(MODULE_NAME=MODULE_NAME)

    content_feature_list = []
    lexical_feature_list = []

    # generate validation sets of all three databases
    if val_sets:
        generate_validation_sets()
        return

    # open data file and write to list (created in component database)
    data_list = open_dataset_XML_file(filename=DATABASE,
                                      iterateable="entry",
                                      label_label="label",
                                      url_label="url")

    if data_list == None:
        log(action_logging_enum=WARNING,
            logging_text=
            "[MODULE FEATURE EXTRACTION]: CSV File [data.csv] was not found. returning ..."
            )
        return

    # binarize labels
    data_list = binarize_labels(data_list)
    log(action_logging_enum=INFO,
        logging_text="[MODULE FEATURE EXTRACTION]: Labels binarized")

    # create feature_list with FeatureEntries for all urls in list
    if lexical:
        # create lexical feauture list
        lexical_feature_list = f.extract_features_from_URL_list(data=data_list)

    if content:
        # extract content based features using ray
        # list is saved for each 1000 entries since the extraction lasts about 2 hours

        process = True
        index = 6000
        append = False
        last_index = 5967

        if index == 0:
            delete_data(filename=CONTENT_FEATURE_DATABASE)

        ray.init(num_cpus=6)
        while process:

            end_index = index + 1000

            if end_index >= len(data_list):
                end_index = len(data_list) - 1
                process = False

            if index > 0:
                append = True

            copy_data = data_list[index:end_index]

            content_feature_list = f.extract_features_from_website_list_ray(
                data=copy_data)

            if not len(content_feature_list) > 0:
                log(
                    ERROR,
                    "[MODULE FEATURE EXTRACTION]: Error while creating feature list for content filter. The list is empty"
                )
                process = False
                break
            last_index += 1
            last_index = write_content_features_CSV(
                feature_list=content_feature_list,
                append=append,
                new_index=last_index)
            log(
                INFO,
                "[MODULE FEATURE EXTRACTION]: Feature list for content filter was writen."
            )

            index += 1000
            log(
                INFO,
                "[MODULE FEATURE EXTRACTION]: Feature list for content filter was writen. (Next for index: {}"
                .format(index))

        ray.shutdown()

    if signature:
        # extract features for signature based filter

        ray.init(num_cpus=6)
        signature_feature_list = f.extract_features_from_signature_list(
            data=data_list)
        ray.shutdown()
        write_signature_features_CSV(feature_list=signature_feature_list)

    # feature extraction completeted
    log(action_logging_enum=INFO,
        logging_text=
        "[MODULE FEATURE EXTRACTION]: Feature extraction completed.")

    # check whether the list has entries
    if len(lexical_feature_list) > 0 and lexical:
        log(
            INFO,
            "[MODULE FEATURE EXTRACTION]: Feature list for lexical filter successfully created."
        )

        delete_data(filename=LEXICAL_FEATURE_DATABASE)

        # write lexical_feature_list to csv file
        write_lexical_features_CSV(feature_list=lexical_feature_list)

    elif lexical:
        log(
            ERROR,
            "[MODULE FEATURE EXTRACTION]: Error while creating feature list for lexical filter. The list is empty"
        )

    # log module completion
    log_module_complete(MODULE_NAME=MODULE_NAME)
Ejemplo n.º 8
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)

    if len(data) == 0:
        data = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE)
        data = transform_data(data)

    for index, col in enumerate(data.columns):
        if data[col].dtype == np.bool:
            name = data.iloc[:, index].name
            if name != "Label":
                data[name] = data[name].astype(int)

    y = data['Label']
    x = data.drop(['Label'], axis=1).values
    train, test = train_test_split(data, test_size=0.2)

    train_y = train['Label']
    train_x = train.drop(['Label'], axis=1).values
    test_y = test['Label']
    test_x = test.drop(['Label'], axis=1).values

    pd.set_option('display.max_columns', None)
    log(action_logging_enum=INFO,
        logging_text="[EXTREME GRADIENT BOOSTING] Data ready for use.")

    if do_optimize == True:
        optimize(train_x, train_y, test_x, test_y)

    params = {
        'silent': False,
        'scale_pos_weight': 1,
        'use_label_encoder': False,
        'learning_rate': 0.04,
        'colsample_bytree': 0.7,
        'subsample': 0.7,
        'n_estimators': 700,
        'reg_alpha': 0.3,
        'max_depth': 5,
        'gamma': 10
    }

    extreme_gradient = XGBClassifier(silent=False,
                                     scale_pos_weight=1,
                                     use_label_encoder=False,
                                     learning_rate=0.04,
                                     colsample_bytree=0.7,
                                     subsample=0.7,
                                     n_estimators=700,
                                     reg_alpha=0.3,
                                     max_depth=5,
                                     gamma=10,
                                     enable_categorical=True)

    f1 = print_scores(extreme_gradient, x, y)

    y_train = train['Label']
    x_train = train.drop(['Label'], axis=1)

    for index, col in enumerate(data.columns):
        if data[col].dtype == np.bool:
            name = data.iloc[:, index]
            data[name] = data[name].astype(int)

    extreme_gradient.fit(x_train, y_train)
    save_model(extreme_gradient=extreme_gradient)

    # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10,
    #                                    max_features='sqrt', max_depth=17

    log(action_logging_enum=INFO,
        logging_text="[EXTREME GRADIENT BOOSTING] Starting training.")
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Ejemplo n.º 9
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)

    if len(data) == 0:
        data = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE)
        data = transform_data(data)

    data = data[[
        'Label', 'Entropy', 'Ratio Netloc/URL', 'Length URL',
        'Ratio Digit/Letter', 'Ratio Path/URL', 'Has HTTPS', 'Length Netloc',
        'KL Divergence', 'Ratio Vowel/Consonant', 'Number Symbols',
        'Number Dots', 'Number Tokens Netloc', 'Number Digits Path',
        'Ratio Cap/NonCap', 'Number Dash', 'Number Dash Netloc',
        'Has Token Netloc', 'Number Slash Path', 'Ratio Query/URL',
        'Number Digits Netloc', 'Number Redirects', 'Number PhishyTokens Path',
        'Has Digits Netloc', 'Number Query Parameters', 'Number Dots Netloc',
        'Has Query', 'Number Equals', 'Number Semicolon', 'Number Ampersand',
        'Cert Created Shortly', 'Number Stars'
    ]]
    train, test = train_test_split(data, test_size=0.2)
    pd.set_option('display.max_columns', None)
    y = data['Label']
    x = data[[
        'Entropy', 'Ratio Netloc/URL', 'Length URL', 'Ratio Digit/Letter',
        'Ratio Path/URL', 'Has HTTPS', 'Length Netloc', 'KL Divergence',
        'Ratio Vowel/Consonant', 'Number Symbols', 'Number Dots',
        'Number Tokens Netloc', 'Number Digits Path', 'Ratio Cap/NonCap',
        'Number Dash', 'Number Dash Netloc', 'Has Token Netloc',
        'Number Slash Path', 'Ratio Query/URL', 'Number Digits Netloc',
        'Number Redirects', 'Number PhishyTokens Path', 'Has Digits Netloc',
        'Number Query Parameters', 'Number Dots Netloc', 'Has Query',
        'Number Equals', 'Number Semicolon', 'Number Ampersand',
        'Cert Created Shortly', 'Number Stars'
    ]]
    log(action_logging_enum=INFO,
        logging_text="[RANDOM FOREST] Data ready for use.")

    if do_optimize == True:
        optimize()

    params = {
        'n_estimators': 800,
        'max_features': 6,
        'max_depth': 21,
        'min_samples_leaf': 1,
        'min_samples_split': 4
    }

    log(action_logging_enum=INFO,
        logging_text="[RANDOM FOREST] Starting training.")
    random_forest = RandomForestClassifier(
    )  #n_estimators=1400, max_features='sqrt', min_samples_leaf=2, min_samples_split=4)
    f1 = print_scores(random_forest, x, y)

    #   'n_estimators': 1400,
    #    'max_features': 'sqrt',
    #    'max_depth': 20,
    #    'min_samples_leaf': 2,
    #    'min_samples_split': 4
    #}

    # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10,
    #                                    max_features='sqrt', max_depth=17

    y_train = train['Label']
    x_train = train.drop(['Label'], axis=1).values
    random_forest.fit(x_train, y_train)
    save_model(random_forest=random_forest)
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Ejemplo n.º 10
0
def train_model(do_optimize=False, data=pd.DataFrame()):
    log_module_start(MODULE_NAME=MODEL_NAME)

    if len(data) == 0:
        data = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE)
        data = transform_data(data)

    data = data[[
        'Label', 'Ratio Similarity', 'Ratio Description Sim', 'Number HREF',
        'Number DIV', 'Number LI', 'Ratio Title Sim', 'Number Span',
        'Number UL', 'Has Bond Status', 'Number Image', 'Ratio Copyright Sim',
        'Number PhishyTokens', 'Number Extern Links', 'Number Button',
        'Number Inputs', 'Number Paragr', 'Ratio Unique Links',
        'Has Freq Domain Extern', 'Has Copyright', 'Has Button',
        'Has Redirect', 'Has iFrame', 'Has Extern Content', 'Has Meta',
        'Has Input', 'Number Option', 'Has Action', 'Number OL', 'Number TR',
        'Has Hidden Element', 'Number Checkbox'
    ]]
    train, test = train_test_split(data, test_size=0.2)
    pd.set_option('display.max_columns', None)
    y = data['Label']
    x = data.drop(['Label'], axis=1)
    log(action_logging_enum=INFO,
        logging_text="[RANDOM FOREST] Data ready for use.")

    if do_optimize == True:
        optimize()

    #Best estimators: 1400
    #Best samples leaf: 1
    #Best samples split: 2
    #Best features: sqrt
    #Best depth: 21

    params = {
        'n_estimators': 1400,
        'max_features': 'sqrt',
        'max_depth': 20,
        'min_samples_leaf': 2,
        'min_samples_split': 4
    }

    log(action_logging_enum=INFO,
        logging_text="[RANDOM FOREST] Starting training.")
    random_forest = RandomForestClassifier(
    )  #n_estimators=600, max_features='auto', min_samples_leaf=1, min_samples_split=2, max_depth=None)
    f1 = print_scores(random_forest, x, y)

    # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10,
    #                                    max_features='sqrt', max_depth=17

    y_train = train['Label']
    x_train = train[[
        'Ratio Similarity', 'Ratio Description Sim', 'Number HREF',
        'Number DIV', 'Number LI', 'Ratio Title Sim', 'Number Span',
        'Number UL', 'Has Bond Status', 'Number Image', 'Ratio Copyright Sim',
        'Number PhishyTokens', 'Number Extern Links', 'Number Button',
        'Number Inputs', 'Number Paragr', 'Ratio Unique Links',
        'Has Freq Domain Extern', 'Has Copyright', 'Has Button',
        'Has Redirect', 'Has iFrame', 'Has Extern Content', 'Has Meta',
        'Has Input', 'Number Option', 'Has Action', 'Number OL', 'Number TR',
        'Has Hidden Element', 'Number Checkbox'
    ]]
    random_forest.fit(x_train, y_train)
    save_model(random_forest=random_forest)
    log_module_complete(MODULE_NAME=MODEL_NAME)

    return f1
Ejemplo n.º 11
0
def run(do_download_alexa=False, do_download_phish=False, do_query_alexa=False, check_status_phishing=False, check_status_benign=False):

    log_module_start(MODULE_NAME=MODULE_NAME)


    ################ ALEXA LIST ##################

    # download all list
    if do_download_alexa==True:
        db.download_file("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip", "alexa.csv.zip")
        db.extract_from_Zip(compressed_name="alexa.csv.zip", target_dir=DATA_PATH, new_name="alexa.csv")

    # read lists from file
    if do_download_alexa:
        alexa_list = db.open_dataset_CSV_file(filename="alexa.csv", pos_url=1, label="Benign", max_line_count=16000)
    else:
        alexa_list = db.open_dataset_XML_file(ALEXA_FILE, iterateable="entry", label="Benign", url_label="url")

    if do_query_alexa == True:
            alexa_list = db.crawl_list_login_page(data=alexa_list, selenium_analysis=False, number_threads=10)

    # delete downloaded file
    if do_download_alexa:
        db.delete_data("alexa.csv.zip")
        db.move_file("alexa.csv")

    ################ PHISHTANK LIST ##################

    # download from phishtank -> DEVELOPER KEY NEEDED
    if do_download_phish == True: db.download_file(
        "http://data.phishtank.com/data/[developer key needed]/online-valid.xml",
        PHISHTANK_FILE)

    # write extracted list to XML
    if not alexa_list == None: db.write_list_to_XML(filename=ALEXA_FILE, root="data", list1=alexa_list)

    # open downloaded phishtank file
    phishtank_list = db.open_dataset_XML_file(filename=PHISHTANK_FILE, iterateable="entry", label="Phish")

    # check if websites in list are reachable
    if check_status_phishing:
        phishtank_list = db.check_status_of_website(phishtank_list)

    if check_status_benign:
        alexa_list = db.check_status_of_website(alexa_list)

    # make balanced list for same number of phishing and benign entries
    if len(phishtank_list) != len(alexa_list):
        if len(phishtank_list) > len(alexa_list):
            diff = len(phishtank_list) - len(alexa_list)

            for i in range(diff):
                phishtank_list.pop(0)
        else:
            diff = len(alexa_list) - len(phishtank_list)

            for i in range(diff):
                alexa_list.pop(0)

    # write phishtank file to XML file
    if not phishtank_list == None: db.write_list_to_XML(filename=PHISHTANK_FILE, root="data", list1=phishtank_list)

    # kaggle database available at: https://www.kaggle.com/kunal4892/phishingandlegitimateurls
    # kaggle_list = db.openCSVFile(filename="kaggle.csv", pos_url=0, pos_label=11)
    #db.deleteData("kaggle.csv")
    # if not kaggle_list == None: db.writeListtoXML(filename=KAGGLE_FILE, root="data", list=kaggle_list)


    ################ FINAL LIST ##################

    # create mix of phishtank and alexa list
    final_list = db.mix_lists_randomly(alexa_list, phishtank_list)

    # safe final list to XML for feature extraction
    if not final_list == None: db.write_list_to_XML(filename=DATABASE, root="data", list1=final_list)

    log_module_complete(MODULE_NAME=MODULE_NAME)