コード例 #1
0
from time import time
import keras.backend as K
import numpy as np
import argparse
from keras.preprocessing.sequence import pad_sequences
from utils.clean_data import process_legit_phish_data
from utils.preprocess_data import split_data, extract_labels, create_vocab, read_dataset, create_char_vocab
from utils.general_utils import get_logger, padding_email_sequences, load_word_embedding_dict, build_embedd_table
from models.baseline_models import build_lstm, build_cnn
from evaluators.evaluator_word_only import Evaluator

logger = get_logger("Train ...")


def main():
    parser = argparse.ArgumentParser(description="Word and char themis model")
    parser.add_argument('--num_epochs', type=int, default=20, help='Number of epochs for training')
    parser.add_argument('--batch_size', type=int, default=16, help='Number of emails in each batch')
    parser.add_argument('--embedding', type=str, default='glove', help='Word embedding type, word2vec, senna or glove')
    parser.add_argument('--embedding_dim', type=int, default=50, help='Dimension of embedding')
    parser.add_argument('--embedding_path', type=str, default='embeddings/glove.6B.50d.txt', help='Path to embedding vec file')
    parser.add_argument('--baby', action='store_true', help='Set to True for small data quantity for debug')
    parser.add_argument('--seed', type=int, default=42, help='Set seed for data split')
    parser.add_argument('--legit_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/legit/', help='Path to legit emails folder')
    parser.add_argument('--phish_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/phish/', help='Path to phish emails folder')
    parser.add_argument('--model_name', type=str, choices=['lstm', 'cnn'], required=True)

    args = parser.parse_args()
    legit_path = args.legit_path
    phish_path = args.phish_path
    embedding_path = args.embedding_path
コード例 #2
0
from sklearn.model_selection import train_test_split
from utils.general_utils import get_logger

logger = get_logger("Pre-process data ...")


def split_data(data, random_state=42):
    train, dev_test = train_test_split(data,
                                       test_size=0.4,
                                       random_state=random_state)
    dev, test = train_test_split(dev_test,
                                 test_size=0.5,
                                 random_state=random_state)
    return train, dev, test


def extract_labels(data):
    x_data = []
    y_data = []
    for item in data:
        x_data.append(item[0])
        y_data.append(item[1])
    return x_data, y_data


def create_vocab(data, vocab_size, to_lower):
    logger.info('Creating vocabulary')
    total_words, unique_words = 0, 0
    word_freqs = {}
    for content in data:
        if to_lower:
コード例 #3
0
import pandas as pd
import re
import numpy as np
import argparse
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from models.themis_models import build_simple_themis
from evaluators.evaluator_sentiment_word_only import Evaluator
from utils.clean_data import process_legit_phish_data
from utils.preprocess_data import split_data, extract_labels, create_vocab, read_dataset
from utils.general_utils import get_logger, padding_email_sequences, load_word_embedding_dict, build_embedd_table

TAG_RE = re.compile(r'<[^>]+>')
logger = get_logger("Train sent class...")


def remove_tags(text):
    return TAG_RE.sub('', text)


def preprocess_text(sen):
    sentence = remove_tags(sen)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence


def main():
コード例 #4
0
import numpy as np
from sklearn.metrics import accuracy_score
from utils.metrics import precision, recall, f1_score, false_positive_rate
from utils.general_utils import get_logger

logger = get_logger("Evaluate ...")


class Evaluator:
    def __init__(self, model, X_train, X_dev, X_test, Y_train, Y_dev, Y_test,
                 batch_size):
        self.model = model

        self.batch_size = batch_size

        self.X_train, self.X_dev, self.X_test = X_train, X_dev, X_test
        self.Y_train, self.Y_dev, self.Y_test = Y_train, Y_dev, Y_test

        self.best_dev_acc = -1
        self.best_dev_precision = -1
        self.best_dev_recall = -1
        self.best_dev_f1 = -1
        self.best_dev_false_pos_rate = -1

        self.best_test_acc = -1
        self.best_test_precision = -1
        self.best_test_recall = -1
        self.best_test_f1 = -1
        self.best_test_false_pos_rate = -1

        self.dev_acc = None