コード例 #1
0
class PhishDetector(object):
    def __init__(self):
        #Flag Configurations
        self.generate_data_matrix = False
        self.generate_test_matrix = False
        self.generate_model = False
        self.classify = False
        self.config_path = 'config.yaml'
        self.filter_targets = False

        #Config File Configurations
        self.root_dir = None
        self.filename = None
        self.weights = None
        self.sender_profile_percentage = 0
        self.data_matrix_percentage = 0
        self.test_matrix_percentage = 0
        self.emails_threshold = 1000
        self.results_size = 10
        self.model_path_out = './model'
        self.result_path_out = './summary'
        self.detectors = None
        self.parallel = None

        #Generator and Classifier
        self.classifier = None

        self.parse_config()
        self.parse_args()

    def parse_args(self):
        """
        Parses command line arguments.
        """
        parser = argparse.ArgumentParser(
            description='Mange spear fishing detector.')
        parser.add_argument(
            '--all',
            action='store_true',
            help=
            ('Generate and serialize data matrix, test matrix, and ML model, then run ML model on test matrix'
             ))
        parser.add_argument(
            '--gen_all',
            action='store_true',
            help=(
                'Generate and serialize data matrix, test matrix, and ML model'
            ))
        parser.add_argument('--gen_data',
                            action='store_true',
                            help=('Generate and serialize data matrix'))
        parser.add_argument('--gen_test',
                            action='store_true',
                            help=('Generate and serialize test matrix'))
        parser.add_argument('--gen_model',
                            action='store_true',
                            help=('Generate and serialize ML model'))
        parser.add_argument('--classify',
                            action='store_true',
                            help=('Run ML model on test matrix'))
        parser.add_argument('--debug_training',
                            action='store_true',
                            help=('Debug the training step of the pipeline.'))
        parser.add_argument('--mbox',
                            action='store_true',
                            help=('Use emails from mbox rather than pcaps'))
        parser.add_argument(
            '--filter_senders',
            action='store_true',
            help=('Only train on names and emails in target sender file'))
        parser.add_argument(
            '--filter_recipients',
            action='store_true',
            help=
            ('Only test and report on names and emails in the target recipient file'
             ))

        args = parser.parse_args()

        run = False
        self.debug_training = False
        if args.all:
            self.generate_data_matrix = True
            self.generate_test_matrix = True
            self.generate_model = True
            self.classify = True
            run = True
        if args.gen_all:
            self.generate_data_matrix = True
            self.generate_test_matrix = True
            self.generate_model = True
            run = True
        if args.gen_data:
            self.generate_data_matrix = True
            run = True
        if args.gen_test:
            self.generate_test_matrix = True
            run = True
        if args.gen_model:
            self.generate_model = True
            run = True
        if args.classify:
            self.classify = True
            run = True
        if args.debug_training:
            self.generate_data_matrix = True
            self.generate_test_matrix = True
            self.generate_model = True
            self.classify = True
            self.debug_training = True
            run = True
        if args.filter_senders:
            self.filter_senders = True
        else:
            self.filter_senders = False
        if args.filter_recipients:
            self.filter_recipients = True
        else:
            self.filter_recipients = False

        if not run:
            parser.error('You must run with at least one flag')

    def parse_config(self):
        """
        Parses configuration file. Assumes configuration is in same directory as this script.
        """
        try:
            stream = file(self.config_path, 'r')
        except IOError:
            progress_logger.exception(
                "Could not find yaml configuration file.")
            raise

        config = yaml.load(stream)

        expected_config_keys = [
            'root_dir', 'regular_filename', 'phish_filename', 'use_percentage',
            'sender_profile_start_time', 'sender_profile_end_time',
            'train_start_time', 'train_end_time', 'test_start_time',
            'test_end_time', 'sender_profile_percentage',
            'data_matrix_percentage', 'test_matrix_percentage',
            'use_name_in_from', 'model_path_out', 'result_path_out', 'weights',
            'detectors', 'emails_threshold', 'batch_threading_size', 'offline',
            'results_size', 'parallel', 'num_threads', 'logging_interval',
            'memlog_gen_features_frequency', 'memlog_classify_frequency',
            'senders', 'recipients'
        ]

        try:
            for key in expected_config_keys:
                setattr(self, key, config[key])
        except KeyError:
            progress_logger.exception("Configuration file missing entry")
            raise

        detectors = []
        for detector, val in self.detectors.items():
            if val == 1:
                detectors.append(getattr(globals()['fc'], detector))

        self.detectors = detectors
        self.root_dir = os.path.abspath(self.root_dir)
        Lookup.initialize(offline=self.offline)

        if not self.use_percentage:
            self.sender_profile_start_time = calendar.timegm(
                time.strptime(self.sender_profile_start_time, "%B %d %Y"))
            self.sender_profile_end_time = calendar.timegm(
                time.strptime(self.sender_profile_end_time, "%B %d %Y"))
            self.train_start_time = calendar.timegm(
                time.strptime(self.train_start_time, "%B %d %Y"))
            self.train_end_time = calendar.timegm(
                time.strptime(self.train_end_time, "%B %d %Y"))
            self.test_start_time = calendar.timegm(
                time.strptime(self.test_start_time, "%B %d %Y"))
            self.test_end_time = calendar.timegm(
                time.strptime(self.test_end_time, "%B %d %Y"))

    def prep_features(self, directory):
        regular_path = os.path.join(directory, self.regular_filename)
        phish_path = os.path.join(directory, self.phish_filename)

        sender_profile_time_interval = (self.sender_profile_start_time,
                                        self.sender_profile_end_time)
        train_time_interval = (self.train_start_time, self.train_end_time)
        test_time_interval = (self.test_start_time, self.test_end_time)

        feature_generator = FeatureGenerator(
            directory, regular_path, phish_path,
            self.sender_profile_percentage, self.data_matrix_percentage,
            self.test_matrix_percentage, sender_profile_time_interval,
            train_time_interval, test_time_interval, self.use_percentage,
            self.detectors)

        feature_generator.do_generate_data_matrix = self.generate_data_matrix
        feature_generator.do_generate_test_matrix = self.generate_test_matrix
        return feature_generator

    def createTargetSendersSet(self):
        senderNames = []
        senderEmails = []
        with open(self.senders) as f:
            for line in f.readlines():
                name, email = parse_sender.parse_sender(line)
                name = name.translate(None, string.punctuation).strip()
                senderNames.append(name.lower())
                senderEmails.append(email.lower())
        return senderNames, senderEmails

    def isTargetSender(self, targetNames, targetEmails, currSender):
        currSender = currSender.lower()
        currSenderStripped = currSender.translate(None,
                                                  string.punctuation).strip()
        for i in range(len(targetNames)):
            firstName, lastName = targetNames[i].split(
                " ")[0], targetNames[i].split(" ")[-1]
            if firstName in currSenderStripped and lastName in currSenderStripped:
                return True
            if targetEmails[i] in currSender:
                return True
        return False

    def generate_features(self):
        if self.use_name_in_from != 0:
            Detector.USE_NAME = True

        dir_to_generate = []

        if (self.filter_senders):
            targetSenderNames, targetSenderEmails = self.createTargetSendersSet(
            )

        progress_logger.info(
            'Starting directory aggregation in feature generation.')
        start_time = time.time()
        for dirpath, dirnames, filenames in os.walk(self.root_dir):
            if ((self.generate_data_matrix
                 and self.regular_filename in filenames
                 and self.phish_filename in filenames)
                    or (self.generate_test_matrix
                        and self.regular_filename in filenames)):
                command = [
                    "wc", "-l", "{}/{}".format(dirpath, self.regular_filename)
                ]
                filtered = False
                if (self.filter_senders):
                    lastPartofPath = os.path.basename(
                        os.path.normpath(dirpath))
                    targetMatch = self.isTargetSender(targetSenderNames,
                                                      targetSenderEmails,
                                                      lastPartofPath)
                    filtered = not targetMatch
                if filtered:
                    continue
                try:
                    wc_output = subprocess.check_output(command)
                    wc_output_split = wc_output.split()
                    line_count = int(wc_output_split[0])

                    if line_count < 50000 and not filtered:  # Ignore inboxes with more than 50,000 emails
                        dir_to_generate.append(dirpath)
                    logs.Watchdog.reset()
                except subprocess.CalledProcessError:
                    debug_logger.warn(
                        'Could not calculate line count for directory {}'.
                        format(dirpath))
                    continue
        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int(
            (end_time - start_time) % 60)
        progress_logger.info(
            'Finished directory aggregation in feature generation in {} minutes, {} seconds'
            .format(min_elapsed, sec_elapsed))

        BATCH_SIZE = self.batch_threading_size
        if self.parallel:
            progress_logger.info(
                'Starting feature generation with {} threads in parallel with batch size {}...'
                .format(self.num_threads, BATCH_SIZE))
            start_time = time.time()
            feature_generators = []
            for directory in dir_to_generate:
                feature_generator = self.prep_features(directory)
                feature_generators.append(feature_generator)
                if len(feature_generators) == BATCH_SIZE:
                    p = Pool(self.num_threads)
                    p.map(run_generator, feature_generators)
                    p.close()
                    p.join()
                    feature_generators = []
            if len(feature_generators) > 0:
                p = Pool(self.num_threads)
                p.map(run_generator, feature_generators)
                p.close()
                p.join()
            end_time = time.time()
            min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int(
                (end_time - start_time) % 60)
            progress_logger.info(
                'Finished feature generation in {} minutes, {} seconds.'.
                format(min_elapsed, sec_elapsed))
        else:
            progress_logger.info(
                'Starting feature generation serially for {} directories'.
                format(len(dir_to_generate)))
            start_time = time.time()
            last_logged_time = start_time
            dir_count = 0
            end_of_last_memory_track = dt.datetime.now()
            for directory in dir_to_generate:
                dir_count += 1
                logs.context = {'feature gen': dir_count}
                curr_time = time.time()
                if (curr_time - last_logged_time) > self.logging_interval * 60:
                    progress_logger.info(
                        'Processing directory #{} of {}'.format(
                            dir_count, len(dir_to_generate)))
                    progress_logger.info(
                        'Feature generation has run for {} minutes'.format(
                            int((curr_time - start_time) / 60)))
                    last_logged_time = curr_time
                feature_generator = self.prep_features(directory)
                feature_generator.run()
                logs.Watchdog.reset()
                now = dt.datetime.now()
                time_elapsed = now - end_of_last_memory_track
                minutes_elapsed = time_elapsed.seconds / 60.0
                if minutes_elapsed > self.memlog_gen_features_frequency:
                    MemTracker.logMemory(
                        'After generating features for {}th sender'.format(
                            dir_count))
                    end_of_last_memory_track = dt.datetime.now()
                logs.context = {}
            end_time = time.time()
            min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int(
                (end_time - start_time) % 60)
            progress_logger.info(
                'Finished feature generation in {} minutes, {} seconds.'.
                format(min_elapsed, sec_elapsed))

    def generate_model_output(self):
        self.classifier = Classify(self.weights,
                                   self.root_dir,
                                   self.emails_threshold,
                                   self.results_size,
                                   results_dir=self.result_path_out,
                                   serial_path=self.model_path_out,
                                   memlog_freq=self.memlog_classify_frequency,
                                   debug_training=self.debug_training,
                                   filterRecipients=self.filter_recipients,
                                   recipientTargetFile=self.recipients)
        logs.Watchdog.reset()
        self.classifier.generate_training()
        logs.Watchdog.reset()
        self.classifier.train_clf()
        logs.Watchdog.reset()
        self.classifier.cross_validate()
        logs.Watchdog.reset()
        self.classifier.test_and_report()
        logs.Watchdog.reset()

    def execute(self):
        detector_names = ', '.join([d.__name__ for d in self.detectors])
        progress_logger.info(
            "Config settings: use_name_in_from={}, parallel={}, detectors={}".
            format(self.use_name_in_from, self.parallel, detector_names))

        start_time = time.time()
        MemTracker.initialize(memory_logger)
        logs.Watchdog.initialize()
        logs.context = {'phase': 'generate_features'}
        if self.generate_data_matrix or self.generate_test_matrix:
            self.generate_features()
        logs.context = {}
        MemTracker.logMemory(
            "After generating features/Before generating model")
        logs.context = {'phase': 'generate_model_output'}
        if self.generate_model:
            self.generate_model_output()
        logs.context = {}
        MemTracker.logMemory("After generating model")
        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int(
            (end_time - start_time) % 60)
        progress_logger.info(
            "Phish Detector took {} minutes, {} seconds to run.".format(
                min_elapsed, sec_elapsed))
        logs.RateLimitedLog.flushall()
コード例 #2
0
class PhishDetector(object):

    def __init__(self):
        #Flag Configurations
        self.generate_data_matrix = False
        self.generate_test_matrix = False
        self.generate_model = False
        self.classify = False
        self.config_path = 'config.yaml'

        #Config File Configurations
        self.root_dir = None
        self.filename = None
        self.weights = None
        self.sender_profile_percentage = 0
        self.data_matrix_percentage = 0
        self.test_matrix_percentage = 0
        self.emails_threshold = 1000
        self.results_size = 10
        self.model_path_out = './model'
        self.result_path_out = './summary'
        self.detectors = None
        self.parallel = None

        #Generator and Classifier
        self.classifier = None

        self.parse_config()
        self.parse_args()


    def parse_args(self):
        """
        Parses command line arguments.
        """
        parser = argparse.ArgumentParser(description='Mange spear fishing detector.')
        parser.add_argument('--all',
                            action='store_true',
                            help=('Generate and serialize data matrix, test matrix, and ML model, then run ML model on test matrix'))
        parser.add_argument('--gen_all',
                            action='store_true',
                            help=('Generate and serialize data matrix, test matrix, and ML model'))
        parser.add_argument('--gen_data',
                            action='store_true',
                            help=('Generate and serialize data matrix'))
        parser.add_argument('--gen_test',
                            action='store_true',
                            help=('Generate and serialize test matrix'))
        parser.add_argument('--gen_model',
                            action='store_true',
                            help=('Generate and serialize ML model'))
        parser.add_argument('--classify',
                            action='store_true',
                            help=('Run ML model on test matrix'))
        parser.add_argument('--debug_training',
                            action='store_true',
                            help=('Debug the training step of the pipeline.'))
        parser.add_argument('--mbox',
                            action='store_true',
                            help=('Use emails from mbox rather than pcaps'))
        
        args = parser.parse_args()

        run = False
        self.debug_training = False
        if args.all:
            self.generate_data_matrix = True
            self.generate_test_matrix = True
            self.generate_model = True
            self.classify = True
            run = True
        if args.gen_all:
            self.generate_data_matrix = True
            self.generate_test_matrix = True
            self.generate_model = True
            run = True
        if args.gen_data:
            self.generate_data_matrix = True
            run = True
        if args.gen_test:
            self.generate_test_matrix = True
            run = True
        if args.gen_model:
            self.generate_model = True
            run = True
        if args.classify:
            self.classify = True
            run = True
        if args.debug_training:
            self.generate_data_matrix = True
            self.generate_test_matrix = True
            self.generate_model = True
            self.classify = True
            self.debug_training = True
            run = True


        if not run:
            parser.error('You must run with at least one flag')

    def parse_config(self):
        """
        Parses configuration file. Assumes configuration is in same directory as this script.
        """
        try:
            stream = file(self.config_path, 'r')
        except IOError:
            progress_logger.exception("Could not find yaml configuration file.")
            raise

        config = yaml.load(stream)
        
        expected_config_keys = [
            'root_dir',
            'regular_filename',
            'phish_filename',
            'sender_profile_percentage',
            'data_matrix_percentage',
            'test_matrix_percentage',
            'use_name_in_from',
            'model_path_out',
            'result_path_out',
            'weights',
            'detectors',
            'emails_threshold',
            'batch_threading_size',
            'offline',
            'results_size',
            'parallel',
            'num_threads',
            'logging_interval',
            'memlog_gen_features_frequency',
            'memlog_classify_frequency'
        ]

        try:
            for key in expected_config_keys:
                setattr(self, key, config[key])
        except KeyError:
            progress_logger.exception("Configuration file missing entry")
            raise

        detectors = []
        for detector, val in self.detectors.items():
            if val == 1:
                detectors.append(getattr(globals()['fc'], detector))

        self.detectors = detectors
        self.root_dir = os.path.abspath(self.root_dir)
        Lookup.initialize(offline=self.offline)

    def prep_features(self, directory):   
        regular_path = os.path.join(directory, self.regular_filename)
        phish_path = os.path.join(directory, self.phish_filename)

        feature_generator = FeatureGenerator(directory,
                                             regular_path,
                                             phish_path,
                                             self.sender_profile_percentage,
                                             self.data_matrix_percentage,
                                             self.test_matrix_percentage,
                                             self.detectors
                                            )

        feature_generator.do_generate_data_matrix = self.generate_data_matrix
        feature_generator.do_generate_test_matrix = self.generate_test_matrix
        return feature_generator


    def generate_features(self):
        if self.use_name_in_from != 0:
            Detector.USE_NAME = True

        dir_to_generate = []

        progress_logger.info('Starting directory aggregation in feature generation.')
        start_time = time.time()
        for dirpath, dirnames, filenames in os.walk(self.root_dir):
            if ((self.generate_data_matrix and self.regular_filename in filenames and self.phish_filename in filenames)
                or (self.generate_test_matrix and self.regular_filename in filenames)):
                dir_to_generate.append(dirpath)
                logs.Watchdog.reset()
        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
        progress_logger.info('Finished directory aggregation in feature generation in {} minutes, {} seconds'.format(min_elapsed, sec_elapsed))
        
        BATCH_SIZE = self.batch_threading_size
        if self.parallel:
            progress_logger.info('Starting feature generation with {} threads in parallel with batch size {}...'.format(self.num_threads, BATCH_SIZE))
            start_time = time.time()
            feature_generators = []
            for directory in dir_to_generate:
                feature_generator = self.prep_features(directory)
                feature_generators.append(feature_generator)
                if len(feature_generators) == BATCH_SIZE:
                    p = Pool(self.num_threads)
                    p.map(run_generator, feature_generators)
                    p.close()
                    p.join()
                    feature_generators = []
            if len(feature_generators) > 0:
                p = Pool(self.num_threads)
                p.map(run_generator, feature_generators)
                p.close()
                p.join()
            end_time = time.time()
            min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
            progress_logger.info('Finished feature generation in {} minutes, {} seconds.'.format(min_elapsed, sec_elapsed))
        else:
            progress_logger.info('Starting feature generation serially for {} directories'.format(len(dir_to_generate)))
            start_time = time.time()
            last_logged_time = start_time
            dir_count = 0
            end_of_last_memory_track = dt.datetime.now()
            for directory in dir_to_generate:
                dir_count += 1
                logs.context = {'feature gen': dir_count}
                curr_time = time.time()
                if (curr_time - last_logged_time) > self.logging_interval * 60:
                    progress_logger.info('Processing directory #{} of {}'.format(dir_count, len(dir_to_generate)))
                    progress_logger.info('Feature generation has run for {} minutes'.format(int((curr_time - start_time) / 60)))
                    last_logged_time = curr_time
                feature_generator = self.prep_features(directory)
                feature_generator.run()
                logs.Watchdog.reset()
                now = dt.datetime.now()
                time_elapsed = now - end_of_last_memory_track
                minutes_elapsed = time_elapsed.seconds / 60.0
                if minutes_elapsed > self.memlog_gen_features_frequency:
                    MemTracker.logMemory('After generating features for {}th sender'.format(dir_count))
                    end_of_last_memory_track = dt.datetime.now()
                logs.context = {}
            end_time = time.time()
            min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
            progress_logger.info('Finished feature generation in {} minutes, {} seconds.'.format(min_elapsed, sec_elapsed))

    def generate_model_output(self):
        self.classifier = Classify(self.weights,
                                   self.root_dir,
                                   self.emails_threshold,
                                   self.results_size,
                                   results_dir=self.result_path_out,
                                   serial_path=self.model_path_out,
                                   memlog_freq=self.memlog_classify_frequency,
                                   debug_training=self.debug_training)
        logs.Watchdog.reset()
        self.classifier.generate_training()
        logs.Watchdog.reset()
        self.classifier.train_clf()
        logs.Watchdog.reset()
        self.classifier.cross_validate()
        logs.Watchdog.reset()
        self.classifier.test_and_report()
        logs.Watchdog.reset()


    def execute(self):
        detector_names = ', '.join([d.__name__ for d in self.detectors])
        progress_logger.info("Config settings: use_name_in_from={}, parallel={}, detectors={}".format(self.use_name_in_from, self.parallel, detector_names))

        start_time = time.time()
        MemTracker.initialize(memory_logger)
        logs.Watchdog.initialize()
        logs.context = {'phase': 'generate_features'}
        if self.generate_data_matrix or self.generate_test_matrix:
            self.generate_features()
        logs.context = {}
        MemTracker.logMemory("After generating features/Before generating model")
        logs.context = {'phase': 'generate_model_output'}
        if self.generate_model:
            self.generate_model_output()
        logs.context = {}
        MemTracker.logMemory("After generating model")
        end_time = time.time()
        min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
        progress_logger.info("Phish Detector took {} minutes, {} seconds to run.".format(min_elapsed, sec_elapsed))
        logs.RateLimitedLog.flushall()