def execute(self): detector_names = ', '.join([d.__name__ for d in self.detectors]) progress_logger.info( "Config settings: use_name_in_from={}, parallel={}, detectors={}". format(self.use_name_in_from, self.parallel, detector_names)) start_time = time.time() MemTracker.initialize(memory_logger) logs.Watchdog.initialize() logs.context = {'phase': 'generate_features'} if self.generate_data_matrix or self.generate_test_matrix: self.generate_features() logs.context = {} MemTracker.logMemory( "After generating features/Before generating model") logs.context = {'phase': 'generate_model_output'} if self.generate_model: self.generate_model_output() logs.context = {} MemTracker.logMemory("After generating model") end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int( (end_time - start_time) % 60) progress_logger.info( "Phish Detector took {} minutes, {} seconds to run.".format( min_elapsed, sec_elapsed)) logs.RateLimitedLog.flushall()
def timer_expired(sig, frame): progress_logger.info('Watchdog expired (more than {} seconds passed); {}'.format(Watchdog.duration, str(context))) memory_logger.info('Dumping all stack frames after watchdog expired:'.format(Watchdog.duration)) Watchdog.log_all_stack_frames() MemTracker.logMemory('watchdog expired') Watchdog.duration *= 2 progress_logger.info('Setting new watchdog timer, for {} seconds'.format(Watchdog.duration)) Watchdog.reset()
def timer_expired(sig, frame): progress_logger.info( 'Watchdog expired (more than {} seconds passed); {}'.format( Watchdog.duration, str(context))) memory_logger.info( 'Dumping all stack frames after watchdog expired:'.format( Watchdog.duration)) Watchdog.log_all_stack_frames() MemTracker.logMemory('watchdog expired') Watchdog.duration *= 2 progress_logger.info( 'Setting new watchdog timer, for {} seconds'.format( Watchdog.duration)) Watchdog.reset()
def build_detectors(self, inbox): logs.context['step'] = 'build_detectors' detectors = [Detector(inbox) for Detector in self.features] verbose = self.should_enable_extra_debugging(inbox, detectors) for i, detector in enumerate(detectors): logs.context['detector'] = type(detector).__name__ detector.create_sender_profile(self.sender_profile_num_emails) logs.Watchdog.reset() if verbose: progress_logger.info('Finished creating {} sender profile, RSS = {}'.format(type(detector).__name__, MemTracker.cur_mem_usage())) MemTracker.logMemory('finished creating {} sender profile'.format(type(detector).__name__)) else: logs.RateLimitedMemTracker.checkmem('finished creating {} sender profile'.format(type(detector).__name__)) del logs.context['step'] del logs.context['detector'] return detectors
def build_detectors(self, inbox): logs.context['step'] = 'build_detectors' detectors = [Detector(inbox) for Detector in self.features] verbose = self.should_enable_extra_debugging(inbox, detectors) for i, detector in enumerate(detectors): logs.context['detector'] = type(detector).__name__ detector.create_sender_profile(self.sender_profile_indeces) logs.Watchdog.reset() if verbose: progress_logger.info('Finished creating {} sender profile, RSS = {}'.format(type(detector).__name__, MemTracker.cur_mem_usage())) MemTracker.logMemory('finished creating {} sender profile'.format(type(detector).__name__)) else: logs.RateLimitedMemTracker.checkmem('finished creating {} sender profile'.format(type(detector).__name__)) del logs.context['step'] del logs.context['detector'] return detectors
def execute(self): detector_names = ', '.join([d.__name__ for d in self.detectors]) progress_logger.info("Config settings: use_name_in_from={}, parallel={}, detectors={}".format(self.use_name_in_from, self.parallel, detector_names)) start_time = time.time() MemTracker.initialize(memory_logger) logs.Watchdog.initialize() logs.context = {'phase': 'generate_features'} if self.generate_data_matrix or self.generate_test_matrix: self.generate_features() logs.context = {} MemTracker.logMemory("After generating features/Before generating model") logs.context = {'phase': 'generate_model_output'} if self.generate_model: self.generate_model_output() logs.context = {} MemTracker.logMemory("After generating model") end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60) progress_logger.info("Phish Detector took {} minutes, {} seconds to run.".format(min_elapsed, sec_elapsed)) logs.RateLimitedLog.flushall()
def test_and_report(self): """ Assumptions: - test.mat exists in directory structure and clf is classifier trained on all data matrices. - test.mat has data['email_index'] Results is [path, index, probability] """ self.clean_all() if not os.path.exists(self.results_dir): os.makedirs(self.results_dir) # creates this file in common/output email_probabilities = open(os.path.join("output", "email_probabilities.txt"), "w") low_volume_top_10 = PriorityQueue() high_volume_top_10 = PriorityQueue() numPhish, testSize = 0, 0 numEmails4Sender = {} logging_interval = 60 # TODO(matthew): Move to config.yaml progress_logger.info("Starting to test on data.") start_time = time.time() last_logged_time = start_time results = np.empty(shape=(0, TOTAL_SIZE), dtype='S200') end_of_last_memory_track = dt.datetime.now() num_senders_completed = 0 for root, dirs, files in os.walk(self.email_path): curr_time = time.time() if (curr_time - last_logged_time) > logging_interval * 60: progress_logger.info('Exploring directory #{}'.format(num_senders_completed)) progress_logger.info('Testing has run for {} minutes'.format(int((curr_time - start_time) / 60))) last_logged_time = curr_time if self.memlog_freq >= 0: now = dt.datetime.now() time_elapsed = now - end_of_last_memory_track minutes_elapsed = time_elapsed.seconds / 60.0 if minutes_elapsed > self.memlog_freq: MemTracker.logMemory("After completing " + str(num_senders_completed) + " iterations in test_and_report") end_of_last_memory_track = dt.datetime.now() if 'test.mat' in files: path = os.path.join(root, "test.mat") data = sio.loadmat(path) test_X = data['test_data'] sample_size = test_X.shape[0] if sample_size == 0: continue test_indx = np.arange(sample_size).reshape(sample_size, 1) indx = data['email_index'].reshape(sample_size, 1) test_mess_id = data['message_id'].reshape(sample_size, 1).astype("S200") test_res = self.output_phish_probabilities(test_X, indx, root, test_indx, test_mess_id) if test_res is not None: for email in test_res: testSize += 1 sender = self.get_sender(email[0]) emailPath = email[0] probability = float(email[2]) message_ID = email[4].strip(" ") if probability > 0.5: numPhish += 1 # caches the num_emails value for each sender if sender not in numEmails4Sender: num_emails = sum(1 for line in open(emailPath)) numEmails4Sender[sender] = num_emails else: num_emails = numEmails4Sender[sender] # checks which priority queue to add item to if num_emails < self.bucket_thres: low_volume_top_10.push(email, probability) else: high_volume_top_10.push(email, probability) # writes an email's message ID and phish probability to a file email_probabilities.write(message_ID + "," + str(probability) + "\n") email_probabilities.close() self.num_phish, self.test_size = numPhish, testSize low_volume_output = low_volume_top_10.createOutput() high_volume_output = high_volume_top_10.createOutput() output = [low_volume_output, high_volume_output] # DEBUG information - don't print to main log # debug_logger.info(pp.pformat(output)) self.d_name_per_feat = self.parse_feature_names() self.pretty_print(low_volume_output, "low_volume") self.pretty_print(high_volume_output, "high_volume") self.write_summary_output(output) end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60) progress_logger.info("Finished testing on data in {} minutes, {} seconds. {} directories tested.".format(min_elapsed, sec_elapsed, num_senders_completed))
def checkmem_rate_limited(self): cur = MemTracker.cur_mem_usage() if cur >= 2*self.max_mem_logged: self.max_mem_logged = cur MemTracker.logMemory(self.task + "; " + str(context))
def test_and_report(self): """ Assumptions: - test.mat exists in directory structure and clf is classifier trained on all data matrices. - test.mat has data['email_index'] Results is [path, index, probability] """ self.clean_all() if not os.path.exists(self.results_dir): os.makedirs(self.results_dir) # creates this file in common/output email_probabilities = open(os.path.join("output", "email_probabilities.txt"), "w") #low_volume_top_10 = PriorityQueue(self.bucket_size) #high_volume_top_10 = PriorityQueue(self.bucket_size) top_emails = PriorityQueue(self.bucket_size) numPhish, testSize = 0, 0 logging_interval = 60 # TODO(matthew): Move to config.yaml progress_logger.info("Starting to test on data.") start_time = time.time() last_logged_time = start_time results = np.zeros(shape=(0, TOTAL_SIZE), dtype='S200') end_of_last_memory_track = dt.datetime.now() num_senders_completed = 0 num_message_id_failed = 0 total_completed = 0 self.d_name_per_feat = self.parse_feature_names() if (self.filterRecipients): targetRecipientName, targetRecipientEmail = self.createTargetRecipientList() for root, dirs, files in os.walk(self.email_path): curr_time = time.time() if (curr_time - last_logged_time) > logging_interval * 60: progress_logger.info('Exploring directory #{}'.format(num_senders_completed)) progress_logger.info('Testing has run for {} minutes'.format(int((curr_time - start_time) / 60))) last_logged_time = curr_time if self.memlog_freq >= 0: now = dt.datetime.now() time_elapsed = now - end_of_last_memory_track minutes_elapsed = time_elapsed.seconds / 60.0 if minutes_elapsed > self.memlog_freq: MemTracker.logMemory("After completing " + str(num_senders_completed) + " iterations in test_and_report") end_of_last_memory_track = dt.datetime.now() if 'test.mat' in files: path = os.path.join(root, "test.mat") data = sio.loadmat(path) test_X = data['test_data'] sample_size = test_X.shape[0] if sample_size == 0: continue test_indx = np.arange(sample_size).reshape(sample_size, 1) indx = data['email_index'].reshape(sample_size, 1) try: test_mess_id = data['message_id'].reshape(sample_size, 1).astype("S200") except ValueError as e: progress_logger.info("Size mismatch of data['message_id'], data['test_data']: {}, {}".format(data['message_id'].shape, data['test_data'].shape)) debug_logger.info("Size mismatch in {}".format(path)) num_message_id_failed += 1 test_mess_id = np.zeros(shape=(sample_size, 1), dtype="S200") test_res = self.get_email_records(test_X, indx, root, test_indx, test_mess_id) if test_res is not None: for email in test_res: filtered = False if self.filterRecipients: filtered = not self.isTargetRecipient(targetRecipientName, targetRecipientEmail, email.email["TO"]) if not filtered: testSize += 1 sender = email.path emailPath = email.path probability = float(email.probability_phish) message_ID = email.email_message_id.strip(" ") if probability > 0.5: numPhish += 1 top_emails.push(email, probability) # writes an email's message ID and phish probability to a file email_probabilities.write(message_ID + "," + str(probability) + "\n") total_completed += 1 num_senders_completed += 1 progress_logger.info("total # of times size of data['message_id'] != sample_size: " + str(num_message_id_failed)) progress_logger.info("total # of successes: " + str(total_completed)) email_probabilities.close() self.num_phish, self.test_size = numPhish, testSize top_emails_output = top_emails.createOutput() output = [top_emails_output] # DEBUG information - don't print to main log # debug_logger.info(pp.pformat(output)) self.pretty_print(top_emails_output, "all_emails") self.write_summary_output(output) end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60) progress_logger.info("Finished testing on data in {} minutes, {} seconds. {} directories tested.".format(min_elapsed, sec_elapsed, num_senders_completed))
def generate_features(self): if self.use_name_in_from != 0: Detector.USE_NAME = True dir_to_generate = [] if (self.filter_senders): targetSenderNames, targetSenderEmails = self.createTargetSendersSet( ) progress_logger.info( 'Starting directory aggregation in feature generation.') start_time = time.time() for dirpath, dirnames, filenames in os.walk(self.root_dir): if ((self.generate_data_matrix and self.regular_filename in filenames and self.phish_filename in filenames) or (self.generate_test_matrix and self.regular_filename in filenames)): command = [ "wc", "-l", "{}/{}".format(dirpath, self.regular_filename) ] filtered = False if (self.filter_senders): lastPartofPath = os.path.basename( os.path.normpath(dirpath)) targetMatch = self.isTargetSender(targetSenderNames, targetSenderEmails, lastPartofPath) filtered = not targetMatch if filtered: continue try: wc_output = subprocess.check_output(command) wc_output_split = wc_output.split() line_count = int(wc_output_split[0]) if line_count < 50000 and not filtered: # Ignore inboxes with more than 50,000 emails dir_to_generate.append(dirpath) logs.Watchdog.reset() except subprocess.CalledProcessError: debug_logger.warn( 'Could not calculate line count for directory {}'. format(dirpath)) continue end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int( (end_time - start_time) % 60) progress_logger.info( 'Finished directory aggregation in feature generation in {} minutes, {} seconds' .format(min_elapsed, sec_elapsed)) BATCH_SIZE = self.batch_threading_size if self.parallel: progress_logger.info( 'Starting feature generation with {} threads in parallel with batch size {}...' .format(self.num_threads, BATCH_SIZE)) start_time = time.time() feature_generators = [] for directory in dir_to_generate: feature_generator = self.prep_features(directory) feature_generators.append(feature_generator) if len(feature_generators) == BATCH_SIZE: p = Pool(self.num_threads) p.map(run_generator, feature_generators) p.close() p.join() feature_generators = [] if len(feature_generators) > 0: p = Pool(self.num_threads) p.map(run_generator, feature_generators) p.close() p.join() end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int( (end_time - start_time) % 60) progress_logger.info( 'Finished feature generation in {} minutes, {} seconds.'. format(min_elapsed, sec_elapsed)) else: progress_logger.info( 'Starting feature generation serially for {} directories'. format(len(dir_to_generate))) start_time = time.time() last_logged_time = start_time dir_count = 0 end_of_last_memory_track = dt.datetime.now() for directory in dir_to_generate: dir_count += 1 logs.context = {'feature gen': dir_count} curr_time = time.time() if (curr_time - last_logged_time) > self.logging_interval * 60: progress_logger.info( 'Processing directory #{} of {}'.format( dir_count, len(dir_to_generate))) progress_logger.info( 'Feature generation has run for {} minutes'.format( int((curr_time - start_time) / 60))) last_logged_time = curr_time feature_generator = self.prep_features(directory) feature_generator.run() logs.Watchdog.reset() now = dt.datetime.now() time_elapsed = now - end_of_last_memory_track minutes_elapsed = time_elapsed.seconds / 60.0 if minutes_elapsed > self.memlog_gen_features_frequency: MemTracker.logMemory( 'After generating features for {}th sender'.format( dir_count)) end_of_last_memory_track = dt.datetime.now() logs.context = {} end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int( (end_time - start_time) % 60) progress_logger.info( 'Finished feature generation in {} minutes, {} seconds.'. format(min_elapsed, sec_elapsed))
def generate_features(self): if self.use_name_in_from != 0: Detector.USE_NAME = True dir_to_generate = [] progress_logger.info('Starting directory aggregation in feature generation.') start_time = time.time() for dirpath, dirnames, filenames in os.walk(self.root_dir): if ((self.generate_data_matrix and self.regular_filename in filenames and self.phish_filename in filenames) or (self.generate_test_matrix and self.regular_filename in filenames)): dir_to_generate.append(dirpath) logs.Watchdog.reset() end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60) progress_logger.info('Finished directory aggregation in feature generation in {} minutes, {} seconds'.format(min_elapsed, sec_elapsed)) BATCH_SIZE = self.batch_threading_size if self.parallel: progress_logger.info('Starting feature generation with {} threads in parallel with batch size {}...'.format(self.num_threads, BATCH_SIZE)) start_time = time.time() feature_generators = [] for directory in dir_to_generate: feature_generator = self.prep_features(directory) feature_generators.append(feature_generator) if len(feature_generators) == BATCH_SIZE: p = Pool(self.num_threads) p.map(run_generator, feature_generators) p.close() p.join() feature_generators = [] if len(feature_generators) > 0: p = Pool(self.num_threads) p.map(run_generator, feature_generators) p.close() p.join() end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60) progress_logger.info('Finished feature generation in {} minutes, {} seconds.'.format(min_elapsed, sec_elapsed)) else: progress_logger.info('Starting feature generation serially for {} directories'.format(len(dir_to_generate))) start_time = time.time() last_logged_time = start_time dir_count = 0 end_of_last_memory_track = dt.datetime.now() for directory in dir_to_generate: dir_count += 1 logs.context = {'feature gen': dir_count} curr_time = time.time() if (curr_time - last_logged_time) > self.logging_interval * 60: progress_logger.info('Processing directory #{} of {}'.format(dir_count, len(dir_to_generate))) progress_logger.info('Feature generation has run for {} minutes'.format(int((curr_time - start_time) / 60))) last_logged_time = curr_time feature_generator = self.prep_features(directory) feature_generator.run() logs.Watchdog.reset() now = dt.datetime.now() time_elapsed = now - end_of_last_memory_track minutes_elapsed = time_elapsed.seconds / 60.0 if minutes_elapsed > self.memlog_gen_features_frequency: MemTracker.logMemory('After generating features for {}th sender'.format(dir_count)) end_of_last_memory_track = dt.datetime.now() logs.context = {} end_time = time.time() min_elapsed, sec_elapsed = int((end_time - start_time) / 60), int((end_time - start_time) % 60) progress_logger.info('Finished feature generation in {} minutes, {} seconds.'.format(min_elapsed, sec_elapsed))
def checkmem_rate_limited(self): cur = MemTracker.cur_mem_usage() if cur >= 2 * self.max_mem_logged: self.max_mem_logged = cur MemTracker.logMemory(self.task + "; " + str(context))