def __init__(self, mode=0, offline=0, dump_testing='testing_fingerprints.csv', dump_training='training_fingerprints.csv'): # 0 for Training mode - 1 for Testing mode if (mode != 0 and mode != 1) or (offline != 0 and offline != 1): raise ValueError( 'The mode value is not valid. Choose between 1 or 0.') self.hosts_clusters = {} self.label_generator = LabelGenerator() self.fin_generator = FingerprintGenerator() self.fin_manager = FingerprintManager() self.detector = DetectionModule() self.mode = mode self.alerts = [] self.time_start = None self.time_current = None self.offline = offline # Files for offline dumps self.dump_testing = dump_testing self.dump_training = dump_training # Known browsers self.browser_user_agents = set() # Referrer graphs per user_agent self.referrerGraphs = dict()
def __init__(self, folder_path, dump_fingerprint_to_timestamps_training = 'training_fingerprint_to_timestamps.txt', dump_fingerprint_to_timestamps_testing = 'testing_fingerprint_to_timestamps.txt'): self.files = glob.glob(folder_path + "*.csv") self.files = sorted(self.files, key=lambda tmp: tmp[84:]) self.training_manager = FingerprintManager() self.testing_manager = FingerprintManager() self.detector = DetectionModule() self.dump_fingerprint_to_timestamps_training = dump_fingerprint_to_timestamps_training self.dump_fingerprint_to_timestamps_testing = dump_fingerprint_to_timestamps_testing self.fingerprint_to_timestamps_testing = {} self.fingerprint_to_timestamps_training = {}
class OfflineDetector: def __init__(self, folder_path, dump_fingerprint_to_timestamps_training = 'training_fingerprint_to_timestamps.txt', dump_fingerprint_to_timestamps_testing = 'testing_fingerprint_to_timestamps.txt'): self.files = glob.glob(folder_path + "*.csv") self.files = sorted(self.files, key=lambda tmp: tmp[84:]) self.training_manager = FingerprintManager() self.testing_manager = FingerprintManager() self.detector = DetectionModule() self.dump_fingerprint_to_timestamps_training = dump_fingerprint_to_timestamps_training self.dump_fingerprint_to_timestamps_testing = dump_fingerprint_to_timestamps_testing self.fingerprint_to_timestamps_testing = {} self.fingerprint_to_timestamps_training = {} def _load_from_csv_2(self): """ This method loads fingerprints from a .csv file, but only those flagged for training" """ for f in self.files: if "training" in f: self.training_manager.read_from_file(f) print "" + f + " has been loaded for training." def run_detection_2(self): """ This method runs the offline detection. Fingerprints were previously dumped into csv files. In the offline analysis are loaded from the csv files, and then compared. Training data is first loaded. The each testing file is analyzed. """ alerts = [] benign = [] self._load_from_csv_2() print "loading__fingerprint_to_timestamps_training" # now load the mapping between fingerprints and timestamps... with open(self.dump_fingerprint_to_timestamps_training, 'r') as f: self.fingerprint_to_timestamps_training = pickle.load(f) print "loading__fingerprint_to_timestamps_testing" with open(self.dump_fingerprint_to_timestamps_testing, 'r') as f: self.fingerprint_to_timestamps_testing = pickle.load(f) print "all_fingerprint_to_timestamp_mappings_loaded" all_training_fingerprints = [] total_files = 0 total_detected = 0 for h, fingerprints in self.training_manager.hosts_fingerprints.iteritems(): for f in fingerprints: all_training_fingerprints.append(f) for f in self.files: if "testing" in f: self.testing_manager.read_from_file(f) print "" + f + " has been loaded for testing." for host,test_fingerprints in self.testing_manager.hosts_fingerprints.iteritems(): total_files += 1 detected = False for fingerprint in test_fingerprints: if self.detector.detection(all_training_fingerprints, fingerprint): if not detected: total_detected += 1 detected = True alerts.append(fingerprint) else: benign.append(fingerprint) if not detected: print host self.testing_manager.hosts_fingerprints = dict() # Uncomment if you want to print on how many files at least an alert has been triggered. #print """{}/{} files detected.""".format(total_detected, total_files) return alerts, benign, self.fingerprint_to_timestamps_training, self.fingerprint_to_timestamps_testing
class Aggregator: """ This class is the engine of Decanter. It is responsible of training and testing fingerprints from input data. """ # Timeout used only in testing mode. timeout = datetime.timedelta(minutes=10) def __init__( self, mode=0, offline=0, dump_testing='testing_fingerprints.csv', dump_training='training_fingerprints.csv', dump_fingerprint_to_timestamps_training='training_fingerprint_to_timestamps.txt', dump_fingerprint_to_timestamps_testing='testing_fingerprint_to_timestamps.txt' ): # 0 for Training mode - 1 for Testing mode if (mode != 0 and mode != 1) or (offline != 0 and offline != 1): raise ValueError( 'The mode value is not valid. Choose between 1 or 0.') self.hosts_clusters = {} self.label_generator = LabelGenerator() self.fin_generator = FingerprintGenerator() self.fin_manager = FingerprintManager() self.detector = DetectionModule() self.mode = mode self.alerts = [] self.time_start = None self.time_current = None self.offline = offline self.fingerprint_to_timestamps_testing = {} self.fingerprint_to_timestamps_training = {} # Files for offline dumps self.dump_testing = dump_testing self.dump_training = dump_training # will be used to store the mapping between fingerprints and timestamps... self.dump_fingerprint_to_timestamps_training = dump_fingerprint_to_timestamps_training self.dump_fingerprint_to_timestamps_testing = dump_fingerprint_to_timestamps_testing # Known browsers self.browser_user_agents = set() # Referrer graphs per user_agent self.referrerGraphs = dict() def change_mode(self, mode): if mode != 0 and mode != 1: raise ValueError('The mode value is not valid') self.mode = mode if mode == 1: print "Aggregator switched to Testing mode." else: print "Aggregator switched to Training mode." def analyze_log(self, data): """ Load and aggregate the HTTP requests from a Dataframe Parameter ------------- data : pandas Dataframe """ if (self.mode == 0): self._training(data) return self.fingerprint_to_timestamps_training elif (self.mode == 1): self._testing(data) return self.fingerprint_to_timestamps_testing else: pass def _testing(self, data): for row in data.iterrows(): # Generate HTTP request http_data = row[1].to_dict() h = HTTPRequest(http_data) # Initialize Time if self.time_start == None: self.time_start = h.ts # Aggregate request self._insert_http_request(h) # Set current time to the current HTTP request timestamp self.time_current = h.ts # Check if the timeout is expired if (self.time_current - self.time_start) > self.timeout: # Create and store the fingerprints for host in self.hosts_clusters.keys(): for app, http_cluster in self.hosts_clusters[ host].iteritems(): self._create_fingerprints(host, http_cluster) # Flush the aggregated HTTP requests and reset the starting time self.hosts_clusters.clear() self.time_start = None # Writing of fingerprints in case the file "ended" and the timeout did not exceed. if self.hosts_clusters: for host in self.hosts_clusters.keys(): for app, http_cluster in self.hosts_clusters[host].iteritems(): self._create_fingerprints(host, http_cluster) self.hosts_clusters.clear() self.time_start = None def _training(self, data): for row in data.iterrows(): # Generate HTTP request http_data = row[1].to_dict() h = HTTPRequest(http_data) # Aggregate request self._insert_http_request(h) # Create and store the fingerprints for host in self.hosts_clusters.keys(): for app, http_cluster in self.hosts_clusters[host].iteritems(): self._create_fingerprints(host, http_cluster) # In OFFLINE mode , dump the generated fingerprints in a .csv file. if self.offline == 1: self.fin_manager.write_to_file(self.dump_training) self.hosts_clusters.clear() def _create_fingerprints(self, host, http_cluster): """ Extract GET and POST requests for each Cluster of HTTP requests Parameter ---------------- http_cluster : list of HTTPRequest Returns ---------------- (get, post) : tuple (list of HTTPRequests, list of HTTPRequests) """ # Removed GET-POST split and replaced with Label_generator labels, referrerGraph = self.label_generator.generate_label( http_cluster, self.mode, self.browser_user_agents, self.referrerGraphs) # Training mode if self.mode == 0: for key, value in labels.items(): method = key[0] label = key[1] cluster = value self.fin_manager.store( host, self.fin_generator.generate_fingerprint( cluster, method, label, self.fingerprint_to_timestamps_training)) with open(self.dump_fingerprint_to_timestamps_training, 'w') as f: pickle.dump(self.fingerprint_to_timestamps_training, f) # If browser, store to known browser user-agents if label == "Browser": user_agent = http_cluster[0].header_values.get( 'user-agent', None) self.browser_user_agents.add(user_agent) # Testing mode elif self.mode == 1: user_agent = http_cluster[0].header_values.get('user-agent', None) self.referrerGraphs[user_agent] = referrerGraph for key, value in labels.items(): method = key[0] label = key[1] cluster = value new_fingerprint = self.fin_generator.generate_fingerprint( cluster, method, label, self.fingerprint_to_timestamps_testing) # In OFFLINE mode, dump the generated fingerprints in a .csv file. IN THIS CASE WE APPEND!!!! if self.offline == 1: self.fin_manager.write_fingerprint_to_file( self.dump_testing, new_fingerprint, host) with open(self.dump_fingerprint_to_timestamps_testing, 'w') as f: pickle.dump(self.fingerprint_to_timestamps_testing, f) else: all_training_fingerprints = [] for h, fingerprints in self.fin_manager.hosts_fingerprints.iteritems( ): for f in fingerprints: all_training_fingerprints.append(f) if self.detector.detection(all_training_fingerprints, new_fingerprint): self.alerts.append(new_fingerprint) else: pass def _insert_http_request(self, req): """ Aggregate the HTTP requests per host and user-agent Parameter ------------------- req : HTTPRequest object """ # Initialize the clusters for the (previously unseen) host if req.orig_ip not in self.hosts_clusters: self.hosts_clusters[req.orig_ip] = {} # Add a request to the cluster of a known host if req.orig_ip in self.hosts_clusters: # Create and/or Update a cluster for those requests that DO NOT HAVE a User-Agent if 'user-agent' not in req.header_values: if 'None' not in self.hosts_clusters[req.orig_ip]: self.hosts_clusters[req.orig_ip]['None'] = [req] else: self.hosts_clusters[req.orig_ip]['None'].append(req) # Create and/or Update a cluster for those requests that DO HAVE a User-Agent else: if req.header_values['user-agent'] not in self.hosts_clusters[ req.orig_ip]: self.hosts_clusters[req.orig_ip][ req.header_values['user-agent']] = [req] else: self.hosts_clusters[req.orig_ip][ req.header_values['user-agent']].append(req)
def __init__(self, folder_path): self.files = glob.glob(folder_path + "*.csv") self.files = sorted(self.files, key=lambda tmp: tmp[84:]) self.training_manager = FingerprintManager() self.testing_manager = FingerprintManager() self.detector = DetectionModule()