def test_augment_data(): model = Model(10, labels=['Unknown']) a = [[1, 2, 3], [4, 5, 6]] x = np.array(a) a = ['label1', 'label2', 'label3'] y = np.array(a) model._augment_data(x, y)
def train(self, data_dir, save_path, m, algorithm): # Initialize the model model = Model(duration=self.duration, hidden_size=self.state_size, labels=self.conf_labels, model=m, model_type=algorithm, threshold_time=self.threshold) # Train the model model.train(data_dir) # Save the model to the specified path model.save(save_path)
def __init__(self): ## Set logging information for instance self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) ## Take arguments from command line self.args = None self.read_args() ## Take input from configuration file self.get_config() self.common = Common(config=self.config) ## Instantiate a logger to to leg messages to aid debugging self.logger = Common().setup_logger(self.logger) ## Add network traffic files for parsing self.get_files() self.model_hash = None self.model = Model(duration=self.duration, hidden_size=None, model_type=self.args.algorithm) def create_base_alg(): return BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model, sos_model=self.args.sos_model) ## Check whether operation is evaluation, train, or test ## Evaluation returns predictions that are useful for the deployment ## of networkml in an operational environment. if self.args.operation == 'eval': self.load_model() if (self.args.algorithm == 'onelayer' or self.args.algorithm == 'randomforest'): base_alg = create_base_alg() base_alg.eval(self.args.algorithm) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': from networkml.algorithms.sos.eval_SoSModel import eval_pcap eval_pcap(self.args.path, self.args.sos_model, self.conf_labels, self.time_const) ## Train entails training a new model on specific packet captures elif self.args.operation == 'train': ## Check for model type specified ## onelayer refers to a one-layer neural network if self.args.algorithm == 'onelayer': m = MLPClassifier((self.state_size), alpha=0.1, activation='relu', max_iter=1000) base_alg = create_base_alg() base_alg.train(self.args.path, self.args.save, m, self.args.algorithm) ## Random forests refers to a decision tree-based model elif self.args.algorithm == 'randomforest': m = RandomForestClassifier(n_estimators=100, min_samples_split=5, class_weight='balanced') base_alg = create_base_alg() base_alg.train(self.args.path, self.args.save, m, self.args.algorithm) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': from networkml.algorithms.sos.train_SoSModel import train train(self.args.path, self.args.sos_model, self.time_const, self.rnn_size, self.conf_labels, self.args.save) ## Test is for checking overall performance of networkML models for ## the device classification task. It is a benchmarking operation. elif self.args.operation == 'test': self.load_model() ## Check for model type specified ## onelayer refers to a one-layer neural network ## Random forests refers to a decision tree-based model if (self.args.algorithm == 'onelayer' or self.args.algorithm == 'randomforest'): base_alg = create_base_alg() base_alg.test(self.args.path, self.args.save) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': self.logger.info( 'There is no testing operation for the SoSModel.')
class NetworkML(): """' Main class that instantiates prediction models of the types of devices found in computer network traffic and whether that device is acting normal given its type (also based on network traffic). The three model types built in to this class are random forests, neural networks, and stochastic outlier selection (SOS). """ def __init__(self): ## Set logging information for instance self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) ## Take arguments from command line self.args = None self.read_args() ## Take input from configuration file self.get_config() self.common = Common(config=self.config) ## Instantiate a logger to to leg messages to aid debugging self.logger = Common().setup_logger(self.logger) ## Add network traffic files for parsing self.get_files() self.model_hash = None self.model = Model(duration=self.duration, hidden_size=None, model_type=self.args.algorithm) def create_base_alg(): return BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model, sos_model=self.args.sos_model) ## Check whether operation is evaluation, train, or test ## Evaluation returns predictions that are useful for the deployment ## of networkml in an operational environment. if self.args.operation == 'eval': self.load_model() if (self.args.algorithm == 'onelayer' or self.args.algorithm == 'randomforest'): base_alg = create_base_alg() base_alg.eval(self.args.algorithm) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': from networkml.algorithms.sos.eval_SoSModel import eval_pcap eval_pcap(self.args.path, self.args.sos_model, self.conf_labels, self.time_const) ## Train entails training a new model on specific packet captures elif self.args.operation == 'train': ## Check for model type specified ## onelayer refers to a one-layer neural network if self.args.algorithm == 'onelayer': m = MLPClassifier((self.state_size), alpha=0.1, activation='relu', max_iter=1000) base_alg = create_base_alg() base_alg.train(self.args.path, self.args.save, m, self.args.algorithm) ## Random forests refers to a decision tree-based model elif self.args.algorithm == 'randomforest': m = RandomForestClassifier(n_estimators=100, min_samples_split=5, class_weight='balanced') base_alg = create_base_alg() base_alg.train(self.args.path, self.args.save, m, self.args.algorithm) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': from networkml.algorithms.sos.train_SoSModel import train train(self.args.path, self.args.sos_model, self.time_const, self.rnn_size, self.conf_labels, self.args.save) ## Test is for checking overall performance of networkML models for ## the device classification task. It is a benchmarking operation. elif self.args.operation == 'test': self.load_model() ## Check for model type specified ## onelayer refers to a one-layer neural network ## Random forests refers to a decision tree-based model if (self.args.algorithm == 'onelayer' or self.args.algorithm == 'randomforest'): base_alg = create_base_alg() base_alg.test(self.args.path, self.args.save) ## SOS refers to statistical outlier selection model elif self.args.algorithm == 'sos': self.logger.info( 'There is no testing operation for the SoSModel.') def read_args(self): """ Read arguments from command line to determine what operations to implement. """ parser = argparse.ArgumentParser() parser.add_argument('--algorithm', '-a', default='onelayer', choices=['onelayer', 'randomforest', 'sos'], help='which algorithm to run') parser.add_argument('--format', '-f', default='pcap', choices=['netflow', 'pcap'], help='which format are the files to process in') parser.add_argument('--operation', '-o', default='eval', choices=['eval', 'train', 'test'], help='which operation to run') parser.add_argument('--sos_model', '-s', default='networkml/trained_models/sos/SoSmodel', help='path to SoSmodel') parser.add_argument( '--trained_model', '-m', default='networkml/trained_models/onelayer/OneLayerModel.pkl', help='path to the trained model file') parser.add_argument( '--path', '-p', default='/pcaps', help='path to file or directory of files to process') parser.add_argument( '--save', '-w', default='networkml/trained_models/onelayer/OneLayerModel.pkl', help='path to save the trained model, if training') self.args = parser.parse_args() return def get_files(self): """ Add directory of files or file for parsing. """ # TODO checking extensions here should be moved to parsers, and it should # probably use 'magic' rather than extensions. See Python magic library self.files = [] if Path(self.args.path).is_dir(): for root, dirnames, filenames in os.walk(self.args.path): for extension in ['pcap', 'dump', 'cap']: for filename in fnmatch.filter(filenames, '*.' + extension): self.files.append(os.path.join(root, filename)) elif Path(self.args.path).is_file() and \ os.path.split(str(self.args.path))[-1].split('.')[-1] in {'pcap', 'dump', 'cap'}: self.files.append(str(self.args.path)) else: self.logger.error( 'Input \'%s\' was neither a file nor a directory.', str(self.args.path)) if not self.files: self.logger.error('Did not find file(s) from \'%s\'.', str(self.args.path)) return def get_config(self, cfg_file='networkml/configs/config.json', labels_file='networkml/configs/label_assignments.json'): """ Load values from configuration file. Args: cfg_file: path to configuration file labels_file: path to labels (or the types of devices predicted) """ try: with open(cfg_file, 'r') as config_file: self.config = json.load(config_file) ## Time constant is used for creating a moving average self.time_const = self.config['time constant'] ## State size sets the number of nodes in the neural network self.state_size = self.config['state size'] ## An amount of time set between investigations of a potentially ## suspicious device self.look_time = self.config['look time'] ## time in seconds ## Threshold sets the confidence needed to identify abnormal ## behavior self.threshold = self.config['threshold'] ## Set parameter for SOS model self.rnn_size = self.config['rnn size'] ## Duration for time window of network traffic for which to computer ## information on features self.duration = self.config['duration'] #self.batch_size = self.config['batch size'] ## Import device label typology with open(labels_file, 'r') as label_file: labels = json.load(label_file) self.conf_labels = [] for label in labels: self.conf_labels.append(labels[label]) self.conf_labels.append('Unknown') self.config['conf labels'] = self.conf_labels except Exception as e: # pragma: no cover self.logger.error("Unable to read '%s' properly because: %s", cfg_file, str(e)) return def load_model(self): """ Load trained machine learning model. """ with open(self.args.trained_model, 'rb') as handle: self.model_hash = hashlib.sha224(handle.read()).hexdigest() self.model.load(self.args.trained_model) self.logger.debug('Loaded model from %s', self.args.trained_model) return
def __init__(self): self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) self.args = None self.read_args() self.get_config() self.common = Common(config=self.config) self.logger = Common().setup_logger(self.logger) self.get_files() self.model_hash = None self.model = Model(duration=self.duration, hidden_size=None, model_type=self.args.algorithm) if self.args.operation == 'eval': self.load_model() if self.args.algorithm == 'onelayer': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval( self.args.algorithm) elif self.args.algorithm == 'randomforest': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval( self.args.algorithm) elif self.args.algorithm == 'sos': from networkml.algorithms.sos.eval_SoSModel import eval_pcap eval_pcap(self.args.path, self.conf_labels, self.time_const) elif self.args.operation == 'train': if self.args.algorithm == 'onelayer': m = MLPClassifier((self.state_size), alpha=0.1, activation='relu', max_iter=1000) BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).train( self.args.path, self.args.save, m, self.args.algorithm) elif self.args.algorithm == 'randomforest': m = RandomForestClassifier(n_estimators=100, min_samples_split=5, class_weight='balanced') BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).train( self.args.path, self.args.save, m, self.args.algorithm) elif self.args.algorithm == 'sos': from networkml.algorithms.sos.train_SoSModel import train train(self.args.path, self.time_const, self.rnn_size, self.conf_labels, self.args.save) elif self.args.operation == 'test': self.load_model() if self.args.algorithm == 'onelayer': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test( self.args.path, self.args.save) elif self.args.algorithm == 'randomforest': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test( self.args.path, self.args.save) elif self.args.algorithm == 'sos': self.logger.info( 'There is no testing operation for the SoSModel.')
class NetworkML(): """' Main class to run different algorithms against different network traffic data sources """ def __init__(self): self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) self.args = None self.read_args() self.get_config() self.common = Common(config=self.config) self.logger = Common().setup_logger(self.logger) self.get_files() self.model_hash = None self.model = Model(duration=self.duration, hidden_size=None, model_type=self.args.algorithm) if self.args.operation == 'eval': self.load_model() if self.args.algorithm == 'onelayer': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval( self.args.algorithm) elif self.args.algorithm == 'randomforest': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).eval( self.args.algorithm) elif self.args.algorithm == 'sos': from networkml.algorithms.sos.eval_SoSModel import eval_pcap eval_pcap(self.args.path, self.conf_labels, self.time_const) elif self.args.operation == 'train': if self.args.algorithm == 'onelayer': m = MLPClassifier((self.state_size), alpha=0.1, activation='relu', max_iter=1000) BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).train( self.args.path, self.args.save, m, self.args.algorithm) elif self.args.algorithm == 'randomforest': m = RandomForestClassifier(n_estimators=100, min_samples_split=5, class_weight='balanced') BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).train( self.args.path, self.args.save, m, self.args.algorithm) elif self.args.algorithm == 'sos': from networkml.algorithms.sos.train_SoSModel import train train(self.args.path, self.time_const, self.rnn_size, self.conf_labels, self.args.save) elif self.args.operation == 'test': self.load_model() if self.args.algorithm == 'onelayer': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test( self.args.path, self.args.save) elif self.args.algorithm == 'randomforest': BaseAlgorithm(files=self.files, config=self.config, model=self.model, model_hash=self.model_hash, model_path=self.args.trained_model).test( self.args.path, self.args.save) elif self.args.algorithm == 'sos': self.logger.info( 'There is no testing operation for the SoSModel.') def read_args(self): parser = argparse.ArgumentParser() parser.add_argument('--algorithm', '-a', default='onelayer', choices=['onelayer', 'randomforest', 'sos'], help='which algorithm to run') parser.add_argument('--format', '-f', default='pcap', choices=['netflow', 'pcap'], help='which format are the files to process in') parser.add_argument('--operation', '-o', default='eval', choices=['eval', 'train', 'test'], help='which operation to run') parser.add_argument( '--trained_model', '-m', default='networkml/trained_models/onelayer/OneLayerModel.pkl', help='path to the trained model file') parser.add_argument( '--path', '-p', default='/pcaps', help='path to file or directory of files to process') parser.add_argument( '--save', '-w', default='networkml/trained_models/onelayer/OneLayerModel.pkl', help='path to save the trained model, if training') self.args = parser.parse_args() return def get_files(self): # TODO checking extensions here should be moved to parsers, and it should probably use 'magic' rather than extensions self.files = [] if Path(self.args.path).is_dir(): for root, dirnames, filenames in os.walk(self.args.path): for extension in ['pcap', 'dump', 'cap']: for filename in fnmatch.filter(filenames, '*.' + extension): self.files.append(os.path.join(root, filename)) elif Path(self.args.path).is_file() and \ os.path.split(str(self.args.path))[-1].split('.')[-1] in {'pcap', 'dump', 'cap'}: self.files.append(str(self.args.path)) else: self.logger.error( 'Input \'%s\' was neither a file nor a directory.', str(self.args.path)) if not self.files: self.logger.error('Did not find file(s) from \'%s\'.', str(self.args.path)) return def get_config(self, cfg_file='networkml/configs/config.json', labels_file='networkml/configs/label_assignments.json'): try: with open(cfg_file, 'r') as config_file: self.config = json.load(config_file) self.time_const = self.config['time constant'] self.state_size = self.config['state size'] self.look_time = self.config['look time'] self.threshold = self.config['threshold'] self.rnn_size = self.config['rnn size'] self.duration = self.config['duration'] #self.batch_size = self.config['batch size'] with open(labels_file, 'r') as label_file: labels = json.load(label_file) self.conf_labels = [] for label in labels: self.conf_labels.append(labels[label]) self.conf_labels.append('Unknown') self.config['conf labels'] = self.conf_labels except Exception as e: # pragma: no cover self.logger.error("Unable to read '%s' properly because: %s", cfg_file, str(e)) return def load_model(self): # Compute model hash with open(self.args.trained_model, 'rb') as handle: self.model_hash = hashlib.sha224(handle.read()).hexdigest() self.model.load(self.args.trained_model) self.logger.debug('Loaded model from %s', self.args.trained_model) return
def test_get_features(): model = Model(10, labels=['Unknown']) with open('tests/test.pcap', 'a'): os.utime('tests/test.pcap', None) model.get_features('tests/test.pcap')
def create_dataset( data_dir, time_const, model_path='networkml/trained_models/onelayer/OneLayerModel.pkl', label=None, model_type='randomforest' ): logger = logging.getLogger(__name__) try: if 'LOG_LEVEL' in os.environ and os.environ['LOG_LEVEL'] != '': logger.setLevel(os.environ['LOG_LEVEL']) except Exception as e: # pragma: no cover logger.error( 'Unable to set logging level because: {0} defaulting to INFO.'.format(str(e))) # Load the model logger.debug('Loading model') model = Model(duration=None, hidden_size=None, model_type=model_type) model.load(model_path) # Get all the pcaps in the training directory logger.debug('Getting pcaps') pcaps = [] try: ext = os.path.splitext(data_dir)[-1] if ext == '.pcap': pcaps.append(data_dir) except Exception as e: # pragma: no cover logger.debug('Skipping {0} because: {1}'.format(data_dir, str(e))) for dirpath, _, filenames in os.walk(data_dir): for filename in filenames: ext = os.path.splitext(filename)[-1] if ext == '.pcap': pcaps.append(os.path.join(dirpath, filename)) # Get and store the representations using the supplied model # Representations will be computed separately for each pcap representations = {} count = 0 for pcap in pcaps: count += 1 logger.info('Working on {0} ({1} bytes) ({2}/{3})'.format(pcap, os.path.getsize(pcap), str(count), len(pcaps))) reps, _, timestamps, _, _, _ = model.get_representation( pcap, mean=False ) sessions = model.sessions # Compute the mean representations prev_rep = None prev_time = None model_outputs = {} if timestamps is not None: for i, timestamp in enumerate(timestamps): rep = reps[i] new_rep, time = average_representation( rep, timestamp, prev_rep, prev_time, time_const ) preds = model.classify_representation(new_rep) if label is not None: preds = [(p[0], 0) for p in preds if p[0] != label] preds.append((label, 1)) model_outputs[timestamp] = { 'classification': list(preds), 'representation': list(rep), 'mean representation': list(new_rep) } prev_rep, prev_time = new_rep, time # Clean the sessions and merge them into a single session dict session_rep_pairs = [] source = get_source(sessions, address_type='IP') for session_dict in sessions: for key, value in session_dict.items(): session_info = featurize_session(key, value, source=source) first_time = value[0][0].timestamp() prior_time = None for timestamp in timestamps: time = timestamp.timestamp() if first_time > time: prior_time = timestamp if prior_time == None: prior_time = timestamps[0] pair = { 'model outputs': model_outputs[prior_time], 'session info': session_info, 'key': key } if session_info is not None: session_rep_pairs.append(pair) representations[pcap] = session_rep_pairs byte_size = sys.getsizeof(pickle.dumps(representations)) logger.debug( 'created training data of size %f mb', round(byte_size/1000000, 3) ) return representations