def test_augment_data():
    model = Model(10, labels=['Unknown'])
    a = [[1, 2, 3], [4, 5, 6]]
    x = np.array(a)
    a = ['label1', 'label2', 'label3']
    y = np.array(a)
    model._augment_data(x, y)
Beispiel #2
0
 def train(self, data_dir, save_path, m, algorithm):
     # Initialize the model
     model = Model(duration=self.duration,
                   hidden_size=self.state_size,
                   labels=self.conf_labels,
                   model=m,
                   model_type=algorithm,
                   threshold_time=self.threshold)
     # Train the model
     model.train(data_dir)
     # Save the model to the specified path
     model.save(save_path)
Beispiel #3
0
    def __init__(self):

        ## Set logging information for instance
        self.logger = logging.getLogger(__name__)
        logging.basicConfig(level=logging.INFO)

        ## Take arguments from command line
        self.args = None
        self.read_args()

        ## Take input from configuration file
        self.get_config()
        self.common = Common(config=self.config)

        ## Instantiate a logger to to leg messages to aid debugging
        self.logger = Common().setup_logger(self.logger)

        ## Add network traffic files for parsing
        self.get_files()
        self.model_hash = None
        self.model = Model(duration=self.duration,
                           hidden_size=None,
                           model_type=self.args.algorithm)

        def create_base_alg():
            return BaseAlgorithm(files=self.files,
                                 config=self.config,
                                 model=self.model,
                                 model_hash=self.model_hash,
                                 model_path=self.args.trained_model,
                                 sos_model=self.args.sos_model)

        ## Check whether operation is evaluation, train, or test
        ## Evaluation returns predictions that are useful for the deployment
        ## of networkml in an operational environment.
        if self.args.operation == 'eval':
            self.load_model()

            if (self.args.algorithm == 'onelayer'
                    or self.args.algorithm == 'randomforest'):
                base_alg = create_base_alg()
                base_alg.eval(self.args.algorithm)

            ## SOS refers to statistical outlier selection model
            elif self.args.algorithm == 'sos':
                from networkml.algorithms.sos.eval_SoSModel import eval_pcap
                eval_pcap(self.args.path, self.args.sos_model,
                          self.conf_labels, self.time_const)

        ## Train entails training a new model on specific packet captures
        elif self.args.operation == 'train':

            ## Check for model type specified
            ## onelayer refers to a one-layer neural network
            if self.args.algorithm == 'onelayer':
                m = MLPClassifier((self.state_size),
                                  alpha=0.1,
                                  activation='relu',
                                  max_iter=1000)
                base_alg = create_base_alg()
                base_alg.train(self.args.path, self.args.save, m,
                               self.args.algorithm)

            ## Random forests refers to a decision tree-based model
            elif self.args.algorithm == 'randomforest':
                m = RandomForestClassifier(n_estimators=100,
                                           min_samples_split=5,
                                           class_weight='balanced')
                base_alg = create_base_alg()
                base_alg.train(self.args.path, self.args.save, m,
                               self.args.algorithm)

            ## SOS refers to statistical outlier selection model
            elif self.args.algorithm == 'sos':
                from networkml.algorithms.sos.train_SoSModel import train
                train(self.args.path, self.args.sos_model, self.time_const,
                      self.rnn_size, self.conf_labels, self.args.save)

        ## Test is for checking overall performance of networkML models for
        ## the device classification task. It is a benchmarking operation.
        elif self.args.operation == 'test':
            self.load_model()

            ## Check for model type specified
            ## onelayer refers to a one-layer neural network
            ## Random forests refers to a decision tree-based model
            if (self.args.algorithm == 'onelayer'
                    or self.args.algorithm == 'randomforest'):
                base_alg = create_base_alg()
                base_alg.test(self.args.path, self.args.save)

            ## SOS refers to statistical outlier selection model
            elif self.args.algorithm == 'sos':
                self.logger.info(
                    'There is no testing operation for the SoSModel.')
Beispiel #4
0
class NetworkML():
    """'
    Main class that instantiates prediction models of the types of devices found
    in computer network traffic and whether that device is acting normal
    given its type (also based on network traffic). The three model types
    built in to this class are random forests, neural networks, and stochastic
    outlier selection (SOS).
    """
    def __init__(self):

        ## Set logging information for instance
        self.logger = logging.getLogger(__name__)
        logging.basicConfig(level=logging.INFO)

        ## Take arguments from command line
        self.args = None
        self.read_args()

        ## Take input from configuration file
        self.get_config()
        self.common = Common(config=self.config)

        ## Instantiate a logger to to leg messages to aid debugging
        self.logger = Common().setup_logger(self.logger)

        ## Add network traffic files for parsing
        self.get_files()
        self.model_hash = None
        self.model = Model(duration=self.duration,
                           hidden_size=None,
                           model_type=self.args.algorithm)

        def create_base_alg():
            return BaseAlgorithm(files=self.files,
                                 config=self.config,
                                 model=self.model,
                                 model_hash=self.model_hash,
                                 model_path=self.args.trained_model,
                                 sos_model=self.args.sos_model)

        ## Check whether operation is evaluation, train, or test
        ## Evaluation returns predictions that are useful for the deployment
        ## of networkml in an operational environment.
        if self.args.operation == 'eval':
            self.load_model()

            if (self.args.algorithm == 'onelayer'
                    or self.args.algorithm == 'randomforest'):
                base_alg = create_base_alg()
                base_alg.eval(self.args.algorithm)

            ## SOS refers to statistical outlier selection model
            elif self.args.algorithm == 'sos':
                from networkml.algorithms.sos.eval_SoSModel import eval_pcap
                eval_pcap(self.args.path, self.args.sos_model,
                          self.conf_labels, self.time_const)

        ## Train entails training a new model on specific packet captures
        elif self.args.operation == 'train':

            ## Check for model type specified
            ## onelayer refers to a one-layer neural network
            if self.args.algorithm == 'onelayer':
                m = MLPClassifier((self.state_size),
                                  alpha=0.1,
                                  activation='relu',
                                  max_iter=1000)
                base_alg = create_base_alg()
                base_alg.train(self.args.path, self.args.save, m,
                               self.args.algorithm)

            ## Random forests refers to a decision tree-based model
            elif self.args.algorithm == 'randomforest':
                m = RandomForestClassifier(n_estimators=100,
                                           min_samples_split=5,
                                           class_weight='balanced')
                base_alg = create_base_alg()
                base_alg.train(self.args.path, self.args.save, m,
                               self.args.algorithm)

            ## SOS refers to statistical outlier selection model
            elif self.args.algorithm == 'sos':
                from networkml.algorithms.sos.train_SoSModel import train
                train(self.args.path, self.args.sos_model, self.time_const,
                      self.rnn_size, self.conf_labels, self.args.save)

        ## Test is for checking overall performance of networkML models for
        ## the device classification task. It is a benchmarking operation.
        elif self.args.operation == 'test':
            self.load_model()

            ## Check for model type specified
            ## onelayer refers to a one-layer neural network
            ## Random forests refers to a decision tree-based model
            if (self.args.algorithm == 'onelayer'
                    or self.args.algorithm == 'randomforest'):
                base_alg = create_base_alg()
                base_alg.test(self.args.path, self.args.save)

            ## SOS refers to statistical outlier selection model
            elif self.args.algorithm == 'sos':
                self.logger.info(
                    'There is no testing operation for the SoSModel.')

    def read_args(self):
        """
        Read arguments from command line to determine what operations to
        implement.
        """
        parser = argparse.ArgumentParser()
        parser.add_argument('--algorithm',
                            '-a',
                            default='onelayer',
                            choices=['onelayer', 'randomforest', 'sos'],
                            help='which algorithm to run')
        parser.add_argument('--format',
                            '-f',
                            default='pcap',
                            choices=['netflow', 'pcap'],
                            help='which format are the files to process in')
        parser.add_argument('--operation',
                            '-o',
                            default='eval',
                            choices=['eval', 'train', 'test'],
                            help='which operation to run')
        parser.add_argument('--sos_model',
                            '-s',
                            default='networkml/trained_models/sos/SoSmodel',
                            help='path to SoSmodel')
        parser.add_argument(
            '--trained_model',
            '-m',
            default='networkml/trained_models/onelayer/OneLayerModel.pkl',
            help='path to the trained model file')
        parser.add_argument(
            '--path',
            '-p',
            default='/pcaps',
            help='path to file or directory of files to process')
        parser.add_argument(
            '--save',
            '-w',
            default='networkml/trained_models/onelayer/OneLayerModel.pkl',
            help='path to save the trained model, if training')

        self.args = parser.parse_args()
        return

    def get_files(self):
        """
        Add directory of files or file for parsing.
        """
        # TODO checking extensions here should be moved to parsers, and it should
        # probably use 'magic' rather than extensions. See Python magic library

        self.files = []
        if Path(self.args.path).is_dir():
            for root, dirnames, filenames in os.walk(self.args.path):
                for extension in ['pcap', 'dump', 'cap']:
                    for filename in fnmatch.filter(filenames,
                                                   '*.' + extension):
                        self.files.append(os.path.join(root, filename))
        elif Path(self.args.path).is_file() and \
                os.path.split(str(self.args.path))[-1].split('.')[-1] in {'pcap', 'dump', 'cap'}:
            self.files.append(str(self.args.path))
        else:
            self.logger.error(
                'Input \'%s\' was neither a file nor a directory.',
                str(self.args.path))

        if not self.files:
            self.logger.error('Did not find file(s) from \'%s\'.',
                              str(self.args.path))
        return

    def get_config(self,
                   cfg_file='networkml/configs/config.json',
                   labels_file='networkml/configs/label_assignments.json'):
        """
        Load values from configuration file.

        Args:
            cfg_file: path to configuration file
            labels_file: path to labels (or the types of devices predicted)
        """
        try:
            with open(cfg_file, 'r') as config_file:
                self.config = json.load(config_file)

            ## Time constant is used for creating a moving average
            self.time_const = self.config['time constant']

            ## State size sets the number of nodes in the neural network
            self.state_size = self.config['state size']

            ## An amount of time set between investigations of a potentially
            ## suspicious device
            self.look_time = self.config['look time']  ## time in seconds

            ## Threshold sets the confidence needed to identify abnormal
            ## behavior
            self.threshold = self.config['threshold']

            ## Set parameter for SOS model
            self.rnn_size = self.config['rnn size']

            ## Duration for time window of network traffic for which to computer
            ## information on features
            self.duration = self.config['duration']

            #self.batch_size = self.config['batch size']

            ## Import device label typology
            with open(labels_file, 'r') as label_file:
                labels = json.load(label_file)
            self.conf_labels = []
            for label in labels:
                self.conf_labels.append(labels[label])
            self.conf_labels.append('Unknown')
            self.config['conf labels'] = self.conf_labels

        except Exception as e:  # pragma: no cover
            self.logger.error("Unable to read '%s' properly because: %s",
                              cfg_file, str(e))
        return

    def load_model(self):
        """
        Load trained machine learning model.
        """
        with open(self.args.trained_model, 'rb') as handle:
            self.model_hash = hashlib.sha224(handle.read()).hexdigest()

        self.model.load(self.args.trained_model)
        self.logger.debug('Loaded model from %s', self.args.trained_model)
        return
Beispiel #5
0
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        logging.basicConfig(level=logging.INFO)

        self.args = None
        self.read_args()
        self.get_config()
        self.common = Common(config=self.config)
        self.logger = Common().setup_logger(self.logger)
        self.get_files()
        self.model_hash = None
        self.model = Model(duration=self.duration,
                           hidden_size=None,
                           model_type=self.args.algorithm)
        if self.args.operation == 'eval':
            self.load_model()
            if self.args.algorithm == 'onelayer':
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).eval(
                                  self.args.algorithm)
            elif self.args.algorithm == 'randomforest':
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).eval(
                                  self.args.algorithm)
            elif self.args.algorithm == 'sos':
                from networkml.algorithms.sos.eval_SoSModel import eval_pcap
                eval_pcap(self.args.path, self.conf_labels, self.time_const)
        elif self.args.operation == 'train':
            if self.args.algorithm == 'onelayer':
                m = MLPClassifier((self.state_size),
                                  alpha=0.1,
                                  activation='relu',
                                  max_iter=1000)
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).train(
                                  self.args.path, self.args.save, m,
                                  self.args.algorithm)
            elif self.args.algorithm == 'randomforest':
                m = RandomForestClassifier(n_estimators=100,
                                           min_samples_split=5,
                                           class_weight='balanced')
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).train(
                                  self.args.path, self.args.save, m,
                                  self.args.algorithm)
            elif self.args.algorithm == 'sos':
                from networkml.algorithms.sos.train_SoSModel import train
                train(self.args.path, self.time_const, self.rnn_size,
                      self.conf_labels, self.args.save)
        elif self.args.operation == 'test':
            self.load_model()
            if self.args.algorithm == 'onelayer':
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).test(
                                  self.args.path, self.args.save)
            elif self.args.algorithm == 'randomforest':
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).test(
                                  self.args.path, self.args.save)
            elif self.args.algorithm == 'sos':
                self.logger.info(
                    'There is no testing operation for the SoSModel.')
Beispiel #6
0
class NetworkML():
    """'
    Main class to run different algorithms against different network
    traffic data sources
    """
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        logging.basicConfig(level=logging.INFO)

        self.args = None
        self.read_args()
        self.get_config()
        self.common = Common(config=self.config)
        self.logger = Common().setup_logger(self.logger)
        self.get_files()
        self.model_hash = None
        self.model = Model(duration=self.duration,
                           hidden_size=None,
                           model_type=self.args.algorithm)
        if self.args.operation == 'eval':
            self.load_model()
            if self.args.algorithm == 'onelayer':
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).eval(
                                  self.args.algorithm)
            elif self.args.algorithm == 'randomforest':
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).eval(
                                  self.args.algorithm)
            elif self.args.algorithm == 'sos':
                from networkml.algorithms.sos.eval_SoSModel import eval_pcap
                eval_pcap(self.args.path, self.conf_labels, self.time_const)
        elif self.args.operation == 'train':
            if self.args.algorithm == 'onelayer':
                m = MLPClassifier((self.state_size),
                                  alpha=0.1,
                                  activation='relu',
                                  max_iter=1000)
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).train(
                                  self.args.path, self.args.save, m,
                                  self.args.algorithm)
            elif self.args.algorithm == 'randomforest':
                m = RandomForestClassifier(n_estimators=100,
                                           min_samples_split=5,
                                           class_weight='balanced')
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).train(
                                  self.args.path, self.args.save, m,
                                  self.args.algorithm)
            elif self.args.algorithm == 'sos':
                from networkml.algorithms.sos.train_SoSModel import train
                train(self.args.path, self.time_const, self.rnn_size,
                      self.conf_labels, self.args.save)
        elif self.args.operation == 'test':
            self.load_model()
            if self.args.algorithm == 'onelayer':
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).test(
                                  self.args.path, self.args.save)
            elif self.args.algorithm == 'randomforest':
                BaseAlgorithm(files=self.files,
                              config=self.config,
                              model=self.model,
                              model_hash=self.model_hash,
                              model_path=self.args.trained_model).test(
                                  self.args.path, self.args.save)
            elif self.args.algorithm == 'sos':
                self.logger.info(
                    'There is no testing operation for the SoSModel.')

    def read_args(self):
        parser = argparse.ArgumentParser()
        parser.add_argument('--algorithm',
                            '-a',
                            default='onelayer',
                            choices=['onelayer', 'randomforest', 'sos'],
                            help='which algorithm to run')
        parser.add_argument('--format',
                            '-f',
                            default='pcap',
                            choices=['netflow', 'pcap'],
                            help='which format are the files to process in')
        parser.add_argument('--operation',
                            '-o',
                            default='eval',
                            choices=['eval', 'train', 'test'],
                            help='which operation to run')
        parser.add_argument(
            '--trained_model',
            '-m',
            default='networkml/trained_models/onelayer/OneLayerModel.pkl',
            help='path to the trained model file')
        parser.add_argument(
            '--path',
            '-p',
            default='/pcaps',
            help='path to file or directory of files to process')
        parser.add_argument(
            '--save',
            '-w',
            default='networkml/trained_models/onelayer/OneLayerModel.pkl',
            help='path to save the trained model, if training')

        self.args = parser.parse_args()
        return

    def get_files(self):
        # TODO checking extensions here should be moved to parsers, and it should probably use 'magic' rather than extensions
        self.files = []
        if Path(self.args.path).is_dir():
            for root, dirnames, filenames in os.walk(self.args.path):
                for extension in ['pcap', 'dump', 'cap']:
                    for filename in fnmatch.filter(filenames,
                                                   '*.' + extension):
                        self.files.append(os.path.join(root, filename))
        elif Path(self.args.path).is_file() and \
                os.path.split(str(self.args.path))[-1].split('.')[-1] in {'pcap', 'dump', 'cap'}:
            self.files.append(str(self.args.path))
        else:
            self.logger.error(
                'Input \'%s\' was neither a file nor a directory.',
                str(self.args.path))

        if not self.files:
            self.logger.error('Did not find file(s) from \'%s\'.',
                              str(self.args.path))
        return

    def get_config(self,
                   cfg_file='networkml/configs/config.json',
                   labels_file='networkml/configs/label_assignments.json'):
        try:
            with open(cfg_file, 'r') as config_file:
                self.config = json.load(config_file)
            self.time_const = self.config['time constant']
            self.state_size = self.config['state size']
            self.look_time = self.config['look time']
            self.threshold = self.config['threshold']
            self.rnn_size = self.config['rnn size']
            self.duration = self.config['duration']
            #self.batch_size = self.config['batch size']
            with open(labels_file, 'r') as label_file:
                labels = json.load(label_file)
            self.conf_labels = []
            for label in labels:
                self.conf_labels.append(labels[label])
            self.conf_labels.append('Unknown')
            self.config['conf labels'] = self.conf_labels
        except Exception as e:  # pragma: no cover
            self.logger.error("Unable to read '%s' properly because: %s",
                              cfg_file, str(e))
        return

    def load_model(self):
        # Compute model hash
        with open(self.args.trained_model, 'rb') as handle:
            self.model_hash = hashlib.sha224(handle.read()).hexdigest()

        self.model.load(self.args.trained_model)
        self.logger.debug('Loaded model from %s', self.args.trained_model)
        return
Beispiel #7
0
def test_get_features():
    model = Model(10, labels=['Unknown'])
    with open('tests/test.pcap', 'a'):
        os.utime('tests/test.pcap', None)
    model.get_features('tests/test.pcap')
Beispiel #8
0
def create_dataset(
    data_dir,
    time_const,
    model_path='networkml/trained_models/onelayer/OneLayerModel.pkl',
    label=None,
    model_type='randomforest'
):
    logger = logging.getLogger(__name__)
    try:
        if 'LOG_LEVEL' in os.environ and os.environ['LOG_LEVEL'] != '':
            logger.setLevel(os.environ['LOG_LEVEL'])
    except Exception as e:  # pragma: no cover
        logger.error(
            'Unable to set logging level because: {0} defaulting to INFO.'.format(str(e)))

    # Load the model
    logger.debug('Loading model')
    model = Model(duration=None, hidden_size=None, model_type=model_type)
    model.load(model_path)

    # Get all the pcaps in the training directory
    logger.debug('Getting pcaps')
    pcaps = []
    try:
        ext = os.path.splitext(data_dir)[-1]
        if ext == '.pcap':
            pcaps.append(data_dir)
    except Exception as e:  # pragma: no cover
        logger.debug('Skipping {0} because: {1}'.format(data_dir, str(e)))

    for dirpath, _, filenames in os.walk(data_dir):
        for filename in filenames:
            ext = os.path.splitext(filename)[-1]
            if ext == '.pcap':
                pcaps.append(os.path.join(dirpath, filename))

    # Get and store the representations using the supplied model
    # Representations will be computed separately for each pcap
    representations = {}
    count = 0
    for pcap in pcaps:
        count += 1
        logger.info('Working on {0} ({1} bytes) ({2}/{3})'.format(pcap,
                                                                  os.path.getsize(pcap), str(count), len(pcaps)))
        reps, _, timestamps, _, _, _ = model.get_representation(
            pcap,
            mean=False
        )
        sessions = model.sessions

        # Compute the mean representations
        prev_rep = None
        prev_time = None
        model_outputs = {}

        if timestamps is not None:
            for i, timestamp in enumerate(timestamps):
                rep = reps[i]
                new_rep, time = average_representation(
                    rep,
                    timestamp,
                    prev_rep,
                    prev_time,
                    time_const
                )
                preds = model.classify_representation(new_rep)
                if label is not None:
                    preds = [(p[0], 0) for p in preds if p[0] != label]
                    preds.append((label, 1))

                model_outputs[timestamp] = {
                    'classification': list(preds),
                    'representation': list(rep),
                    'mean representation': list(new_rep)
                }
                prev_rep, prev_time = new_rep, time

        # Clean the sessions and merge them into a single session dict
        session_rep_pairs = []
        source = get_source(sessions, address_type='IP')
        for session_dict in sessions:
            for key, value in session_dict.items():
                session_info = featurize_session(key, value, source=source)

                first_time = value[0][0].timestamp()
                prior_time = None
                for timestamp in timestamps:
                    time = timestamp.timestamp()
                    if first_time > time:
                        prior_time = timestamp
                if prior_time == None:
                    prior_time = timestamps[0]

                pair = {
                    'model outputs': model_outputs[prior_time],
                    'session info': session_info,
                    'key': key
                }
                if session_info is not None:
                    session_rep_pairs.append(pair)

        representations[pcap] = session_rep_pairs
    byte_size = sys.getsizeof(pickle.dumps(representations))
    logger.debug(
        'created training data of size %f mb',
        round(byte_size/1000000, 3)
    )

    return representations