Ejemplo n.º 1
0
def extract_features(session_dict, capture_source=None, max_port=None):
    '''
    Extracts netflow level features from packet capture.

    Args:
        pcap_path: path to the packet capture to process into features
        max_port:  Maximum port to get features on (default to reading config)

    Returns:
        feature_vector: Vector containing the featurized representation
                        of the input pcap.
    '''

    # Get featurization info from config
    try:
        with open('opts/config.json', 'r') as config_file:
            config = json.load(config_file)
            address_type = config['source identifier']
            if max_port is None:
                max_port = config['max port']
    except:
        address_type = 'MAC'

    # If the capture source isn't specified, default to the most used address
    if capture_source is None:
        capture_source = get_source(session_dict, address_type=address_type)

    # Initialize some counter variables
    num_sport_init = [0] * max_port
    num_dport_init = [0] * max_port
    num_sport_rec = [0] * max_port
    num_dport_rec = [0] * max_port

    num_sessions_init = 0
    num_external_init = 0
    num_tcp_sess_init = 0
    num_udp_sess_init = 0
    num_icmp_sess_init = 0

    num_sessions_rec = 0
    num_external_rec = 0
    num_tcp_sess_rec = 0
    num_udp_sess_rec = 0
    num_icmp_sess_rec = 0

    # Iterate over all sessions and aggregate the info
    other_ips = defaultdict(int)
    for key, session in session_dict.items():
        address_1, port_1 = get_ip_port(key[0])
        address_2, port_2 = get_ip_port(key[1])

        # Get the first packet and grab the macs from it
        first_packet = session[0][1]
        source_mac, destination_mac = extract_macs(first_packet)

        # If the source is the cpature source
        if (source_mac == capture_source or address_1 == capture_source):

            if is_private(address_2):
                other_ips[address_2] += 1

            num_sessions_init += 1
            num_external_init += is_external(address_1, address_2)
            num_tcp_sess_init += is_protocol(session, '06')
            num_udp_sess_init += is_protocol(session, '11')
            num_icmp_sess_init += is_protocol(session, '01')

            if int(port_1) < max_port:
                num_sport_init[int(port_1)] += 1

            if int(port_2) < max_port:
                num_dport_init[int(port_2)] += 1

        # If the destination is the capture source
        if (destination_mac == capture_source or address_2 == capture_source):
            if is_private(address_1):
                other_ips[address_1] += 1

            num_sessions_rec += 1
            num_external_rec += is_external(address_2, address_1)
            num_tcp_sess_rec += is_protocol(session, '06')
            num_udp_sess_rec += is_protocol(session, '11')
            num_icmp_sess_rec += is_protocol(session, '01')

            if int(port_1) < max_port:
                num_sport_rec[int(port_1)] += 1
            if int(port_2) < max_port:
                num_dport_rec[int(port_2)] += 1

    num_port_sess = np.concatenate(
        (num_sport_init, num_dport_init, num_sport_rec, num_dport_rec), axis=0)

    if num_sessions_init == 0:
        num_sessions_init += 1
    if num_sessions_rec == 0:
        num_sessions_rec += 1

    num_port_sess = np.asarray(num_port_sess) / (num_sessions_init +
                                                 num_sessions_rec)

    extra_features = [0] * 8
    extra_features[0] = num_external_init / num_sessions_init
    extra_features[1] = num_tcp_sess_init / num_sessions_init
    extra_features[2] = num_udp_sess_init / num_sessions_init
    extra_features[3] = num_icmp_sess_init / num_sessions_init

    extra_features[4] = num_external_rec / num_sessions_rec
    extra_features[5] = num_tcp_sess_rec / num_sessions_rec
    extra_features[6] = num_udp_sess_rec / num_sessions_rec
    extra_features[7] = num_icmp_sess_rec / num_sessions_rec

    feature_vector = np.concatenate((num_port_sess, extra_features), axis=0)
    return feature_vector, capture_source, list(other_ips.keys())
Ejemplo n.º 2
0
def read_data(data_dir, duration=None, labels=None):
    '''
    Reads all the data in the specified directory and parses it into
    a feature array and a label array.

    Args:
        data_dir: path to the directory that contains the training data
        duration: Time window to compute feature information
        labels: List containing labels to use

    Returns:
        X: numpy 2D array that contains the (high dimensional) features
        y: numpy 1D array that contains the labels for the features in X
        new_labels: Reordered labels used in training
    '''
    logger = logging.getLogger(__name__)
    try:
        if 'LOG_LEVEL' in os.environ and os.environ['LOG_LEVEL'] != '':
            logger.setLevel(os.environ['LOG_LEVEL'])
    except Exception as e:
        logger.error(
            'Unable to set logging level because: {0} defaulting to INFO.'.
            format(str(e)))
    X = []
    y = []
    assigned_labels = []

    # Get all the files in the directory
    files = []
    with open('opts/label_assignments.json') as handle:
        label_assignments = json.load(handle)

    for dirpath, _, filenames in os.walk(data_dir):
        for file in filenames:
            _, ext = os.path.splitext(file)
            if ext == '.pcap':
                files.append(os.path.join(dirpath, file))
    # Go through all the files in the directory
    logger.info('Found {0} pcap files to read.'.format(len(files)))
    count = 0
    for filename in files:
        count += 1
        # Extract the label from the filename
        name = os.path.split(filename)[1]
        name = name.split('-')[0]
        if name in label_assignments:
            label = label_assignments[name]
            if label not in labels:
                label = 'Unknown'
        else:
            label = 'Unknown'
        if label not in assigned_labels:
            assigned_labels.append(label)

        logger.info('Reading {0} ({1} bytes) as {2} ({3}/{4})'.format(
            filename, os.path.getsize(filename), label, count, len(files)))
        # Bin the sessions with the specified time window
        binned_sessions = sessionizer(filename, duration=duration)
        # Get the capture source from the binned sessions
        capture_source = get_source(binned_sessions)

        # For each of the session bins, compute the  full feature vectors
        for session_dict in binned_sessions:
            features, _, _ = extract_features(session_dict,
                                              capture_source=capture_source)

            # Store the feature vector and the labels
            X.append(features)
            y.append(assigned_labels.index(label))

        # Update the labels to reflect the new assignments
        new_labels = assigned_labels + \
            [l for l in labels if l not in assigned_labels]

    return np.stack(X), np.stack(y), new_labels
Ejemplo n.º 3
0
def create_dataset(data_dir,
                   time_const,
                   model_path='/models/OneLayerModel.pkl',
                   label=None,
                   model_type='RandomForest'):
    logger = logging.getLogger(__name__)
    try:
        if 'LOG_LEVEL' in os.environ and os.environ['LOG_LEVEL'] != '':
            logger.setLevel(os.environ['LOG_LEVEL'])
    except Exception as e:
        logger.error(
            'Unable to set logging level because: {0} defaulting to INFO.'.
            format(str(e)))

    # Load the model
    logger.debug('Loading model')
    model = Model(duration=None, hidden_size=None, model_type=model_type)
    model.load(model_path)

    # Get all the pcaps in the training directory
    logger.debug('Getting pcaps')
    pcaps = []
    try:
        ext = os.path.splitext(data_dir)[-1]
        if ext == '.pcap':
            pcaps.append(data_dir)
    except Exception as e:
        logger.debug('Skipping {0} because: {1}'.format(data_dir, str(e)))

    for dirpath, _, filenames in os.walk(data_dir):
        for filename in filenames:
            ext = os.path.splitext(filename)[-1]
            if ext == '.pcap':
                pcaps.append(os.path.join(dirpath, filename))

    # Get and store the representations using the supplied model
    # Representations will be computed separately for each pcap
    representations = {}
    for pcap in pcaps:
        logger.debug('Working on %s', pcap)
        reps, _, timestamps, _, _ = model.get_representation(pcap, mean=False)
        sessions = model.sessions

        # Compute the mean representations
        prev_rep = None
        prev_time = None
        model_outputs = {}

        if timestamps is not None:
            for i, timestamp in enumerate(timestamps):
                rep = reps[i]
                new_rep, time = average_representation(rep, timestamp,
                                                       prev_rep, prev_time,
                                                       time_const)
                preds = model.classify_representation(new_rep)
                if label is not None:
                    preds = [(p[0], 0) for p in preds if p[0] != label]
                    preds.append((label, 1))

                model_outputs[timestamp] = {
                    'classification': list(preds),
                    'representation': list(rep),
                    'mean representation': list(new_rep)
                }
                prev_rep, prev_time = new_rep, time

        # Clean the sessions and merge them into a single session dict
        session_rep_pairs = []
        source = get_source(sessions, address_type='IP')
        for session_dict in sessions:
            for key, value in session_dict.items():
                session_info = featurize_session(key, value, source=source)

                first_time = value[0][0].timestamp()
                prior_time = None
                for timestamp in timestamps:
                    time = timestamp.timestamp()
                    if first_time > time:
                        prior_time = timestamp
                if prior_time == None:
                    prior_time = timestamps[0]

                pair = {
                    'model outputs': model_outputs[prior_time],
                    'session info': session_info,
                    'key': key
                }
                if session_info is not None:
                    session_rep_pairs.append(pair)

        representations[pcap] = session_rep_pairs
    byte_size = sys.getsizeof(pickle.dumps(representations))
    logger.debug('created training data of size %f mb',
                 round(byte_size / 1000000, 3))

    return representations