def read_data(data_dir, duration=None, labels=None): ''' Reads all the data in the specified directory and parses it into a feature array and a label array. Args: data_dir: path to the directory that contains the training data duration: Time window to compute feature information labels: List containing labels to use Returns: X: numpy 2D array that contains the (high dimensional) features y: numpy 1D array that contains the labels for the features in X new_labels: Reordered labels used in training ''' X = [] y = [] assigned_labels = [] # Get all the files in the directory files = [] with open(os.path.join(data_dir,'label_assignments.json')) as handle: label_assignments = json.load(handle) for dirpath, dirnames, filenames in os.walk(data_dir): for file in filenames: files.append(os.path.join(dirpath,file)) # Go through all the files in the directory for filename in files: # Extract the label from the filename name = os.path.split(filename)[1] name = name.split('-')[0] if name in label_assignments: label = label_assignments[name] if label not in labels: label = 'Unknown' else: label = 'Unknown' if label not in assigned_labels: assigned_labels.append(label) print("Reading", filename,"as",label) # Bin the sessions with the specified time window binned_sessions = sessionizer( filename, duration=duration ) # For each of the session bins, compute the full feature vectors for session_dict in binned_sessions: features, _, _ = extract_features(session_dict) # Store the feature vector and the labels X.append(features) y.append(assigned_labels.index(label)) # Update the labels to reflect the new assignments new_labels = assigned_labels + \ [l for l in labels if l not in assigned_labels] return np.stack(X), np.stack(y), new_labels
def get_features(self, filepath, source_ip=None): ''' Reads a pcap specified by the file path and returns an array of the computed model inputs Args: filepath: Path to pcap to compute features for Returns: features: Numpy 2D array containing features for each time bin timestamp: datetime of the last observed packet ''' # Read the capture into a feature array X = [] timestamps = [] binned_sessions = sessionizer(filepath, duration=self.duration) self.sessions = binned_sessions if len(binned_sessions) is 0: return None, None, None, None for session_dict in binned_sessions: if len(session_dict) > 0: if source_ip is None: feature_list, source_ip, other_ips = extract_features( session_dict ) else: feature_list, _, other_ips = extract_features( session_dict, capture_source=source_ip ) X.append(feature_list) last_packet = list(session_dict.items())[-1] timestamps.append(last_packet[1][0][0]) if len(X) == 0: return None, None, None, None full_features = np.stack(X) # Mean normalize the features full_features -= np.expand_dims(self.means, 0) full_features /= np.expand_dims(self.stds, 0) features = full_features[:, self.feature_list] return features, source_ip, timestamps, other_ips
def read_data(data_dir, duration=None, labels=None): ''' Reads all the data in the specified directory and parses it into a feature array and a label array. Args: data_dir: path to the directory that contains the training data duration: Time window to compute feature information labels: List containing labels to use Returns: X: numpy 2D array that contains the (high dimensional) features y: numpy 1D array that contains the labels for the features in X new_labels: Reordered labels used in training ''' logger = logging.getLogger(__name__) try: if 'LOG_LEVEL' in os.environ and os.environ['LOG_LEVEL'] != '': logger.setLevel(os.environ['LOG_LEVEL']) except Exception as e: logger.error( 'Unable to set logging level because: {0} defaulting to INFO.'. format(str(e))) X = [] y = [] assigned_labels = [] # Get all the files in the directory files = [] with open('opts/label_assignments.json') as handle: label_assignments = json.load(handle) for dirpath, _, filenames in os.walk(data_dir): for file in filenames: _, ext = os.path.splitext(file) if ext == '.pcap': files.append(os.path.join(dirpath, file)) # Go through all the files in the directory logger.info('Found {0} pcap files to read.'.format(len(files))) count = 0 for filename in files: count += 1 # Extract the label from the filename name = os.path.split(filename)[1] name = name.split('-')[0] if name in label_assignments: label = label_assignments[name] if label not in labels: label = 'Unknown' else: label = 'Unknown' if label not in assigned_labels: assigned_labels.append(label) logger.info('Reading {0} ({1} bytes) as {2} ({3}/{4})'.format( filename, os.path.getsize(filename), label, count, len(files))) # Bin the sessions with the specified time window binned_sessions = sessionizer(filename, duration=duration) # Get the capture source from the binned sessions capture_source = get_source(binned_sessions) # For each of the session bins, compute the full feature vectors for session_dict in binned_sessions: features, _, _ = extract_features(session_dict, capture_source=capture_source) # Store the feature vector and the labels X.append(features) y.append(assigned_labels.index(label)) # Update the labels to reflect the new assignments new_labels = assigned_labels + \ [l for l in labels if l not in assigned_labels] return np.stack(X), np.stack(y), new_labels