def parser(): parser = argparse.ArgumentParser() parser.add_argument('-j', '--json_format', help='Import zeek log in json string format', action='store_true') parser.add_argument( '-l', '--length', help='Filter queries that have characters greater than this length', type=int, default=0) parser.add_argument('zeek_log_path', type=str, help='Type in location of zeek log') args = parser.parse_args() if args.json_format: print('**Importing zeek log in json format**') df = import_json(args.zeek_log_path) else: print('**Importing zeek log in ascii format**') print( '**If this hangs for longer than a 17 sec, high chance you are trying to import a log in json format instead, use -j**' ) log_to_df = LogToDataFrame() df = log_to_df.create_dataframe(args.zeek_log_path) return df, args.length
def detect(file, amountanom): """ Function to apply a very simple anomaly detector amountanom: The top number of anomalies we want to print """ # read data log_to_df = LogToDataFrame() zeek_df = log_to_df.create_dataframe(file) #print('Read in {:d} Rows...'.format(len(zeek_df))) # In case you need a label, due to some models being able to work in a # semisupervized mode, then put it here. For now everything is # 'normal', but we are not using this for detection zeek_df['label'] = 'normal' # Replace the rows without data (with '-') with 0. # Even though this may add a bias in the algorithms, # is better than not using the lines. # Also fill the no values with 0 # Finally put a type to each column zeek_df['orig_bytes'].replace('-', '0', inplace=True) zeek_df['orig_bytes'] = zeek_df['orig_bytes'].fillna(0).astype('int32') zeek_df['resp_bytes'].replace('-', '0', inplace=True) zeek_df['resp_bytes'] = zeek_df['resp_bytes'].fillna(0).astype('int32') zeek_df['resp_pkts'].replace('-', '0', inplace=True) zeek_df['resp_pkts'] = zeek_df['resp_pkts'].fillna(0).astype('int32') zeek_df['orig_ip_bytes'].replace('-', '0', inplace=True) zeek_df['orig_ip_bytes'] = zeek_df['orig_ip_bytes'].fillna(0).astype( 'int32') zeek_df['resp_ip_bytes'].replace('-', '0', inplace=True) zeek_df['resp_ip_bytes'] = zeek_df['resp_ip_bytes'].fillna(0).astype( 'int32') # zeek_df['duration'].replace('-', '0', inplace=True) # zeek_df['duration'] = zeek_df['duration'].fillna(0).astype('float64') features = [ 'duration', 'orig_bytes', 'id.resp_p', 'resp_bytes', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes' ] # Add the columns from the log file that we know are numbers. This is only for conn.log files. to_matrix = DataFrameToMatrix() zeek_matrix = to_matrix.fit_transform(zeek_df[features], normalize=True) X_train = zeek_matrix # Our y is the label. But we are not using it now. y = zeek_df.label # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train. # X_test = X_train odd_clf = IsolationForest(behaviour='new', contamination=0.25) # Marking 25% odd odd_clf.fit(zeek_matrix) odd_df = zeek_df[features][odd_clf.predict(zeek_matrix) == -1] print(odd_df.shape)
def parser(): parser = argparse.ArgumentParser() parser.add_argument('-j', '--json_format', help='Import zeek log in json string format', action='store_true') parser.add_argument('zeek_log_path', type=str, help='Type in location of zeek log') parser.add_argument('-t', action='store_true', default=False, help='Sets the program to tail a live Zeek log') args, commands = parser.parse_known_args() # Check for unknown args if commands: print('Unrecognized args: %s' % commands) sys.exit(1) # Sanity check that this is a ssl log if 'ssl' not in args.zeek_log_path: print('This example only works with Zeek ssl.log files..') sys.exit(1) # File may have a tilde in it if args.zeek_log_path: args.zeek_log_path = os.path.expanduser(args.zeek_log_path) # Determine json or ascii format if args.json_format: print('**Importing zeek log in json format**') df = import_json(args.zeek_log_path) else: print('**Importing zeek log in ascii format**') print( '**If this hangs for longer than a 17 sec, high chance you are trying to import a log in json format instead, use -j**' ) log_to_df = LogToDataFrame() df = log_to_df.create_dataframe(args.zeek_log_path) return df
def detect(file, amountanom, realtime, dumptocsv): """ Function to apply a very simple anomaly detector amountanom: The top number of anomalies we want to print realtime: If we want to read the conn.log file in real time (not working) """ # Create a Pandas dataframe from the conn.log log_to_df = LogToDataFrame() bro_df = log_to_df.create_dataframe(file, ts_index=False) # In case you need a label, due to some models being able to work in a semisupervized mode, then put it here. For now everything is 'normal', but we are not using this for detection bro_df['label'] = 'normal' # Change the datetime delta value to seconds. Scikit does not now how to work with timedeltas bro_df['durationsec'] = bro_df.duration.apply(lambda x: x.total_seconds()) # Replace the rows without data (with '-') with -1. Even though this may add a bias in the algorithms, is better than not using the lines. bro_df['orig_bytes'] = bro_df['orig_bytes'].fillna(0) bro_df['resp_bytes'] = bro_df['resp_bytes'].fillna(0) bro_df['resp_pkts'] = bro_df['resp_pkts'].fillna(0) bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].fillna(0) bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].fillna(0) bro_df['durationsec'] = bro_df['durationsec'].fillna(0) # Save dataframe to disk as CSV if dumptocsv != "None": bro_df.to_csv(dumptocsv) # Add the columns from the log file that we know are numbers. This is only for conn.log files. X_train = bro_df[[ 'durationsec', 'orig_bytes', 'id.resp_p', 'resp_bytes', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes' ]] # Our y is the label. But we are not using it now. y = bro_df.label # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train. X_test = X_train ################# # Select a model from below # ABOD class for Angle-base Outlier Detection. For an observation, the variance of its weighted cosine scores to all neighbors could be viewed as the outlying score. #clf = ABOD() # LOF #clf = LOF() # CBLOF #clf = CBLOF() # LOCI #clf = LOCI() # LSCP #clf = LSCP() # MCD #clf = MCD() # OCSVM #clf = OCSVM() # PCA. Good and fast! clf = PCA() # SOD #clf = SOD() # SO_GAAL #clf = SO_GALL() # SOS #clf = SOS() # XGBOD #clf = XGBOD() # KNN # Good results but slow #clf = KNN() #clf = KNN(n_neighbors=10) ################# # Fit the model to the train data clf.fit(X_train) # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # Convert the ndarrays of scores and predictions to pandas series scores_series = pd.Series(y_test_scores) pred_series = pd.Series(y_test_pred) # Now use the series to add a new column to the X test X_test['score'] = scores_series.values X_test['pred'] = pred_series.values # Add the score to the bro_df also. So we can show it at the end bro_df['score'] = X_test['score'] # Keep the positive predictions only. That is, keep only what we predict is an anomaly. X_test_predicted = X_test[X_test.pred == 1] # Keep the top X amount of anomalies top10 = X_test_predicted.sort_values(by='score', ascending=False).iloc[:amountanom] ## Print the results # Find the predicted anomalies in the original bro dataframe, where the rest of the data is #df_to_print = bro_df.iloc[top10.index] df_to_print = bro_df.iloc[top10.index] print('\nFlows of the top anomalies') # Only print some columns, not all, so its easier to read. df_to_print = df_to_print.drop([ 'conn_state', 'history', 'local_orig', 'local_resp', 'missed_bytes', 'ts', 'tunnel_parents', 'uid', 'label' ], axis=1) print(df_to_print)
parser.add_argument('zeek_log', type=str, help='Specify a zeek log to run ZeekLogReader test on') args, commands = parser.parse_known_args() # Check for unknown args if commands: print('Unrecognized args: %s' % commands) sys.exit(1) # File may have a tilde in it if args.zeek_log: args.zeek_log = os.path.expanduser(args.zeek_log) # Create a Pandas dataframe from a Zeek log log_to_df = LogToDataFrame() zeek_df = log_to_df.create_dataframe(args.zeek_log) # Print out the head of the dataframe print(zeek_df.head()) # Print out the types of the columns print(zeek_df.dtypes) # Print out size and memory usage print('DF Shape: {:s}'.format(str(zeek_df.shape))) print('DF Memory:') memory_usage = zeek_df.memory_usage(deep=True) total = memory_usage.sum() for item in memory_usage.items(): print('\t {:s}: \t{:.2f} MB'.format(item[0], item[1] / 1e6))
def parser(): parser = argparse.ArgumentParser() parser.add_argument('-j', '--json_format', help='Import zeek log in json string format', action='store_true') parser.add_argument('-d', '--directory', help='Import zeek logs from directory', action='store_true') parser.add_argument('zeek_log_path', type=str, help='Type in location of zeek log') parser.add_argument('-t', action='store_true', default=False, help='Sets the program to tail a live Zeek log') args, commands = parser.parse_known_args() # Check for unknown args if commands: print('Unrecognized args: %s' % commands) sys.exit(1) # Sanity check that this is a ssl log if 'ssl' not in args.zeek_log_path: print('This example only works with Zeek ssl.log files..') sys.exit(1) # File may have a tilde in it if args.zeek_log_path: args.zeek_log_path = os.path.expanduser(args.zeek_log_path) # Determine json or ascii format if args.json_format: if args.directory: print('**Importing zeek logs from directory in json format**') zeek_logs = [ os.path.join(args.zeek_log_path, file) for file in os.listdir(args.zeek_log_path) ] logs = [] for log in zeek_logs: logs.append(import_json(log)) df = pd.concat(logs) else: print('**Importing zeek log in json format**') df = import_json(args.zeek_log_path) else: if args.directory: print('**Importing zeek logs from directory in ascii format**') zeek_logs = [ os.path.join(args.zeek_log_path, file) for file in os.listdir(args.zeek_log_path) ] logs = [] log_to_df = LogToDataFrame() for log in zeek_logs: logs.append(log_to_df.create_dataframe(log)) df = pd.concat(logs) else: print('**Importing zeek log in ascii format**') print( '**Hanging? High chance you are trying to import a log in json format, use -j**' ) log_to_df = LogToDataFrame() df = log_to_df.create_dataframe(args.zeek_log_path) return df
for j in self.job_queue: if not j.running(): break else: j = None if j is not None: self.finished_jobs.append(j) self.job_queue.remove(j) time.sleep(self.sleeping_time) self.job_queue.append(job) if __name__ == "__main__": client = bigquery.Client.from_service_account_json( GCP_SERVICE_ACCOUNT_JSON) log_to_df = LogToDataFrame() jq = JobQueue() for root, directories, files in os.walk(DATA_IN): for f in files: if f.endswith(".log"): table_id = "cyber-258808.cicids_2017." + f.replace(".", "_") df = log_to_df.create_dataframe(os.path.join(root, f)) df.reset_index(inplace=True) for c, t in df.dtypes.iteritems(): # print((c, str(t))) if str(t) == "timedelta64[ns]": df[c] = df[c].dt.total_seconds() elif str(t).endswith("[ns]"): df[c] = df[c].astype(str(t).replace("[ns]", "[ms]")) # accepting to loose precision because of pyarrow that would otherwise not be happy # TODO? Better?
parser.add_argument('bro_log', type=str, help='Specify a bro log to run BroLogReader test on') args, commands = parser.parse_known_args() # Check for unknown args if commands: print('Unrecognized args: %s' % commands) sys.exit(1) # File may have a tilde in it if args.bro_log: args.bro_log = os.path.expanduser(args.bro_log) # Create a Pandas dataframe from a Zeek log log_to_df = LogToDataFrame() bro_df = log_to_df.create_dataframe(args.bro_log) # Print out the head of the dataframe print(bro_df.head()) # Print out the types of the columns print(bro_df.dtypes) # Print out size and memory usage print('DF Shape: {:s}'.format(str(bro_df.shape))) print('DF Memory:') memory_usage = bro_df.memory_usage(deep=True) total = memory_usage.sum() for item in memory_usage.items(): print('\t {:s}: \t{:.2f} MB'.format(item[0], item[1] / 1e6))
def parser(): parser = argparse.ArgumentParser() parser.add_argument('-j', '--json_format', help='Import zeek log in json string format', action='store_true') parser.add_argument('-a', '--anomaly', help='Perform clustering on anomalies', action='store_true') parser.add_argument('-d', '--directory', help='Import zeek logs from directory', action='store_true') parser.add_argument('-c', '--clusters', help='Number of clusters to divide data, default=4', type=int, default=4) parser.add_argument('zeek_log_path', type=str, help='Type in location of zeek log') args = parser.parse_args() if args.json_format: if args.directory: print('**Importing zeek logs from directory in json format**') zeek_logs = [ os.path.join(args.zeek_log_path, file) for file in os.listdir(args.zeek_log_path) ] ''' df = import_json(zeek_logs[0]) for log in zeek_logs[1:]: print('**Appending log**') df = df.append(import_json(log)) ''' logs = [] for log in zeek_logs: logs.append(import_json(log)) df = pd.concat(logs) else: print('**Importing zeek log in json format**') df = import_json(args.zeek_log_path) else: if args.directory: print('**Importing zeek logs from directory in ascii format**') zeek_logs = [ os.path.join(args.zeek_log_path, file) for file in os.listdir(args.zeek_log_path) ] logs = [] log_to_df = LogToDataFrame() for log in zeek_logs: logs.append(log_to_df.create_dataframe(log)) df = pd.concat(logs) else: print('**Importing zeek log in ascii format**') print( '**If this hangs for longer than a 17 sec, high chance you are trying to import a log in json format instead, use -j**' ) log_to_df = LogToDataFrame() df = log_to_df.create_dataframe(args.zeek_log_path) return df, args.anomaly, args.clusters