def main(argv): startTime = datetime.now() print 'Starting...\n' ifile = None cont = None try: opts, args = getopt.getopt(argv, "hi:c:", ["ifile=", "cont=",]) except getopt.GetoptError: print 'Check your input parameters' print 'bro-dns-ml-hunt.py -i <input Bro DNS file> -c <contamination>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'bro-dns-ml-hunt.py -i <input Bro DNS file> -c <contamination>' sys.exit() elif opt in ("-i", "--ifile"): ifile = arg elif opt in ("-c", "--cont"): cont = float(arg) if not ifile: print 'A Bro log file must be provided as input' print 'bro-dns-ml-hunt.py -i <input Bro DNS file> -c <contamination>' sys.exit(2) if not cont: print 'Using default contamination value: 0.1' cont = 0.2 rng = np.random.RandomState(42) # Loading target data set f = open(ifile, 'r') lines = f.readlines() f.close() data = [] for line in lines: data.append(json.loads(line.strip())) original_data = data target_data, srows = select_fields(original_data) with warnings.catch_warnings(): warnings.simplefilter("ignore") # Create pandas dataframe bro_target_df = pd.DataFrame.from_dict(target_data, orient='columns') to_matrix = dataframe_to_matrix.DataFrameToMatrix() bro_target_matrix = to_matrix.fit_transform(bro_target_df) # Train using the Isolation Forest model iForest = IsolationForest(max_samples=100, contamination=cont, random_state=rng, verbose=False) iForest.fit(bro_target_matrix) # Get predictions outliers = iForest.predict(bro_target_matrix) # Save all outliers f = open('outliers.json', 'w') for i in range(len(outliers)): if outliers[i] == -1: f.write(json.dumps(original_data[srows[i]]) + '\n') f.close() # Isolate outliers odd_df = bro_target_df[outliers == -1] # Explore outliers with the help from KMeans odd_matrix = to_matrix.fit_transform(odd_df) num_clusters = min(len(odd_df), 4) # 4 clusters unless we have less than 4 observations odd_df['cluster'] = KMeans(n_clusters=num_clusters).fit_predict(odd_matrix) # Group the dataframe by cluster cluster_groups = odd_df[['cluster']].groupby('cluster') # Save all outliers per cluster f = open('kmeans-clusters.json', 'w') for key, group in cluster_groups: f.write('#Cluster {:d}: {:d} observations'.format(key, len(group)) + '\n') np_matrix = group.to_records() for item in np_matrix: f.write(json.dumps(original_data[srows[item[0]]]) + '\n') f.close() print '\nDone!' print 'Your results have been saved to the files outliers.json and kmeans-clusters.json' print 'Time elapsed: ' + str(datetime.now() - startTime) print 'Have a nice day!'
bro_df = log_to_dataframe.LogToDataFrame(args.bro_log) # Add query length bro_df['query_length'] = bro_df['query'].str.len() # Normalize this field ql = bro_df['query_length'] bro_df['query_length_norm'] = (ql - ql.min()) / (ql.max()-ql.min()) # These are the features we want (note some of these are categorical!) features = ['AA', 'RA', 'RD', 'TC', 'Z', 'rejected', 'proto', 'query', 'qclass_name', 'qtype_name', 'rcode_name', 'query_length_norm'] feature_df = bro_df[features] # Use the super awesome DataframeToMatrix class (handles categorical data!) to_matrix = dataframe_to_matrix.DataFrameToMatrix() bro_matrix = to_matrix.fit_transform(feature_df) # Now we're ready for scikit-learn! # Just some simple stuff for this example, KMeans and PCA kmeans = KMeans(n_clusters=5).fit_predict(bro_matrix) pca = PCA(n_components=2).fit_transform(bro_matrix) # Now we can put our ML results back onto our dataframe! bro_df['x'] = jitter(pca[:, 0]) # PCA X Column bro_df['y'] = jitter(pca[:, 1]) # PCA Y Column bro_df['cluster'] = kmeans # Now use dataframe group by cluster show_fields = ['query', 'Z', 'proto', 'qtype_name', 'x', 'y', 'cluster'] cluster_groups = bro_df[show_fields].groupby('cluster')