import csv from virus_total_apis import PublicApi as VT from bat import log_to_dataframe, dataframe_to_matrix import pandas as pd import numpy as np import sklearn from sklearn.ensemble import IsolationForest from sklearn.covariance import EllipticEnvelope from sklearn.svm import OneClassSVM from sklearn.decomposition import PCA from sklearn.cluster import KMeans import tkinter from geoip import geolite2 clean_df = log_to_dataframe.LogToDataFrame('bro/clean_traffic/http.log') mixed_df = log_to_dataframe.LogToDataFrame('bro/mixed_traffic/http.log') #print(clean_df.head()) #print(mixed_df.head()) #trans_depth features = [ 'ts', 'day', 'id.resp_h', 'id.resp_p', 'method', 'host', 'user_agent', 'request_body_len', 'response_body_len', 'status_code', 'info_code' ] clean_df = clean_df.reset_index() mixed_df = mixed_df.reset_index() def convert(ip):
# If no args just call help if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Sanity check that this is a dns log if not args.bro_log.endswith('dns.log'): print('This example only works with Bro dns.log files..') sys.exit(1) # File may have a tilde in it if args.bro_log: args.bro_log = os.path.expanduser(args.bro_log) # Create a Pandas dataframe from the Bro log bro_df = log_to_dataframe.LogToDataFrame(args.bro_log) # Add query length bro_df['query_length'] = bro_df['query'].str.len() # Normalize this field ql = bro_df['query_length'] bro_df['query_length_norm'] = (ql - ql.min()) / (ql.max()-ql.min()) # These are the features we want (note some of these are categorical!) features = ['AA', 'RA', 'RD', 'TC', 'Z', 'rejected', 'proto', 'query', 'qclass_name', 'qtype_name', 'rcode_name', 'query_length_norm'] feature_df = bro_df[features] # Use the super awesome DataframeToMatrix class (handles categorical data!) to_matrix = dataframe_to_matrix.DataFrameToMatrix()
#!/usr/bin/python3 import bat from bat import log_to_dataframe, dataframe_to_matrix import pandas as pd import numpy as np import sklearn from sklearn.ensemble import IsolationForest from sklearn.decomposition import PCA from sklearn.cluster import KMeans import tkinter from geoip import geolite2 clean_df = log_to_dataframe.LogToDataFrame('bro/clean_traffic/conn.log') mixed_df = log_to_dataframe.LogToDataFrame('bro/malicious_traffic/conn.log') print(clean_df.head()) print(mixed_df.head()) features = [ 'ts', 'day', 'id.resp_h', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'local_orig', 'local_resp', 'orig_pkts', 'resp_pkts' ] #features = ['id.orig_h', 'id.resp_h'] clean_df = clean_df.reset_index() mixed_df = mixed_df.reset_index() clean_df = clean_df[clean_df.service != 'dns'] mixed_df = mixed_df[mixed_df.service != 'dns']