def readrows(self): """The readrows method reads simply 'combines' the rows of multiple files OR gunzips the file and then reads the rows """ # For each file (may be just one) create a BroLogReader and use it for self._filepath in self._files: # Check if the file is zipped if self._filepath.endswith('.gz'): tmp = tempfile.NamedTemporaryFile(delete=False) with gzip.open(self._filepath, 'rb') as f_in, open(tmp.name, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # Set the file path to the new temp file self._filepath = tmp.name # Create a BroLogReader reader = bro_log_reader.BroLogReader(self._filepath) for row in reader.readrows(): yield row # Clean up any temp files try: os.remove(tmp.name) print('Removed temporary file {:s}...'.format(tmp.name)) except: pass
def parseLOG(filename): """ Generate a list of Dumont Requests from a bro log file Parameters ---------- filename : string path to .pcap file to parse. Returns ------- result : list of DumontLog() ordered list of dumont logs. """ DumontRequests = [] bro_log = bro_log_reader.BroLogReader(filename) data = pd.DataFrame(bro_log.readrows()) data['header_values'] = data['header_values'].apply(__parseHeaderValues__) for d in data.iterrows(): if d[1]['method'] == 'GET' or d[1]['method'] == 'POST': DumontRequests.append(DumontLog(d[1])) return aggregateTemporalFeatures(DumontRequests)
def bro_http_parser(inFile): #reader = bro_log_reader.BroLogReader('/Users/Gary/PycharmProjects/Aktaion/data/broData/ExploitExample/http.log') reader = bro_log_reader.BroLogReader(inFile) dictionaryIndex = 1 masterDictionary = {} for row in reader.readrows(): #cast row dictionary into a new dictionary to provide legacy variable names row["idOrigHost"] = row.pop('id.orig_h') row["idOrigPort"] = row.pop('id.orig_p') row["idRespHost"] = row.pop('id.resp_h') row["idRespPort"] = row.pop('id.resp_p') row["transDepth"] = row.pop('trans_depth') row["userAgent"] = row.pop('user_agent') row["requestBodyLen"] = row.pop('request_body_len') row["responseBodyLen"] = row.pop('response_body_len') row["statusCode"] = row.pop('status_msg') row["epochTime"] = row.pop('ts') if row["idRespHost"] == 443: fUrl = {"fullUrl": "https:\\" + row["host"] + row["uri"]} else: fUrl = {"fullUrl": "https:\\" + row["host"] + row["uri"]} row.update(fUrl) masterDictionary[dictionaryIndex] = row dictionaryIndex += 1 # # row.update(row1) #TODO return just row1 with all 27 relabeled fields + fullUrl #pprint(row) return (masterDictionary)
def parseFile(self, filename, json=False): """ Creates a pandas dataframe from given brofile Parameters ---------- filename : string Path to file to be parsed Returns ------- result : pd.DataFrame Pandas dataframe containing bro log file """ df = None if not json: bro_log = bro_log_reader.BroLogReader(filename) df = pd.DataFrame(bro_log.readrows()) else: df = pd.read_json(filename, lines=True) #df.rename( # index=str, # columns={ # 'client_header_names': 'header_values'}, # inplace=True) df['header_values'] = df['header_values'].apply( self.__parseHeaderValues__) return df
def searcher(): es = Elasticsearch("127.0.0.1:9200") reader = bro_log_reader.BroLogReader("http.log") l = [] for row_dict in reader.readrows(): # The result is list of json objects print "\n" try: ip = row_dict["id.resp_h"] # Getting the source IP field, it is inside a try statement cause not every query hits will have this field except: continue if ip not in l: # Query only if the IP has not been queried yet l.append(ip) # Appending it in the list nyasro = obj.lookup_ip(ip) # Talos lookup if "not fetch" not in str(nyasro): # If the talos lookup did not failed if nyasro["web_reputation"] == "Poor" or nyasro["email_reputation"] == "Poor": nyasro["@timestamp"] = datetime.now().isoformat() nyasro["destination_ip"] = row_dict["id.resp_h"] nyasro["source_ip"] = row_dict["id.orig_h"] nyasro["source_port"] = row_dict["id.orig_p"] nyasro["destination_port"] = row_dict["id.resp_p"] print nyasro try: es.create(index = "threat-intel",doc_type="threat",body = nyasro) print "sucess" except: es.index(index = "threat-intel",doc_type="threat",body = nyasro)
def scan_files(c): s = SlackClient("api key here") c.h1("\n\nMaliciou Files Downloaded") message = "" malicious_files_downloaded = [] filename = '' reader = bro_log_reader.BroLogReader( "download.log") # Reading from this log file for row in reader.readrows(): # Reading each row from the bro logs try: if row['method'] == 'GET': filename = row['host'] + row['uri'] print filename if filename not in malicious_files_downloaded: malicious_files_downloaded.append(filename) try: r = scan(filename) except: continue if r and r[1] > 0: # if malicious message = message + "A mal file was downloaded {}\n".format( filename) break except: pass print "\n\nWriting to pdf" c.p(message) s.api_call("chat.postMessage", channel='project', text=message)
def bro_http_to_df(inFile): """Parses a Bro http.log file, returns a pandas data frame""" if not inFile.endswith('log'): print('This method only works with Bro http.log files, the file ' + inFile + ' is not valid.' ) sys.exit(1) reader = bro_log_reader.BroLogReader(inFile) bro_df = pd.DataFrame(reader.readrows()) bro_df = broParse.add_full_URL(bro_df) bro_df = broParse.normalize_bro(bro_df) return(bro_df)
def parseFile(self, filename): """ Creates a pandas dataframe from given brofile Parameters ---------- filename : string Path to file to be parsed Returns ------- result : pd.DataFrame Pandas dataframe containing bro log file """ bro_log = bro_log_reader.BroLogReader(filename) data = pd.DataFrame(bro_log.readrows()) data['header_values'] = data['header_values'].apply(self.__parseHeaderValues__) return data
def __init__(self, filepath, eps=10, max_rows=None): """Initialization for the LiveSimulator Class Args: eps (int): Events Per Second that the simulator will emit events (default = 10) max_rows (int): The maximum number of rows to generate (default = None (go forever)) """ # Compute EPS timer # Logic: # - Normal distribution centered around 1.0/eps # - Make sure never less than 0 # - Precompute 1000 deltas and then just cycle around self.eps_timer = itertools.cycle([max(0, delta) for delta in np.random.normal(1.0/float(eps), .5/float(eps), size=1000)]) # Initialize the Bro log reader self.log_reader = bro_log_reader.BroLogReader(filepath, tail=False) # Store max_rows self.max_rows = max_rows
def test_write_bro(self): # Extract Data self.exportObj.extractFromDB() # Reduce Threats self.prune_threats() # Write Test File self.exportObj.writeBro() # Set the file string name file_string = self.exportObj.fileString + '.bro' # Load the test file into a new expected format reader = bro_log_reader.BroLogReader(file_string) # Add each imported dictionary into a list full_list = [] for row in reader.readrows(): tmp_dict = dict() tmp_dict.update(row) full_list.append(tmp_dict) # Test Dictionary self.assertNotEqual(full_list, [])
def http_analyzer(c): msg = '' c.h1("\n\nHTTP_HUNT") es = Elasticsearch("localhost:9200") reader = bro_log_reader.BroLogReader( "http.log" ) # This object reads from the log from the mentioned location n = datetime.now() # THe below line has been commented, if info about the key and value pairs is requierd can be uncommented to see them url_list = [] # just a sample malicious site i = 0 my_dict = {} # will hold the json value for row_dict in reader.readrows(): print "\n\n" print "\n\n" try: row_dict['host'] # Sometimes this key may not be present if row_dict['host'] == '-': continue pass # If key is present go ot the next script except: continue # If key is not present go to top of loop if row_dict['host'] not in url_list: # Getting only the URLs url_list.append(row_dict['host']) if i == 0: pass elif i % 4 == 0: # Becasue the limitation of VIrustotal PUblci API is 4 scans per minute print "[*]NEED A TIMEOUT \n" time.sleep(60) try: r = scan(row_dict['host']) except: continue if r and r[1] > 0: # if malicious print r print "a malicious site" my_dict["src_ip"] = row_dict["id.orig_h"] my_dict["src_port"] = row_dict["id.orig_p"] my_dict["dest_ip"] = row_dict["id.resp_h"] my_dict["dest_port"] = row_dict["id.resp_p"] my_dict["refferer"] = row_dict["referrer"] my_dict["method"] = row_dict["method"] my_dict["link"] = row_dict['host'] my_dict["type"] = "HTTP" my_dict["@timestamp"] = datetime.now().isoformat() my_dict["ip_void"] = "http://www.ipvoid.com/scan/{}".format( row_dict["id.resp_h"]) my_dict[ "sender_base"] = "http://www.senderbase.org/lookup/?search_string={}".format( row_dict["id.resp_h"]) my_dict[ "virustotal"] = "https://www.virustotal.com/en/ip-address/{}/information/".format( row_dict["id.resp_h"]) my_dict["threat-intel-source"] = "Virustotal" msg = msg + "Source_ip: {}\nDestination_ip: {}\nSource_port: {}\nDestination_port: {}\nDomain:{}\n".format( my_dict["src_ip"], my_dict["dest_ip"], my_dict["src_port"], my_dict["dest_port"], my_dict["link"]) body = json.dumps(my_dict) try: es.create(index="malicious_website", doc_type="practise", body=body) print "sucess" except: es.index(index="malicious_website", doc_type="practise", body=body) print "wut" my_dict = {} else: pass i = i + 1 c.p(msg)
def detect(file, amountanom, realtime): """ Functon to apply a very simple anomaly detector amountanom: The top number of anomalies we want to print realtime: If we want to read the conn.log file in real time (not working) """ # Create a zeek reader on a given log file. Thanks brothon reader = bro_log_reader.BroLogReader(file, tail=realtime) # Create a Pandas dataframe from reader bro_df = pd.DataFrame(reader.readrows()) # In case you need a label, due to some models being able to work in a semisupervized mode, then put it here. For now everything is 'normal', but we are not using this for detection bro_df['label'] = 'normal' # Change the datetime delta value to seconds. Scikit does not now how to work with timedeltas bro_df['durationsec'] = bro_df.duration.apply(lambda x: x.total_seconds()) # Replace the rows without data (with '-') with -1. Even though this may add a bias in the algorithms, is better than not using the lines. bro_df['orig_bytes'] = bro_df['orig_bytes'].replace(to_replace='-', value=-1) bro_df['resp_bytes'] = bro_df['resp_bytes'].replace(to_replace='-', value=-1) bro_df['resp_pkts'] = bro_df['resp_pkts'].replace(to_replace='-', value=-1) bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].replace(to_replace='-', value=-1) bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].replace(to_replace='-', value=-1) # Add the columns from the log file that we know are numbers. This is only for conn.log files. X_train = bro_df[[ 'durationsec', 'orig_bytes', 'id.resp_p', 'resp_bytes', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes' ]] # Our y is the label. But we are not using it now. y = bro_df.label # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train. X_test = X_train ################# # Select a model from below # ABOD class for Angle-base Outlier Detection. For an observation, the variance of its weighted cosine scores to all neighbors could be viewed as the outlying score. #clf = ABOD() # LOF #clf = LOF() # CBLOF #clf = CBLOF() # LOCI #clf = LOCI() # LSCP #clf = LSCP() # MCD #clf = MCD() # OCSVM #clf = OCSVM() # PCA. Good and fast! clf = PCA() # SOD #clf = SOD() # SO_GAAL #clf = SO_GALL() # SOS #clf = SOS() # XGBOD #clf = XGBOD() # KNN # Good results but slow #clf = KNN() #clf = KNN(n_neighbors=10) ################# # Fit the model to the train data clf.fit(X_train) # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # Convert the ndarrays of scores and predictions to pandas series scores_series = pd.Series(y_test_scores) pred_series = pd.Series(y_test_pred) # Now use the series to add a new column to the X test X_test['score'] = scores_series.values X_test['pred'] = pred_series.values # Add the score to the bro_df also. So we can show it at the end bro_df['score'] = X_test['score'] # Keep the positive predictions only. That is, keep only what we predict is an anomaly. X_test_predicted = X_test[X_test.pred == 1] # Keep the top X amount of anomalies top10 = X_test_predicted.sort_values(by='score', ascending=False).iloc[:amountanom] ## Print the results # Find the predicted anomalies in the original bro dataframe, where the rest of the data is df_to_print = bro_df.iloc[top10.index] print('\nFlows of the top anomalies') # Only print some columns, not all, so its easier to read. df_to_print = df_to_print.drop([ 'conn_state', 'history', 'local_orig', 'local_resp', 'missed_bytes', 'ts', 'tunnel_parents', 'uid', 'label' ], axis=1) print(df_to_print)
] elif 'dns' in args.bro_log: log_type = 'dns' features = [ 'Z', 'rejected', 'proto', 'query', 'qclass_name', 'qtype_name', 'rcode_name', 'query_length', 'answer_length', 'entropy' ] else: print( 'This example only works with Bro with http.log or dns.log files..' ) sys.exit(1) # Create a Bro IDS log reader print('Opening Data File: {:s}'.format(args.bro_log)) reader = bro_log_reader.BroLogReader(args.bro_log) # Create a Pandas dataframe from reader bro_df = pd.DataFrame(reader.readrows()) print('Read in {:d} Rows...'.format(len(bro_df))) # Using Pandas we can easily and efficiently compute additional data metrics # Here we use the vectorized operations of Pandas/Numpy to compute query length # We'll also compute entropy of the query if log_type == 'dns': bro_df['query_length'] = bro_df['query'].str.len() bro_df['answer_length'] = bro_df['answers'].str.len() bro_df['entropy'] = bro_df['query'].map(lambda x: entropy(x)) # Use the BroThon DataframeToMatrix class to_matrix = dataframe_to_matrix.DataFrameToMatrix()
# Local imports from brothon import bro_log_reader if __name__ == '__main__': # Example to run the bro log reader on a given file # Collect args from the command line parser = argparse.ArgumentParser() parser.add_argument('-f', '--bro-log', type=str, help='Specify a bro log to run BroLogReader test on') parser.add_argument('-t', '--tail', action='store_true', help='Turn on log tailing') args, commands = parser.parse_known_args() # Check for unknown args if commands: print('Unrecognized args: %s' % commands) sys.exit(1) # If no args just call help if len(sys.argv) == 1: parser.print_help() sys.exit(1) # File may have a tilde in it if args.bro_log: args.bro_log = os.path.expanduser(args.bro_log) # Run the bro reader on a given log file reader = bro_log_reader.BroLogReader(args.bro_log, tail=args.tail) for row in reader.readrows(): pprint(row)
#!/usr/bin/python from brothon import bro_log_reader from file import scan from elasticsearch import Elasticsearch import time import json reader = bro_log_reader.BroLogReader("dns.log") dns_replier_list = [] dns_requested_url = [] my_dict = {} def put_in_string(message, my_dict): message = message + "\n\nSource_IP: {}\nSource_port: {}\nDestination_ip: {}\nDestination_port: {}\nQuery: {}\n".format( my_dict["src_ip"], my_dict["src_port"], my_dict["dest_ip"], my_dict["dest_port"], my_dict["query"]) return message def search_dns(c): i = 0 dns_replier_list = [] dns_requested_url = [] my_dict = {} message = "" es = Elasticsearch("localhost:9200") for row in reader.readrows():
print('more') print(json_response['permalink']) else: print('This file cannot be verified') except BaseException as e: print(e) if __name__ == '__main__': """Run a VirusTotal Query on Extracted File Hashes""" index = 1 try: # Run the bro reader on a given log file reader = bro_log_reader.BroLogReader('files.log') print('Examination result (positives / total number of vaccines)') # Use Sha1 hash algorithm for row in reader.readrows(): print(index) index += 1 if (row['sha1'] != '-'): checkVirus(row['sha1']) except BaseException as e: print(e)
parser.print_help() sys.exit(1) # Sanity check that this is a file log if not args.bro_log.endswith('files.log'): print('This example only works with Bro files.log files..') sys.exit(1) # File may have a tilde in it if args.bro_log: args.bro_log = os.path.expanduser(args.bro_log) # Create a VirusTotal Query Class vtq = vt_query.VTQuery() # Run the bro reader on a given log file reader = bro_log_reader.BroLogReader(args.bro_log, tail=True) for row in reader.readrows(): file_sha = row.get('sha256', '-') # Bro uses - for empty field if file_sha == '-': file_sha = row.get('sha1', '-') # Bro uses - for empthy field if file_sha == '-': print( 'Should not find a sha256 or a sha1 key! Skipping...') continue # Make the query with either sha results = vtq.query_file(file_sha) if results.get('positives', 0) > 1: # At least two hits pprint(results)