class ParseProcessLog(list): """Parses process log file.""" def __init__(self, log_path): """@param log_path: log file path.""" self._log_path = log_path self.fd = None self.parser = None self.process_id = None self.process_name = None self.parent_id = None self.first_seen = None self.calls = self self.lastcall = None if os.path.exists(log_path) and os.stat(log_path).st_size > 0: self.parse_first_and_reset() def parse_first_and_reset(self): self.fd = open(self._log_path, "rb") if self._log_path.endswith(".bson"): self.parser = BsonParser(self) elif self._log_path.endswith(".raw"): self.parser = NetlogParser(self) else: self.fd.close() self.fd = None return # get the process information from file to determine # process id (file names) while not self.process_id: self.parser.read_next_message() self.fd.seek(0) def read(self, length): if not length: return '' buf = self.fd.read(length) if not buf or len(buf) != length: raise EOFError() return buf def __iter__(self): #import inspect #log.debug('iter called by this guy: {0}'.format(inspect.stack()[1])) return self def __repr__(self): return "<ParseProcessLog log-path: %r>" % self._log_path def __nonzero__(self): return self.wait_for_lastcall() def reset(self): self.fd.seek(0) self.lastcall = None def compare_calls(self, a, b): """Compare two calls for equality. Same implementation as before netlog. @param a: call a @param b: call b @return: True if a == b else False """ if a["api"] == b["api"] and \ a["status"] == b["status"] and \ a["arguments"] == b["arguments"] and \ a["return"] == b["return"]: return True return False def wait_for_lastcall(self): while not self.lastcall: r = None try: r = self.parser.read_next_message() except EOFError: return False if not r: return False return True def next(self): if not self.fd: raise StopIteration() if not self.wait_for_lastcall(): self.reset() raise StopIteration() nextcall, self.lastcall = self.lastcall, None self.wait_for_lastcall() while self.lastcall and self.compare_calls(nextcall, self.lastcall): nextcall["repeated"] += 1 self.lastcall = None self.wait_for_lastcall() return nextcall def log_process(self, context, timestring, pid, ppid, modulepath, procname): self.process_id, self.parent_id, self.process_name = pid, ppid, procname self.first_seen = timestring def log_thread(self, context, pid): pass def log_call(self, context, apiname, category, arguments): apiindex, status, returnval, tid, timediff = context current_time = self.first_seen + datetime.timedelta(0, 0, timediff*1000) timestring = logtime(current_time) self.lastcall = self._parse([timestring, tid, category, apiname, status, returnval] + arguments) def log_error(self, emsg): log.warning("ParseProcessLog error condition on log %s: %s", str(self._log_path), emsg) def _parse(self, row): """Parse log row. @param row: row data. @return: parsed information dict. """ call = {} arguments = [] try: timestamp = row[0] # Timestamp of current API call invocation. thread_id = row[1] # Thread ID. category = row[2] # Win32 function category. api_name = row[3] # Name of the Windows API. status_value = row[4] # Success or Failure? return_value = row[5] # Value returned by the function. except IndexError as e: log.debug("Unable to parse process log row: %s", e) return None # Now walk through the remaining columns, which will contain API # arguments. for index in range(6, len(row)): argument = {} # Split the argument name with its value based on the separator. try: arg_name, arg_value = row[index] except ValueError as e: log.debug("Unable to parse analysis row argument (row=%s): %s", row[index], e) continue argument["name"] = arg_name argument["value"] = convert_to_printable(cleanup_value(arg_value)) arguments.append(argument) call["timestamp"] = timestamp call["thread_id"] = str(thread_id) call["category"] = category call["api"] = api_name call["status"] = bool(int(status_value)) if isinstance(return_value, int): call["return"] = "0x%.08x" % return_value else: call["return"] = convert_to_printable(cleanup_value(return_value)) call["arguments"] = arguments call["repeated"] = 0 return call
class ParseProcessLog(list): """Parses process log file.""" def __init__(self, log_path): """@param log_path: log file path.""" self._log_path = log_path self.fd = None self.parser = None self.reporting_mode = False self.process_id = None self.process_name = None self.parent_id = None self.module_path = None self.threads = [] self.first_seen = None self.calls = self self.lastcall = None self.environdict = None self.api_count = 0 self.call_id = 0 self.conversion_cache = {} self.cfg = Config() self.api_limit = self.cfg.processing.analysis_call_limit # Limit of API calls per process if os.path.exists(log_path) and os.stat(log_path).st_size > 0: self.parse_first_and_reset() if self.cfg.processing.ram_boost: self.api_call_cache = [] self.api_pointer = 0 try: while True: i = self.cacheless_next() self.api_call_cache.append(i) except StopIteration: pass self.api_call_cache.append(None) def parse_first_and_reset(self): """ Open file and either init Netlog or Bson Parser. Read till first process """ self.fd = open(self._log_path, "rb") if self._log_path.endswith(".bson"): self.parser = BsonParser(self) elif self._log_path.endswith(".raw"): self.parser = NetlogParser(self) else: self.fd.close() self.fd = None return # Get the process information from file to determine # process id (file names.) while not self.process_id: self.parser.read_next_message() self.fd.seek(0) def read(self, length): """ Read data from log file @param length: Length in byte to read """ if not length: return '' buf = self.fd.read(length) if not buf or len(buf) != length: raise EOFError() return buf def __iter__(self): #import inspect #log.debug('iter called by this guy: {0}'.format(inspect.stack()[1])) return self def __repr__(self): return "<ParseProcessLog log-path: %r>" % self._log_path def __nonzero__(self): return self.wait_for_lastcall() def reset(self): """ Reset fd """ self.fd.seek(0) self.api_count = 0 self.lastcall = None self.call_id = 0 self.api_pointer = 0 def compare_calls(self, a, b): """Compare two calls for equality. Same implementation as before netlog. @param a: call a @param b: call b @return: True if a == b else False """ if a["api"] == b["api"] and \ a["status"] == b["status"] and \ a["arguments"] == b["arguments"] and \ a["return"] == b["return"]: return True return False def wait_for_lastcall(self): """ If there is no lastcall, iterate through messages till a call is found or EOF. To get the next call, set self.lastcall to None before calling this function @return: True if there is a call, False on EOF """ while not self.lastcall: try: if not self.parser.read_next_message(): return False except EOFError: return False return True def cacheless_next(self): if not self.fd: raise StopIteration() if not self.wait_for_lastcall(): self.reset() raise StopIteration() self.api_count += 1 if self.api_limit and self.api_count > self.api_limit: self.reset() raise StopIteration() nextcall, self.lastcall = self.lastcall, None self.wait_for_lastcall() while self.lastcall and self.compare_calls(nextcall, self.lastcall): nextcall["repeated"] += self.lastcall["repeated"] + 1 self.lastcall = None self.wait_for_lastcall() nextcall["id"] = self.call_id self.call_id += 1 return nextcall def next(self): """ Just accessing the cache """ if self.cfg.processing.ram_boost: res = self.api_call_cache[self.api_pointer] if res is None: self.reset() raise StopIteration() self.api_pointer += 1 return res else: return self.cacheless_next() def log_process(self, context, timestring, pid, ppid, modulepath, procname): """ log process information parsed from data file @param context: ignored @param timestring: Process first seen time @param pid: PID @param ppid: Parent PID @param modulepath: ignored @param procname: Process name """ self.process_id, self.parent_id, self.process_name = pid, ppid, procname self.module_path = modulepath self.first_seen = timestring def log_thread(self, context, pid): pass def log_environ(self, context, environdict): """ log user/process environment information for later use in behavioral signatures @param context: ignored @param environdict: dict of the various collected information, which will expand over time """ self.environdict = environdict def log_anomaly(self, subcategory, tid, funcname, msg): """ log an anomaly parsed from data file @param subcategory: @param tid: Thread ID @param funcname: @param msg: """ self.lastcall = dict(thread_id=tid, category="anomaly", api="", subcategory=subcategory, funcname=funcname, msg=msg) def log_call(self, context, apiname, category, arguments): """ log an api call from data file @param context: containing additional api info @param apiname: name of the api @param category: win32 function category @param arguments: arguments to the api call """ apiindex, repeated, status, returnval, tid, timediff, caller, parentcaller = context current_time = self.first_seen + datetime.timedelta(0, 0, timediff*1000) timestring = logtime(current_time) self.lastcall = self._parse([timestring, tid, caller, parentcaller, category, apiname, repeated, status, returnval] + arguments) def log_error(self, emsg): """ Log an error """ log.warning("ParseProcessLog error condition on log %s: %s", str(self._log_path), emsg) def begin_reporting(self): self.reporting_mode = True if self.cfg.processing.ram_boost: idx = 0 while True: ent = self.api_call_cache[idx] if not ent: break # remove the values we don't want to encode in reports for arg in ent["arguments"]: del arg["raw_value"] idx += 1 def _parse(self, row): """Parse log row. @param row: row data. @return: parsed information dict. """ call = {} arguments = [] try: timestamp = row[0] # Timestamp of current API call invocation. thread_id = row[1] # Thread ID. caller = row[2] # non-system DLL return address parentcaller = row[3] # non-system DLL parent of non-system-DLL return address category = row[4] # Win32 function category. api_name = row[5] # Name of the Windows API. repeated = row[6] # Times log repeated status_value = row[7] # Success or Failure? return_value = row[8] # Value returned by the function. except IndexError as e: log.debug("Unable to parse process log row: %s", e) return None # Now walk through the remaining columns, which will contain API # arguments. for index in range(9, len(row)): argument = {} # Split the argument name with its value based on the separator. try: arg_name, arg_value = row[index] except ValueError as e: log.debug("Unable to parse analysis row argument (row=%s): %s", row[index], e) continue argument["name"] = arg_name argument["value"] = convert_to_printable(str(arg_value), self.conversion_cache) if not self.reporting_mode: argument["raw_value"] = arg_value pretty = pretty_print_arg(category, api_name, arg_name, argument["value"]) if pretty: argument["pretty_value"] = pretty arguments.append(argument) call["timestamp"] = timestamp call["thread_id"] = str(thread_id) call["caller"] = "0x%.08x" % caller call["parentcaller"] = "0x%.08x" % parentcaller call["category"] = category call["api"] = api_name call["status"] = bool(int(status_value)) if isinstance(return_value, int) or isinstance(return_value, long): call["return"] = "0x%.08x" % return_value else: call["return"] = convert_to_printable(str(return_value), self.conversion_cache) prettyret = pretty_print_retval(category, api_name, call["status"], call["return"]) if prettyret: call["pretty_return"] = prettyret call["arguments"] = arguments call["repeated"] = repeated # add the thread id to our thread set if call["thread_id"] not in self.threads: self.threads.append(call["thread_id"]) return call
class ParseProcessLog(list): """Parses process log file.""" def __init__(self, log_path): """@param log_path: log file path.""" self._log_path = log_path self.fd = None self.parser = None self.process_id = None self.process_name = None self.parent_id = None self.first_seen = None self.calls = self self.lastcall = None if os.path.exists(log_path) and os.stat(log_path).st_size > 0: self.parse_first_and_reset() def parse_first_and_reset(self): self.fd = open(self._log_path, "rb") if self._log_path.endswith(".bson"): self.parser = BsonParser(self) elif self._log_path.endswith(".raw"): self.parser = NetlogParser(self) else: self.fd.close() self.fd = None return # Get the process information from file to determine # process id (file names.) while not self.process_id: self.parser.read_next_message() self.fd.seek(0) def read(self, length): if not length: return '' buf = self.fd.read(length) if not buf or len(buf) != length: raise EOFError() return buf def __iter__(self): #import inspect #log.debug('iter called by this guy: {0}'.format(inspect.stack()[1])) return self def __repr__(self): return "<ParseProcessLog log-path: %r>" % self._log_path def __nonzero__(self): return self.wait_for_lastcall() def reset(self): self.fd.seek(0) self.lastcall = None def compare_calls(self, a, b): """Compare two calls for equality. Same implementation as before netlog. @param a: call a @param b: call b @return: True if a == b else False """ if a["api"] == b["api"] and \ a["status"] == b["status"] and \ a["arguments"] == b["arguments"] and \ a["return"] == b["return"]: return True return False def wait_for_lastcall(self): while not self.lastcall: try: if not self.parser.read_next_message(): return False except EOFError: return False return True def next(self): if not self.fd: raise StopIteration() if not self.wait_for_lastcall(): self.reset() raise StopIteration() nextcall, self.lastcall = self.lastcall, None self.wait_for_lastcall() while self.lastcall and self.compare_calls(nextcall, self.lastcall): nextcall["repeated"] += 1 self.lastcall = None self.wait_for_lastcall() return nextcall def log_process(self, context, timestring, pid, ppid, modulepath, procname): self.process_id, self.parent_id, self.process_name = pid, ppid, procname self.first_seen = timestring def log_thread(self, context, pid): pass def log_call(self, context, apiname, category, arguments): apiindex, status, returnval, tid, timediff = context current_time = self.first_seen + datetime.timedelta( 0, 0, timediff * 1000) timestring = logtime(current_time) self.lastcall = self._parse( [timestring, tid, category, apiname, status, returnval] + arguments) def log_error(self, emsg): log.warning("ParseProcessLog error condition on log %s: %s", str(self._log_path), emsg) def _parse(self, row): """Parse log row. @param row: row data. @return: parsed information dict. """ call = {} arguments = [] try: timestamp = row[0] # Timestamp of current API call invocation. thread_id = row[1] # Thread ID. category = row[2] # Win32 function category. api_name = row[3] # Name of the Windows API. status_value = row[4] # Success or Failure? return_value = row[5] # Value returned by the function. except IndexError as e: log.debug("Unable to parse process log row: %s", e) return None # Now walk through the remaining columns, which will contain API # arguments. for index in range(6, len(row)): argument = {} # Split the argument name with its value based on the separator. try: arg_name, arg_value = row[index] except ValueError as e: log.debug("Unable to parse analysis row argument (row=%s): %s", row[index], e) continue argument["name"] = arg_name argument["value"] = convert_to_printable(cleanup_value(arg_value)) arguments.append(argument) call["timestamp"] = timestamp call["thread_id"] = str(thread_id) call["category"] = category call["api"] = api_name call["status"] = bool(int(status_value)) if isinstance(return_value, int): call["return"] = "0x%.08x" % return_value else: call["return"] = convert_to_printable(cleanup_value(return_value)) call["arguments"] = arguments call["repeated"] = 0 return call
class ParseProcessLog(list): """Parses process log file.""" def __init__(self, log_path): """@param log_path: log file path.""" self._log_path = log_path self.fd = None self.parser = None self.reporting_mode = False self.process_id = None self.process_name = None self.parent_id = None self.module_path = None self.threads = [] self.first_seen = None self.calls = self self.lastcall = None self.api_count = 0 self.call_id = 0 self.conversion_cache = {} self.cfg = Config() self.api_limit = self.cfg.processing.analysis_call_limit # Limit of API calls per process if os.path.exists(log_path) and os.stat(log_path).st_size > 0: self.parse_first_and_reset() if self.cfg.processing.ram_boost: self.api_call_cache = [] self.api_pointer = 0 try: while True: i = self.cacheless_next() self.api_call_cache.append(i) except StopIteration: pass self.api_call_cache.append(None) def parse_first_and_reset(self): """ Open file and either init Netlog or Bson Parser. Read till first process """ self.fd = open(self._log_path, "rb") if self._log_path.endswith(".bson"): self.parser = BsonParser(self) elif self._log_path.endswith(".raw"): self.parser = NetlogParser(self) else: self.fd.close() self.fd = None return # Get the process information from file to determine # process id (file names.) while not self.process_id: self.parser.read_next_message() self.fd.seek(0) def read(self, length): """ Read data from log file @param length: Length in byte to read """ if not length: return '' buf = self.fd.read(length) if not buf or len(buf) != length: raise EOFError() return buf def __iter__(self): #import inspect #log.debug('iter called by this guy: {0}'.format(inspect.stack()[1])) return self def __repr__(self): return "<ParseProcessLog log-path: %r>" % self._log_path def __nonzero__(self): return self.wait_for_lastcall() def reset(self): """ Reset fd """ self.fd.seek(0) self.api_count = 0 self.lastcall = None self.call_id = 0 self.api_pointer = 0 def compare_calls(self, a, b): """Compare two calls for equality. Same implementation as before netlog. @param a: call a @param b: call b @return: True if a == b else False """ if a["api"] == b["api"] and \ a["status"] == b["status"] and \ a["arguments"] == b["arguments"] and \ a["return"] == b["return"]: return True return False def wait_for_lastcall(self): """ If there is no lastcall, iterate through messages till a call is found or EOF. To get the next call, set self.lastcall to None before calling this function @return: True if there is a call, False on EOF """ while not self.lastcall: try: if not self.parser.read_next_message(): return False except EOFError: return False return True def cacheless_next(self): if not self.fd: raise StopIteration() if not self.wait_for_lastcall(): self.reset() raise StopIteration() self.api_count += 1 if self.api_limit and self.api_count > self.api_limit: self.reset() raise StopIteration() nextcall, self.lastcall = self.lastcall, None self.wait_for_lastcall() while self.lastcall and self.compare_calls(nextcall, self.lastcall): nextcall["repeated"] += self.lastcall["repeated"] + 1 self.lastcall = None self.wait_for_lastcall() nextcall["id"] = self.call_id self.call_id += 1 return nextcall def next(self): """ Just accessing the cache """ if self.cfg.processing.ram_boost: res = self.api_call_cache[self.api_pointer] if res is None: self.reset() raise StopIteration() self.api_pointer += 1 return res else: return self.cacheless_next() def log_process(self, context, timestring, pid, ppid, modulepath, procname): """ log process information parsed from data file @param context: ignored @param timestring: Process first seen time @param pid: PID @param ppid: Parent PID @param modulepath: ignored @param procname: Process name """ self.process_id, self.parent_id, self.process_name = pid, ppid, procname self.module_path = modulepath self.first_seen = timestring def log_thread(self, context, pid): pass def log_anomaly(self, subcategory, tid, funcname, msg): """ log an anomaly parsed from data file @param subcategory: @param tid: Thread ID @param funcname: @param msg: """ self.lastcall = dict(thread_id=tid, category="anomaly", api="", subcategory=subcategory, funcname=funcname, msg=msg) def log_call(self, context, apiname, category, arguments): """ log an api call from data file @param context: containing additional api info @param apiname: name of the api @param category: win32 function category @param arguments: arguments to the api call """ apiindex, repeated, status, returnval, tid, timediff, caller, parentcaller = context current_time = self.first_seen + datetime.timedelta( 0, 0, timediff * 1000) timestring = logtime(current_time) self.lastcall = self._parse([ timestring, tid, caller, parentcaller, category, apiname, repeated, status, returnval ] + arguments) def log_error(self, emsg): """ Log an error """ log.warning("ParseProcessLog error condition on log %s: %s", str(self._log_path), emsg) def begin_reporting(self): self.reporting_mode = True if self.cfg.processing.ram_boost: idx = 0 while True: ent = self.api_call_cache[idx] if not ent: break # remove the values we don't want to encode in reports for arg in ent["arguments"]: del arg["raw_value"] idx += 1 def _parse(self, row): """Parse log row. @param row: row data. @return: parsed information dict. """ call = {} arguments = [] try: timestamp = row[0] # Timestamp of current API call invocation. thread_id = row[1] # Thread ID. caller = row[2] # non-system DLL return address parentcaller = row[ 3] # non-system DLL parent of non-system-DLL return address category = row[4] # Win32 function category. api_name = row[5] # Name of the Windows API. repeated = row[6] # Times log repeated status_value = row[7] # Success or Failure? return_value = row[8] # Value returned by the function. except IndexError as e: log.debug("Unable to parse process log row: %s", e) return None # Now walk through the remaining columns, which will contain API # arguments. for index in range(9, len(row)): argument = {} # Split the argument name with its value based on the separator. try: arg_name, arg_value = row[index] except ValueError as e: log.debug("Unable to parse analysis row argument (row=%s): %s", row[index], e) continue argument["name"] = arg_name argument["value"] = convert_to_printable(str(arg_value), self.conversion_cache) if not self.reporting_mode: argument["raw_value"] = arg_value pretty = pretty_print_arg(category, api_name, arg_name, argument["value"]) if pretty: argument["pretty_value"] = pretty arguments.append(argument) call["timestamp"] = timestamp call["thread_id"] = str(thread_id) call["caller"] = "0x%.08x" % caller call["parentcaller"] = "0x%.08x" % parentcaller call["category"] = category call["api"] = api_name call["status"] = bool(int(status_value)) if isinstance(return_value, int) or isinstance(return_value, long): call["return"] = "0x%.08x" % return_value else: call["return"] = convert_to_printable(str(return_value), self.conversion_cache) prettyret = pretty_print_retval(category, api_name, call["status"], call["return"]) if prettyret: call["pretty_return"] = prettyret call["arguments"] = arguments call["repeated"] = repeated # add the thread id to our thread set if call["thread_id"] not in self.threads: self.threads.append(call["thread_id"]) return call