def find_new_samples(self): what = "id, hash, description, callgraph, total_functions, primes" where = "clustered != 1 and total_functions >= 100" order = "id asc" ret = self.db.select("samples", what=what, where=where, order=order) rows = list(ret) if len(rows) == 0: return False log("Found a total %d new sample(s) to cluster" % len(rows)) return self.cluster_samples(rows)
def get_description(self, buf): if self.clamd is None: return None ret = self.clamd.scan_stream(buf) if ret is None: return None # Answer format is in the following form: # >>> cd.scan_stream(buf) # >>> {u'stream': ('FOUND', 'Win.Trojan.Miniduke-3')} ret = ret["stream"][1] log("Found malware name %s" % repr(ret)) return ret
def analyse(self, path): filename = path t = time.time() buf = open(filename, "rb").read() sha1_hash = sha1(buf).hexdigest() if self.file_exists(sha1_hash): log("Already existing file %s..." % sha1_hash) return ANALYSIS_ALREADY data = self.read_functions() if data is None: return ANALYSIS_FAILED total_functions, avg_nodes, avg_edges, avg_ccs = data msg = "%d-%d-%d-%d" % (total_functions, avg_nodes, avg_edges, avg_ccs) log("File analysed %s, callgraph signature %s" % (msg, callgraph)) log("Time to analyze %f" % (time.time() - t)) callgraph = str(callgraph) primes = ",".join(map(str, primes)) desc = None # We don't have pyclamd in IDA... self.db.insert("samples", filename=filename, callgraph=callgraph, \ hash=sha1_hash, total_functions=total_functions, \ format=None, primes=primes, description=desc,\ analysis_date=time.asctime()) return ANALYSIS_SUCCESS
def find_clusters(self): log("Finding new samples to cluster...") while 1: ret = self.find_new_samples() break if ret: log("Waiting for %d second(s)..." % self.wait_time) time.sleep(self.wait_time) break log("Done")
def cluster_samples(self, raw_samples): samples = {} for sample in raw_samples: samples[str(sample["id"])] = sample["callgraph"] callgraph = sample["callgraph"] primes = sample["primes"].split(",") FACTORS_CACHE[callgraph] = self.to_primes_dict(primes) fgm = CFuzzyGraphMatcher(samples, max_diff=20, diff_relative=True, debug=True) log("Creating phylogenetic trees...") g = fgm.make_tree(to_dot=False) log("Creating or updating clusters...") ret = self.create_or_update_clusters(g, raw_samples) log("Done processing phylogenetic trees!") return ret
def create_or_update_clusters(self, g, raw_samples): # Build a dict with id -> description d_samples = {} d_functions = {} min_funcs = -1 max_funcs = -1 for sample in raw_samples: d_functions[sample["id"]] = [ int(sample["total_functions"]), sample["hash"] ] if sample["description"] is not None: d_samples[sample["id"]] = sample["description"] for cluster in g: # Only process clusters with more than 1 item if len(cluster) <= 1: continue generation_level = 0 # Get the cluster's samples and the graph (a dictionary) new_d = {} cluster_samples = set() d = cluster.d for key in d: if key.name.isdigit(): cluster_samples.add(int(key)) else: generation_level += 1 new_d[str(key)] = list(map(str, d[key])) for x in d[key]: if x.name.isdigit(): cluster_samples.add(int(x.name)) # Get the final field's values that will be inserted in the # clusters table cluster_name = self.build_cluster_name(cluster_samples, d_samples) cluster_tags = self.build_cluster_tags(cluster_name) cluster_graph = json.dumps(new_d) cluster_hashes = self.get_cluster_hashes(cluster_samples, d_functions) cluster_hashes = json.dumps(list(cluster_hashes)) cluster_samples_j = json.dumps(list(cluster_samples)) l = self.get_cluster_functions_range(cluster_samples, d_functions) min_funcs, max_funcs = l dot = cluster.toDot() if cluster_name: log("Creating cluster with name %s..." % repr(cluster_name)) else: log("Creating unnamed cluster...") if cluster_graph is None: cluster_graph = "" if cluster_tags is None: cluster_tags = "" with self.db.transaction(): self.db.insert("clusters", description=cluster_name, \ graph=cluster_graph[:65535], generation_level=generation_level,\ samples=cluster_samples_j, last_update=time.asctime(), \ max_funcs=max_funcs, min_funcs=min_funcs, dot=dot, \ tags=cluster_tags[:65535]) c_vars = {"samples": list(cluster_samples)} where = "id in $samples" self.db.update("samples", vars=c_vars, where=where, clustered=1) return True
def analyse(self, path): filename = path t = time.time() buf = open(filename, "rb").read() sha1_hash = sha1(buf).hexdigest() if self.file_exists(sha1_hash): log("Already existing file %s..." % sha1_hash) return ANALYSIS_ALREADY pyew = CPyew(batch=True) pyew.analysis_timeout = 300 pyew.codeanalysis = True pyew.deepcodeanalysis = True try: pyew.loadFile(path) load_error = False except KeyboardInterrupt: log("Abort") return ANALYSIS_FAILED except: log("ERROR loading file %s" % path) load_error = True if not load_error: if pyew.format not in ["PE", "ELF", "bootsector"]: if pyew.format not in ["PDF", "OLE2"]: log("Not a known executable/document format") load_error = True if load_error: return ANALYSIS_FAILED primes = [] total_functions = len(pyew.function_stats) if not load_error and total_functions > 0: nodes = [] edges = [] ccs = [] callgraph = 1 for x in pyew.function_stats: nodes.append(pyew.function_stats[x][0]) edges.append(pyew.function_stats[x][1]) cc = pyew.function_stats[x][2] ccs.append(cc) prime = self.primes_table[cc] callgraph *= prime primes.append(prime) avg_nodes = abs(sum(nodes) / total_functions) avg_edges = abs(sum(edges) / total_functions) avg_ccs = abs(sum(ccs) / total_functions) elif load_error: total_functions = avg_nodes = avg_edges = avg_ccs = -1 callgraph = -1 msg = "%d-%d-%d-%d" % (total_functions, avg_nodes, avg_edges, avg_ccs) log("File analysed %s, callgraph signature %s" % (msg, callgraph)) log("Time to analyze %f" % (time.time() - t)) callgraph = str(callgraph) primes = ",".join(map(str, primes)) desc = self.get_description(buf) self.db.insert("samples", filename=filename, callgraph=callgraph, \ hash=sha1_hash, total_functions=total_functions, \ format=pyew.format, primes=primes, description=desc,\ analysis_date=time.asctime()) return ANALYSIS_SUCCESS
import time import sqlite3 from hashlib import sha1 sys.path.append("pyew") from pyew_core import CPyew from cn_log import log from cn_db import init_web_db from cosa_nostra import open_db try: import pyclamd except ImportError: log("No pyclamd support, files will not have a description.") pyclamd = None #----------------------------------------------------------------------- ANALYSIS_FAILED = 0 ANALYSIS_SUCCESS = 1 ANALYSIS_ALREADY = 2 #----------------------------------------------------------------------- def primes(n): if n == 2: return [2] elif n < 2: return [] s = list(range(3, n + 1, 2)) mroot = n**0.5 half = (n + 1) / 2 - 1
def self_kill(): log("*** TIMEOUT *** KILLING MY SELF!") thread.interrupt_main()
def analyse(self, path): filename = path t = time.time() buf = open(filename, "rb").read() sha1_hash = sha1(buf).hexdigest() if self.file_exists(sha1_hash): log("Already existing file %s..." % sha1_hash) return ANALYSIS_ALREADY try: self.r2 = r2pipe.open(path) load_error = False except KeyboardInterrupt: log("Abort") return ANALYSIS_FAILED except: log("ERROR loading file %s" % path) load_error = True # Before performing code analysis, install a thread to immolate us # if it takes too long... kill_thread = threading.Timer(60, self_kill) kill_thread.start() try: self.r2.cmd("aaa") # Cancel it as soon as the analysis finishes kill_thread.cancel() except: load_error = True if load_error: return ANALYSIS_FAILED fmt = None r2_format = self.r2.cmdj("ij") if r2_format is not None and "bin" in r2_format: fmt = r2_format["bin"]["class"] primes = [] functions = self.r2.cmdj("aflj") if functions is None: return ANALYSIS_FAILED total_functions = len(functions) if not load_error and total_functions > 0: nodes = [] edges = [] ccs = [] callgraph = 1 for f in functions: ret = self.read_function(f) if ret is None: # Sometimes, it might fail in radare2 for some functions continue f_nodes, f_edges, f_cc = ret nodes.append(f_nodes) edges.append(f_edges) ccs.append(f_cc) prime = self.primes_table[f_cc] callgraph *= prime primes.append(prime) avg_nodes = abs(sum(nodes) / total_functions) avg_edges = abs(sum(edges) / total_functions) avg_ccs = abs(sum(ccs) / total_functions) elif load_error: total_functions = avg_nodes = avg_edges = avg_ccs = -1 callgraph = -1 msg = "%d-%d-%d-%d" % (total_functions, avg_nodes, avg_edges, avg_ccs) log("File analysed %s, callgraph signature %s" % (msg, callgraph)) log("Time to analyze %f" % (time.time() - t)) callgraph = str(callgraph) primes = ",".join(map(str, primes)) desc = self.get_description(buf) self.db.insert("samples", filename=filename, callgraph=callgraph, \ hash=sha1_hash, total_functions=total_functions, \ format=fmt, primes=primes, description=desc,\ analysis_date=time.asctime()) return ANALYSIS_SUCCESS
def ida_log(msg): log("COSA-NOSTRA: %s" % msg)