def find_new_samples(self):
     what = "id, hash, description, callgraph, total_functions, primes"
     where = "clustered != 1 and total_functions >= 100"
     order = "id asc"
     ret = self.db.select("samples", what=what, where=where, order=order)
     rows = list(ret)
     if len(rows) == 0:
         return False
     log("Found a total %d new sample(s) to cluster" % len(rows))
     return self.cluster_samples(rows)
Beispiel #2
0
    def get_description(self, buf):
        if self.clamd is None:
            return None

        ret = self.clamd.scan_stream(buf)
        if ret is None:
            return None

        # Answer format is in the following form:
        # >>> cd.scan_stream(buf)
        # >>> {u'stream': ('FOUND', 'Win.Trojan.Miniduke-3')}
        ret = ret["stream"][1]
        log("Found malware name %s" % repr(ret))
        return ret
Beispiel #3
0
  def analyse(self, path):
    filename = path

    t = time.time()
    buf = open(filename, "rb").read()
    sha1_hash = sha1(buf).hexdigest()
    if self.file_exists(sha1_hash):
      log("Already existing file %s..." % sha1_hash)
      return ANALYSIS_ALREADY

    data = self.read_functions()
    if data is None:
      return ANALYSIS_FAILED

    total_functions, avg_nodes, avg_edges, avg_ccs = data
    msg = "%d-%d-%d-%d" % (total_functions, avg_nodes, avg_edges, avg_ccs)
    log("File analysed %s, callgraph signature %s" % (msg, callgraph))
    log("Time to analyze %f" % (time.time() - t))

    callgraph = str(callgraph)
    primes = ",".join(map(str, primes))
    desc = None # We don't have pyclamd in IDA...
    self.db.insert("samples", filename=filename, callgraph=callgraph,  \
                   hash=sha1_hash, total_functions=total_functions,    \
                   format=None, primes=primes, description=desc,\
                   analysis_date=time.asctime())
    return ANALYSIS_SUCCESS
 def find_clusters(self):
     log("Finding new samples to cluster...")
     while 1:
         ret = self.find_new_samples()
         break
         if ret:
             log("Waiting for %d second(s)..." % self.wait_time)
         time.sleep(self.wait_time)
         break
     log("Done")
    def cluster_samples(self, raw_samples):
        samples = {}
        for sample in raw_samples:
            samples[str(sample["id"])] = sample["callgraph"]
            callgraph = sample["callgraph"]
            primes = sample["primes"].split(",")
            FACTORS_CACHE[callgraph] = self.to_primes_dict(primes)

        fgm = CFuzzyGraphMatcher(samples,
                                 max_diff=20,
                                 diff_relative=True,
                                 debug=True)
        log("Creating phylogenetic trees...")
        g = fgm.make_tree(to_dot=False)

        log("Creating or updating clusters...")
        ret = self.create_or_update_clusters(g, raw_samples)

        log("Done processing phylogenetic trees!")
        return ret
    def create_or_update_clusters(self, g, raw_samples):
        # Build a dict with id -> description
        d_samples = {}
        d_functions = {}
        min_funcs = -1
        max_funcs = -1
        for sample in raw_samples:
            d_functions[sample["id"]] = [
                int(sample["total_functions"]), sample["hash"]
            ]
            if sample["description"] is not None:
                d_samples[sample["id"]] = sample["description"]

        for cluster in g:
            # Only process clusters with more than 1 item
            if len(cluster) <= 1:
                continue

            generation_level = 0

            # Get the cluster's samples and the graph (a dictionary)
            new_d = {}
            cluster_samples = set()
            d = cluster.d
            for key in d:
                if key.name.isdigit():
                    cluster_samples.add(int(key))
                else:
                    generation_level += 1

                new_d[str(key)] = list(map(str, d[key]))

                for x in d[key]:
                    if x.name.isdigit():
                        cluster_samples.add(int(x.name))

            # Get the final field's values that will be inserted in the
            # clusters table
            cluster_name = self.build_cluster_name(cluster_samples, d_samples)
            cluster_tags = self.build_cluster_tags(cluster_name)
            cluster_graph = json.dumps(new_d)
            cluster_hashes = self.get_cluster_hashes(cluster_samples,
                                                     d_functions)
            cluster_hashes = json.dumps(list(cluster_hashes))
            cluster_samples_j = json.dumps(list(cluster_samples))
            l = self.get_cluster_functions_range(cluster_samples, d_functions)
            min_funcs, max_funcs = l
            dot = cluster.toDot()

            if cluster_name:
                log("Creating cluster with name %s..." % repr(cluster_name))
            else:
                log("Creating unnamed cluster...")

            if cluster_graph is None:
                cluster_graph = ""
            if cluster_tags is None:
                cluster_tags = ""

            with self.db.transaction():
                self.db.insert("clusters", description=cluster_name,           \
                        graph=cluster_graph[:65535], generation_level=generation_level,\
                        samples=cluster_samples_j, last_update=time.asctime(), \
                        max_funcs=max_funcs, min_funcs=min_funcs, dot=dot,     \
                        tags=cluster_tags[:65535])

                c_vars = {"samples": list(cluster_samples)}
                where = "id in $samples"
                self.db.update("samples",
                               vars=c_vars,
                               where=where,
                               clustered=1)

        return True
Beispiel #7
0
    def analyse(self, path):
        filename = path

        t = time.time()
        buf = open(filename, "rb").read()
        sha1_hash = sha1(buf).hexdigest()
        if self.file_exists(sha1_hash):
            log("Already existing file %s..." % sha1_hash)
            return ANALYSIS_ALREADY

        pyew = CPyew(batch=True)
        pyew.analysis_timeout = 300
        pyew.codeanalysis = True
        pyew.deepcodeanalysis = True

        try:
            pyew.loadFile(path)
            load_error = False
        except KeyboardInterrupt:
            log("Abort")
            return ANALYSIS_FAILED
        except:
            log("ERROR loading file %s" % path)
            load_error = True

        if not load_error:
            if pyew.format not in ["PE", "ELF", "bootsector"]:
                if pyew.format not in ["PDF", "OLE2"]:
                    log("Not a known executable/document format")
                load_error = True

        if load_error:
            return ANALYSIS_FAILED

        primes = []
        total_functions = len(pyew.function_stats)
        if not load_error and total_functions > 0:
            nodes = []
            edges = []
            ccs = []
            callgraph = 1
            for x in pyew.function_stats:
                nodes.append(pyew.function_stats[x][0])
                edges.append(pyew.function_stats[x][1])
                cc = pyew.function_stats[x][2]
                ccs.append(cc)

                prime = self.primes_table[cc]
                callgraph *= prime
                primes.append(prime)

            avg_nodes = abs(sum(nodes) / total_functions)
            avg_edges = abs(sum(edges) / total_functions)
            avg_ccs = abs(sum(ccs) / total_functions)
        elif load_error:
            total_functions = avg_nodes = avg_edges = avg_ccs = -1
            callgraph = -1

        msg = "%d-%d-%d-%d" % (total_functions, avg_nodes, avg_edges, avg_ccs)
        log("File analysed %s, callgraph signature %s" % (msg, callgraph))
        log("Time to analyze %f" % (time.time() - t))

        callgraph = str(callgraph)
        primes = ",".join(map(str, primes))
        desc = self.get_description(buf)
        self.db.insert("samples", filename=filename, callgraph=callgraph,  \
                       hash=sha1_hash, total_functions=total_functions,    \
                       format=pyew.format, primes=primes, description=desc,\
                       analysis_date=time.asctime())
        return ANALYSIS_SUCCESS
Beispiel #8
0
import time
import sqlite3

from hashlib import sha1

sys.path.append("pyew")
from pyew_core import CPyew

from cn_log import log
from cn_db import init_web_db
from cosa_nostra import open_db

try:
    import pyclamd
except ImportError:
    log("No pyclamd support, files will not have a description.")
    pyclamd = None

#-----------------------------------------------------------------------
ANALYSIS_FAILED = 0
ANALYSIS_SUCCESS = 1
ANALYSIS_ALREADY = 2


#-----------------------------------------------------------------------
def primes(n):
    if n == 2: return [2]
    elif n < 2: return []
    s = list(range(3, n + 1, 2))
    mroot = n**0.5
    half = (n + 1) / 2 - 1
Beispiel #9
0
def self_kill():
    log("*** TIMEOUT *** KILLING MY SELF!")
    thread.interrupt_main()
Beispiel #10
0
    def analyse(self, path):
        filename = path

        t = time.time()
        buf = open(filename, "rb").read()
        sha1_hash = sha1(buf).hexdigest()
        if self.file_exists(sha1_hash):
            log("Already existing file %s..." % sha1_hash)
            return ANALYSIS_ALREADY

        try:
            self.r2 = r2pipe.open(path)
            load_error = False
        except KeyboardInterrupt:
            log("Abort")
            return ANALYSIS_FAILED
        except:
            log("ERROR loading file %s" % path)
            load_error = True

        # Before performing code analysis, install a thread to immolate us
        # if it takes too long...
        kill_thread = threading.Timer(60, self_kill)
        kill_thread.start()
        try:
            self.r2.cmd("aaa")
            # Cancel it as soon as the analysis finishes
            kill_thread.cancel()
        except:
            load_error = True

        if load_error:
            return ANALYSIS_FAILED

        fmt = None
        r2_format = self.r2.cmdj("ij")
        if r2_format is not None and "bin" in r2_format:
            fmt = r2_format["bin"]["class"]

        primes = []
        functions = self.r2.cmdj("aflj")
        if functions is None:
            return ANALYSIS_FAILED

        total_functions = len(functions)
        if not load_error and total_functions > 0:
            nodes = []
            edges = []
            ccs = []
            callgraph = 1
            for f in functions:
                ret = self.read_function(f)
                if ret is None:
                    # Sometimes, it might fail in radare2 for some functions
                    continue

                f_nodes, f_edges, f_cc = ret
                nodes.append(f_nodes)
                edges.append(f_edges)
                ccs.append(f_cc)

                prime = self.primes_table[f_cc]
                callgraph *= prime
                primes.append(prime)

            avg_nodes = abs(sum(nodes) / total_functions)
            avg_edges = abs(sum(edges) / total_functions)
            avg_ccs = abs(sum(ccs) / total_functions)
        elif load_error:
            total_functions = avg_nodes = avg_edges = avg_ccs = -1
            callgraph = -1

        msg = "%d-%d-%d-%d" % (total_functions, avg_nodes, avg_edges, avg_ccs)
        log("File analysed %s, callgraph signature %s" % (msg, callgraph))
        log("Time to analyze %f" % (time.time() - t))

        callgraph = str(callgraph)
        primes = ",".join(map(str, primes))
        desc = self.get_description(buf)
        self.db.insert("samples", filename=filename, callgraph=callgraph,  \
                       hash=sha1_hash, total_functions=total_functions,    \
                       format=fmt, primes=primes, description=desc,\
                       analysis_date=time.asctime())
        return ANALYSIS_SUCCESS
Beispiel #11
0
def ida_log(msg):
    log("COSA-NOSTRA: %s" % msg)