def find_original_file(db, id): vars = {"id":id} where = "sample_id = $id" res = db.select("samples", what="sample_hash", where=where, vars=vars) res = list(res) if len(res) == 0: raise Exception("Invalid crash identifier") sample_hash = res[0].sample_hash res = db.select("config", what="value", where="name='SAMPLES_PATH'") res = list(res) if len(res) == 0: raise Exception("Invalid configuration value for 'SAMPLES_PATH'") path = os.path.join(res[0].value, "crashes") path = os.path.join(path, sample_hash) if not os.path.exists(path): raise Exception("Crash sample does not exists! %s" % path) magic = open(path, "rb").read(3) if magic == "PK\x03": z = ZipFile(path, "r") cmt = z.comment z.close() if cmt == "NIGHTMARE": raise Exception("Cannot find the original sample for ZIP archives created by Nightmare, sorry.") res = db.select("config", what="value", where="name = 'TEMPLATES_PATH'") res = list(res) if len(res) == 0: raise Exception("Invalid configuration value for 'TEMPLATES_PATH'") templates_path = res[0].value sql = """select p.subfolder subfolder from projects p, crashes c where c.sample_id = $id and p.project_id = c.project_id""" vars = {"id":id} res = db.query(sql, vars=vars) res = list(res) if len(res) == 0: raise Exception("Cannot find the project associated to the crash identifier") project_path = os.path.join(templates_path, res[0].subfolder) if not os.path.exists(project_path): raise Exception("Cannot find path '%s'" % project_path) kfh = CKoretFuzzyHashing() kfh.bsize = 16 h1, h2, h3 = kfh.hash_file(path).split(";") original_file = None for f in os.listdir(project_path): filename = os.path.join(project_path, f) if not os.path.isfile(filename): continue tmp1, tmp2, tmp3 = kfh.hash_file(filename).split(";") if h1 == tmp1 and h2 == tmp2 and h3 == tmp3: original_file = filename break elif h1 == tmp1 or h2 == tmp2 or h3 == tmp3: original_file = filename break return original_file, path
class CDeepToad: def __init__(self): self.kfd = CKoretFuzzyHashing() self.kfd.bsize = 512 self.kfd.output_size = 32 self.kfd.ignore_range = 2 self.kfd.big_file_size = 1024*1024*32 self.groups = {} self.ingroups = {} self.extensions = [] self.ignore_extensions = [] self.edit_distance = MAX_EDIT_DISTANCE self.maximum = 0 self.aggresive = False self.just_print = False self.just_compare = False self.print_similars = False self.output_dir = None def cluster(self, hashes, filename): if self.just_print or self.just_compare: self.groups[filename] = hashes return for hash in hashes: hashed = False for key in self.groups: # Check for maximum edit distance if self.kfd.edit_distance(key, hash) <= self.edit_distance: self.groups[key].append(filename) hashed = True break if hashed: continue if not self.groups.has_key(hash): self.groups[hash] = [] self.groups[hash].append(filename) def compareSimilars(self, hashes, filename): for line in open(self.similars_file, "rb").readlines(): line = line.strip("\r").strip("\n") similar_hases = line.split(";") print similar_hashes def hashFile(self, filename): try: s1, s2, s3 = self.kfd.hash_file(filename, self.aggresive).split(";") if self.just_print: print "%s;%s;%s;%s" % (s1, s2, s3, filename) elif self.print_similars: self.compareSimilars((s1, s2, s3), filename) else: self.cluster((s1, s2, s3), filename) except KeyboardInterrupt: raise except: sys.stderr.write(" -> %s\n" % str(sys.exc_info()[1])) sys.stderr.flush() def printReportHeader(self): print "Signature;Simple Signature;Reverse Signature;Filename" def clusterDirectory(self, path, output_dir): last_size = 0 total = 0 if self.just_print: self.printReportHeader() for root, dirs, files in os.walk(path): for name in files: if self.maximum != 0 and total >= self.maximum: break total += 1 basename, extension = os.path.splitext(name) if extension in self.ignore_extensions: continue elif len(self.extensions) != 0: if extension not in self.extensions: continue if not self.just_print: sys.stderr.write("\b"*last_size + " "*last_size + "\b"*last_size) sys.stderr.flush() sys.stderr.write("Processing file %s ..." % os.path.join(root, name)) last_size = len("Processing file %s ..." % os.path.join(root, name)) sys.stderr.flush() self.hashFile(os.path.join(root, name)) if self.maximum != 0 and total >= self.maximum: break if total > 0: sys.stderr.write("\n") sys.stderr.flush() def sortByCount(self): # First, sort by count of elements newGrp = {} for x in self.groups: if x == "": continue newGrp[x] = len(self.groups[x]) # Now sort the dict by values outgrp = {} alist = sorted(newGrp.iteritems(), key=lambda (k,v): (v,k), reverse=True) dones = [] # Create the new dict with only non empty groups for x in alist: val = x[0] outgrp[val] = [] for element in self.groups[val]: if element not in dones: outgrp[val].append(element) dones.append(element) if len(outgrp[val]) == 0: del outgrp[val] return outgrp def printHashes(self): self.printReportHeader() for x in self.groups: hashes = self.groups[x] print "%s;%s;%s;%s" % (hashes[0], hashes[1], hashes[2], x) def compareAndReportHashes(self, x, y, hashesx, hashesy, dones): finished = False for hx in hashesx: if hx == "": continue elif finished: break #if hx not in hashesy: # continue for hy in hashesy: if hy == "": continue dis = self.kfd.edit_distance(hx, hy) dis = len(hx) - dis percent = dis*100.00/len(hx) if percent > 33: print "File '%s' matches '%s' (%0.2f%%)" % (x, y, percent) dones[y] = x finished = True break return dones def compareHashes(self): dones = {} for x in self.groups: for y in self.groups: if x == y: continue elif dones.has_key(x): if dones[x] == y: #print "Ignored" continue hashesx = self.groups[x] hashesy = self.groups[y] dones = self.compareAndReportHashes(x, y, hashesx, hashesy, dones) def printReport(self): if self.just_print: #self.printHashes() return if self.just_compare: self.compareHashes() return grp = self.sortByCount() already = [] for x in grp: for element in self.groups[x]: if element not in already: print "%s;%s" % (x, element) already.append(element) def copySamples(self, out_dir): if not os.path.exists(out_dir): os.mkdir(out_dir) base_dir_name = "set" sets = {} i = 1 grp = self.sortByCount() already = [] for x in grp: for element in self.groups[x]: if element not in already: if not sets.has_key(x): tmp = base_dir_name + str(i) i += 1 tmp = os.path.join(self.output_dir, tmp) os.mkdir(tmp) sets[x] = tmp new_path = sets[x] basefile = os.path.basename(element) new_path = os.path.join(new_path, basefile) shutil.copy(element, new_path) #print ("Moving %s to %s" % (element, new_path)) #print "%s;%s" % (x, element) already.append(element)