Python CKoretFuzzyHashing.hash_file Examples

Programming Language: Python

Namespace/Package Name: kfuzzy

Method/Function: hash_file

Examples at hotexamples.com: 2

Python CKoretFuzzyHashing.hash_file - 2 examples found. These are the top rated real world Python examples of kfuzzy.CKoretFuzzyHashing.hash_file extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CKoretFuzzyHashing(5)

bsize(3)

hash_bytes(3)

edit_distance(2)

hash_file(2)

output_size(2)

Example #1

Show file

File: nightmare_frontend.py Project: ssatanss/nightmare

def find_original_file(db, id):
  
  vars = {"id":id}
  where = "sample_id = $id"
  res = db.select("samples", what="sample_hash", where=where, vars=vars)
  res = list(res)
  if len(res) == 0:
    raise Exception("Invalid crash identifier")
  sample_hash = res[0].sample_hash

  res = db.select("config", what="value", where="name='SAMPLES_PATH'")
  res = list(res)
  if len(res) == 0:
    raise Exception("Invalid configuration value for 'SAMPLES_PATH'")

  path = os.path.join(res[0].value, "crashes")
  path = os.path.join(path, sample_hash)
  if not os.path.exists(path):
    raise Exception("Crash sample does not exists! %s" % path)

  magic = open(path, "rb").read(3)
  if magic == "PK\x03":
    z = ZipFile(path, "r")
    cmt = z.comment
    z.close()
    if cmt == "NIGHTMARE":
      raise Exception("Cannot find the original sample for ZIP archives created by Nightmare, sorry.")

  res = db.select("config", what="value", where="name = 'TEMPLATES_PATH'")
  res = list(res)
  if len(res) == 0:
    raise Exception("Invalid configuration value for 'TEMPLATES_PATH'")
  templates_path = res[0].value

  sql = """select p.subfolder subfolder
             from projects p,
                  crashes c
            where c.sample_id = $id
              and p.project_id = c.project_id"""
  vars = {"id":id}
  res = db.query(sql, vars=vars)
  res = list(res)
  if len(res) == 0:
    raise Exception("Cannot find the project associated to the crash identifier")

  project_path = os.path.join(templates_path, res[0].subfolder)
  if not os.path.exists(project_path):
    raise Exception("Cannot find path '%s'" % project_path)

  kfh = CKoretFuzzyHashing()
  kfh.bsize = 16
  h1, h2, h3 = kfh.hash_file(path).split(";")

  original_file = None
  for f in os.listdir(project_path):
    filename = os.path.join(project_path, f)
    if not os.path.isfile(filename):
      continue

    tmp1, tmp2, tmp3 = kfh.hash_file(filename).split(";")
    if h1 == tmp1 and h2 == tmp2 and h3 == tmp3:
      original_file = filename
      break
    elif h1 == tmp1 or h2 == tmp2 or h3 == tmp3:
      original_file = filename
      break

  return original_file, path

Example #2

Show file

class CDeepToad:
    def __init__(self):
        self.kfd = CKoretFuzzyHashing()
        self.kfd.bsize = 512
        self.kfd.output_size = 32
        self.kfd.ignore_range = 2
        self.kfd.big_file_size = 1024*1024*32
        
        self.groups = {}
        self.ingroups = {}
        self.extensions = []
        self.ignore_extensions = []
        self.edit_distance = MAX_EDIT_DISTANCE
        self.maximum = 0
        self.aggresive = False
        self.just_print = False
        self.just_compare = False
        self.print_similars = False
        self.output_dir = None

    def cluster(self, hashes, filename):
        if self.just_print or self.just_compare:
            self.groups[filename] = hashes
            return
        
        for hash in hashes:
            hashed = False
            for key in self.groups:
                # Check for maximum edit distance
                if self.kfd.edit_distance(key, hash) <= self.edit_distance:
                    self.groups[key].append(filename)
                    hashed = True
                    break
            
            if hashed:
                continue
            
            if not self.groups.has_key(hash):
                self.groups[hash] = []
            
            self.groups[hash].append(filename)

    def compareSimilars(self, hashes, filename):
        for line in open(self.similars_file, "rb").readlines():
            line = line.strip("\r").strip("\n")
            similar_hases = line.split(";")
            print similar_hashes

    def hashFile(self, filename):
        try:
            s1, s2, s3 = self.kfd.hash_file(filename, self.aggresive).split(";")
            if self.just_print:
                print "%s;%s;%s;%s" % (s1, s2, s3, filename)
            elif self.print_similars:
                self.compareSimilars((s1, s2, s3), filename)
            else:
                self.cluster((s1, s2, s3), filename)
        except KeyboardInterrupt:
            raise
        except:
            sys.stderr.write(" -> %s\n" % str(sys.exc_info()[1]))
            sys.stderr.flush()

    def printReportHeader(self):
        print "Signature;Simple Signature;Reverse Signature;Filename"
    
    def clusterDirectory(self, path, output_dir):
        last_size = 0
        total = 0
        if self.just_print:
            self.printReportHeader()
        
        for root, dirs, files in os.walk(path):
            for name in files:
                if self.maximum != 0 and total >= self.maximum:
                    break
                total += 1
                
                basename, extension = os.path.splitext(name)
                if extension in self.ignore_extensions:
                    continue
                elif len(self.extensions) != 0:
                    if extension not in self.extensions:
                        continue
                
                if not self.just_print:
                    sys.stderr.write("\b"*last_size + " "*last_size + "\b"*last_size)
                    sys.stderr.flush()
                    sys.stderr.write("Processing file %s ..." % os.path.join(root, name))
                    last_size = len("Processing file %s ..." % os.path.join(root, name))
                    sys.stderr.flush()
                
                self.hashFile(os.path.join(root, name))
            
            if self.maximum != 0 and total >= self.maximum:
                break
        
        if total > 0:
            sys.stderr.write("\n")
            sys.stderr.flush()

    def sortByCount(self):
        # First, sort by count of elements
        newGrp = {}
        for x in self.groups:
            if x == "":
                continue
            
            newGrp[x] = len(self.groups[x])
        
        # Now sort the dict by values
        outgrp = {}
        alist = sorted(newGrp.iteritems(), key=lambda (k,v): (v,k), reverse=True)
        dones = []
        
        # Create the new dict with only non empty groups
        for x in alist:
            val = x[0]
            outgrp[val] = []
            for element in self.groups[val]:
                if element not in dones:
                    outgrp[val].append(element)
                    dones.append(element)
            
            if len(outgrp[val]) == 0:
                del outgrp[val]
        
        return outgrp

    def printHashes(self):
        self.printReportHeader()
        for x in self.groups:
            hashes = self.groups[x]
            print "%s;%s;%s;%s" % (hashes[0], hashes[1], hashes[2], x)

    def compareAndReportHashes(self, x, y, hashesx, hashesy, dones):
        finished = False
        
        for hx in hashesx:
            if hx == "":
                continue
            elif finished:
                break
            
            #if hx not in hashesy:
            #    continue
            
            for hy in hashesy:
                if hy == "":
                    continue
                
                dis = self.kfd.edit_distance(hx, hy)
                dis = len(hx) - dis
                percent = dis*100.00/len(hx)
                
                if percent > 33:
                    print "File '%s' matches '%s' (%0.2f%%)" % (x, y, percent)
                    dones[y] = x
                    finished = True
                    break
        
        return dones

    def compareHashes(self):
        dones = {}
        
        for x in self.groups:
            for y in self.groups:
                if x == y:
                    continue
                elif dones.has_key(x):
                    if dones[x] == y:
                        #print "Ignored"
                        continue
                
                hashesx = self.groups[x]
                hashesy = self.groups[y]
                
                dones = self.compareAndReportHashes(x, y, hashesx, hashesy, dones)

    def printReport(self):
        if self.just_print:
            #self.printHashes()
            return
        
        if self.just_compare:
            self.compareHashes()
            return
        
        grp = self.sortByCount()
        already = []
        for x in grp:
            for element in self.groups[x]:
                if element not in already:
                    print "%s;%s" % (x, element)
                    already.append(element)
    
    def copySamples(self, out_dir):
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        
        base_dir_name = "set"
        sets = {}
        i = 1
        
        grp = self.sortByCount()
        already = []
        for x in grp:
            for element in self.groups[x]:
                if element not in already:
                    if not sets.has_key(x):
                        tmp = base_dir_name + str(i)
                        i += 1
                        tmp = os.path.join(self.output_dir, tmp)
                        os.mkdir(tmp)
                        sets[x] = tmp
                    
                    new_path = sets[x]
                    
                    basefile = os.path.basename(element)
                    new_path = os.path.join(new_path, basefile)
                    
                    shutil.copy(element, new_path)
                    #print ("Moving %s to %s" % (element, new_path))
                    #print "%s;%s" % (x, element)
                    already.append(element)