Esempio n. 1
0
class TestKFuzzy(unittest.TestCase):

    kfd = None
    buf = ""

    def setUp(self):
        self.kfd = CKoretFuzzyHashing()
        buf = ""
        for c in xrange(0, 255):
            buf += chr(c) * 512
        self.buf = buf

    def testDHA(self):
        """ Default hashing algorithm (DHA) """
        key = "AgIEBAQEBgYGBggICAgKCgoKDAwMDA4O;BAQGBggICgoMDA4OEBASEhQUFhYYGBoa;+/v5+ff39fXz8/Hx7+/t7evr6enn5+Xl"
        hash = self.kfd.hash_bytes(self.buf)
        self.assert_(key == hash)

    def testFHA(self):
        key = "IAIEBggKDA4QEhQWGBocHg;AgQGCAoMDhASFBYYGhweICIkJigqLC4w;vb/Bw8XHycvNz9HT1dfZ293f4ePl5+nr"
        self.kfd.algorithm = self.kfd._fast_hash
        hash = self.kfd.hash_bytes(self.buf)
        self.assert_(key == hash)

    def testSimplified(self):
        key = "/v4DA/39Bgb8/AkJ+/sMDPr6Dw/5+RIS;AYB//n/+fv1+/X38ffx8+3z7e/p7+nr5;+3z8ff19/X7+fv5/f4ABgAGBAoECggOC"
        buf = self.buf * 16
        self.kfd.algorithm = self.kfd.simplified
        hash = self.kfd.hash_bytes(buf)
        self.assert_(key == hash)

    def testOutputSize(self):
        self.kfd.algorithm = None
        l = len(self.kfd.hash_bytes(self.buf))
        size = self.kfd.output_size * 3
        size += 2
        self.assertEqual(l, size)

    def testNullBlocks(self):
        buf = "\x00" * 8192
        buf += self.buf
        self.kfd.algorithm = None
        self.kfd.reduce_errors = True
        h = self.kfd.hash_bytes(buf)
        self.failUnless(h.find("AA") == -1)

    def testSimplified(self):
        self.kfd.algorithm = self.kfd.simplified
        h = self.kfd.hash_bytes(self.buf + self.buf)
        self.kfd.algorithm = self.kfd._fast_hash
        h2 = self.kfd.hash_bytes(self.buf + self.buf)
        self.failUnless(((self.kfd.output_size * 3) + 2) -
                        self.kfd.edit_distance(h, h2) < 16)
Esempio n. 2
0
class TestKFuzzy(unittest.TestCase):

    kfd = None
    buf = ""

    def setUp(self):
        self.kfd = CKoretFuzzyHashing()
        buf = ""
        for c in xrange(0, 255):
            buf += chr(c)*512
        self.buf = buf

    def testDHA(self):
        """ Default hashing algorithm (DHA) """
        key = "AgIEBAQEBgYGBggICAgKCgoKDAwMDA4O;BAQGBggICgoMDA4OEBASEhQUFhYYGBoa;+/v5+ff39fXz8/Hx7+/t7evr6enn5+Xl"
        hash = self.kfd.hash_bytes(self.buf)
        self.assert_(key == hash)

    def testFHA(self):
        key = "IAIEBggKDA4QEhQWGBocHg;AgQGCAoMDhASFBYYGhweICIkJigqLC4w;vb/Bw8XHycvNz9HT1dfZ293f4ePl5+nr"
        self.kfd.algorithm = self.kfd._fast_hash
        hash = self.kfd.hash_bytes(self.buf)
        self.assert_(key == hash)

    def testSimplified(self):
        key = "/v4DA/39Bgb8/AkJ+/sMDPr6Dw/5+RIS;AYB//n/+fv1+/X38ffx8+3z7e/p7+nr5;+3z8ff19/X7+fv5/f4ABgAGBAoECggOC"
        buf = self.buf * 16
        self.kfd.algorithm = self.kfd.simplified
        hash = self.kfd.hash_bytes(buf)
        self.assert_(key == hash)

    def testOutputSize(self):
        self.kfd.algorithm = None
        l = len(self.kfd.hash_bytes(self.buf))
        size = self.kfd.output_size * 3
        size += 2
        self.assertEqual(l, size)

    def testNullBlocks(self):
        buf  = "\x00"*8192
        buf += self.buf
        self.kfd.algorithm = None
        self.kfd.reduce_errors = True
        h = self.kfd.hash_bytes(buf)
        self.failUnless(h.find("AA") == -1)

    def testSimplified(self):
        self.kfd.algorithm = self.kfd.simplified
        h = self.kfd.hash_bytes(self.buf + self.buf)
        self.kfd.algorithm = self.kfd._fast_hash
        h2 = self.kfd.hash_bytes(self.buf + self.buf)
        self.failUnless(((self.kfd.output_size*3)+2) - self.kfd.edit_distance(h, h2) < 16)
Esempio n. 3
0
class CDeepToad:
    def __init__(self):
        self.kfd = CKoretFuzzyHashing()
        self.kfd.bsize = 512
        self.kfd.output_size = 32
        self.kfd.ignore_range = 2
        self.kfd.big_file_size = 1024*1024*32
        
        self.groups = {}
        self.ingroups = {}
        self.extensions = []
        self.ignore_extensions = []
        self.edit_distance = MAX_EDIT_DISTANCE
        self.maximum = 0
        self.aggresive = False
        self.just_print = False
        self.just_compare = False
        self.print_similars = False
        self.output_dir = None

    def cluster(self, hashes, filename):
        if self.just_print or self.just_compare:
            self.groups[filename] = hashes
            return
        
        for hash in hashes:
            hashed = False
            for key in self.groups:
                # Check for maximum edit distance
                if self.kfd.edit_distance(key, hash) <= self.edit_distance:
                    self.groups[key].append(filename)
                    hashed = True
                    break
            
            if hashed:
                continue
            
            if not self.groups.has_key(hash):
                self.groups[hash] = []
            
            self.groups[hash].append(filename)

    def compareSimilars(self, hashes, filename):
        for line in open(self.similars_file, "rb").readlines():
            line = line.strip("\r").strip("\n")
            similar_hases = line.split(";")
            print similar_hashes

    def hashFile(self, filename):
        try:
            s1, s2, s3 = self.kfd.hash_file(filename, self.aggresive).split(";")
            if self.just_print:
                print "%s;%s;%s;%s" % (s1, s2, s3, filename)
            elif self.print_similars:
                self.compareSimilars((s1, s2, s3), filename)
            else:
                self.cluster((s1, s2, s3), filename)
        except KeyboardInterrupt:
            raise
        except:
            sys.stderr.write(" -> %s\n" % str(sys.exc_info()[1]))
            sys.stderr.flush()

    def printReportHeader(self):
        print "Signature;Simple Signature;Reverse Signature;Filename"
    
    def clusterDirectory(self, path, output_dir):
        last_size = 0
        total = 0
        if self.just_print:
            self.printReportHeader()
        
        for root, dirs, files in os.walk(path):
            for name in files:
                if self.maximum != 0 and total >= self.maximum:
                    break
                total += 1
                
                basename, extension = os.path.splitext(name)
                if extension in self.ignore_extensions:
                    continue
                elif len(self.extensions) != 0:
                    if extension not in self.extensions:
                        continue
                
                if not self.just_print:
                    sys.stderr.write("\b"*last_size + " "*last_size + "\b"*last_size)
                    sys.stderr.flush()
                    sys.stderr.write("Processing file %s ..." % os.path.join(root, name))
                    last_size = len("Processing file %s ..." % os.path.join(root, name))
                    sys.stderr.flush()
                
                self.hashFile(os.path.join(root, name))
            
            if self.maximum != 0 and total >= self.maximum:
                break
        
        if total > 0:
            sys.stderr.write("\n")
            sys.stderr.flush()

    def sortByCount(self):
        # First, sort by count of elements
        newGrp = {}
        for x in self.groups:
            if x == "":
                continue
            
            newGrp[x] = len(self.groups[x])
        
        # Now sort the dict by values
        outgrp = {}
        alist = sorted(newGrp.iteritems(), key=lambda (k,v): (v,k), reverse=True)
        dones = []
        
        # Create the new dict with only non empty groups
        for x in alist:
            val = x[0]
            outgrp[val] = []
            for element in self.groups[val]:
                if element not in dones:
                    outgrp[val].append(element)
                    dones.append(element)
            
            if len(outgrp[val]) == 0:
                del outgrp[val]
        
        return outgrp

    def printHashes(self):
        self.printReportHeader()
        for x in self.groups:
            hashes = self.groups[x]
            print "%s;%s;%s;%s" % (hashes[0], hashes[1], hashes[2], x)

    def compareAndReportHashes(self, x, y, hashesx, hashesy, dones):
        finished = False
        
        for hx in hashesx:
            if hx == "":
                continue
            elif finished:
                break
            
            #if hx not in hashesy:
            #    continue
            
            for hy in hashesy:
                if hy == "":
                    continue
                
                dis = self.kfd.edit_distance(hx, hy)
                dis = len(hx) - dis
                percent = dis*100.00/len(hx)
                
                if percent > 33:
                    print "File '%s' matches '%s' (%0.2f%%)" % (x, y, percent)
                    dones[y] = x
                    finished = True
                    break
        
        return dones

    def compareHashes(self):
        dones = {}
        
        for x in self.groups:
            for y in self.groups:
                if x == y:
                    continue
                elif dones.has_key(x):
                    if dones[x] == y:
                        #print "Ignored"
                        continue
                
                hashesx = self.groups[x]
                hashesy = self.groups[y]
                
                dones = self.compareAndReportHashes(x, y, hashesx, hashesy, dones)

    def printReport(self):
        if self.just_print:
            #self.printHashes()
            return
        
        if self.just_compare:
            self.compareHashes()
            return
        
        grp = self.sortByCount()
        already = []
        for x in grp:
            for element in self.groups[x]:
                if element not in already:
                    print "%s;%s" % (x, element)
                    already.append(element)
    
    def copySamples(self, out_dir):
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        
        base_dir_name = "set"
        sets = {}
        i = 1
        
        grp = self.sortByCount()
        already = []
        for x in grp:
            for element in self.groups[x]:
                if element not in already:
                    if not sets.has_key(x):
                        tmp = base_dir_name + str(i)
                        i += 1
                        tmp = os.path.join(self.output_dir, tmp)
                        os.mkdir(tmp)
                        sets[x] = tmp
                    
                    new_path = sets[x]
                    
                    basefile = os.path.basename(element)
                    new_path = os.path.join(new_path, basefile)
                    
                    shutil.copy(element, new_path)
                    #print ("Moving %s to %s" % (element, new_path))
                    #print "%s;%s" % (x, element)
                    already.append(element)