Beispiel #1
0
def tlshh(path):
    print("[+] Creating tlsh matches")
    result = []
    with open('file_numbers.csv', mode='r') as infile:
        reader = csv.reader(infile)
        mydict = {rows[0]: rows[1] for rows in reader}

    for pair in itertools.combinations(glob.glob(os.path.join(path, '*')),
                                       r=2):
        file1 = pair[0]
        file2 = pair[1]
        filename1 = file1.split("/")[1]
        filename2 = file2.split("/")[1]
        h1 = tlsh.hash(open(file1, 'rb').read())
        h1_number = mydict[filename1]
        h2 = tlsh.hash(open(file2, 'rb').read())
        h2_number = mydict[filename2]
        score = tlsh.diff(h1, h2)
        result.append([h1_number, h1, h2_number, h2, 1000 - score])

    with open("matchestlsh.csv", "w", newline="") as f:
        writer = csv.writer(f)
        result.insert(0, [
            "File 1 Number", "File 1 Hash", "File 2 Number", "File 2 Hash",
            "Score"
        ])
        writer.writerows(result)
Beispiel #2
0
def check_text_similarity(path1, path2):
	file1_str = open(path1, 'rb').read();
	file2_str = open(path2, 'rb').read();
	h1 = tlsh.hash(file1_str);
	h2 = tlsh.hash(file2_str);
	diff = tlsh.diff(h1, h2);
	return diff;
Beispiel #3
0
def ToHash(h1,h2, classes):
  try:
    th = tlsh.hash(h1)
  except:
    th = 'None'

  try:
    sh = fuzzyhashlib.sdhash(h1).hexdigest().rstrip()
  except:
    sh = 'None'

  try:
    nil = Nilsimsa(h1).hexdigest()
  except:
    nil = 'None'

  try:
    ss = fuzzyhashlib.ssdeep(h1).hexdigest()
  except:
    ss = 'None'

  ch = []
  if classes!=None:
    for c in classes:
      name = c[0]
      content = c[1]
      try:
        cnil = Nilsimsa(content).hexdigest()
      except:
        cnil = 'None'

      try:
        css = fuzzyhashlib.ssdeep(content).hexdigest()
      except:
        css = 'None'

      try:
        csh = 'None'
        if len(content)  >= 512:
          csh = fuzzyhashlib.sdhash(content).hexdigest().rstrip()
      except:
        csh = 'None'

      try:
        cth = 'None'
        if len(content) >= 256:
          cth = tlsh.hash(content)
      except:
        cth = 'None'
      ch.append((name,cth,csh,cnil,css))
  return th,sh,nil,ss,ch
Beispiel #4
0
def compute_hashes_impl(sample, pe):
    md5_value = hashlib.md5(sample).hexdigest()
    sha1_value = hashlib.sha1(sample).hexdigest()
    sha256_value = hashlib.sha256(sample).hexdigest()
    ssdeep_value = ssdeep.hash(sample)
    impfuzzy_value = pyimpfuzzy.get_impfuzzy_data(sample)
    tlsh_value = tlsh.hash(sample)
    totalhash = pehash.totalhash_hex(pe=pe)
    anymaster = pehash.anymaster_hex(pe=pe)
    anymaster_v1_0_1 = pehash.anymaster_v1_0_1_hex(pe=pe)
    endgame = pehash.endgame_hex(pe=pe)
    crits = pehash.crits_hex(pe=pe)
    pehashng = pehash.pehashng_hex(pe=pe)
    imphash = pe.get_imphash()

    return {
        "md5": md5_value,
        "sha1": sha1_value,
        "sha256": sha256_value,
        "ssdeep": ssdeep_value,
        "imphash": imphash,
        "impfuzzy": impfuzzy_value,
        "tlsh": tlsh_value,
        "totalhash": totalhash,
        "anymaster": anymaster,
        "anymaster_v1_0_1": anymaster_v1_0_1,
        "endgame": endgame,
        "crits": crits,
        "pehashng": pehashng,
    }
Beispiel #5
0
def info_file(path):
    BUF_SIZE = 65536
    md5 = hashlib.md5()
    sha1 = hashlib.sha1()
    sha256 = hashlib.sha256()
    sha512 = hashlib.sha512()

    with open(path, 'rb') as f:
        while True:
            data = f.read(BUF_SIZE)
            if not data:
                break
            md5.update(data)
            sha1.update(data)
            sha256.update(data)
            sha512.update(data)
            SSDEEP = ssdeep.hash(data)
            TLSH = tlsh.hash(data)

    print("MD5: {0}".format(md5.hexdigest()))
    print("SHA1: {0}".format(sha1.hexdigest()))
    print("SHA256: {0}".format(sha256.hexdigest()))
    print("SHA512: {0}".format(sha512.hexdigest()))
    print("SSDEEP: {0}".format(SSDEEP))
    print("TLSH: {0}".format(TLSH))
def scanhashes(resolved_path, extrahashes):
    filehashes = {}
    scanfile = open(resolved_path, 'r')
    h = hashlib.new('sha256')
    data = scanfile.read()
    h.update(data)
    scanfile.close()
    filehashes['sha256'] = h.hexdigest()

    for i in extrahashes:
        if i == 'crc32':
            if os.stat(resolved_path).st_size > 2147483647:
                filehashes[i] = None
            else:
                filehashes[i] = zlib.crc32(data) & 0xffffffff
        elif i == 'tlsh':
            if os.stat(resolved_path).st_size >= 256:
                tlshhash = tlsh.hash(data)
                filehashes[i] = tlshhash
            else:
                filehashes[i] = None
        else:
            h = hashlib.new(i)
            h.update(data)
            filehashes[i] = h.hexdigest()
        scanfile.close()
    return filehashes
Beispiel #7
0
    def calculate(self, data):
        if len(data) < 50:
            return '-'

        fingerprint = tlsh.hash(data)

        return fingerprint if fingerprint else '-'
Beispiel #8
0
    def hash_from_file(cls, file: pathlib.Path) -> str:
        if not str(file).endswith(".pdf"):
            warnings.warn("File does not appear to be a pdf. ",
                          category=UserWarning)
            return ""

        try:
            import tlsh
            from pdfminer.converter import TextConverter
            from pdfminer.layout import LAParams
            from pdfminer.pdfdocument import PDFDocument
            from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
            from pdfminer.pdfpage import PDFPage
            from pdfminer.pdfparser import PDFParser
        except:
            warnings.warn(
                "Getting the tlsh hash of a pdf requires additional libraries already be installed; install threatexchange with the [pdf] extra",
                category=UserWarning,
            )
            return ""
        text = StringIO()
        with open(file, "rb") as in_file:
            parser = PDFParser(in_file)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, text, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
            return str(tlsh.hash(text.getvalue().encode()))
Beispiel #9
0
    def calculate(self, data):
        if len(data) < 50:
            return 'Error: TLSH requires buffer >= 50 in size ({0:d})'.format(
                len(data))

        fingerprint = tlsh.hash(data)

        return fingerprint if fingerprint else 'Error: empty hash'
Beispiel #10
0
 def _create_data(self):
     data = self._concat_data()
     return {
         "data": data,
         "sha256": hashlib.sha256(data).hexdigest(),
         "entropy": entropy(data),
         "histogram": histogram(data),
         "tlsh": tlsh.hash(data),
         "strings": set(word_reg.findall(data)),
     }
Beispiel #11
0
def file_upload_view(request):
    print("File uploaded")
    # print(request.FILES)
    if request.method == 'POST':
        my_file = request.FILES.get('file')
        fs = FileSystemStorage()
        path = fs.save(my_file.name, my_file)
        BUF_SIZE = 65536
        now = timezone.localtime(timezone.now())
        md5 = hashlib.md5()
        sha256 = hashlib.sha256()
        sha512 = hashlib.sha512()
        sha1 = hashlib.sha1()

        with open("/root/Desktop/OmegaVirus/OmegaVirusWeb/core/uploads/" + path, 'rb') as f:
            while True:
                data = f.read(BUF_SIZE)
                if not data:
                    break
                sha256.update(data)
                sha512.update(data)
                md5.update(data)
                sha1.update(data)
                SSDEEP = ssdeep.hash(data)
                TLSH = tlsh.hash(data)

        if File.objects.filter(SHA256=sha256.hexdigest()).exists():
            b = File(SHA256="{0}".format(sha256.hexdigest()), name=my_file.name, filepath=path,
                     size=convert_size(my_file.size), date=now, first=False)
            b.save()

        else:
            b = File(SHA256="{0}".format(sha256.hexdigest()), name=my_file.name, filepath=path,
                     size=convert_size(my_file.size), date=now, first=True)
            b.save()

        h = Hash(SHA256="{0}".format(sha256.hexdigest()), SHA512="{0}".format(sha512.hexdigest()),
                 SHA1="{0}".format(sha1.hexdigest()), MD5="{0}".format(md5.hexdigest()),
                 SSDEEP="{0}".format(SSDEEP), TLSH="{0}".format(TLSH))
        h.save()
    try:
        obj = File.objects.latest('id')
        print("im in")
        context = {
            "name": obj.name,
            "id": obj.id,
            "SHA256": obj.SHA256,
            "filepath": obj.filepath,
            "size": obj.size,
            "date": obj.date
        }
        return redirect('latest')
        # return render(request, "index.html", context)
    except ValueError:
        return redirect('latest')
Beispiel #12
0
def compute_1(path,force):
    try:
        f = open(path, 'rb')
        data = f.read()
        if force == 1:
            hs = tlsh.forcehash(data)
	else:
            hs = tlsh.hash(data)
        return hs
    except IOError as e:
        print('cannot find file: ', path)
    return ''
Beispiel #13
0
def compute_1(path, force):
    try:
        f = open(path, "rb")
        data = f.read()
        if force == 1:
            hs = tlsh.forcehash(data)
        else:
            hs = tlsh.hash(data)
        return hs
    except IOError as e:
        print("cannot find file: ", path)
    return ""
Beispiel #14
0
        def fuzzy_hash(self):
            if not hasattr(self, "_fuzzy_hash"):
                try:
                    hex_digest = tlsh.hash(self.asm.encode())
                except ValueError:
                    # File must contain a certain amount of randomness
                    return None

                # For short files, the hex_digest is an empty string, so turn
                # it into None
                self._fuzzy_hash = hex_digest or None

            return self._fuzzy_hash
Beispiel #15
0
    def hash_picture(self, curr_picture):
        """
        Hash a picture and returns the hash value
        :param curr_picture: the picture to hash
        :return: the hashed version of the picture
        """
        answer = {}
        self.logger.info("Hashing picture ... ")

        # Convert bytes in PIL image
        pil_picture = Image.open(io.BytesIO(curr_picture))
        self.logger.debug(
            f"Picture converted to PIL Image {type(pil_picture)}")

        # DEBUG # pil_picture.save('/home/user/Desktop/debug_pil.bmp')

        try:
            # Note : @image must be a PIL instance.
            if self.fe_conf.A_HASH.get("is_enabled", False):
                self.logger.debug("A-HASH ... ")
                answer["A_HASH"] = self.check_null_hash(
                    imagehash.average_hash(pil_picture))
            if self.fe_conf.P_HASH.get("is_enabled", False):
                self.logger.debug("P_HASH ... ")
                answer["P_HASH"] = self.check_null_hash(
                    imagehash.phash(pil_picture))
            if self.fe_conf.P_HASH_SIMPLE.get("is_enabled", False):
                self.logger.debug("P_HASH_SIMPLE ... ")
                answer["P_HASH_SIMPLE"] = self.check_null_hash(
                    imagehash.phash_simple(pil_picture))
            if self.fe_conf.D_HASH.get("is_enabled", False):
                self.logger.debug("D_HASH ... ")
                answer["D_HASH"] = self.check_null_hash(
                    imagehash.dhash(pil_picture))
            if self.fe_conf.D_HASH_VERTICAL.get("is_enabled", False):
                self.logger.debug("D_HASH_VERTICAL ... ")
                answer["D_HASH_VERTICAL"] = self.check_null_hash(
                    imagehash.dhash_vertical(pil_picture))
            if self.fe_conf.W_HASH.get("is_enabled", False):
                self.logger.debug("W_HASH ... ")
                answer["W_HASH"] = self.check_null_hash(
                    imagehash.whash(pil_picture))
            if self.fe_conf.TLSH.get("is_enabled", False):
                self.logger.debug("TLSH ... ")
                answer["TLSH"] = self.check_null_hash(tlsh.hash(curr_picture))

        except Exception as e:
            self.logger.error("Error during hashing : " + str(e))

        return answer
Beispiel #16
0
 def hash_from_file(cls, file: pathlib.Path) -> str:
     if not str(file).endswith(".pdf"):
         warnings.warn("File does not appear to be a pdf. ",
                       category=UserWarning)
         return ""
     text = StringIO()
     with open(file, "rb") as in_file:
         parser = PDFParser(in_file)
         doc = PDFDocument(parser)
         rsrcmgr = PDFResourceManager()
         device = TextConverter(rsrcmgr, text, laparams=LAParams())
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         for page in PDFPage.create_pages(doc):
             interpreter.process_page(page)
         return str(tlsh.hash(text.getvalue().encode()))
Beispiel #17
0
def tlsh_score(response, site, alert):
    """
    Caculate TLSH Score.

    :param response: Http response.
    :param site: Site Object.
    :param alert: Alert Integer.
    :return: alert, score
    :rtype: int, int
    """
    fuzzy_hash = tlsh.hash(bytes(response.text, 'utf-8'))
    score = tlsh.diffxlen(site.content_fuzzy_hash, fuzzy_hash)
    if score > 160:
        alert += 4
        Site.objects.filter(pk=site.pk).update(content_fuzzy_hash=fuzzy_hash)
    return alert, score
Beispiel #18
0
    def hash_picture(self, curr_picture: picture_class.Picture):
        # target_hash = tlsh.hash(Image.open(curr_picture.path))
        target_hash = tlsh.hash(
            open(curr_picture.path,
                 'rb').read())  # From https://github.com/trendmicro/tlsh

        curr_picture.hash = target_hash

        if target_hash is None or target_hash == "":
            # TODO : Better handling of null hashes ?
            curr_picture.hash = '0000000000000000000000000000000000000000000000000000000000000000000000'
            raise Exception(
                f"Target hash is None or null for {curr_picture.path.name}. Hash set to 0s value"
            )

        return curr_picture
def scantlsh(scanqueue, reportqueue, cursor, conn, tlshcutoff):
	while True:
		(directory, filename, sha256) = scanqueue.get()

		## then compute the TLSH hash and search in the database
		## for the closest file.
		tlshfile = open(os.path.join(directory, filename), 'rb')
		tlshdata = tlshfile.read()
		tlshfile.close()
		tlshhash = tlsh.hash(tlshdata)

		if tlshhash == '':
			## file is either too small or a hash cannot be
			## computed (example: all characters are the same)
			scanqueue.task_done()
			continue

		## now get some cadidates
		cursor.execute("select distinct checksum from fileinfo where filename=%s", (filename,))
		candidates = cursor.fetchall()
		conn.commit()
		if len(candidates) == 0:
			scanqueue.task_done()
			continue
		
		mostpromising = []
		minhash = sys.maxsize
		for candidate in candidates:
			cursor.execute("select tlsh from hashes where sha256=%s", candidate)
			tlshresult = cursor.fetchone()
			if tlshresult == None:
				continue
			tlshdiff = tlsh.diff(tlshhash, tlshresult[0])
			if tlshdiff < minhash:
				minhash = tlshdiff
				mostpromising = [candidate[0]]
			elif tlshdiff == minhash:
				mostpromising.append(candidate[0])
		if mostpromising != []:
			if minhash < tlshcutoff:
				candidates = []
				for m in mostpromising:
					cursor.execute("select packagename, version, fullfilename from fileinfo where checksum=%s", (m,))
					candidates += cursor.fetchall()
					conn.commit()
				reportqueue.put((directory, filename, candidates, minhash))
		scanqueue.task_done()
Beispiel #20
0
def main():
    if len(sys.argv) >= 2:
        input_file = sys.argv[1]
    else:
        print('Forgot file name as arg')

    with open(input_file, 'r') as f:
        urls = f.readlines()

    for u in urls:
        ureq.urlretrieve(u, 'tmp_download.jpg')
        img = im.load_img('tmp_download.jpg')
        os.remove('tmp_download.jpg')
        kp, desc = im.get_keypoints(img)
        pack = dm.pack_keypoints(kp, desc)
        img_hash = tlsh.hash(str(pack).encode('utf-8'))

        dm.write_postgres(img_hash, pack, u)
        print(f'Added: {u}')
def computehash((filedir, filename, extrahashes)):
	filehashes = {}
	resolved_path = os.path.join(filedir, filename)
	scanfile = open(resolved_path, 'r')
	filedata = scanfile.read()
	scanfile.close()
	h = hashlib.new('sha256')
	h.update(filedata)
	filehashes['sha256'] = h.hexdigest()

	if 'crc32' in extrahashes:
		try:
			filehashes['crc32'] = zlib.crc32(filedata) & 0xffffffff
		except:
			return None

	if 'tlsh' in extrahashes:
		if os.stat(resolved_path).st_size >= 256:
			filehashes['tlsh'] = tlsh.hash(filedata)
		else:
			filehashes['tlsh'] = None

	## first remove 'crc32' from extrahashes
	extrahashesset = set(extrahashes)
	try:
		extrahashesset.remove('crc32')
	except KeyError:
		pass

	## then remove 'tlsh' from extrahashes
	try:
		extrahashesset.remove('tlsh')
	except KeyError:
		pass

	temphashes = {}
	for i in extrahashesset:
		temphashes[i] = hashlib.new(i)
	for i in extrahashesset:
		temphashes[i].update(filedata)
	for i in extrahashesset:
		filehashes[i] = temphashes[i].hexdigest()
        return (filename, filehashes)
Beispiel #22
0
def _processFile1(filename):
        print "Processing %s..." % filename
        _resetFileDetails()
        if getsize(filename) <= 512: TlshStruct.logger.error("File %s too small to compute tlsh value")
        else:
                result = None
                try:
                        TlshStruct.file_basic_details["filename"] = filename
                        TlshStruct.file_basic_details["tlsh"] = tlsh.hash(open(filename, "rb").read())
                        TlshStruct.file_basic_details["sha256"] = _getSha256(filename)
                        if not TlshStruct.restrict:
                                prop_details = efp.getBasicFileProperties(filename)
                                cert_details = efp.getCertificateDetails(filename)
                                TlshStruct.file_prop_details = prop_details if prop_details is not None else {}
                                TlshStruct.file_cert_details = cert_details if cert_details is not None else {}
                                result = _sendQuery()
                except Exception, ex:
                        print "ERROR: Problem in getting tlsh value of %s : %s" % (filename, ex)
                        tlsh_val = "error"
                finally:
Beispiel #23
0
def upload_filer():
    if request.method == 'POST':
        f = request.files['file']
        f.save(f'{FILE_PREFIX}{secure_filename(f.filename)}')

        # Process Image
        img = im.load_img(f'{FILE_PREFIX}{secure_filename(f.filename)}')
        kp, desc = im.get_keypoints(img)
        img_output = dm.pack_keypoints(kp, desc)
        img_hash = tlsh.hash(str(img_output).encode('utf-8'))

        if not app.testing:
            result = dm.query_postgres(img_hash)
        else:
            with open('tests/data_uploader.txt', 'rb') as f:
                result = pickle.load(f)

        f = open('panic.log', 'w')
        matches = {}
        for item in result:
            tmp_kp, tmp_desc = dm.unpack_keypoints(item[2])
            match_score = im.get_match(desc, tmp_desc)
            matches[item[1]] = len(match_score)
        f.write(f'{matches}')
        f.write(f'{len(matches)}')
        f.close()

        # Return Results
        if len(matches) > 0:
            url = max(matches, key=matches.get)
            # Generate random string for filename
            # This can help prevent browser caching of an image that has changed
            filename = ''.join(
                random.choice(string.ascii_letters) for i in range(10))
            ureq.urlretrieve(url, f'static/img/{filename}')
            img_2 = im.load_img(f'static/img/{filename}')
            os.remove(f'static/img/{filename}')

            return render_template('result.html', value=url)
        return render_template('nomatch.html')
Beispiel #24
0
    def Calculate(self, string):
        if self.name == "md5":
            hash = hashlib.md5(string).hexdigest()

        elif self.name == "sha1":
            hash = hashlib.sha1(string).hexdigest()

        elif self.name == "crc":
            crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF)
            crc32.update(string)
            hash = crc32.hexdigest()

        elif self.name == "murmur":
            hash = mmh3.hash(string)

        elif self.name == "ssdeep":
            hash = ssdeep.hash(string)

        elif self.name == "tlsh":
            hash = tlsh.hash(string)

        return hash
def computehash((filedir, filename, extrahashes, sha256sum)):
    resolved_path = os.path.join(filedir, filename)
    filehashes = {}
    filehashes['sha256'] = sha256sum
    scanfile = open(resolved_path, 'r')
    data = scanfile.read()
    scanfile.close()
    for i in extrahashes:
        if i == 'crc32':
            filehashes[i] = zlib.crc32(data) & 0xffffffff
        elif i == 'tlsh':
            if os.stat(resolved_path).st_size >= 256:
                tlshhash = tlsh.hash(data)
                filehashes[i] = tlshhash
            else:
                filehashes[i] = None
        else:
            h = hashlib.new(i)
            h.update(data)
            filehashes[i] = h.hexdigest()
    filehashes['sha256'] = sha256sum
    return (filedir, filename, filehashes)
Beispiel #26
0
def _processFile1(filename):
    print "Processing %s..." % filename
    _resetFileDetails()
    if getsize(filename) <= 512:
        TlshStruct.logger.error("File %s too small to compute tlsh value")
    else:
        result = None
        try:
            TlshStruct.file_basic_details["filename"] = filename
            TlshStruct.file_basic_details["tlsh"] = tlsh.hash(
                open(filename, "rb").read())
            TlshStruct.file_basic_details["sha256"] = _getSha256(filename)
            if not TlshStruct.restrict:
                prop_details = efp.getBasicFileProperties(filename)
                cert_details = efp.getCertificateDetails(filename)
                TlshStruct.file_prop_details = prop_details if prop_details is not None else {}
                TlshStruct.file_cert_details = cert_details if cert_details is not None else {}
                result = _sendQuery()
        except Exception, ex:
            print "ERROR: Problem in getting tlsh value of %s : %s" % (
                filename, ex)
            tlsh_val = "error"
        finally:
        for f in files:
            f2 = path.join(root,f)
            try:
                fname ='./'+f2[prelen:] 
                if path.isfile(f2):
                    fsize=path.getsize(f2)
                    stat=os.stat(f2)
                    perm=stat.st_mode
                    uid=stat.st_uid
                    gid=stat.st_gid
                    
                    with open(f2, 'rb') as fin:
                        cont = fin.read()
                        sha1_hash = hashlib.sha1(cont).hexdigest()
                        md5_hash = hashlib.md5(cont).hexdigest()
                        tlsh_hash = tlsh.hash(cont)
                    symtgt=None
                elif path.islink(f2):
                    symtgt=path.realpath(f2)
                    gid=uid=perm=None
                    fsize=None
                    sha1_hash=md5_hash=tlsh_hash=None
                else:
                    gid=uid=perm=None
                    try:
                        fsize=path.getsize(f2)
                    except:
                        fsize=None
                    sha1_hash=md5_hash=tlsh_hash=None

                print(fname, f2)
Beispiel #28
0
def processarchive(scanqueue, resultqueue, sourcesdirectory, unpackprefix):
    while True:
        task = scanqueue.get()
        unpackdirectory = tempfile.mkdtemp(dir=unpackprefix)
        unpackdirectorylen = len(unpackdirectory) + 1
        ## then for each file:
        ## 1. unpack the archive
        ## 2. compute hashes
        ## 3. report results
        p = subprocess.Popen(
            ['tar', 'ixf',
             os.path.join(sourcesdirectory, task['filename'])],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            cwd=unpackdirectory)
        p.communicate()
        if p.returncode != 0:
            shutil.rmtree(unpackdirectory)
            scanqueue.task_done()
            continue

        results = []
        hashresults = []
        resultcounter = 0
        dirwalk = os.walk(unpackdirectory)
        for direntries in dirwalk:
            ## make sure all subdirectories and files can be accessed
            for subdir in direntries[1]:
                subdirname = os.path.join(direntries[0], subdir)
                if not os.path.islink(subdirname):
                    os.chmod(subdirname,
                             stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
            for filename in direntries[2]:
                fullfilename = os.path.join(direntries[0], filename)
                if not os.path.islink(fullfilename):
                    os.chmod(fullfilename,
                             stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)

                ## now read the contents of the file
                try:
                    sourcefile = open(fullfilename, 'rb')
                    sourcedata = sourcefile.read()
                    sourcefile.close()
                except:
                    continue

                ## compute hashes
                h = hashlib.new('sha256')
                h.update(sourcedata)
                filehash = h.hexdigest()
                tlshhash = None
                if usetlsh:
                    ## only compute TLSH for files that are 256 bytes are more
                    if len(sourcedata) >= 256:
                        tlshhash = tlsh.hash(sourcedata)
                        hashresults.append({
                            'sha256': filehash,
                            'tlshhash': tlshhash
                        })

                results.append((task['package'], task['version'],
                                fullfilename[unpackdirectorylen:],
                                os.path.basename(fullfilename), filehash))
                resultcounter += 1
                if resultcounter % 1000 == 0:
                    resultqueue.put(('file', results))
                    results = []
        resultqueue.put(('file', results))

        if hashresults != []:
            resultqueue.put(('hashes', hashresults))

        shutil.rmtree(unpackdirectory)
        resultqueue.put(('archive', copy.deepcopy(task)))
        print("Queued\n", task['version'])
        sys.stdout.flush()
        scanqueue.task_done()
Beispiel #29
0
import tlsh

diff = []

# 256 is the maximum number of single byte changes
for i in xrange(256):
  s1 = ''.join([`chr(j % 256)` for j in xrange(1000)])
  s2 = ''.join([`chr(j % 256)` for j in xrange(999)])

  k = (999 - i) % 256
  s2 = s2 + `chr(k)`
  h1 = tlsh.hash(s1)
  h2 = tlsh.hash(s2)
  diff.append(tlsh.diff(h1, h2))

for i in xrange(1,256):  
  print 'diff', i, 'score', diff[i]

# 0 has not change
print 'diff', 0, 'score', diff[0]
Beispiel #30
0
def compute_1(path):
    with open(path, 'rb') as f:
        data = f.read()
        hs = tlsh.hash(data)
    return hs
Beispiel #31
0
def processarchive(scanqueue, resultqueue, sourcesdirectory, unpackprefix, cacheresult, cachedir, processsleep):
	seensha256 = set()
	while True:
		## grab a new task
		task = scanqueue.get()

		jsonsuccess = False
		## first check if there are already results available and use those instead
		if cacheresult:
			jsonresultfilename = os.path.join(cachedir, "%s.json" % task['sha256'])
			if os.path.exists(jsonresultfilename):
				jsonresultfile = open(jsonresultfilename, 'r')
				try:
					jsonresults = json.load(jsonresultfile)
					jsonsuccess = True
				except:
					## corrupt file
					jsonresultfile.close()
					try:
						os.unlink(jsonresultfilename)
					except:
						pass
				jsonresultfile.close()

		results = []
		hashresults = []
		resultcounter = 0

		if cacheresult and jsonsuccess:
				## split the results
				for f in jsonresults:
					fullfilename = f['fullfilename']
					relativefilename = f['relativefilename']
					filehash = f['sha256']
					results.append((task['package'], task['version'], fullfilename, relativefilename, os.path.basename(fullfilename), filehash))
					if 'tlshhash' in f and not filehash in seensha256:
						hashresults.append({'sha256': filehash, 'tlshhash': f['tlshhash']})
					seensha256.add(filehash)

				## send results to the database
				resultqueue.put(('file', results))
				resultqueue.put(('hashes', hashresults))

				print("Queued\n", task['version'])
				sys.stdout.flush()

				## send the results for the archive to the database
				resultqueue.put(('archive', copy.deepcopy(task)))

				memstats = psutil.virtual_memory()
				#swapstats = psutil.swap_memory()
				if memstats.percent > 30:
					time.sleep(processsleep)

				## tell the queue the task is done
				scanqueue.task_done()
				continue

		## create a temporary directory
		unpackdirectory = tempfile.mkdtemp(dir=unpackprefix)

		## store the length of the temporary directory so it can be properly
		## cut from the data that is being stored. Add 1 for the '/' of the path
		unpackdirectorylen = len(unpackdirectory) + 1

		## For each archive:
		## 1. unpack the archive
		## 2. compute hashes for each file found in the archive
		## 3. report results
		if 'tar' in task['filename'].lower():
			try:
				sourcetar = tarfile.open(os.path.join(sourcesdirectory, task['filename']), 'r')
				sourcetar.extractall(path=unpackdirectory)
				sourcetar.close()
			except:
				## tar file could not be unpacked, so stop
				shutil.rmtree(unpackdirectory)
				scanqueue.task_done()
				continue
		else:
			## TODO: add support for ZIP files
			shutil.rmtree(unpackdirectory)
			scanqueue.task_done()
			continue

		cacheresults = []

		## check to see whether or not part the path should be removed first
		## before storing as "relativefilename"
		removetopdirlen = unpackdirectorylen
		removetopdir = False
		if len(os.listdir(unpackdirectory)) == 1:
			topdir = os.listdir(unpackdirectory)[0]
			if topdir == task['package']:
				removetopdir = True
			else:
				for i in ['-', ',', '_', ' ']:
					if topdir == "%s%s%s" % (task['package'], i, task['version']):
						removetopdir = True
						break
			if removetopdir:
				removetopdirlen += len(topdir) + 1

		## walk the directory
		dirwalk = os.walk(unpackdirectory)

		for direntries in dirwalk:
			## make sure all subdirectories and files can be accessed
			for subdir in direntries[1]:
				subdirname = os.path.join(direntries[0], subdir)
				if not os.path.islink(subdirname):
					os.chmod(subdirname, stat.S_IRUSR|stat.S_IWUSR|stat.S_IXUSR)
			for filename in direntries[2]:
				fullfilename = os.path.join(direntries[0], filename)
				if not os.path.islink(fullfilename):
					os.chmod(fullfilename, stat.S_IRUSR|stat.S_IWUSR|stat.S_IXUSR)

				## now read the contents of the file
				try:
					sourcefile = open(fullfilename, 'rb')
					sourcedata = sourcefile.read()
					sourcefile.close()
				except:
					continue

				## compute hashes
				h = hashlib.new('sha256')
				h.update(sourcedata)
				filehash = h.hexdigest()
				tlshhash = None

				results.append((task['package'], task['version'], fullfilename[unpackdirectorylen:], fullfilename[removetopdirlen:], os.path.basename(fullfilename), filehash))
				if cacheresult:
					tmpresult = {'sha256': filehash, 'fullfilename': fullfilename[unpackdirectorylen:], 'relativefilename': fullfilename[removetopdirlen:]}

				if usetlsh:
					## only compute TLSH for files that are 256 bytes are more
					if len(sourcedata) >= 256:
						tlshhash = tlsh.hash(sourcedata)
						if cacheresult:
							tmpresult['tlshhash'] = tlshhash
						if not filehash in seensha256:
							hashresults.append({'sha256': filehash, 'tlshhash': tlshhash})
				seensha256.add(filehash)


				if cacheresult:
					cacheresults.append(tmpresult)

				## send intermediate results to the database, per 1000 files
				resultcounter += 1
				if resultcounter % 1000 == 0:
					resultqueue.put(('file', results))
					resultqueue.put(('hashes', hashresults))
					results = []
					hashresults = []

		## send remaining results to the database
		resultqueue.put(('file', results))

		## send remaining results to the database
		if hashresults != []:
			resultqueue.put(('hashes', hashresults))

		## remove the unpacking directory
		shutil.rmtree(unpackdirectory)

		print("Queued\n", task['version'])
		sys.stdout.flush()

		## send the results for the archive to the database
		resultqueue.put(('archive', copy.deepcopy(task)))

		if cacheresult:
			jsondumpfile = open(os.path.join(cachedir, "%s.json" % task['sha256']), 'w')
			json.dump(cacheresults, jsondumpfile)
			jsondumpfile.close()

		## tell the queue the task is done
		scanqueue.task_done()
Beispiel #32
0
import random
import tlsh

inserts = int(sys.argv[1])
checksums = {}

if tlsh_3b == '':
  buckets = 256.0
else:
  buckets = 16777216.0

# generate random strings, extract the checksum, compare all the checksums
for i in xrange(inserts):
  s = ''.join([`chr(j % 256)` for j in random.sample(xrange(10**9),512)])

  if tlsh_3b == '':
    checksum = int(tlsh.hash(s)[:2], 16)
  else:
    checksum = int(tlsh.hash(s)[:6], 16)

  if checksums.has_key(checksum):
    checksums[checksum] += 1
  else:
    checksums[checksum] = 1
  
print inserts - buckets * (1.0 - ((buckets - 1)/buckets)**inserts)

for k in checksums.keys():
  if checksums[k] > 1:
    print "collision at ", k, "for", checksums[k], "times"
def scantlsh(scanqueue, reportqueue, cursor, conn, tlshcutoff):
    while True:
        ## first get the data for a file for which a close match
        ## needs to be compute.
        (directory, filename, sha256) = scanqueue.get()

        ## then compute the TLSH hash and search in the database
        ## for the closest files.
        tlshfile = open(os.path.join(directory, filename), 'rb')
        tlshdata = tlshfile.read()
        tlshfile.close()
        tlshhash = tlsh.hash(tlshdata)

        if tlshhash == '':
            ## file is either too small or a hash cannot be
            ## computed (example: all characters are the same)
            scanqueue.task_done()
            continue

        ## now get checksums for files with the exact same name
        cursor.execute(
            "select distinct checksum from fileinfo where filename=%s",
            (filename, ))
        candidates = cursor.fetchall()
        conn.commit()
        if len(candidates) == 0:
            scanqueue.task_done()
            continue

        ## keep the most promising files in a list
        mostpromising = []

        ## first set the value for the found hash very high
        minhash = sys.maxsize

        for candidate in candidates:
            ## first grab the TLSH value from the database
            cursor.execute("select tlsh from hashes where sha256=%s",
                           candidate)
            tlshresult = cursor.fetchone()
            if tlshresult == None:
                continue

            ## compute the difference with the TLSH value computed above
            ## if the distance is smaller than the distance of the current
            ## best hit, then this will be the new best hit. If it is the
            ## same it is added to the list of best matches.
            tlshdiff = tlsh.diff(tlshhash, tlshresult[0])
            if tlshdiff < minhash:
                minhash = tlshdiff
                mostpromising = [candidate[0]]
            elif tlshdiff == minhash:
                mostpromising.append(candidate[0])

        ## if there are promising files and they aren't below a specific TLSH threshold
        ## return the information associated with these files.
        if mostpromising != []:
            if minhash < tlshcutoff:
                candidates = []
                for m in mostpromising:
                    cursor.execute(
                        "select packagename, version, fullfilename from fileinfo where checksum=%s",
                        (m, ))
                    candidates += cursor.fetchall()
                    conn.commit()
                reportqueue.put((directory, filename, candidates, minhash))
        scanqueue.task_done()
Beispiel #34
0
def compute_1(path):
    with open(path, 'rb') as f:
        data = f.read()
        hs = tlsh.hash(data)
    return hs
Beispiel #35
0
 def hash(self, data, alghConfig):
     retdata = tlsh.hash(data)
     if not retdata:
         debug.warning("TLSH generated empty hash")
         retdata = '-'
     return retdata
Beispiel #36
0
 with open(filename, mode='rb') as f:
     if 'PDF document' in filetype:
         try:
             pdf = pdftotext.PDF(f)
             readFile = bytes("\n\n".join(pdf), 'UTF-8')
         except:
             readFile = f.read()
     # ===== Start APK handler =====
     elif filename.endswith('.apk'):
         filelist = apk_handler(filename)
         todo = todo + filelist
         print("printing todo" + str(todo))
         readFile = f.read()
     else:
         readFile = f.read()
     tlshash = tlsh.hash(readFile)
     md5 = hashlib.md5(readFile).hexdigest()
     sha1 = hashlib.sha1(readFile).hexdigest()
     sha256 = hashlib.sha256(readFile).hexdigest()
     CoinCollected = False
     RESULTS_WRITER.writerow([
         tlshash, md5, sha1, sha256, filename, filetype, "NONE",
         "Analysed"
     ])
     try:
         if 'PDF document' in filetype:
             data = readFile
         else:
             data = mmap.mmap(f.fileno(),
                              0,
                              prot=mmap.PROT_READ)
Beispiel #37
0
def main():
    if len(sys.argv)<2:
        print('eg:\n  %s images/123.tar.gz'%(sys.argv[0]))
        return
    fname = sys.argv[1]
    m = re.search(r'(\d+)\.tar\.gz', fname)
    if m:
        parent_id = int(m.group(1))
    elif len(sys.argv)>2:
        parent_id = sys.argv[2]
    else:
        print('please give me firmware image ID')
        sys.exit(1)
    tfile = tarfile.open(fname)

    try:
        db = psycopg2.connect(database='firmware',host='127.0.0.1',
                user='******',password='******')
        cur = db.cursor()

        for mem in tfile.getmembers():
            fname = mem.name
            if mem.isfile():
                f = tfile.extractfile(mem)
                cont = f.read()
                sha1_hash = hashlib.sha1(cont).hexdigest()
                md5_hash = hashlib.md5(cont).hexdigest()
                filesize = mem.size
                tlsh_hash = tlsh.hash(cont)
                perm=mem.mode
                uid=mem.uid
                gid=mem.gid
                try:
                    cur.execute("INSERT INTO unpacked_fw \
                            (  parent_id,   filename,    sha1_hash,    md5_hash,    tlsh_hash,       filesize, permission, uid, gid) VALUES \
                            (%(parent_id)s, %(fname)s, %(sha1_hash)s, %(md5_hash)s, %(tlsh_hash)s, %(filesize)s, %(perm)s, %(uid)s, %(gid)s )", 
                            locals())
                    db.commit()
                    print(fname)
                except psycopg2.Error as ex:
                    if ex.pgcode not in [psqlerr.UNIQUE_VIOLATION]:
                        print(ex)
                    db.rollback()
            elif mem.issym():
                linkpath = mem.linkpath
                try:
                    cur.execute("INSERT INTO unpacked_fw \
                            (  parent_id,   filename,   linkpath) VALUES \
                            (%(parent_id)s, %(fname)s,  %(linkpath)s)", 
                            locals())
                    db.commit()
                    print(fname)
                except psycopg2.Error as ex:
                    if ex.pgcode not in [psqlerr.UNIQUE_VIOLATION]:
                        print(ex)
                    db.rollback()
    except Exception as ex:
        print(ex)
        traceback.print_exc()
    finally:
        db.close()
Beispiel #38
0
import sys
import tlsh

with open(sys.argv[1], 'rb') as f:
  d = f.read()

h1 = tlsh.hash(d)
print h1