def tlshh(path): print("[+] Creating tlsh matches") result = [] with open('file_numbers.csv', mode='r') as infile: reader = csv.reader(infile) mydict = {rows[0]: rows[1] for rows in reader} for pair in itertools.combinations(glob.glob(os.path.join(path, '*')), r=2): file1 = pair[0] file2 = pair[1] filename1 = file1.split("/")[1] filename2 = file2.split("/")[1] h1 = tlsh.hash(open(file1, 'rb').read()) h1_number = mydict[filename1] h2 = tlsh.hash(open(file2, 'rb').read()) h2_number = mydict[filename2] score = tlsh.diff(h1, h2) result.append([h1_number, h1, h2_number, h2, 1000 - score]) with open("matchestlsh.csv", "w", newline="") as f: writer = csv.writer(f) result.insert(0, [ "File 1 Number", "File 1 Hash", "File 2 Number", "File 2 Hash", "Score" ]) writer.writerows(result)
def check_text_similarity(path1, path2): file1_str = open(path1, 'rb').read(); file2_str = open(path2, 'rb').read(); h1 = tlsh.hash(file1_str); h2 = tlsh.hash(file2_str); diff = tlsh.diff(h1, h2); return diff;
def ToHash(h1,h2, classes): try: th = tlsh.hash(h1) except: th = 'None' try: sh = fuzzyhashlib.sdhash(h1).hexdigest().rstrip() except: sh = 'None' try: nil = Nilsimsa(h1).hexdigest() except: nil = 'None' try: ss = fuzzyhashlib.ssdeep(h1).hexdigest() except: ss = 'None' ch = [] if classes!=None: for c in classes: name = c[0] content = c[1] try: cnil = Nilsimsa(content).hexdigest() except: cnil = 'None' try: css = fuzzyhashlib.ssdeep(content).hexdigest() except: css = 'None' try: csh = 'None' if len(content) >= 512: csh = fuzzyhashlib.sdhash(content).hexdigest().rstrip() except: csh = 'None' try: cth = 'None' if len(content) >= 256: cth = tlsh.hash(content) except: cth = 'None' ch.append((name,cth,csh,cnil,css)) return th,sh,nil,ss,ch
def compute_hashes_impl(sample, pe): md5_value = hashlib.md5(sample).hexdigest() sha1_value = hashlib.sha1(sample).hexdigest() sha256_value = hashlib.sha256(sample).hexdigest() ssdeep_value = ssdeep.hash(sample) impfuzzy_value = pyimpfuzzy.get_impfuzzy_data(sample) tlsh_value = tlsh.hash(sample) totalhash = pehash.totalhash_hex(pe=pe) anymaster = pehash.anymaster_hex(pe=pe) anymaster_v1_0_1 = pehash.anymaster_v1_0_1_hex(pe=pe) endgame = pehash.endgame_hex(pe=pe) crits = pehash.crits_hex(pe=pe) pehashng = pehash.pehashng_hex(pe=pe) imphash = pe.get_imphash() return { "md5": md5_value, "sha1": sha1_value, "sha256": sha256_value, "ssdeep": ssdeep_value, "imphash": imphash, "impfuzzy": impfuzzy_value, "tlsh": tlsh_value, "totalhash": totalhash, "anymaster": anymaster, "anymaster_v1_0_1": anymaster_v1_0_1, "endgame": endgame, "crits": crits, "pehashng": pehashng, }
def info_file(path): BUF_SIZE = 65536 md5 = hashlib.md5() sha1 = hashlib.sha1() sha256 = hashlib.sha256() sha512 = hashlib.sha512() with open(path, 'rb') as f: while True: data = f.read(BUF_SIZE) if not data: break md5.update(data) sha1.update(data) sha256.update(data) sha512.update(data) SSDEEP = ssdeep.hash(data) TLSH = tlsh.hash(data) print("MD5: {0}".format(md5.hexdigest())) print("SHA1: {0}".format(sha1.hexdigest())) print("SHA256: {0}".format(sha256.hexdigest())) print("SHA512: {0}".format(sha512.hexdigest())) print("SSDEEP: {0}".format(SSDEEP)) print("TLSH: {0}".format(TLSH))
def scanhashes(resolved_path, extrahashes): filehashes = {} scanfile = open(resolved_path, 'r') h = hashlib.new('sha256') data = scanfile.read() h.update(data) scanfile.close() filehashes['sha256'] = h.hexdigest() for i in extrahashes: if i == 'crc32': if os.stat(resolved_path).st_size > 2147483647: filehashes[i] = None else: filehashes[i] = zlib.crc32(data) & 0xffffffff elif i == 'tlsh': if os.stat(resolved_path).st_size >= 256: tlshhash = tlsh.hash(data) filehashes[i] = tlshhash else: filehashes[i] = None else: h = hashlib.new(i) h.update(data) filehashes[i] = h.hexdigest() scanfile.close() return filehashes
def calculate(self, data): if len(data) < 50: return '-' fingerprint = tlsh.hash(data) return fingerprint if fingerprint else '-'
def hash_from_file(cls, file: pathlib.Path) -> str: if not str(file).endswith(".pdf"): warnings.warn("File does not appear to be a pdf. ", category=UserWarning) return "" try: import tlsh from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser except: warnings.warn( "Getting the tlsh hash of a pdf requires additional libraries already be installed; install threatexchange with the [pdf] extra", category=UserWarning, ) return "" text = StringIO() with open(file, "rb") as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, text, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) return str(tlsh.hash(text.getvalue().encode()))
def calculate(self, data): if len(data) < 50: return 'Error: TLSH requires buffer >= 50 in size ({0:d})'.format( len(data)) fingerprint = tlsh.hash(data) return fingerprint if fingerprint else 'Error: empty hash'
def _create_data(self): data = self._concat_data() return { "data": data, "sha256": hashlib.sha256(data).hexdigest(), "entropy": entropy(data), "histogram": histogram(data), "tlsh": tlsh.hash(data), "strings": set(word_reg.findall(data)), }
def file_upload_view(request): print("File uploaded") # print(request.FILES) if request.method == 'POST': my_file = request.FILES.get('file') fs = FileSystemStorage() path = fs.save(my_file.name, my_file) BUF_SIZE = 65536 now = timezone.localtime(timezone.now()) md5 = hashlib.md5() sha256 = hashlib.sha256() sha512 = hashlib.sha512() sha1 = hashlib.sha1() with open("/root/Desktop/OmegaVirus/OmegaVirusWeb/core/uploads/" + path, 'rb') as f: while True: data = f.read(BUF_SIZE) if not data: break sha256.update(data) sha512.update(data) md5.update(data) sha1.update(data) SSDEEP = ssdeep.hash(data) TLSH = tlsh.hash(data) if File.objects.filter(SHA256=sha256.hexdigest()).exists(): b = File(SHA256="{0}".format(sha256.hexdigest()), name=my_file.name, filepath=path, size=convert_size(my_file.size), date=now, first=False) b.save() else: b = File(SHA256="{0}".format(sha256.hexdigest()), name=my_file.name, filepath=path, size=convert_size(my_file.size), date=now, first=True) b.save() h = Hash(SHA256="{0}".format(sha256.hexdigest()), SHA512="{0}".format(sha512.hexdigest()), SHA1="{0}".format(sha1.hexdigest()), MD5="{0}".format(md5.hexdigest()), SSDEEP="{0}".format(SSDEEP), TLSH="{0}".format(TLSH)) h.save() try: obj = File.objects.latest('id') print("im in") context = { "name": obj.name, "id": obj.id, "SHA256": obj.SHA256, "filepath": obj.filepath, "size": obj.size, "date": obj.date } return redirect('latest') # return render(request, "index.html", context) except ValueError: return redirect('latest')
def compute_1(path,force): try: f = open(path, 'rb') data = f.read() if force == 1: hs = tlsh.forcehash(data) else: hs = tlsh.hash(data) return hs except IOError as e: print('cannot find file: ', path) return ''
def compute_1(path, force): try: f = open(path, "rb") data = f.read() if force == 1: hs = tlsh.forcehash(data) else: hs = tlsh.hash(data) return hs except IOError as e: print("cannot find file: ", path) return ""
def fuzzy_hash(self): if not hasattr(self, "_fuzzy_hash"): try: hex_digest = tlsh.hash(self.asm.encode()) except ValueError: # File must contain a certain amount of randomness return None # For short files, the hex_digest is an empty string, so turn # it into None self._fuzzy_hash = hex_digest or None return self._fuzzy_hash
def hash_picture(self, curr_picture): """ Hash a picture and returns the hash value :param curr_picture: the picture to hash :return: the hashed version of the picture """ answer = {} self.logger.info("Hashing picture ... ") # Convert bytes in PIL image pil_picture = Image.open(io.BytesIO(curr_picture)) self.logger.debug( f"Picture converted to PIL Image {type(pil_picture)}") # DEBUG # pil_picture.save('/home/user/Desktop/debug_pil.bmp') try: # Note : @image must be a PIL instance. if self.fe_conf.A_HASH.get("is_enabled", False): self.logger.debug("A-HASH ... ") answer["A_HASH"] = self.check_null_hash( imagehash.average_hash(pil_picture)) if self.fe_conf.P_HASH.get("is_enabled", False): self.logger.debug("P_HASH ... ") answer["P_HASH"] = self.check_null_hash( imagehash.phash(pil_picture)) if self.fe_conf.P_HASH_SIMPLE.get("is_enabled", False): self.logger.debug("P_HASH_SIMPLE ... ") answer["P_HASH_SIMPLE"] = self.check_null_hash( imagehash.phash_simple(pil_picture)) if self.fe_conf.D_HASH.get("is_enabled", False): self.logger.debug("D_HASH ... ") answer["D_HASH"] = self.check_null_hash( imagehash.dhash(pil_picture)) if self.fe_conf.D_HASH_VERTICAL.get("is_enabled", False): self.logger.debug("D_HASH_VERTICAL ... ") answer["D_HASH_VERTICAL"] = self.check_null_hash( imagehash.dhash_vertical(pil_picture)) if self.fe_conf.W_HASH.get("is_enabled", False): self.logger.debug("W_HASH ... ") answer["W_HASH"] = self.check_null_hash( imagehash.whash(pil_picture)) if self.fe_conf.TLSH.get("is_enabled", False): self.logger.debug("TLSH ... ") answer["TLSH"] = self.check_null_hash(tlsh.hash(curr_picture)) except Exception as e: self.logger.error("Error during hashing : " + str(e)) return answer
def hash_from_file(cls, file: pathlib.Path) -> str: if not str(file).endswith(".pdf"): warnings.warn("File does not appear to be a pdf. ", category=UserWarning) return "" text = StringIO() with open(file, "rb") as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, text, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) return str(tlsh.hash(text.getvalue().encode()))
def tlsh_score(response, site, alert): """ Caculate TLSH Score. :param response: Http response. :param site: Site Object. :param alert: Alert Integer. :return: alert, score :rtype: int, int """ fuzzy_hash = tlsh.hash(bytes(response.text, 'utf-8')) score = tlsh.diffxlen(site.content_fuzzy_hash, fuzzy_hash) if score > 160: alert += 4 Site.objects.filter(pk=site.pk).update(content_fuzzy_hash=fuzzy_hash) return alert, score
def hash_picture(self, curr_picture: picture_class.Picture): # target_hash = tlsh.hash(Image.open(curr_picture.path)) target_hash = tlsh.hash( open(curr_picture.path, 'rb').read()) # From https://github.com/trendmicro/tlsh curr_picture.hash = target_hash if target_hash is None or target_hash == "": # TODO : Better handling of null hashes ? curr_picture.hash = '0000000000000000000000000000000000000000000000000000000000000000000000' raise Exception( f"Target hash is None or null for {curr_picture.path.name}. Hash set to 0s value" ) return curr_picture
def scantlsh(scanqueue, reportqueue, cursor, conn, tlshcutoff): while True: (directory, filename, sha256) = scanqueue.get() ## then compute the TLSH hash and search in the database ## for the closest file. tlshfile = open(os.path.join(directory, filename), 'rb') tlshdata = tlshfile.read() tlshfile.close() tlshhash = tlsh.hash(tlshdata) if tlshhash == '': ## file is either too small or a hash cannot be ## computed (example: all characters are the same) scanqueue.task_done() continue ## now get some cadidates cursor.execute("select distinct checksum from fileinfo where filename=%s", (filename,)) candidates = cursor.fetchall() conn.commit() if len(candidates) == 0: scanqueue.task_done() continue mostpromising = [] minhash = sys.maxsize for candidate in candidates: cursor.execute("select tlsh from hashes where sha256=%s", candidate) tlshresult = cursor.fetchone() if tlshresult == None: continue tlshdiff = tlsh.diff(tlshhash, tlshresult[0]) if tlshdiff < minhash: minhash = tlshdiff mostpromising = [candidate[0]] elif tlshdiff == minhash: mostpromising.append(candidate[0]) if mostpromising != []: if minhash < tlshcutoff: candidates = [] for m in mostpromising: cursor.execute("select packagename, version, fullfilename from fileinfo where checksum=%s", (m,)) candidates += cursor.fetchall() conn.commit() reportqueue.put((directory, filename, candidates, minhash)) scanqueue.task_done()
def main(): if len(sys.argv) >= 2: input_file = sys.argv[1] else: print('Forgot file name as arg') with open(input_file, 'r') as f: urls = f.readlines() for u in urls: ureq.urlretrieve(u, 'tmp_download.jpg') img = im.load_img('tmp_download.jpg') os.remove('tmp_download.jpg') kp, desc = im.get_keypoints(img) pack = dm.pack_keypoints(kp, desc) img_hash = tlsh.hash(str(pack).encode('utf-8')) dm.write_postgres(img_hash, pack, u) print(f'Added: {u}')
def computehash((filedir, filename, extrahashes)): filehashes = {} resolved_path = os.path.join(filedir, filename) scanfile = open(resolved_path, 'r') filedata = scanfile.read() scanfile.close() h = hashlib.new('sha256') h.update(filedata) filehashes['sha256'] = h.hexdigest() if 'crc32' in extrahashes: try: filehashes['crc32'] = zlib.crc32(filedata) & 0xffffffff except: return None if 'tlsh' in extrahashes: if os.stat(resolved_path).st_size >= 256: filehashes['tlsh'] = tlsh.hash(filedata) else: filehashes['tlsh'] = None ## first remove 'crc32' from extrahashes extrahashesset = set(extrahashes) try: extrahashesset.remove('crc32') except KeyError: pass ## then remove 'tlsh' from extrahashes try: extrahashesset.remove('tlsh') except KeyError: pass temphashes = {} for i in extrahashesset: temphashes[i] = hashlib.new(i) for i in extrahashesset: temphashes[i].update(filedata) for i in extrahashesset: filehashes[i] = temphashes[i].hexdigest() return (filename, filehashes)
def _processFile1(filename): print "Processing %s..." % filename _resetFileDetails() if getsize(filename) <= 512: TlshStruct.logger.error("File %s too small to compute tlsh value") else: result = None try: TlshStruct.file_basic_details["filename"] = filename TlshStruct.file_basic_details["tlsh"] = tlsh.hash(open(filename, "rb").read()) TlshStruct.file_basic_details["sha256"] = _getSha256(filename) if not TlshStruct.restrict: prop_details = efp.getBasicFileProperties(filename) cert_details = efp.getCertificateDetails(filename) TlshStruct.file_prop_details = prop_details if prop_details is not None else {} TlshStruct.file_cert_details = cert_details if cert_details is not None else {} result = _sendQuery() except Exception, ex: print "ERROR: Problem in getting tlsh value of %s : %s" % (filename, ex) tlsh_val = "error" finally:
def upload_filer(): if request.method == 'POST': f = request.files['file'] f.save(f'{FILE_PREFIX}{secure_filename(f.filename)}') # Process Image img = im.load_img(f'{FILE_PREFIX}{secure_filename(f.filename)}') kp, desc = im.get_keypoints(img) img_output = dm.pack_keypoints(kp, desc) img_hash = tlsh.hash(str(img_output).encode('utf-8')) if not app.testing: result = dm.query_postgres(img_hash) else: with open('tests/data_uploader.txt', 'rb') as f: result = pickle.load(f) f = open('panic.log', 'w') matches = {} for item in result: tmp_kp, tmp_desc = dm.unpack_keypoints(item[2]) match_score = im.get_match(desc, tmp_desc) matches[item[1]] = len(match_score) f.write(f'{matches}') f.write(f'{len(matches)}') f.close() # Return Results if len(matches) > 0: url = max(matches, key=matches.get) # Generate random string for filename # This can help prevent browser caching of an image that has changed filename = ''.join( random.choice(string.ascii_letters) for i in range(10)) ureq.urlretrieve(url, f'static/img/{filename}') img_2 = im.load_img(f'static/img/{filename}') os.remove(f'static/img/{filename}') return render_template('result.html', value=url) return render_template('nomatch.html')
def Calculate(self, string): if self.name == "md5": hash = hashlib.md5(string).hexdigest() elif self.name == "sha1": hash = hashlib.sha1(string).hexdigest() elif self.name == "crc": crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF) crc32.update(string) hash = crc32.hexdigest() elif self.name == "murmur": hash = mmh3.hash(string) elif self.name == "ssdeep": hash = ssdeep.hash(string) elif self.name == "tlsh": hash = tlsh.hash(string) return hash
def computehash((filedir, filename, extrahashes, sha256sum)): resolved_path = os.path.join(filedir, filename) filehashes = {} filehashes['sha256'] = sha256sum scanfile = open(resolved_path, 'r') data = scanfile.read() scanfile.close() for i in extrahashes: if i == 'crc32': filehashes[i] = zlib.crc32(data) & 0xffffffff elif i == 'tlsh': if os.stat(resolved_path).st_size >= 256: tlshhash = tlsh.hash(data) filehashes[i] = tlshhash else: filehashes[i] = None else: h = hashlib.new(i) h.update(data) filehashes[i] = h.hexdigest() filehashes['sha256'] = sha256sum return (filedir, filename, filehashes)
def _processFile1(filename): print "Processing %s..." % filename _resetFileDetails() if getsize(filename) <= 512: TlshStruct.logger.error("File %s too small to compute tlsh value") else: result = None try: TlshStruct.file_basic_details["filename"] = filename TlshStruct.file_basic_details["tlsh"] = tlsh.hash( open(filename, "rb").read()) TlshStruct.file_basic_details["sha256"] = _getSha256(filename) if not TlshStruct.restrict: prop_details = efp.getBasicFileProperties(filename) cert_details = efp.getCertificateDetails(filename) TlshStruct.file_prop_details = prop_details if prop_details is not None else {} TlshStruct.file_cert_details = cert_details if cert_details is not None else {} result = _sendQuery() except Exception, ex: print "ERROR: Problem in getting tlsh value of %s : %s" % ( filename, ex) tlsh_val = "error" finally:
for f in files: f2 = path.join(root,f) try: fname ='./'+f2[prelen:] if path.isfile(f2): fsize=path.getsize(f2) stat=os.stat(f2) perm=stat.st_mode uid=stat.st_uid gid=stat.st_gid with open(f2, 'rb') as fin: cont = fin.read() sha1_hash = hashlib.sha1(cont).hexdigest() md5_hash = hashlib.md5(cont).hexdigest() tlsh_hash = tlsh.hash(cont) symtgt=None elif path.islink(f2): symtgt=path.realpath(f2) gid=uid=perm=None fsize=None sha1_hash=md5_hash=tlsh_hash=None else: gid=uid=perm=None try: fsize=path.getsize(f2) except: fsize=None sha1_hash=md5_hash=tlsh_hash=None print(fname, f2)
def processarchive(scanqueue, resultqueue, sourcesdirectory, unpackprefix): while True: task = scanqueue.get() unpackdirectory = tempfile.mkdtemp(dir=unpackprefix) unpackdirectorylen = len(unpackdirectory) + 1 ## then for each file: ## 1. unpack the archive ## 2. compute hashes ## 3. report results p = subprocess.Popen( ['tar', 'ixf', os.path.join(sourcesdirectory, task['filename'])], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=unpackdirectory) p.communicate() if p.returncode != 0: shutil.rmtree(unpackdirectory) scanqueue.task_done() continue results = [] hashresults = [] resultcounter = 0 dirwalk = os.walk(unpackdirectory) for direntries in dirwalk: ## make sure all subdirectories and files can be accessed for subdir in direntries[1]: subdirname = os.path.join(direntries[0], subdir) if not os.path.islink(subdirname): os.chmod(subdirname, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) for filename in direntries[2]: fullfilename = os.path.join(direntries[0], filename) if not os.path.islink(fullfilename): os.chmod(fullfilename, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) ## now read the contents of the file try: sourcefile = open(fullfilename, 'rb') sourcedata = sourcefile.read() sourcefile.close() except: continue ## compute hashes h = hashlib.new('sha256') h.update(sourcedata) filehash = h.hexdigest() tlshhash = None if usetlsh: ## only compute TLSH for files that are 256 bytes are more if len(sourcedata) >= 256: tlshhash = tlsh.hash(sourcedata) hashresults.append({ 'sha256': filehash, 'tlshhash': tlshhash }) results.append((task['package'], task['version'], fullfilename[unpackdirectorylen:], os.path.basename(fullfilename), filehash)) resultcounter += 1 if resultcounter % 1000 == 0: resultqueue.put(('file', results)) results = [] resultqueue.put(('file', results)) if hashresults != []: resultqueue.put(('hashes', hashresults)) shutil.rmtree(unpackdirectory) resultqueue.put(('archive', copy.deepcopy(task))) print("Queued\n", task['version']) sys.stdout.flush() scanqueue.task_done()
import tlsh diff = [] # 256 is the maximum number of single byte changes for i in xrange(256): s1 = ''.join([`chr(j % 256)` for j in xrange(1000)]) s2 = ''.join([`chr(j % 256)` for j in xrange(999)]) k = (999 - i) % 256 s2 = s2 + `chr(k)` h1 = tlsh.hash(s1) h2 = tlsh.hash(s2) diff.append(tlsh.diff(h1, h2)) for i in xrange(1,256): print 'diff', i, 'score', diff[i] # 0 has not change print 'diff', 0, 'score', diff[0]
def compute_1(path): with open(path, 'rb') as f: data = f.read() hs = tlsh.hash(data) return hs
def processarchive(scanqueue, resultqueue, sourcesdirectory, unpackprefix, cacheresult, cachedir, processsleep): seensha256 = set() while True: ## grab a new task task = scanqueue.get() jsonsuccess = False ## first check if there are already results available and use those instead if cacheresult: jsonresultfilename = os.path.join(cachedir, "%s.json" % task['sha256']) if os.path.exists(jsonresultfilename): jsonresultfile = open(jsonresultfilename, 'r') try: jsonresults = json.load(jsonresultfile) jsonsuccess = True except: ## corrupt file jsonresultfile.close() try: os.unlink(jsonresultfilename) except: pass jsonresultfile.close() results = [] hashresults = [] resultcounter = 0 if cacheresult and jsonsuccess: ## split the results for f in jsonresults: fullfilename = f['fullfilename'] relativefilename = f['relativefilename'] filehash = f['sha256'] results.append((task['package'], task['version'], fullfilename, relativefilename, os.path.basename(fullfilename), filehash)) if 'tlshhash' in f and not filehash in seensha256: hashresults.append({'sha256': filehash, 'tlshhash': f['tlshhash']}) seensha256.add(filehash) ## send results to the database resultqueue.put(('file', results)) resultqueue.put(('hashes', hashresults)) print("Queued\n", task['version']) sys.stdout.flush() ## send the results for the archive to the database resultqueue.put(('archive', copy.deepcopy(task))) memstats = psutil.virtual_memory() #swapstats = psutil.swap_memory() if memstats.percent > 30: time.sleep(processsleep) ## tell the queue the task is done scanqueue.task_done() continue ## create a temporary directory unpackdirectory = tempfile.mkdtemp(dir=unpackprefix) ## store the length of the temporary directory so it can be properly ## cut from the data that is being stored. Add 1 for the '/' of the path unpackdirectorylen = len(unpackdirectory) + 1 ## For each archive: ## 1. unpack the archive ## 2. compute hashes for each file found in the archive ## 3. report results if 'tar' in task['filename'].lower(): try: sourcetar = tarfile.open(os.path.join(sourcesdirectory, task['filename']), 'r') sourcetar.extractall(path=unpackdirectory) sourcetar.close() except: ## tar file could not be unpacked, so stop shutil.rmtree(unpackdirectory) scanqueue.task_done() continue else: ## TODO: add support for ZIP files shutil.rmtree(unpackdirectory) scanqueue.task_done() continue cacheresults = [] ## check to see whether or not part the path should be removed first ## before storing as "relativefilename" removetopdirlen = unpackdirectorylen removetopdir = False if len(os.listdir(unpackdirectory)) == 1: topdir = os.listdir(unpackdirectory)[0] if topdir == task['package']: removetopdir = True else: for i in ['-', ',', '_', ' ']: if topdir == "%s%s%s" % (task['package'], i, task['version']): removetopdir = True break if removetopdir: removetopdirlen += len(topdir) + 1 ## walk the directory dirwalk = os.walk(unpackdirectory) for direntries in dirwalk: ## make sure all subdirectories and files can be accessed for subdir in direntries[1]: subdirname = os.path.join(direntries[0], subdir) if not os.path.islink(subdirname): os.chmod(subdirname, stat.S_IRUSR|stat.S_IWUSR|stat.S_IXUSR) for filename in direntries[2]: fullfilename = os.path.join(direntries[0], filename) if not os.path.islink(fullfilename): os.chmod(fullfilename, stat.S_IRUSR|stat.S_IWUSR|stat.S_IXUSR) ## now read the contents of the file try: sourcefile = open(fullfilename, 'rb') sourcedata = sourcefile.read() sourcefile.close() except: continue ## compute hashes h = hashlib.new('sha256') h.update(sourcedata) filehash = h.hexdigest() tlshhash = None results.append((task['package'], task['version'], fullfilename[unpackdirectorylen:], fullfilename[removetopdirlen:], os.path.basename(fullfilename), filehash)) if cacheresult: tmpresult = {'sha256': filehash, 'fullfilename': fullfilename[unpackdirectorylen:], 'relativefilename': fullfilename[removetopdirlen:]} if usetlsh: ## only compute TLSH for files that are 256 bytes are more if len(sourcedata) >= 256: tlshhash = tlsh.hash(sourcedata) if cacheresult: tmpresult['tlshhash'] = tlshhash if not filehash in seensha256: hashresults.append({'sha256': filehash, 'tlshhash': tlshhash}) seensha256.add(filehash) if cacheresult: cacheresults.append(tmpresult) ## send intermediate results to the database, per 1000 files resultcounter += 1 if resultcounter % 1000 == 0: resultqueue.put(('file', results)) resultqueue.put(('hashes', hashresults)) results = [] hashresults = [] ## send remaining results to the database resultqueue.put(('file', results)) ## send remaining results to the database if hashresults != []: resultqueue.put(('hashes', hashresults)) ## remove the unpacking directory shutil.rmtree(unpackdirectory) print("Queued\n", task['version']) sys.stdout.flush() ## send the results for the archive to the database resultqueue.put(('archive', copy.deepcopy(task))) if cacheresult: jsondumpfile = open(os.path.join(cachedir, "%s.json" % task['sha256']), 'w') json.dump(cacheresults, jsondumpfile) jsondumpfile.close() ## tell the queue the task is done scanqueue.task_done()
import random import tlsh inserts = int(sys.argv[1]) checksums = {} if tlsh_3b == '': buckets = 256.0 else: buckets = 16777216.0 # generate random strings, extract the checksum, compare all the checksums for i in xrange(inserts): s = ''.join([`chr(j % 256)` for j in random.sample(xrange(10**9),512)]) if tlsh_3b == '': checksum = int(tlsh.hash(s)[:2], 16) else: checksum = int(tlsh.hash(s)[:6], 16) if checksums.has_key(checksum): checksums[checksum] += 1 else: checksums[checksum] = 1 print inserts - buckets * (1.0 - ((buckets - 1)/buckets)**inserts) for k in checksums.keys(): if checksums[k] > 1: print "collision at ", k, "for", checksums[k], "times"
def scantlsh(scanqueue, reportqueue, cursor, conn, tlshcutoff): while True: ## first get the data for a file for which a close match ## needs to be compute. (directory, filename, sha256) = scanqueue.get() ## then compute the TLSH hash and search in the database ## for the closest files. tlshfile = open(os.path.join(directory, filename), 'rb') tlshdata = tlshfile.read() tlshfile.close() tlshhash = tlsh.hash(tlshdata) if tlshhash == '': ## file is either too small or a hash cannot be ## computed (example: all characters are the same) scanqueue.task_done() continue ## now get checksums for files with the exact same name cursor.execute( "select distinct checksum from fileinfo where filename=%s", (filename, )) candidates = cursor.fetchall() conn.commit() if len(candidates) == 0: scanqueue.task_done() continue ## keep the most promising files in a list mostpromising = [] ## first set the value for the found hash very high minhash = sys.maxsize for candidate in candidates: ## first grab the TLSH value from the database cursor.execute("select tlsh from hashes where sha256=%s", candidate) tlshresult = cursor.fetchone() if tlshresult == None: continue ## compute the difference with the TLSH value computed above ## if the distance is smaller than the distance of the current ## best hit, then this will be the new best hit. If it is the ## same it is added to the list of best matches. tlshdiff = tlsh.diff(tlshhash, tlshresult[0]) if tlshdiff < minhash: minhash = tlshdiff mostpromising = [candidate[0]] elif tlshdiff == minhash: mostpromising.append(candidate[0]) ## if there are promising files and they aren't below a specific TLSH threshold ## return the information associated with these files. if mostpromising != []: if minhash < tlshcutoff: candidates = [] for m in mostpromising: cursor.execute( "select packagename, version, fullfilename from fileinfo where checksum=%s", (m, )) candidates += cursor.fetchall() conn.commit() reportqueue.put((directory, filename, candidates, minhash)) scanqueue.task_done()
def hash(self, data, alghConfig): retdata = tlsh.hash(data) if not retdata: debug.warning("TLSH generated empty hash") retdata = '-' return retdata
with open(filename, mode='rb') as f: if 'PDF document' in filetype: try: pdf = pdftotext.PDF(f) readFile = bytes("\n\n".join(pdf), 'UTF-8') except: readFile = f.read() # ===== Start APK handler ===== elif filename.endswith('.apk'): filelist = apk_handler(filename) todo = todo + filelist print("printing todo" + str(todo)) readFile = f.read() else: readFile = f.read() tlshash = tlsh.hash(readFile) md5 = hashlib.md5(readFile).hexdigest() sha1 = hashlib.sha1(readFile).hexdigest() sha256 = hashlib.sha256(readFile).hexdigest() CoinCollected = False RESULTS_WRITER.writerow([ tlshash, md5, sha1, sha256, filename, filetype, "NONE", "Analysed" ]) try: if 'PDF document' in filetype: data = readFile else: data = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
def main(): if len(sys.argv)<2: print('eg:\n %s images/123.tar.gz'%(sys.argv[0])) return fname = sys.argv[1] m = re.search(r'(\d+)\.tar\.gz', fname) if m: parent_id = int(m.group(1)) elif len(sys.argv)>2: parent_id = sys.argv[2] else: print('please give me firmware image ID') sys.exit(1) tfile = tarfile.open(fname) try: db = psycopg2.connect(database='firmware',host='127.0.0.1', user='******',password='******') cur = db.cursor() for mem in tfile.getmembers(): fname = mem.name if mem.isfile(): f = tfile.extractfile(mem) cont = f.read() sha1_hash = hashlib.sha1(cont).hexdigest() md5_hash = hashlib.md5(cont).hexdigest() filesize = mem.size tlsh_hash = tlsh.hash(cont) perm=mem.mode uid=mem.uid gid=mem.gid try: cur.execute("INSERT INTO unpacked_fw \ ( parent_id, filename, sha1_hash, md5_hash, tlsh_hash, filesize, permission, uid, gid) VALUES \ (%(parent_id)s, %(fname)s, %(sha1_hash)s, %(md5_hash)s, %(tlsh_hash)s, %(filesize)s, %(perm)s, %(uid)s, %(gid)s )", locals()) db.commit() print(fname) except psycopg2.Error as ex: if ex.pgcode not in [psqlerr.UNIQUE_VIOLATION]: print(ex) db.rollback() elif mem.issym(): linkpath = mem.linkpath try: cur.execute("INSERT INTO unpacked_fw \ ( parent_id, filename, linkpath) VALUES \ (%(parent_id)s, %(fname)s, %(linkpath)s)", locals()) db.commit() print(fname) except psycopg2.Error as ex: if ex.pgcode not in [psqlerr.UNIQUE_VIOLATION]: print(ex) db.rollback() except Exception as ex: print(ex) traceback.print_exc() finally: db.close()
import sys import tlsh with open(sys.argv[1], 'rb') as f: d = f.read() h1 = tlsh.hash(d) print h1