def match(mod_data1, mod_data2): try: diff = tlsh.diff(mod_data1["tlsh"], mod_data2["tlsh"]) except: return False #print("TLSH diff between {} and {}:".format(mod_data1["port"], mod_data2["port"]), diff) return diff < 50
def _compare_tlsh(value1, value2): if value1 == "TNULL" and value2 == "TNULL": return 0 elif value1 == "TNULL" or value2 == "TNULL": return 1 else: return min(tlsh.diff(value1, value2) / 100, 1)
def compute_distance(file1, file2): """ Use tlsh to compute the distance between 2 files If it fails, revert to counting the number of different bytes """ try: return tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash) except (TypeError, ValueError): # File is too small or doesn't have enough randomness pass # Compute the proportion of bytes changed path1 = str(file1.path) path2 = str(file2.path) file_size = max(get_file_size(path1), get_file_size(path2)) try: diff_bytes = subprocess.check_output(["cmp", "-l", path1, path2], stderr=subprocess.DEVNULL) except subprocess.CalledProcessError as e: # When files are different, cmp has an exit code of 1 diff_bytes = e.output # Diff is size of output, multiplied by a constant to be the same order of # magnitude as TLSH's distance, and set at a min value of 1 diff = int(10 * len(diff_bytes) / max(1, file_size)) return max(diff, 1)
def check_text_similarity(path1, path2): file1_str = open(path1, 'rb').read(); file2_str = open(path2, 'rb').read(); h1 = tlsh.hash(file1_str); h2 = tlsh.hash(file2_str); diff = tlsh.diff(h1, h2); return diff;
def perform_fuzzy_matching(members1, members2): if tlsh == None or Config().fuzzy_threshold == 0: return already_compared = set() # Perform local copies because they will be modified by consumer members1 = dict(members1) members2 = dict(members2) for name1, (file1, _) in members1.items(): if file1.is_directory() or not file1.fuzzy_hash: continue comparisons = [] for name2, (file2, _) in members2.items(): if name2 in already_compared or file2.is_directory( ) or not file2.fuzzy_hash: continue comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2)) if comparisons: comparisons.sort(key=operator.itemgetter(0)) score, name2 = comparisons[0] logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score) if score < Config().fuzzy_threshold: yield name1, name2, score already_compared.add(name2)
def tlshh(path): print("[+] Creating tlsh matches") result = [] with open('file_numbers.csv', mode='r') as infile: reader = csv.reader(infile) mydict = {rows[0]: rows[1] for rows in reader} for pair in itertools.combinations(glob.glob(os.path.join(path, '*')), r=2): file1 = pair[0] file2 = pair[1] filename1 = file1.split("/")[1] filename2 = file2.split("/")[1] h1 = tlsh.hash(open(file1, 'rb').read()) h1_number = mydict[filename1] h2 = tlsh.hash(open(file2, 'rb').read()) h2_number = mydict[filename2] score = tlsh.diff(h1, h2) result.append([h1_number, h1, h2_number, h2, 1000 - score]) with open("matchestlsh.csv", "w", newline="") as f: writer = csv.writer(f) result.insert(0, [ "File 1 Number", "File 1 Hash", "File 2 Number", "File 2 Hash", "Score" ]) writer.writerows(result)
def compute_tlsh_distance(hash1, hash2) -> float: """ Compute hash difference for TLSH only :param hash1: first hash :param hash2: second hash :return: distance between hashes """ return tlsh.diff(hash1, hash2) / (len(hash1) * 16) # 70 hexa character
def TO_OVERWRITE_compute_distance(self, pic1: picture_class.Picture, pic2: picture_class.Picture): dist = None if self.conf.ALGO == configuration.ALGO_TYPE.TLSH: dist = tlsh.diff(pic1.hash, pic2.hash) elif self.conf.ALGO == configuration.ALGO_TYPE.TLSH_NO_LENGTH: dist = tlsh.diffxlen(pic1.hash, pic2.hash) else: raise Exception( "Invalid algorithm type for TLSH execution handler during distance computing : " + str(self.conf.ALGO.name)) return dist
def scantlsh(scanqueue, reportqueue, cursor, conn, tlshcutoff): while True: (directory, filename, sha256) = scanqueue.get() ## then compute the TLSH hash and search in the database ## for the closest file. tlshfile = open(os.path.join(directory, filename), 'rb') tlshdata = tlshfile.read() tlshfile.close() tlshhash = tlsh.hash(tlshdata) if tlshhash == '': ## file is either too small or a hash cannot be ## computed (example: all characters are the same) scanqueue.task_done() continue ## now get some cadidates cursor.execute("select distinct checksum from fileinfo where filename=%s", (filename,)) candidates = cursor.fetchall() conn.commit() if len(candidates) == 0: scanqueue.task_done() continue mostpromising = [] minhash = sys.maxsize for candidate in candidates: cursor.execute("select tlsh from hashes where sha256=%s", candidate) tlshresult = cursor.fetchone() if tlshresult == None: continue tlshdiff = tlsh.diff(tlshhash, tlshresult[0]) if tlshdiff < minhash: minhash = tlshdiff mostpromising = [candidate[0]] elif tlshdiff == minhash: mostpromising.append(candidate[0]) if mostpromising != []: if minhash < tlshcutoff: candidates = [] for m in mostpromising: cursor.execute("select packagename, version, fullfilename from fileinfo where checksum=%s", (m,)) candidates += cursor.fetchall() conn.commit() reportqueue.put((directory, filename, candidates, minhash)) scanqueue.task_done()
def get_distances(telfhash_data, files_combination): """Get the distance between each telfhash TLSH values Input: telfhash_data - dictionary of telfhash data with the keys being the filename files_combination - a list of list. each component list contains two files """ distances = [] for element in files_combination: file1 = element[0] file2 = element[1] distance = tlsh.diff(telfhash_data[file1]["telfhash"].upper(), telfhash_data[file2]["telfhash"].upper()) distances.append((file1, file2, distance)) return distances
def perform_fuzzy_matching(members1, members2): if tlsh == None or Config.general.fuzzy_threshold == 0: return already_compared = set() # Perform local copies because they will be modified by consumer members1 = dict(members1) members2 = dict(members2) for name1, file1 in members1.items(): if file1.is_directory() or not file1.fuzzy_hash: continue comparisons = [] for name2, file2 in members2.items(): if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash: continue comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2)) if comparisons: comparisons.sort(key=operator.itemgetter(0)) score, name2 = comparisons[0] logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score) if score < Config.general.fuzzy_threshold: yield name1, name2, score already_compared.add(name2)
def compute_2(path): h = tlsh.Tlsh() with open(path, "rb") as f: for buf in iter(lambda: f.read(512), b""): h.update(buf) h.final() return h hex1 = compute_1(sys.argv[1]) print("tlsh.hash hex1", hex1) hex2 = compute_1(sys.argv[2]) print("tlsh.hash hex2", hex2) print("tlsh.diff(hex1, hex2)", tlsh.diff(hex1, hex2)) print("tlsh.diff(hex2, hex1)", tlsh.diff(hex2, hex1)) h1 = compute_2(sys.argv[1]) hex1 = h1.hexdigest() print("tlsh.Tlsh hex1", hex1) h2 = compute_2(sys.argv[2]) hex2 = h2.hexdigest() print("tlsh.Tlsh hex2", hex2) print("h1.diff(h2)", h1.diff(h2)) print("h2.diff(h1)", h2.diff(h1)) print("h1.diff(hex2)", h1.diff(hex2)) print("h2.diff(hex1)", h2.diff(hex1)) h3 = tlsh.Tlsh() h3.fromTlshStr(hex2)
from __future__ import print_function import sys import tlsh def compute(path): h = tlsh.Tlsh() with open(path, 'rb') as f: for buf in iter(lambda: f.read(512), b''): h.update(buf) h.final() return h h1 = compute(sys.argv[1]) hex1 = h1.hexdigest() print('hex1', hex1) h2 = compute(sys.argv[2]) hex2 = h2.hexdigest() print('hex2', hex2) print('tlsh.diff(hex1, hex2)', tlsh.diff(hex1, hex2)) print('tlsh.diff(hex2, hex1)', tlsh.diff(hex2, hex1)) print('h1.diff(h2)', h1.diff(h2)) print('h2.diff(h1)', h2.diff(h1)) print('h1.diff(hex2)', h1.diff(hex2)) print('h2.diff(hex1)', h2.diff(hex1))
def scantlsh(scanqueue, reportqueue, cursor, conn, tlshcutoff): while True: ## first get the data for a file for which a close match ## needs to be compute. (directory, filename, sha256) = scanqueue.get() ## then compute the TLSH hash and search in the database ## for the closest files. tlshfile = open(os.path.join(directory, filename), 'rb') tlshdata = tlshfile.read() tlshfile.close() tlshhash = tlsh.hash(tlshdata) if tlshhash == '': ## file is either too small or a hash cannot be ## computed (example: all characters are the same) scanqueue.task_done() continue ## now get checksums for files with the exact same name cursor.execute( "select distinct checksum from fileinfo where filename=%s", (filename, )) candidates = cursor.fetchall() conn.commit() if len(candidates) == 0: scanqueue.task_done() continue ## keep the most promising files in a list mostpromising = [] ## first set the value for the found hash very high minhash = sys.maxsize for candidate in candidates: ## first grab the TLSH value from the database cursor.execute("select tlsh from hashes where sha256=%s", candidate) tlshresult = cursor.fetchone() if tlshresult == None: continue ## compute the difference with the TLSH value computed above ## if the distance is smaller than the distance of the current ## best hit, then this will be the new best hit. If it is the ## same it is added to the list of best matches. tlshdiff = tlsh.diff(tlshhash, tlshresult[0]) if tlshdiff < minhash: minhash = tlshdiff mostpromising = [candidate[0]] elif tlshdiff == minhash: mostpromising.append(candidate[0]) ## if there are promising files and they aren't below a specific TLSH threshold ## return the information associated with these files. if mostpromising != []: if minhash < tlshcutoff: candidates = [] for m in mostpromising: cursor.execute( "select packagename, version, fullfilename from fileinfo where checksum=%s", (m, )) candidates += cursor.fetchall() conn.commit() reportqueue.put((directory, filename, candidates, minhash)) scanqueue.task_done()
def diff_hash(one, two): return tlsh.diff(one, two)
def get_tlsh_comparison(first, second): return tlsh.diff(first, second)
deletionStr = csvinput.loc[csvinput["Password"] == password, 'oneDeletion'].values[0] deletionStrHashed = tlsh.forcehash((prefixSalt + (deletionStr * multiplier) + suffixSalt).encode("utf-8")) substitutionStr = csvinput.loc[csvinput["Password"] == password, 'oneSubstitution'].values[0] substitutionStrHashed = tlsh.forcehash((prefixSalt + (substitutionStr * multiplier) + suffixSalt).encode("utf-8")) incorrectStr = csvinput.loc[csvinput["Password"] == password, 'incorrect'].values[0] incorrectStrHashed = tlsh.forcehash((prefixSalt + (incorrectStr * multiplier) + suffixSalt).encode("utf-8")) capStr = csvinput.loc[csvinput["Password"] == password, 'oneCapMistake'].values[0] capStrHashed = tlsh.forcehash((prefixSalt + (capStr * multiplier) + suffixSalt).encode("utf-8")) # subPuncStr = csvinput.loc[csvinput["Password"] == password, 'subPunctuation'].values[0] # subPuncHashed = tlsh.forcehash((prefixSalt + (subPuncStr * multiplier) + suffixSalt).encode("utf-8")) oneInsertionDifference.append(tlsh.diff(passwordHashed, insertionStrHashed)) oneDeletionDiffernce.append(tlsh.diff(passwordHashed, deletionStrHashed)) oneSubstitutionDifference.append(tlsh.diff(passwordHashed, substitutionStrHashed)) incorrectDifference.append(tlsh.diff(passwordHashed, incorrectStrHashed)) oneCapDifference.append(tlsh.diff(passwordHashed, capStrHashed)) # subPuncDifference.append(tlsh.diff(passwordHashed, subPuncHashed)) csvinput['oneSubstitutionDifference'] = pd.Series(oneSubstitutionDifference, dtype = int) #csvinput['oneSubstitutionDifference'] = csvinput['oneSubstitutionDifference'].str[0] #csvinput['oneSubstitutionDifference'] = csvinput['oneSubstitutionDifference'].astype(object) csvinput['oneInsertionDifference'] = pd.Series(oneInsertionDifference, dtype = int) #csvinput['oneInsertionDifference'] = csvinput['oneInsertionDifference'].str[0] #csvinput['oneInsertionDifference'] = csvinput['oneInsertionDifference'].astype(object) csvinput['oneDeletionDiffernce'] = pd.Series(oneDeletionDiffernce, dtype = int)
output = subprocess.Popen(["ssdeep.exe", filename], stdout=subprocess.PIPE).communicate()[0] hashes[2].append((filename, output)) #hashes[2].append((filename, str(output.splitlines()[2]).split("\'", 1)[1].split(",", 1)[0])) print("") print("nilsimsa (different 0 - 128 similar)") for e in hashes[0]: print(str(e.hexdigest())) compare_results[0].append(hashes[0][0].compare(e.hexdigest(), True)) print("") print("tlsh (different ? - 0 similar)") for e in hashes[1]: print(str(e)) compare_results[1].append(tlsh.diff(hashes[1][0][1], e[1])) print("") print("ssdeep (different 0 - 100 similar)") with open("tmp", "wb") as file: file.write(hashes[2][0][1]) for e in hashes[2]: print(str(e)) output = subprocess.Popen(["ssdeep.exe", "-a", "-m", "tmp", e[0]], stdout=subprocess.PIPE).communicate()[0] compare_results[2].append( int(str(output).split("(", 1)[1].split(")", 1)[0])) print("") print(str(compare_results))
def diff(passwordHash, otherHash): ''' Uses TLSH difference function to return hash difference, for consistency I guess... ''' return tlsh.diff(passwordHash, otherHash)
import tlsh import os password = '******' f = open('salt.txt', 'r') saltLines = f.read() salt = saltLines.splitlines() multiplier = 5 incorrectPWArray = [ 'swordfish', 'awordfish', 'aaordfish', 'aaardfish', 'swordfisa', 'swordfiaa', 'swordfaaa', 'aaaaaaaaa', 'zzzzzzzzz', 'swordfis', 'wordfish', 'sordfish', 'swordfisha', 'aswordfish', 'aaswordfish', 'swordfishaa', 'haufkljdioja', ' ' ] incorrectPWCharDifference = [ 0, 1, 2, 3, 1, 2, 3, 9, 9, 1, 1, 1, 1, 1, 2, 2, 12, 9 ] correctCombine = salt[0] + (password * multiplier) + salt[1] hashOutput = tlsh.forcehash(correctCombine.encode("utf-8")) for i in range(len(incorrectPWArray)): incorrectCombine = salt[0] + (incorrectPWArray[i] * multiplier) + salt[1] incorrectHashOutput = tlsh.forcehash(incorrectCombine.encode("utf-8")) diff = tlsh.diff(hashOutput, incorrectHashOutput) print('Attempted password: '******'Character Difference: ' + str(incorrectPWCharDifference[i])) print('difference score: ' + str(diff)) print()
def get_tlsh_comparison(first, second): return tlsh.diff(first, second) # pylint: disable=c-extension-no-member
import tlsh diff = [] # 256 is the maximum number of single byte changes for i in xrange(256): s1 = ''.join([`chr(j % 256)` for j in xrange(1000)]) s2 = ''.join([`chr(j % 256)` for j in xrange(999)]) k = (999 - i) % 256 s2 = s2 + `chr(k)` h1 = tlsh.hash(s1) h2 = tlsh.hash(s2) diff.append(tlsh.diff(h1, h2)) for i in xrange(1,256): print 'diff', i, 'score', diff[i] # 0 has not change print 'diff', 0, 'score', diff[0]
import matplotlib.pyplot as plt from sklearn.cluster import * from sklearn import metrics from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler hash_list = [] for i in range(10001): hash_list.append(tlsh.hash(os.urandom(256))) adj = np.zeros((len(hash_list), len(hash_list)), int) for i in range(len(hash_list)): for j in range(len(hash_list)): d = tlsh.diff(hash_list[i], hash_list[j]); adj[i][j] = d adj[j][i] = d adj = StandardScaler().fit_transform(adj) #adj, labels_true = make_blobs(n_samples=1001) #labels_true = make_blobs(n_samples=1001) # Compute DBSCAN #db = DBSCAN(eps=0.4, min_samples=10, metric='precomputed').fit(adj) #db = DBSCAN(eps=0.4, min_samples=10).fit(adj) #ms = MeanShift(n_jobs=-1).fit(adj) ms = MiniBatchKMeans(n_clusters=2).fit(adj) #db = AgglomerativeClustering(n_clusters=3, affinity='precomputed').fit(adj) #core_samples_mask = np.zeros_like(db.labels_, dtype=bool)