Ejemplo n.º 1
0
def match(mod_data1, mod_data2):
    try:
        diff = tlsh.diff(mod_data1["tlsh"], mod_data2["tlsh"])
    except:
        return False
    #print("TLSH diff between {} and {}:".format(mod_data1["port"], mod_data2["port"]), diff)
    return diff < 50
Ejemplo n.º 2
0
def _compare_tlsh(value1, value2):
    if value1 == "TNULL" and value2 == "TNULL":
        return 0
    elif value1 == "TNULL" or value2 == "TNULL":
        return 1
    else:
        return min(tlsh.diff(value1, value2) / 100, 1)
Ejemplo n.º 3
0
def compute_distance(file1, file2):
    """
    Use tlsh to compute the distance between 2 files
    If it fails, revert to counting the number of different bytes
    """
    try:
        return tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash)
    except (TypeError, ValueError):
        # File is too small or doesn't have enough randomness
        pass

    # Compute the proportion of bytes changed
    path1 = str(file1.path)
    path2 = str(file2.path)
    file_size = max(get_file_size(path1), get_file_size(path2))

    try:
        diff_bytes = subprocess.check_output(["cmp", "-l", path1, path2],
                                             stderr=subprocess.DEVNULL)
    except subprocess.CalledProcessError as e:
        # When files are different, cmp has an exit code of 1
        diff_bytes = e.output

    # Diff is size of output, multiplied by a constant to be the same order of
    # magnitude as TLSH's distance, and set at a min value of 1
    diff = int(10 * len(diff_bytes) / max(1, file_size))
    return max(diff, 1)
Ejemplo n.º 4
0
def check_text_similarity(path1, path2):
	file1_str = open(path1, 'rb').read();
	file2_str = open(path2, 'rb').read();
	h1 = tlsh.hash(file1_str);
	h2 = tlsh.hash(file2_str);
	diff = tlsh.diff(h1, h2);
	return diff;
Ejemplo n.º 5
0
def perform_fuzzy_matching(members1, members2):
    if tlsh == None or Config().fuzzy_threshold == 0:
        return
    already_compared = set()
    # Perform local copies because they will be modified by consumer
    members1 = dict(members1)
    members2 = dict(members2)
    for name1, (file1, _) in members1.items():
        if file1.is_directory() or not file1.fuzzy_hash:
            continue
        comparisons = []
        for name2, (file2, _) in members2.items():
            if name2 in already_compared or file2.is_directory(
            ) or not file2.fuzzy_hash:
                continue
            comparisons.append((tlsh.diff(file1.fuzzy_hash,
                                          file2.fuzzy_hash), name2))
        if comparisons:
            comparisons.sort(key=operator.itemgetter(0))
            score, name2 = comparisons[0]
            logger.debug('fuzzy top match %s %s: %d difference score', name1,
                         name2, score)
            if score < Config().fuzzy_threshold:
                yield name1, name2, score
                already_compared.add(name2)
Ejemplo n.º 6
0
def tlshh(path):
    print("[+] Creating tlsh matches")
    result = []
    with open('file_numbers.csv', mode='r') as infile:
        reader = csv.reader(infile)
        mydict = {rows[0]: rows[1] for rows in reader}

    for pair in itertools.combinations(glob.glob(os.path.join(path, '*')),
                                       r=2):
        file1 = pair[0]
        file2 = pair[1]
        filename1 = file1.split("/")[1]
        filename2 = file2.split("/")[1]
        h1 = tlsh.hash(open(file1, 'rb').read())
        h1_number = mydict[filename1]
        h2 = tlsh.hash(open(file2, 'rb').read())
        h2_number = mydict[filename2]
        score = tlsh.diff(h1, h2)
        result.append([h1_number, h1, h2_number, h2, 1000 - score])

    with open("matchestlsh.csv", "w", newline="") as f:
        writer = csv.writer(f)
        result.insert(0, [
            "File 1 Number", "File 1 Hash", "File 2 Number", "File 2 Hash",
            "Score"
        ])
        writer.writerows(result)
Ejemplo n.º 7
0
 def compute_tlsh_distance(hash1, hash2) -> float:
     """
     Compute hash difference for TLSH only
     :param hash1: first hash
     :param hash2: second hash
     :return: distance between hashes
     """
     return tlsh.diff(hash1, hash2) / (len(hash1) * 16)  # 70 hexa character
Ejemplo n.º 8
0
    def TO_OVERWRITE_compute_distance(self, pic1: picture_class.Picture,
                                      pic2: picture_class.Picture):
        dist = None
        if self.conf.ALGO == configuration.ALGO_TYPE.TLSH:
            dist = tlsh.diff(pic1.hash, pic2.hash)
        elif self.conf.ALGO == configuration.ALGO_TYPE.TLSH_NO_LENGTH:
            dist = tlsh.diffxlen(pic1.hash, pic2.hash)
        else:
            raise Exception(
                "Invalid algorithm type for TLSH execution handler during distance computing : "
                + str(self.conf.ALGO.name))

        return dist
Ejemplo n.º 9
0
def scantlsh(scanqueue, reportqueue, cursor, conn, tlshcutoff):
	while True:
		(directory, filename, sha256) = scanqueue.get()

		## then compute the TLSH hash and search in the database
		## for the closest file.
		tlshfile = open(os.path.join(directory, filename), 'rb')
		tlshdata = tlshfile.read()
		tlshfile.close()
		tlshhash = tlsh.hash(tlshdata)

		if tlshhash == '':
			## file is either too small or a hash cannot be
			## computed (example: all characters are the same)
			scanqueue.task_done()
			continue

		## now get some cadidates
		cursor.execute("select distinct checksum from fileinfo where filename=%s", (filename,))
		candidates = cursor.fetchall()
		conn.commit()
		if len(candidates) == 0:
			scanqueue.task_done()
			continue
		
		mostpromising = []
		minhash = sys.maxsize
		for candidate in candidates:
			cursor.execute("select tlsh from hashes where sha256=%s", candidate)
			tlshresult = cursor.fetchone()
			if tlshresult == None:
				continue
			tlshdiff = tlsh.diff(tlshhash, tlshresult[0])
			if tlshdiff < minhash:
				minhash = tlshdiff
				mostpromising = [candidate[0]]
			elif tlshdiff == minhash:
				mostpromising.append(candidate[0])
		if mostpromising != []:
			if minhash < tlshcutoff:
				candidates = []
				for m in mostpromising:
					cursor.execute("select packagename, version, fullfilename from fileinfo where checksum=%s", (m,))
					candidates += cursor.fetchall()
					conn.commit()
				reportqueue.put((directory, filename, candidates, minhash))
		scanqueue.task_done()
Ejemplo n.º 10
0
def get_distances(telfhash_data, files_combination):
    """Get the distance between each telfhash TLSH values

    Input:
        telfhash_data - dictionary of telfhash data with the keys being the
                       filename
        files_combination - a list of list. each component list contains
                            two files
    """

    distances = []

    for element in files_combination:
        file1 = element[0]
        file2 = element[1]
        distance = tlsh.diff(telfhash_data[file1]["telfhash"].upper(),
                             telfhash_data[file2]["telfhash"].upper())

        distances.append((file1, file2, distance))

    return distances
Ejemplo n.º 11
0
def perform_fuzzy_matching(members1, members2):
    if tlsh == None or Config.general.fuzzy_threshold == 0:
        return
    already_compared = set()
    # Perform local copies because they will be modified by consumer
    members1 = dict(members1)
    members2 = dict(members2)
    for name1, file1 in members1.items():
        if file1.is_directory() or not file1.fuzzy_hash:
            continue
        comparisons = []
        for name2, file2 in members2.items():
            if name2 in already_compared or file2.is_directory() or not file2.fuzzy_hash:
                continue
            comparisons.append((tlsh.diff(file1.fuzzy_hash, file2.fuzzy_hash), name2))
        if comparisons:
            comparisons.sort(key=operator.itemgetter(0))
            score, name2 = comparisons[0]
            logger.debug('fuzzy top match %s %s: %d difference score', name1, name2, score)
            if score < Config.general.fuzzy_threshold:
                yield name1, name2, score
                already_compared.add(name2)
Ejemplo n.º 12
0

def compute_2(path):
    h = tlsh.Tlsh()
    with open(path, "rb") as f:
        for buf in iter(lambda: f.read(512), b""):
            h.update(buf)
    h.final()
    return h


hex1 = compute_1(sys.argv[1])
print("tlsh.hash hex1", hex1)
hex2 = compute_1(sys.argv[2])
print("tlsh.hash hex2", hex2)
print("tlsh.diff(hex1, hex2)", tlsh.diff(hex1, hex2))
print("tlsh.diff(hex2, hex1)", tlsh.diff(hex2, hex1))

h1 = compute_2(sys.argv[1])
hex1 = h1.hexdigest()
print("tlsh.Tlsh hex1", hex1)
h2 = compute_2(sys.argv[2])
hex2 = h2.hexdigest()
print("tlsh.Tlsh hex2", hex2)
print("h1.diff(h2)", h1.diff(h2))
print("h2.diff(h1)", h2.diff(h1))
print("h1.diff(hex2)", h1.diff(hex2))
print("h2.diff(hex1)", h2.diff(hex1))

h3 = tlsh.Tlsh()
h3.fromTlshStr(hex2)
Ejemplo n.º 13
0
from __future__ import print_function

import sys
import tlsh


def compute(path):
    h = tlsh.Tlsh()
    with open(path, 'rb') as f:
        for buf in iter(lambda: f.read(512), b''):
            h.update(buf)
    h.final()
    return h


h1 = compute(sys.argv[1])
hex1 = h1.hexdigest()
print('hex1', hex1)
h2 = compute(sys.argv[2])
hex2 = h2.hexdigest()
print('hex2', hex2)
print('tlsh.diff(hex1, hex2)', tlsh.diff(hex1, hex2))
print('tlsh.diff(hex2, hex1)', tlsh.diff(hex2, hex1))
print('h1.diff(h2)', h1.diff(h2))
print('h2.diff(h1)', h2.diff(h1))
print('h1.diff(hex2)', h1.diff(hex2))
print('h2.diff(hex1)', h2.diff(hex1))
Ejemplo n.º 14
0
def scantlsh(scanqueue, reportqueue, cursor, conn, tlshcutoff):
    while True:
        ## first get the data for a file for which a close match
        ## needs to be compute.
        (directory, filename, sha256) = scanqueue.get()

        ## then compute the TLSH hash and search in the database
        ## for the closest files.
        tlshfile = open(os.path.join(directory, filename), 'rb')
        tlshdata = tlshfile.read()
        tlshfile.close()
        tlshhash = tlsh.hash(tlshdata)

        if tlshhash == '':
            ## file is either too small or a hash cannot be
            ## computed (example: all characters are the same)
            scanqueue.task_done()
            continue

        ## now get checksums for files with the exact same name
        cursor.execute(
            "select distinct checksum from fileinfo where filename=%s",
            (filename, ))
        candidates = cursor.fetchall()
        conn.commit()
        if len(candidates) == 0:
            scanqueue.task_done()
            continue

        ## keep the most promising files in a list
        mostpromising = []

        ## first set the value for the found hash very high
        minhash = sys.maxsize

        for candidate in candidates:
            ## first grab the TLSH value from the database
            cursor.execute("select tlsh from hashes where sha256=%s",
                           candidate)
            tlshresult = cursor.fetchone()
            if tlshresult == None:
                continue

            ## compute the difference with the TLSH value computed above
            ## if the distance is smaller than the distance of the current
            ## best hit, then this will be the new best hit. If it is the
            ## same it is added to the list of best matches.
            tlshdiff = tlsh.diff(tlshhash, tlshresult[0])
            if tlshdiff < minhash:
                minhash = tlshdiff
                mostpromising = [candidate[0]]
            elif tlshdiff == minhash:
                mostpromising.append(candidate[0])

        ## if there are promising files and they aren't below a specific TLSH threshold
        ## return the information associated with these files.
        if mostpromising != []:
            if minhash < tlshcutoff:
                candidates = []
                for m in mostpromising:
                    cursor.execute(
                        "select packagename, version, fullfilename from fileinfo where checksum=%s",
                        (m, ))
                    candidates += cursor.fetchall()
                    conn.commit()
                reportqueue.put((directory, filename, candidates, minhash))
        scanqueue.task_done()
Ejemplo n.º 15
0
def diff_hash(one, two):
    return tlsh.diff(one, two)
Ejemplo n.º 16
0
def get_tlsh_comparison(first, second):
    return tlsh.diff(first, second)
Ejemplo n.º 17
0
	deletionStr = csvinput.loc[csvinput["Password"] == password, 'oneDeletion'].values[0]
	deletionStrHashed = tlsh.forcehash((prefixSalt + (deletionStr * multiplier) + suffixSalt).encode("utf-8"))

	substitutionStr = csvinput.loc[csvinput["Password"] == password, 'oneSubstitution'].values[0]
	substitutionStrHashed = tlsh.forcehash((prefixSalt + (substitutionStr * multiplier) + suffixSalt).encode("utf-8"))

	incorrectStr = csvinput.loc[csvinput["Password"] == password, 'incorrect'].values[0]
	incorrectStrHashed = tlsh.forcehash((prefixSalt + (incorrectStr * multiplier) + suffixSalt).encode("utf-8"))

	capStr = csvinput.loc[csvinput["Password"] == password, 'oneCapMistake'].values[0]
	capStrHashed = tlsh.forcehash((prefixSalt + (capStr * multiplier) + suffixSalt).encode("utf-8"))

	# subPuncStr = csvinput.loc[csvinput["Password"] == password, 'subPunctuation'].values[0]
	# subPuncHashed = tlsh.forcehash((prefixSalt + (subPuncStr * multiplier) + suffixSalt).encode("utf-8"))

	oneInsertionDifference.append(tlsh.diff(passwordHashed, insertionStrHashed))
	oneDeletionDiffernce.append(tlsh.diff(passwordHashed, deletionStrHashed))
	oneSubstitutionDifference.append(tlsh.diff(passwordHashed, substitutionStrHashed))
	incorrectDifference.append(tlsh.diff(passwordHashed, incorrectStrHashed))
	oneCapDifference.append(tlsh.diff(passwordHashed, capStrHashed))
	# subPuncDifference.append(tlsh.diff(passwordHashed, subPuncHashed))

csvinput['oneSubstitutionDifference'] = pd.Series(oneSubstitutionDifference, dtype = int)
#csvinput['oneSubstitutionDifference'] = csvinput['oneSubstitutionDifference'].str[0]
#csvinput['oneSubstitutionDifference'] = csvinput['oneSubstitutionDifference'].astype(object)

csvinput['oneInsertionDifference'] = pd.Series(oneInsertionDifference, dtype = int)
#csvinput['oneInsertionDifference'] = csvinput['oneInsertionDifference'].str[0]
#csvinput['oneInsertionDifference'] = csvinput['oneInsertionDifference'].astype(object)

csvinput['oneDeletionDiffernce'] = pd.Series(oneDeletionDiffernce, dtype = int)
Ejemplo n.º 18
0
            output = subprocess.Popen(["ssdeep.exe", filename],
                                      stdout=subprocess.PIPE).communicate()[0]
            hashes[2].append((filename, output))
            #hashes[2].append((filename, str(output.splitlines()[2]).split("\'", 1)[1].split(",", 1)[0]))

print("")
print("nilsimsa (different 0 - 128 similar)")
for e in hashes[0]:
    print(str(e.hexdigest()))
    compare_results[0].append(hashes[0][0].compare(e.hexdigest(), True))

print("")
print("tlsh (different ? - 0 similar)")
for e in hashes[1]:
    print(str(e))
    compare_results[1].append(tlsh.diff(hashes[1][0][1], e[1]))

print("")
print("ssdeep (different 0 - 100 similar)")
with open("tmp", "wb") as file:
    file.write(hashes[2][0][1])
for e in hashes[2]:
    print(str(e))
    output = subprocess.Popen(["ssdeep.exe", "-a", "-m", "tmp", e[0]],
                              stdout=subprocess.PIPE).communicate()[0]
    compare_results[2].append(
        int(str(output).split("(", 1)[1].split(")", 1)[0]))

print("")
print(str(compare_results))
Ejemplo n.º 19
0
def diff(passwordHash, otherHash):
    '''
	Uses TLSH difference function to return hash difference, for consistency I guess...
	'''
    return tlsh.diff(passwordHash, otherHash)
Ejemplo n.º 20
0
import tlsh
import os

password = '******'
f = open('salt.txt', 'r')
saltLines = f.read()
salt = saltLines.splitlines()
multiplier = 5

incorrectPWArray = [
    'swordfish', 'awordfish', 'aaordfish', 'aaardfish', 'swordfisa',
    'swordfiaa', 'swordfaaa', 'aaaaaaaaa', 'zzzzzzzzz', 'swordfis', 'wordfish',
    'sordfish', 'swordfisha', 'aswordfish', 'aaswordfish', 'swordfishaa',
    'haufkljdioja', ' '
]
incorrectPWCharDifference = [
    0, 1, 2, 3, 1, 2, 3, 9, 9, 1, 1, 1, 1, 1, 2, 2, 12, 9
]

correctCombine = salt[0] + (password * multiplier) + salt[1]
hashOutput = tlsh.forcehash(correctCombine.encode("utf-8"))

for i in range(len(incorrectPWArray)):
    incorrectCombine = salt[0] + (incorrectPWArray[i] * multiplier) + salt[1]
    incorrectHashOutput = tlsh.forcehash(incorrectCombine.encode("utf-8"))
    diff = tlsh.diff(hashOutput, incorrectHashOutput)
    print('Attempted password: '******'Character Difference: ' + str(incorrectPWCharDifference[i]))
    print('difference score: ' + str(diff))
    print()
Ejemplo n.º 21
0
from __future__ import print_function

import sys
import tlsh

def compute(path):
    h = tlsh.Tlsh()
    with open(path, 'rb') as f:
        for buf in iter(lambda: f.read(512), b''):
            h.update(buf)
    h.final()
    return h

h1 = compute(sys.argv[1])
hex1 = h1.hexdigest()
print('hex1', hex1)
h2 = compute(sys.argv[2])
hex2 = h2.hexdigest()
print('hex2', hex2)
print('tlsh.diff(hex1, hex2)', tlsh.diff(hex1, hex2))
print('tlsh.diff(hex2, hex1)', tlsh.diff(hex2, hex1))
print('h1.diff(h2)', h1.diff(h2))
print('h2.diff(h1)', h2.diff(h1))
print('h1.diff(hex2)', h1.diff(hex2))
print('h2.diff(hex1)', h2.diff(hex1))
Ejemplo n.º 22
0
def get_tlsh_comparison(first, second):
    return tlsh.diff(first, second)  # pylint: disable=c-extension-no-member
Ejemplo n.º 23
0
import tlsh

diff = []

# 256 is the maximum number of single byte changes
for i in xrange(256):
  s1 = ''.join([`chr(j % 256)` for j in xrange(1000)])
  s2 = ''.join([`chr(j % 256)` for j in xrange(999)])

  k = (999 - i) % 256
  s2 = s2 + `chr(k)`
  h1 = tlsh.hash(s1)
  h2 = tlsh.hash(s2)
  diff.append(tlsh.diff(h1, h2))

for i in xrange(1,256):  
  print 'diff', i, 'score', diff[i]

# 0 has not change
print 'diff', 0, 'score', diff[0]
Ejemplo n.º 24
0
import matplotlib.pyplot as plt

from sklearn.cluster import *
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

hash_list = []
for i in range(10001):
    hash_list.append(tlsh.hash(os.urandom(256)))

adj = np.zeros((len(hash_list), len(hash_list)), int)

for i in range(len(hash_list)):
    for j in range(len(hash_list)):
        d = tlsh.diff(hash_list[i], hash_list[j]);
        adj[i][j] = d
        adj[j][i] = d

adj = StandardScaler().fit_transform(adj)
#adj, labels_true = make_blobs(n_samples=1001)

#labels_true = make_blobs(n_samples=1001)

# Compute DBSCAN
#db = DBSCAN(eps=0.4, min_samples=10, metric='precomputed').fit(adj)
#db = DBSCAN(eps=0.4, min_samples=10).fit(adj)
#ms = MeanShift(n_jobs=-1).fit(adj)
ms = MiniBatchKMeans(n_clusters=2).fit(adj)
#db = AgglomerativeClustering(n_clusters=3, affinity='precomputed').fit(adj)
#core_samples_mask = np.zeros_like(db.labels_, dtype=bool)