def __init__(self, cutoff=None, strings=False, same=False, symlinks=False, name=False, max_results=None, display=False, log=None, csv=False, quiet=False, format_to_screen=False, abspath=False, matches={}, types={}): ''' Class constructor. @cutoff - The fuzzy cutoff which determines if files are different or not. @strings - Only hash strings inside of the file, not the entire file itself. @same - Set to True to show files that are the same, False to show files that are different. @symlinks - Set to True to include symbolic link files. @name - Set to True to only compare files whose base names match. @max_results - Stop searching after x number of matches. @display - Set to True to display results to stdout, or pass an instance of binwalk.prettyprint.PrettyPrint. @log - Specify a log file to log results to. @csv - Set to True to log data in CSV format. @quiet - Set to True to suppress output to stdout. @format_to_screen - Set to True to format the output to the terminal window width. @abspath - Set to True to display absolute file paths. @matches - A dictionary of file names to diff. @types - A dictionary of file types to diff. Returns None. ''' self.cutoff = cutoff self.strings = strings self.show_same = same self.symlinks = symlinks self.matches = matches self.name = name self.types = types self.abspath = abspath self.max_results = max_results if display: if isinstance(display, PrettyPrint): self.pretty_print = display else: self.pretty_print = PrettyPrint(log=log, csv=csv, format_to_screen=format_to_screen, quiet=quiet) self.pretty_print.header(header="PERCENTAGE\t\t\tFILE", csv=True) else: self.pretty_print = None self.total = 0 self.last_file1 = HashResult(None) self.last_file2 = HashResult(None) self.magic = magic.open(0) self.magic.load() lib_path = ctypes.util.find_library(self.LIBRARY_NAME) if lib_path is None: raise Exception('Could not find the hash matching library. Please install libfuzzy from ssdeep.') self.lib = ctypes.cdll.LoadLibrary(lib_path) if self.cutoff is None: self.cutoff = self.DEFAULT_CUTOFF for k in get_keys(self.types): for i in range(0, len(self.types[k])): self.types[k][i] = re.compile(self.types[k][i])
class HashMatch(object): ''' Class for fuzzy hash matching of files and directories. ''' # Requires libfuzzy.so LIBRARY_NAME = "fuzzy" # Max result is 148 (http://ssdeep.sourceforge.net/api/html/fuzzy_8h.html) FUZZY_MAX_RESULT = 150 # Files smaller than this won't produce meaningful fuzzy results (from ssdeep.h) FUZZY_MIN_FILE_SIZE = 4096 DEFAULT_CUTOFF = 0 CONSERVATIVE_CUTOFF = 90 def __init__(self, cutoff=None, strings=False, same=False, symlinks=False, name=False, max_results=None, display=False, log=None, csv=False, quiet=False, format_to_screen=False, abspath=False, matches={}, types={}): ''' Class constructor. @cutoff - The fuzzy cutoff which determines if files are different or not. @strings - Only hash strings inside of the file, not the entire file itself. @same - Set to True to show files that are the same, False to show files that are different. @symlinks - Set to True to include symbolic link files. @name - Set to True to only compare files whose base names match. @max_results - Stop searching after x number of matches. @display - Set to True to display results to stdout, or pass an instance of binwalk.prettyprint.PrettyPrint. @log - Specify a log file to log results to. @csv - Set to True to log data in CSV format. @quiet - Set to True to suppress output to stdout. @format_to_screen - Set to True to format the output to the terminal window width. @abspath - Set to True to display absolute file paths. @matches - A dictionary of file names to diff. @types - A dictionary of file types to diff. Returns None. ''' self.cutoff = cutoff self.strings = strings self.show_same = same self.symlinks = symlinks self.matches = matches self.name = name self.types = types self.abspath = abspath self.max_results = max_results if display: if isinstance(display, PrettyPrint): self.pretty_print = display else: self.pretty_print = PrettyPrint(log=log, csv=csv, format_to_screen=format_to_screen, quiet=quiet) self.pretty_print.header(header="PERCENTAGE\t\t\tFILE", csv=True) else: self.pretty_print = None self.total = 0 self.last_file1 = HashResult(None) self.last_file2 = HashResult(None) self.magic = magic.open(0) self.magic.load() self.lib = ctypes.cdll.LoadLibrary(ctypes.util.find_library(self.LIBRARY_NAME)) if self.cutoff is None: self.cutoff = self.DEFAULT_CUTOFF for k in get_keys(self.types): for i in range(0, len(self.types[k])): self.types[k][i] = re.compile(self.types[k][i]) def _get_strings(self, fname): return ''.join(list(binwalk.common.strings(fname, minimum=10))) def _print(self, match, fname): if self.pretty_print: if self.abspath: fname = os.path.abspath(fname) self.pretty_print._pprint('%4d\t\t\t\t%s\n' % (match, self.pretty_print._format(fname))) def _print_footer(self): if self.pretty_print: self.pretty_print.footer() def _compare_files(self, file1, file2): ''' Fuzzy diff two files. @file1 - The first file to diff. @file2 - The second file to diff. Returns the match percentage. Returns None on error. ''' status = 0 file1_dup = False file2_dup = False if not self.name or os.path.basename(file1) == os.path.basename(file2): if os.path.exists(file1) and os.path.exists(file2): hash1 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT) hash2 = ctypes.create_string_buffer(self.FUZZY_MAX_RESULT) # Check if the last file1 or file2 matches this file1 or file2; no need to re-hash if they match. if file1 == self.last_file1.name and self.last_file1.hash: file1_dup = True else: self.last_file1.name = file1 if file2 == self.last_file2.name and self.last_file2.hash: file2_dup = True else: self.last_file2.name = file2 try: if self.strings: if file1_dup: file1_strings = self.last_file1.strings else: self.last_file1.strings = file1_strings = self._get_strings(file1) if file2_dup: file2_strings = self.last_file2.strings else: self.last_file2.strings = file2_strings = self._get_strings(file2) if file1_strings == file2_strings: return 100 else: if file1_dup: hash1 = self.last_file1.hash else: status |= self.lib.fuzzy_hash_buf(str2bytes(file1_strings), len(file1_strings), hash1) if file2_dup: hash2 = self.last_file2.hash else: status |= self.lib.fuzzy_hash_buf(str2bytes(file2_strings), len(file2_strings), hash2) else: if file1_dup: hash1 = self.last_file1.hash else: status |= self.lib.fuzzy_hash_filename(str2bytes(file1), hash1) if file2_dup: hash2 = self.last_file2.hash else: status |= self.lib.fuzzy_hash_filename(str2bytes(file2), hash2) if status == 0: if not file1_dup: self.last_file1.hash = hash1 if not file2_dup: self.last_file2.hash = hash2 if hash1.raw == hash2.raw: return 100 else: return self.lib.fuzzy_compare(hash1, hash2) except Exception as e: print ("WARNING: Exception while doing fuzzy hash: %s" % e) return None def is_match(self, match): ''' Returns True if this is a good match. Returns False if his is not a good match. ''' return (match is not None and ((match >= self.cutoff and self.show_same) or (match < self.cutoff and not self.show_same))) def _get_file_list(self, directory): ''' Generates a directory tree, including/excluding files as specified in self.matches and self.types. @directory - The root directory to start from. Returns a set of file paths, excluding the root directory. ''' file_list = [] # Normalize directory path so that we can exclude it from each individual file path directory = os.path.abspath(directory) + os.path.sep for (root, dirs, files) in os.walk(directory): # Don't include the root directory in the file paths root = ''.join(root.split(directory, 1)[1:]) # Get a list of files, with or without symlinks as specified during __init__ files = [os.path.join(root, f) for f in files if self.symlinks or not os.path.islink(f)] # If no filters were specified, return all files if not self.types and not self.matches: file_list += files else: # Filter based on the file type, as reported by libmagic if self.types: for f in files: for (include, regex_list) in iterator(self.types): for regex in regex_list: try: magic_result = self.magic.file(os.path.join(directory, f)).lower() except Exception as e: magic_result = '' match = regex.match(magic_result) # If this matched an include filter, or didn't match an exclude filter if (match and include) or (not match and not include): file_list.append(f) # Filter based on file name if self.matches: for (include, file_filter_list) in iterator(self.matches): for file_filter in file_filter_list: matching_files = fnmatch.filter(files, file_filter) # If this is an include filter, add all matching files to the list if include: file_list += matching_files # Else, this add all files except those that matched to the list else: file_list += list(set(files) - set(matching_files)) return set(file_list) def files(self, needle, haystack): ''' Compare one file against a list of other files. @needle - File to match against. @haystack - A list of haystack files. Returns a list of tuple results. ''' results = [] self.total = 0 for f in haystack: m = self._compare_files(needle, f) if m is not None and self.is_match(m): self._print(m, f) results.append((m, f)) self.total += 1 if self.max_results and self.total >= self.max_results: break self._print_footer() return results def file(self, needle, haystack): ''' Search for one file inside one or more directories. @needle - File to search for. @haystack - List of directories to search in. Returns a list of tuple results. ''' matching_files = [] self.total = 0 done = False for directory in haystack: for f in self._get_file_list(directory): f = os.path.join(directory, f) m = self._compare_files(needle, f) if m is not None and self.is_match(m): self._print(m, f) matching_files.append((m, f)) self.total += 1 if self.max_results and self.total >= self.max_results: done = True break if done: break self._print_footer() return matching_files def directories(self, needle, haystack): ''' Compare the contents of one directory with the contents of other directories. @source - Source directory to compare everything to. @dir_list - Compare files in source to files in these directories. Returns a list of tuple results. ''' done = False results = [] self.total = 0 source_files = self._get_file_list(needle) for directory in haystack: dir_files = self._get_file_list(directory) for f in source_files: if f in dir_files: file1 = os.path.join(needle, f) file2 = os.path.join(directory, f) m = self._compare_files(file1, file2) if m is not None and self.is_match(m): self._print(m, file2) results.append((m, file2)) self.total += 1 if self.max_results and self.total >= self.max_results: done = True break if done: break self._print_footer() return results