def hash_files(files: list) -> dict: """ Hash each of the file in the list. In case of a MemoryError (limitation of e.g. 32-bit Python) write out the file names in separate .txt files per PID of the process used for hashing. """ hashfiles = {} files_len = len(files) for idx, fname in enumerate(files): LOG.debug('Hashing %d / %d', idx + 1, files_len) try: fhash = hash_file(fname) if not fhash: continue if fhash not in hashfiles: hashfiles[fhash] = [fname] else: hashfiles[fhash].extend([fname]) except MemoryError: LOG.critical( 'Not enough memory while hashing %s, skipping.', fname ) ignore_append(fname) return hashfiles
def test_ignore_append(): """ Test writing ignored path to a file. """ pid = 12345 patch_pid = patch('bear.common.getpid', return_value=pid) file_obj = MagicMock() mocked_open_inst = MagicMock(__enter__=MagicMock( return_value=file_obj)) patch_open = patch('builtins.open', return_value=mocked_open_inst) with patch_pid, patch_open as mocked_open: ignore_append('lalala') mocked_open.assert_called_once_with(f'{pid}_ignored.txt', 'a') file_obj.write.assert_has_calls([call('lalala'), call('\n')])
def hash_files(files: list, hasher: Hasher, master_pid: int = None) -> dict: """ Hash each of the file in the list. In case of a MemoryError (limitation of e.g. 32-bit Python) write out the file names in separate .txt files per PID of the process used for hashing. Note: master_pid should have a default in case of running out of MP, and use master's PID so that master can recognize slave processes' files after the slaves in the Pool are terminated. """ hashfiles = {} files_len = len(files) partial_file = f"bear_m{master_pid}_s{getpid()}_hashes.txt" # safe-check in case the name changes in the future to prevent # creating files that won't be deleted by master process assert f"bear_m{master_pid}_" in partial_file assert f"_hashes.txt" in partial_file for idx, fname in enumerate(files): LOG.debug('Hashing %d / %d', idx + 1, files_len) try: fhash = hash_file(path=fname, hasher=hasher) if not fhash: continue with open(partial_file, "a") as file: file.write(f"{fhash}\t{fname}\n") if fhash not in hashfiles: hashfiles[fhash] = [fname] else: hashfiles[fhash].extend([fname]) except MemoryError: LOG.critical('Not enough memory while hashing %s, skipping.', fname) ignore_append(fname) return hashfiles
def hash_file(path: str, hasher: Hasher) -> str: """ Open a file, read its contents and return MD5 hash. """ result = '' try: with open(path, 'rb') as file: contents = file.read() result = hash_text(inp=contents, hasher=hasher) except PermissionError: LOG.critical('Could not open %s due to permission error! %s', path, traceback.format_exc()) ignore_append(path) except FileNotFoundError: LOG.warning('Could not find file %s! Did you delete it?', path) ignore_append(path) except OSError: LOG.critical('Could not open %s! %s', path, traceback.format_exc()) ignore_append(path) return result