Exemple #1
0
def hash_files(files: list) -> dict:
    """
    Hash each of the file in the list.

    In case of a MemoryError (limitation of e.g. 32-bit Python)
    write out the file names in separate .txt files per PID
    of the process used for hashing.
    """

    hashfiles = {}
    files_len = len(files)

    for idx, fname in enumerate(files):
        LOG.debug('Hashing %d / %d', idx + 1, files_len)

        try:
            fhash = hash_file(fname)
            if not fhash:
                continue
            if fhash not in hashfiles:
                hashfiles[fhash] = [fname]
            else:
                hashfiles[fhash].extend([fname])
        except MemoryError:
            LOG.critical(
                'Not enough memory while hashing %s, skipping.', fname
            )
            ignore_append(fname)

    return hashfiles
Exemple #2
0
    def test_ignore_append():
        """
        Test writing ignored path to a file.
        """
        pid = 12345
        patch_pid = patch('bear.common.getpid', return_value=pid)

        file_obj = MagicMock()
        mocked_open_inst = MagicMock(__enter__=MagicMock(
            return_value=file_obj))
        patch_open = patch('builtins.open', return_value=mocked_open_inst)

        with patch_pid, patch_open as mocked_open:
            ignore_append('lalala')

            mocked_open.assert_called_once_with(f'{pid}_ignored.txt', 'a')

            file_obj.write.assert_has_calls([call('lalala'), call('\n')])
Exemple #3
0
def hash_files(files: list, hasher: Hasher, master_pid: int = None) -> dict:
    """
    Hash each of the file in the list.

    In case of a MemoryError (limitation of e.g. 32-bit Python)
    write out the file names in separate .txt files per PID
    of the process used for hashing.

    Note: master_pid should have a default in case of running out of MP,
          and use master's PID so that master can recognize slave processes'
          files after the slaves in the Pool are terminated.
    """

    hashfiles = {}
    files_len = len(files)
    partial_file = f"bear_m{master_pid}_s{getpid()}_hashes.txt"

    # safe-check in case the name changes in the future to prevent
    # creating files that won't be deleted by master process
    assert f"bear_m{master_pid}_" in partial_file
    assert f"_hashes.txt" in partial_file

    for idx, fname in enumerate(files):
        LOG.debug('Hashing %d / %d', idx + 1, files_len)

        try:
            fhash = hash_file(path=fname, hasher=hasher)
            if not fhash:
                continue
            with open(partial_file, "a") as file:
                file.write(f"{fhash}\t{fname}\n")
            if fhash not in hashfiles:
                hashfiles[fhash] = [fname]
            else:
                hashfiles[fhash].extend([fname])
        except MemoryError:
            LOG.critical('Not enough memory while hashing %s, skipping.',
                         fname)
            ignore_append(fname)

    return hashfiles
Exemple #4
0
def hash_file(path: str, hasher: Hasher) -> str:
    """
    Open a file, read its contents and return MD5 hash.
    """
    result = ''
    try:
        with open(path, 'rb') as file:
            contents = file.read()
        result = hash_text(inp=contents, hasher=hasher)
    except PermissionError:
        LOG.critical('Could not open %s due to permission error! %s', path,
                     traceback.format_exc())
        ignore_append(path)
    except FileNotFoundError:
        LOG.warning('Could not find file %s! Did you delete it?', path)
        ignore_append(path)
    except OSError:
        LOG.critical('Could not open %s! %s', path, traceback.format_exc())
        ignore_append(path)
    return result