Beispiel #1
0
    def _file_should_be_hashed(self, filename: str) -> bool:
        global _FOLDER_BLACK_LIST

        if not _FOLDER_BLACK_LIST:
            _FOLDER_BLACK_LIST = FolderBlackList(
                config.get_option("server.folderWatchBlacklist"))

        filepath = os.path.abspath(filename)
        file_is_blacklisted = _FOLDER_BLACK_LIST.is_blacklisted(filepath)
        # Short circuiting for performance.
        if file_is_blacklisted:
            return False
        return file_util.file_is_in_folder_glob(
            filepath, self._get_main_script_directory()
        ) or file_util.file_in_pythonpath(filepath)
Beispiel #2
0
class CodeHasher:
    """A hasher that can hash code objects including dependencies."""
    def __init__(self, name="md5", hasher=None, hash_funcs=None):
        self.hashes = dict()

        self.name = name

        # The number of the bytes in the hash.
        self.size = 0

        # An ever increasing counter.
        self._counter = 0

        if hasher:
            self.hasher = hasher
        else:
            self.hasher = hashlib.new(name)

        self._folder_black_list = FolderBlackList(
            config.get_option("server.folderWatchBlacklist"))

        self.hash_funcs = hash_funcs or {}

    def update(self, obj, context=None):
        """Update the hash with the provided object."""
        self._update(self.hasher, obj, context)

    def digest(self):
        return self.hasher.digest()

    def hexdigest(self):
        return self.hasher.hexdigest()

    def to_bytes(self, obj, context=None):
        """Add memoization to _to_bytes and protect against cycles in data structures."""
        key = _key(obj, context)

        if key is not None:
            if key in self.hashes:
                return self.hashes[key]

            # add a tombstone hash to break recursive calls
            self._counter += 1
            self.hashes[key] = _int_to_bytes(self._counter)

        if obj in hash_stacks:
            return CYCLE_PLACEHOLDER

        hash_stacks.push(obj)

        try:
            b = self._to_bytes(obj, context)

            self.size += sys.getsizeof(b)

            if key is not None:
                self.hashes[key] = b
        finally:
            # In case an UnhashableType (or other) error is thrown, clean up the
            # stack so we don't get false positives in future hashing calls
            hash_stacks.pop()

        return b

    def _update(self, hasher, obj, context=None):
        """Update the provided hasher with the hash of an object."""
        b = self.to_bytes(obj, context)
        hasher.update(b)

    def _file_should_be_hashed(self, filename):
        filepath = os.path.abspath(filename)
        file_is_blacklisted = self._folder_black_list.is_blacklisted(filepath)
        # Short circuiting for performance.
        if file_is_blacklisted:
            return False
        return file_util.file_is_in_folder_glob(
            filepath, self._get_main_script_directory()
        ) or file_util.file_in_pythonpath(filepath)

    def _to_bytes(self, obj, context):
        """Hash objects to bytes, including code with dependencies.
        Python's built in `hash` does not produce consistent results across
        runs."""

        try:
            if _is_magicmock(obj):
                # MagicMock can result in objects that appear to be infinitely
                # deep, so we don't try to hash them at all.
                return self.to_bytes(id(obj))
            elif isinstance(obj, bytes) or isinstance(obj, bytearray):
                return obj
            elif isinstance(obj, string_types):  # noqa: F821
                # Don't allow the user to override string since
                # str == bytes on python 2
                return obj.encode()
            elif type(obj) in self.hash_funcs:
                # Escape hatch for unsupported objects
                return self.to_bytes(self.hash_funcs[type(obj)](obj))
            elif isinstance(obj, float):
                return self.to_bytes(hash(obj))
            elif isinstance(obj, int):
                return _int_to_bytes(obj)
            elif isinstance(obj, list) or isinstance(obj, tuple):
                h = hashlib.new(self.name)

                # Hash the name of the container so that ["a"] hashes differently from ("a",)
                # Otherwise we'd only be hashing the data and the hashes would be the same.
                self._update(h, type(obj).__name__.encode() + b":")
                for e in obj:
                    self._update(h, e, context)
                return h.digest()
            elif isinstance(obj, dict):
                h = hashlib.new(self.name)

                self._update(h, type(obj).__name__.encode() + b":")
                for e in obj.items():
                    self._update(h, e, context)
                return h.digest()
            elif obj is None:
                # Special string since hashes change between sessions.
                # We don't use Python's `hash` since hashes are not consistent
                # across runs.
                return NONESENSE
            elif obj is True:
                return b"bool:1"
            elif obj is False:
                return b"bool:0"
            elif type_util.is_type(
                    obj, "pandas.core.frame.DataFrame") or type_util.is_type(
                        obj, "pandas.core.series.Series"):
                import pandas as pd

                if len(obj) >= PANDAS_ROWS_LARGE:
                    obj = obj.sample(n=PANDAS_SAMPLE_SIZE, random_state=0)
                try:
                    return pd.util.hash_pandas_object(obj).sum()
                except TypeError:
                    # Use pickle if pandas cannot hash the object for example if
                    # it contains unhashable objects.
                    return pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
            elif type_util.is_type(obj, "numpy.ndarray"):
                h = hashlib.new(self.name)
                self._update(h, obj.shape)

                if obj.size >= NP_SIZE_LARGE:
                    import numpy as np

                    state = np.random.RandomState(0)
                    obj = state.choice(obj.flat, size=NP_SAMPLE_SIZE)

                self._update(h, obj.tobytes())
                return h.digest()
            elif inspect.isbuiltin(obj):
                return self.to_bytes(obj.__name__)
            elif hasattr(obj, "name") and (
                    isinstance(obj, io.IOBase)
                    # Handle temporary files used during testing
                    or isinstance(obj, tempfile._TemporaryFileWrapper) or
                (not compatibility.is_running_py3()
                 and isinstance(obj, file))):
                # Hash files as name + last modification date + offset.
                h = hashlib.new(self.name)
                self._update(h, obj.name)
                self._update(h, os.path.getmtime(obj.name))
                self._update(h, obj.tell())
                return h.digest()
            elif inspect.isroutine(obj):
                if hasattr(obj, "__wrapped__"):
                    # Ignore the wrapper of wrapped functions.
                    return self.to_bytes(obj.__wrapped__)

                if obj.__module__.startswith("streamlit"):
                    # Ignore streamlit modules even if they are in the CWD
                    # (e.g. during development).
                    return self.to_bytes("%s.%s" %
                                         (obj.__module__, obj.__name__))

                h = hashlib.new(self.name)
                if self._file_should_be_hashed(obj.__code__.co_filename):
                    context = _get_context(obj)
                    if obj.__defaults__:
                        self._update(h, obj.__defaults__, context)
                    h.update(self._code_to_bytes(obj.__code__, context))
                else:
                    # Don't hash code that is not in the current working directory.
                    self._update(h, obj.__module__)
                    self._update(h, obj.__name__)
                return h.digest()
            elif inspect.iscode(obj):
                return self._code_to_bytes(obj, context)
            elif inspect.ismodule(obj):
                # TODO: Figure out how to best show this kind of warning to the
                # user. In the meantime, show nothing. This scenario is too common,
                # so the current warning is quite annoying...
                # st.warning(('Streamlit does not support hashing modules. '
                #             'We did not hash `%s`.') % obj.__name__)
                # TODO: Hash more than just the name for internal modules.
                return self.to_bytes(obj.__name__)
            elif inspect.isclass(obj):
                # TODO: Figure out how to best show this kind of warning to the
                # user. In the meantime, show nothing. This scenario is too common,
                # (e.g. in every "except" statement) so the current warning is
                # quite annoying...
                # st.warning(('Streamlit does not support hashing classes. '
                #             'We did not hash `%s`.') % obj.__name__)
                # TODO: Hash more than just the name of classes.
                return self.to_bytes(obj.__name__)
            elif isinstance(obj, functools.partial):
                # The return value of functools.partial is not a plain function:
                # it's a callable object that remembers the original function plus
                # the values you pickled into it. So here we need to special-case it.
                h = hashlib.new(self.name)
                self._update(h, obj.args)
                self._update(h, obj.func)
                self._update(h, obj.keywords)
                return h.digest()
            else:
                # As a last resort
                h = hashlib.new(self.name)

                self._update(h, type(obj).__name__.encode() + b":")
                for e in obj.__reduce__():
                    self._update(h, e, context)
                return h.digest()
        except UnhashableType as e:
            raise e
        except Exception as e:
            LOGGER.error(e)
            msg = _hashing_error_message(type(obj))
            raise UnhashableType(msg)

    def _code_to_bytes(self, code, context):
        h = hashlib.new(self.name)

        # Hash the bytecode.
        self._update(h, code.co_code)

        # Hash constants that are referenced by the bytecode but ignore names of lambdas.
        consts = [
            n for n in code.co_consts
            if not isinstance(n, string_types)  # noqa: F821
            or not n.endswith(".<lambda>")
        ]
        self._update(h, consts, context)

        # Hash non-local names and functions referenced by the bytecode.
        if hasattr(dis, "get_instructions"
                   ):  # get_instructions is new since Python 3.4
            for ref in get_referenced_objects(code, context):
                self._update(h, ref, context)
        else:
            # This won't correctly follow nested calls like `foo.bar.baz()`.
            for name in code.co_names:
                if name in context.globals:
                    try:
                        self._update(h, context.globals[name], context)
                    except Exception:
                        self._update(h, name)
                else:
                    try:
                        module = importlib.import_module(name)
                        self._update(h, module, context)
                    except ImportError:
                        self._update(h, name, context)

            for name, value in context.cells.items():
                try:
                    self._update(h, value, context)
                except Exception:
                    self._update(h, name)

        return h.digest()

    @staticmethod
    def _get_main_script_directory():
        """Get the directory of the main script.
        """
        import __main__
        import os

        # This works because we set __main__.__file__ to the report
        # script path in ScriptRunner.
        main_path = __main__.__file__
        return os.path.dirname(main_path)
Beispiel #3
0
class LocalSourcesWatcher:
    def __init__(self, session_data: SessionData):
        self._session_data = session_data
        self._on_file_changed: List[Callable[[], None]] = []
        self._is_closed = False

        # Blacklist for folders that should not be watched
        self._folder_black_list = FolderBlackList(
            config.get_option("server.folderWatchBlacklist"))

        self._watched_modules: Dict[str, WatchedModule] = {}

        self._register_watcher(
            self._session_data.script_path,
            module_name=None,  # Only the root script has None here.
        )

    def register_file_change_callback(self, cb: Callable[[], None]) -> None:
        self._on_file_changed.append(cb)

    def on_file_changed(self, filepath):
        if filepath not in self._watched_modules:
            LOGGER.error("Received event for non-watched file: %s", filepath)
            return

        # Workaround:
        # Delete all watched modules so we can guarantee changes to the
        # updated module are reflected on reload.
        #
        # In principle, for reloading a given module, we only need to unload
        # the module itself and all of the modules which import it (directly
        # or indirectly) such that when we exec the application code, the
        # changes are reloaded and reflected in the running application.
        #
        # However, determining all import paths for a given loaded module is
        # non-trivial, and so as a workaround we simply unload all watched
        # modules.
        for wm in self._watched_modules.values():
            if wm.module_name is not None and wm.module_name in sys.modules:
                del sys.modules[wm.module_name]

        for cb in self._on_file_changed:
            cb()

    def close(self):
        for wm in self._watched_modules.values():
            wm.watcher.close()
        self._watched_modules = {}
        self._is_closed = True

    def _register_watcher(self, filepath, module_name):
        global FileWatcher
        if FileWatcher is None:
            FileWatcher = get_default_file_watcher_class()

        if FileWatcher is NoOpFileWatcher:
            return

        try:
            wm = WatchedModule(
                watcher=FileWatcher(filepath, self.on_file_changed),
                module_name=module_name,
            )
        except PermissionError:
            # If you don't have permission to read this file, don't even add it
            # to watchers.
            return

        self._watched_modules[filepath] = wm

    def _deregister_watcher(self, filepath):
        if filepath not in self._watched_modules:
            return

        if filepath == self._session_data.script_path:
            return

        wm = self._watched_modules[filepath]
        wm.watcher.close()
        del self._watched_modules[filepath]

    def _file_is_new(self, filepath):
        return filepath not in self._watched_modules

    def _file_should_be_watched(self, filepath):
        # Using short circuiting for performance.
        return self._file_is_new(filepath) and (
            file_util.file_is_in_folder_glob(filepath,
                                             self._session_data.script_folder)
            or file_util.file_in_pythonpath(filepath))

    def update_watched_modules(self):
        if self._is_closed:
            return

        modules_paths = {
            name: self._exclude_blacklisted_paths(get_module_paths(module))
            for name, module in dict(sys.modules).items()
        }

        self._register_necessary_watchers(modules_paths)

    def _register_necessary_watchers(
            self, module_paths: Dict[str, Set[str]]) -> None:
        for name, paths in module_paths.items():
            for path in paths:
                if self._file_should_be_watched(path):
                    self._register_watcher(path, name)

    def _exclude_blacklisted_paths(self, paths: Set[str]) -> Set[str]:
        return {
            p
            for p in paths if not self._folder_black_list.is_blacklisted(p)
        }
class LocalSourcesWatcher(object):
    def __init__(self, report, on_file_changed):
        self._report = report
        self._on_file_changed = on_file_changed
        self._is_closed = False

        # Blacklist for folders that should not be watched
        self._folder_black_list = FolderBlackList(
            config.get_option("server.folderWatchBlacklist"))

        # A dict of filepath -> WatchedModule.
        self._watched_modules = {}

        self._register_watcher(
            self._report.script_path,
            module_name=None,  # Only the root script has None here.
        )

    def on_file_changed(self, filepath):
        if filepath not in self._watched_modules:
            LOGGER.error("Received event for non-watched file", filepath)
            return

        # Workaround:
        # Delete all watched modules so we can guarantee changes to the
        # updated module are reflected on reload.
        #
        # In principle, for reloading a given module, we only need to unload
        # the module itself and all of the modules which import it (directly
        # or indirectly) such that when we exec the application code, the
        # changes are reloaded and reflected in the running application.
        #
        # However, determining all import paths for a given loaded module is
        # non-trivial, and so as a workaround we simply unload all watched
        # modules.
        for wm in self._watched_modules.values():
            if wm.module_name is not None and wm.module_name in sys.modules:
                del sys.modules[wm.module_name]

        self._on_file_changed()

    def close(self):
        for wm in self._watched_modules.values():
            wm.watcher.close()
        self._watched_modules = {}
        self._is_closed = True

    def _register_watcher(self, filepath, module_name):
        if compatibility.is_running_py3():
            ErrorType = PermissionError
        else:
            ErrorType = OSError

        try:
            wm = WatchedModule(
                watcher=FileWatcher(filepath, self.on_file_changed),
                module_name=module_name,
            )
        except ErrorType:
            # If you don't have permission to read this file, don't even add it
            # to watchers.
            return

        self._watched_modules[filepath] = wm

    def _deregister_watcher(self, filepath):
        if filepath not in self._watched_modules:
            return

        if filepath == self._report.script_path:
            return

        wm = self._watched_modules[filepath]
        wm.watcher.close()
        del self._watched_modules[filepath]

    def update_watched_modules(self):
        if self._is_closed:
            return

        local_filepaths = []

        # Clone modules dict here because we may alter the original dict inside
        # the loop.
        modules = dict(sys.modules)

        for name, module in modules.items():
            try:
                spec = getattr(module, "__spec__", None)

                if spec is None:
                    filepath = getattr(module, "__file__", None)
                    if filepath is None:
                        # Some modules have neither a spec nor a file. But we
                        # can ignore those since they're not the user-created
                        # modules we want to watch anyway.
                        continue
                else:
                    filepath = spec.origin

                if filepath is None:
                    # Built-in modules (and other stuff) don't have origins.
                    continue

                filepath = os.path.abspath(filepath)

                if not os.path.isfile(filepath):
                    # There are some modules that have a .origin, but don't
                    # point to real files. For example, there's a module where
                    # .origin is 'built-in'.
                    continue

                if self._folder_black_list.is_blacklisted(filepath):
                    continue

                file_is_new = filepath not in self._watched_modules
                file_is_local = util.file_is_in_folder_glob(
                    filepath, self._report.script_folder)

                local_filepaths.append(filepath)

                if file_is_local and file_is_new:
                    self._register_watcher(filepath, name)

            except Exception:
                # In case there's a problem introspecting some specific module,
                # let's not stop the entire loop from running.  For example,
                # the __spec__ field in some modules (like IPython) is actually
                # a dynamic property, which can crash if the underlying
                # module's code has a bug (as discovered by one of our users).
                continue

        # Clone dict here because we may alter the original dict inside the
        # loop.
        watched_modules = dict(self._watched_modules)

        # Remove no-longer-depended-on files from self._watched_modules
        # Will this ever happen?
        for filepath in watched_modules:
            if filepath not in local_filepaths:
                self._deregister_watcher(filepath)
class LocalSourcesWatcher(object):
    def __init__(self, report, on_file_changed):
        self._report = report
        self._on_file_changed = on_file_changed
        self._is_closed = False

        # Blacklist for folders that should not be watched
        self._folder_black_list = FolderBlackList(
            config.get_option("server.folderWatchBlacklist"))

        # A dict of filepath -> WatchedModule.
        self._watched_modules = {}

        self._register_watcher(
            self._report.script_path,
            module_name=None,  # Only the root script has None here.
        )

    def on_file_changed(self, filepath):
        if filepath not in self._watched_modules:
            LOGGER.error("Received event for non-watched file", filepath)
            return

        wm = self._watched_modules[filepath]

        if wm.module_name is not None and wm.module_name in sys.modules:
            del sys.modules[wm.module_name]

        self._on_file_changed()

    def close(self):
        for wm in self._watched_modules.values():
            wm.watcher.close()
        self._watched_modules = {}
        self._is_closed = True

    def _register_watcher(self, filepath, module_name):
        wm = WatchedModule(watcher=FileWatcher(filepath, self.on_file_changed),
                           module_name=module_name)
        self._watched_modules[filepath] = wm

    def _deregister_watcher(self, filepath):
        if filepath not in self._watched_modules:
            return

        if filepath == self._report.script_path:
            return

        wm = self._watched_modules[filepath]
        wm.watcher.close()
        del self._watched_modules[filepath]

    def update_watched_modules(self):
        if self._is_closed:
            return

        local_filepaths = []

        # Clone modules dict here because we may alter the original dict inside
        # the loop.
        modules = dict(sys.modules)

        for name, module in modules.items():
            try:
                spec = getattr(module, "__spec__", None)

                if spec is None:
                    filepath = getattr(module, "__file__", None)
                    if filepath is None:
                        # Some modules have neither a spec nor a file. But we
                        # can ignore those since they're not the user-created
                        # modules we want to watch anyway.
                        continue
                else:
                    filepath = spec.origin

                if filepath is None:
                    # Built-in modules (and other stuff) don't have origins.
                    continue

                filepath = os.path.abspath(filepath)

                if not os.path.isfile(filepath):
                    # There are some modules that have a .origin, but don't
                    # point to real files. For example, there's a module where
                    # .origin is 'built-in'.
                    continue

                if self._folder_black_list.is_blacklisted(filepath):
                    continue

                file_is_new = filepath not in self._watched_modules
                file_is_local = util.file_is_in_folder_glob(
                    filepath, self._report.script_folder)

                local_filepaths.append(filepath)

                if file_is_local and file_is_new:
                    self._register_watcher(filepath, name)

            except Exception:
                # In case there's a problem introspecting some specific module,
                # let's not stop the entire loop from running.  For example,
                # the __spec__ field in some modules (like IPython) is actually
                # a dynamic property, which can crash if the underlying
                # module's code has a bug (as discovered by one of our users).
                continue

        # Clone dict here because we may alter the original dict inside the
        # loop.
        watched_modules = dict(self._watched_modules)

        # Remove no-longer-depended-on files from self._watched_modules
        # Will this ever happen?
        for filepath in watched_modules:
            if filepath not in local_filepaths:
                self._deregister_watcher(filepath)