def _file_should_be_hashed(self, filename: str) -> bool: global _FOLDER_BLACK_LIST if not _FOLDER_BLACK_LIST: _FOLDER_BLACK_LIST = FolderBlackList( config.get_option("server.folderWatchBlacklist")) filepath = os.path.abspath(filename) file_is_blacklisted = _FOLDER_BLACK_LIST.is_blacklisted(filepath) # Short circuiting for performance. if file_is_blacklisted: return False return file_util.file_is_in_folder_glob( filepath, self._get_main_script_directory() ) or file_util.file_in_pythonpath(filepath)
def __init__(self, session_data: SessionData): self._session_data = session_data self._on_file_changed: List[Callable[[], None]] = [] self._is_closed = False # Blacklist for folders that should not be watched self._folder_black_list = FolderBlackList( config.get_option("server.folderWatchBlacklist")) self._watched_modules: Dict[str, WatchedModule] = {} self._register_watcher( self._session_data.script_path, module_name=None, # Only the root script has None here. )
def test_do_blacklist_user_configured_folders(self): """ Files inside user configured folders should be blacklisted. """ folder_black_list = FolderBlackList(["/bar/some_folder"]) is_blacklisted = folder_black_list.is_blacklisted self.assertTrue(is_blacklisted("/bar/some_folder/script.py"))
def __init__(self, report, on_file_changed): self._report = report self._on_file_changed = on_file_changed self._is_closed = False # Blacklist for folders that should not be watched self._folder_black_list = FolderBlackList( config.get_option("server.folderWatchBlacklist")) # A dict of filepath -> WatchedModule. self._watched_modules = {} self._register_watcher( self._report.script_path, module_name=None, # Only the root script has None here. )
def __init__(self, name="md5", hasher=None): self.hashes = dict() self.name = name # The number of the bytes in the hash. self.size = 0 # An ever increasing counter. self._counter = 0 if hasher: self.hasher = hasher else: self.hasher = hashlib.new(name) self._folder_black_list = FolderBlackList( config.get_option("server.folderWatchBlacklist"))
def test_do_not_blacklist(self): """ Ensure we're not accidentally blacklisting things we shouldn't be. """ folder_black_list = FolderBlackList([]) is_blacklisted = folder_black_list.is_blacklisted self.assertFalse(is_blacklisted("/foo/not_blacklisted/script.py")) self.assertFalse(is_blacklisted("/foo/not_blacklisted/.hidden_script.py"))
def test_do_blacklist(self): """ miniconda, anaconda, and .*/ folders should be blacklisted. """ folder_black_list = FolderBlackList([]) is_blacklisted = folder_black_list.is_blacklisted self.assertTrue(is_blacklisted("/foo/miniconda2/script.py")) self.assertTrue(is_blacklisted("/foo/miniconda3/script.py")) self.assertTrue(is_blacklisted("/foo/anaconda2/script.py")) self.assertTrue(is_blacklisted("/foo/anaconda3/script.py")) self.assertTrue(is_blacklisted("/foo/.virtualenv/script.py")) self.assertTrue(is_blacklisted("/foo/.venv/script.py")) self.assertTrue(is_blacklisted("/foo/.random_hidden_folder/script.py"))
class CodeHasher: """A hasher that can hash code objects including dependencies.""" def __init__(self, name="md5", hasher=None, hash_funcs=None): self.hashes = dict() self.name = name # The number of the bytes in the hash. self.size = 0 # An ever increasing counter. self._counter = 0 if hasher: self.hasher = hasher else: self.hasher = hashlib.new(name) self._folder_black_list = FolderBlackList( config.get_option("server.folderWatchBlacklist")) self.hash_funcs = hash_funcs or {} def update(self, obj, context=None): """Update the hash with the provided object.""" self._update(self.hasher, obj, context) def digest(self): return self.hasher.digest() def hexdigest(self): return self.hasher.hexdigest() def to_bytes(self, obj, context=None): """Add memoization to _to_bytes and protect against cycles in data structures.""" key = _key(obj, context) if key is not None: if key in self.hashes: return self.hashes[key] # add a tombstone hash to break recursive calls self._counter += 1 self.hashes[key] = _int_to_bytes(self._counter) if obj in hash_stacks: return CYCLE_PLACEHOLDER hash_stacks.push(obj) try: b = self._to_bytes(obj, context) self.size += sys.getsizeof(b) if key is not None: self.hashes[key] = b finally: # In case an UnhashableType (or other) error is thrown, clean up the # stack so we don't get false positives in future hashing calls hash_stacks.pop() return b def _update(self, hasher, obj, context=None): """Update the provided hasher with the hash of an object.""" b = self.to_bytes(obj, context) hasher.update(b) def _file_should_be_hashed(self, filename): filepath = os.path.abspath(filename) file_is_blacklisted = self._folder_black_list.is_blacklisted(filepath) # Short circuiting for performance. if file_is_blacklisted: return False return file_util.file_is_in_folder_glob( filepath, self._get_main_script_directory() ) or file_util.file_in_pythonpath(filepath) def _to_bytes(self, obj, context): """Hash objects to bytes, including code with dependencies. Python's built in `hash` does not produce consistent results across runs.""" try: if _is_magicmock(obj): # MagicMock can result in objects that appear to be infinitely # deep, so we don't try to hash them at all. return self.to_bytes(id(obj)) elif isinstance(obj, bytes) or isinstance(obj, bytearray): return obj elif isinstance(obj, string_types): # noqa: F821 # Don't allow the user to override string since # str == bytes on python 2 return obj.encode() elif type(obj) in self.hash_funcs: # Escape hatch for unsupported objects return self.to_bytes(self.hash_funcs[type(obj)](obj)) elif isinstance(obj, float): return self.to_bytes(hash(obj)) elif isinstance(obj, int): return _int_to_bytes(obj) elif isinstance(obj, list) or isinstance(obj, tuple): h = hashlib.new(self.name) # Hash the name of the container so that ["a"] hashes differently from ("a",) # Otherwise we'd only be hashing the data and the hashes would be the same. self._update(h, type(obj).__name__.encode() + b":") for e in obj: self._update(h, e, context) return h.digest() elif isinstance(obj, dict): h = hashlib.new(self.name) self._update(h, type(obj).__name__.encode() + b":") for e in obj.items(): self._update(h, e, context) return h.digest() elif obj is None: # Special string since hashes change between sessions. # We don't use Python's `hash` since hashes are not consistent # across runs. return NONESENSE elif obj is True: return b"bool:1" elif obj is False: return b"bool:0" elif type_util.is_type( obj, "pandas.core.frame.DataFrame") or type_util.is_type( obj, "pandas.core.series.Series"): import pandas as pd if len(obj) >= PANDAS_ROWS_LARGE: obj = obj.sample(n=PANDAS_SAMPLE_SIZE, random_state=0) try: return pd.util.hash_pandas_object(obj).sum() except TypeError: # Use pickle if pandas cannot hash the object for example if # it contains unhashable objects. return pickle.dumps(obj, pickle.HIGHEST_PROTOCOL) elif type_util.is_type(obj, "numpy.ndarray"): h = hashlib.new(self.name) self._update(h, obj.shape) if obj.size >= NP_SIZE_LARGE: import numpy as np state = np.random.RandomState(0) obj = state.choice(obj.flat, size=NP_SAMPLE_SIZE) self._update(h, obj.tobytes()) return h.digest() elif inspect.isbuiltin(obj): return self.to_bytes(obj.__name__) elif hasattr(obj, "name") and ( isinstance(obj, io.IOBase) # Handle temporary files used during testing or isinstance(obj, tempfile._TemporaryFileWrapper) or (not compatibility.is_running_py3() and isinstance(obj, file))): # Hash files as name + last modification date + offset. h = hashlib.new(self.name) self._update(h, obj.name) self._update(h, os.path.getmtime(obj.name)) self._update(h, obj.tell()) return h.digest() elif inspect.isroutine(obj): if hasattr(obj, "__wrapped__"): # Ignore the wrapper of wrapped functions. return self.to_bytes(obj.__wrapped__) if obj.__module__.startswith("streamlit"): # Ignore streamlit modules even if they are in the CWD # (e.g. during development). return self.to_bytes("%s.%s" % (obj.__module__, obj.__name__)) h = hashlib.new(self.name) if self._file_should_be_hashed(obj.__code__.co_filename): context = _get_context(obj) if obj.__defaults__: self._update(h, obj.__defaults__, context) h.update(self._code_to_bytes(obj.__code__, context)) else: # Don't hash code that is not in the current working directory. self._update(h, obj.__module__) self._update(h, obj.__name__) return h.digest() elif inspect.iscode(obj): return self._code_to_bytes(obj, context) elif inspect.ismodule(obj): # TODO: Figure out how to best show this kind of warning to the # user. In the meantime, show nothing. This scenario is too common, # so the current warning is quite annoying... # st.warning(('Streamlit does not support hashing modules. ' # 'We did not hash `%s`.') % obj.__name__) # TODO: Hash more than just the name for internal modules. return self.to_bytes(obj.__name__) elif inspect.isclass(obj): # TODO: Figure out how to best show this kind of warning to the # user. In the meantime, show nothing. This scenario is too common, # (e.g. in every "except" statement) so the current warning is # quite annoying... # st.warning(('Streamlit does not support hashing classes. ' # 'We did not hash `%s`.') % obj.__name__) # TODO: Hash more than just the name of classes. return self.to_bytes(obj.__name__) elif isinstance(obj, functools.partial): # The return value of functools.partial is not a plain function: # it's a callable object that remembers the original function plus # the values you pickled into it. So here we need to special-case it. h = hashlib.new(self.name) self._update(h, obj.args) self._update(h, obj.func) self._update(h, obj.keywords) return h.digest() else: # As a last resort h = hashlib.new(self.name) self._update(h, type(obj).__name__.encode() + b":") for e in obj.__reduce__(): self._update(h, e, context) return h.digest() except UnhashableType as e: raise e except Exception as e: LOGGER.error(e) msg = _hashing_error_message(type(obj)) raise UnhashableType(msg) def _code_to_bytes(self, code, context): h = hashlib.new(self.name) # Hash the bytecode. self._update(h, code.co_code) # Hash constants that are referenced by the bytecode but ignore names of lambdas. consts = [ n for n in code.co_consts if not isinstance(n, string_types) # noqa: F821 or not n.endswith(".<lambda>") ] self._update(h, consts, context) # Hash non-local names and functions referenced by the bytecode. if hasattr(dis, "get_instructions" ): # get_instructions is new since Python 3.4 for ref in get_referenced_objects(code, context): self._update(h, ref, context) else: # This won't correctly follow nested calls like `foo.bar.baz()`. for name in code.co_names: if name in context.globals: try: self._update(h, context.globals[name], context) except Exception: self._update(h, name) else: try: module = importlib.import_module(name) self._update(h, module, context) except ImportError: self._update(h, name, context) for name, value in context.cells.items(): try: self._update(h, value, context) except Exception: self._update(h, name) return h.digest() @staticmethod def _get_main_script_directory(): """Get the directory of the main script. """ import __main__ import os # This works because we set __main__.__file__ to the report # script path in ScriptRunner. main_path = __main__.__file__ return os.path.dirname(main_path)
class LocalSourcesWatcher(object): def __init__(self, report, on_file_changed): self._report = report self._on_file_changed = on_file_changed self._is_closed = False # Blacklist for folders that should not be watched self._folder_black_list = FolderBlackList( config.get_option("server.folderWatchBlacklist")) # A dict of filepath -> WatchedModule. self._watched_modules = {} self._register_watcher( self._report.script_path, module_name=None, # Only the root script has None here. ) def on_file_changed(self, filepath): if filepath not in self._watched_modules: LOGGER.error("Received event for non-watched file", filepath) return # Workaround: # Delete all watched modules so we can guarantee changes to the # updated module are reflected on reload. # # In principle, for reloading a given module, we only need to unload # the module itself and all of the modules which import it (directly # or indirectly) such that when we exec the application code, the # changes are reloaded and reflected in the running application. # # However, determining all import paths for a given loaded module is # non-trivial, and so as a workaround we simply unload all watched # modules. for wm in self._watched_modules.values(): if wm.module_name is not None and wm.module_name in sys.modules: del sys.modules[wm.module_name] self._on_file_changed() def close(self): for wm in self._watched_modules.values(): wm.watcher.close() self._watched_modules = {} self._is_closed = True def _register_watcher(self, filepath, module_name): if compatibility.is_running_py3(): ErrorType = PermissionError else: ErrorType = OSError try: wm = WatchedModule( watcher=FileWatcher(filepath, self.on_file_changed), module_name=module_name, ) except ErrorType: # If you don't have permission to read this file, don't even add it # to watchers. return self._watched_modules[filepath] = wm def _deregister_watcher(self, filepath): if filepath not in self._watched_modules: return if filepath == self._report.script_path: return wm = self._watched_modules[filepath] wm.watcher.close() del self._watched_modules[filepath] def update_watched_modules(self): if self._is_closed: return local_filepaths = [] # Clone modules dict here because we may alter the original dict inside # the loop. modules = dict(sys.modules) for name, module in modules.items(): try: spec = getattr(module, "__spec__", None) if spec is None: filepath = getattr(module, "__file__", None) if filepath is None: # Some modules have neither a spec nor a file. But we # can ignore those since they're not the user-created # modules we want to watch anyway. continue else: filepath = spec.origin if filepath is None: # Built-in modules (and other stuff) don't have origins. continue filepath = os.path.abspath(filepath) if not os.path.isfile(filepath): # There are some modules that have a .origin, but don't # point to real files. For example, there's a module where # .origin is 'built-in'. continue if self._folder_black_list.is_blacklisted(filepath): continue file_is_new = filepath not in self._watched_modules file_is_local = util.file_is_in_folder_glob( filepath, self._report.script_folder) local_filepaths.append(filepath) if file_is_local and file_is_new: self._register_watcher(filepath, name) except Exception: # In case there's a problem introspecting some specific module, # let's not stop the entire loop from running. For example, # the __spec__ field in some modules (like IPython) is actually # a dynamic property, which can crash if the underlying # module's code has a bug (as discovered by one of our users). continue # Clone dict here because we may alter the original dict inside the # loop. watched_modules = dict(self._watched_modules) # Remove no-longer-depended-on files from self._watched_modules # Will this ever happen? for filepath in watched_modules: if filepath not in local_filepaths: self._deregister_watcher(filepath)
# If a dataframe has more than this many rows, we consider it large and hash a sample. _PANDAS_ROWS_LARGE = 100000 _PANDAS_SAMPLE_SIZE = 10000 # Similar to dataframes, we also sample large numpy arrays. _NP_SIZE_LARGE = 1000000 _NP_SAMPLE_SIZE = 100000 # Arbitrary item to denote where we found a cycle in a hashed object. # This allows us to hash self-referencing lists, dictionaries, etc. _CYCLE_PLACEHOLDER = b"streamlit-57R34ML17-hesamagicalponyflyingthroughthesky-CYCLE" _FOLDER_BLACK_LIST = FolderBlackList(config.get_option("server.folderWatchBlacklist")) # FFI objects (objects that interface with C libraries) can be any of these types: _FFI_TYPE_NAMES = [ "_cffi_backend.FFI", "builtins.CompiledFFI", ] # KERAS objects can be any of these types: _KERAS_TYPE_NAMES = [ "keras.engine.training.Model", "tensorflow.python.keras.engine.training.Model", "tensorflow.python.keras.engine.functional.Functional", ]
class LocalSourcesWatcher: def __init__(self, session_data: SessionData): self._session_data = session_data self._on_file_changed: List[Callable[[], None]] = [] self._is_closed = False # Blacklist for folders that should not be watched self._folder_black_list = FolderBlackList( config.get_option("server.folderWatchBlacklist")) self._watched_modules: Dict[str, WatchedModule] = {} self._register_watcher( self._session_data.script_path, module_name=None, # Only the root script has None here. ) def register_file_change_callback(self, cb: Callable[[], None]) -> None: self._on_file_changed.append(cb) def on_file_changed(self, filepath): if filepath not in self._watched_modules: LOGGER.error("Received event for non-watched file: %s", filepath) return # Workaround: # Delete all watched modules so we can guarantee changes to the # updated module are reflected on reload. # # In principle, for reloading a given module, we only need to unload # the module itself and all of the modules which import it (directly # or indirectly) such that when we exec the application code, the # changes are reloaded and reflected in the running application. # # However, determining all import paths for a given loaded module is # non-trivial, and so as a workaround we simply unload all watched # modules. for wm in self._watched_modules.values(): if wm.module_name is not None and wm.module_name in sys.modules: del sys.modules[wm.module_name] for cb in self._on_file_changed: cb() def close(self): for wm in self._watched_modules.values(): wm.watcher.close() self._watched_modules = {} self._is_closed = True def _register_watcher(self, filepath, module_name): global FileWatcher if FileWatcher is None: FileWatcher = get_default_file_watcher_class() if FileWatcher is NoOpFileWatcher: return try: wm = WatchedModule( watcher=FileWatcher(filepath, self.on_file_changed), module_name=module_name, ) except PermissionError: # If you don't have permission to read this file, don't even add it # to watchers. return self._watched_modules[filepath] = wm def _deregister_watcher(self, filepath): if filepath not in self._watched_modules: return if filepath == self._session_data.script_path: return wm = self._watched_modules[filepath] wm.watcher.close() del self._watched_modules[filepath] def _file_is_new(self, filepath): return filepath not in self._watched_modules def _file_should_be_watched(self, filepath): # Using short circuiting for performance. return self._file_is_new(filepath) and ( file_util.file_is_in_folder_glob(filepath, self._session_data.script_folder) or file_util.file_in_pythonpath(filepath)) def update_watched_modules(self): if self._is_closed: return modules_paths = { name: self._exclude_blacklisted_paths(get_module_paths(module)) for name, module in dict(sys.modules).items() } self._register_necessary_watchers(modules_paths) def _register_necessary_watchers( self, module_paths: Dict[str, Set[str]]) -> None: for name, paths in module_paths.items(): for path in paths: if self._file_should_be_watched(path): self._register_watcher(path, name) def _exclude_blacklisted_paths(self, paths: Set[str]) -> Set[str]: return { p for p in paths if not self._folder_black_list.is_blacklisted(p) }
class LocalSourcesWatcher(object): def __init__(self, report, on_file_changed): self._report = report self._on_file_changed = on_file_changed self._is_closed = False # Blacklist for folders that should not be watched self._folder_black_list = FolderBlackList( config.get_option("server.folderWatchBlacklist")) # A dict of filepath -> WatchedModule. self._watched_modules = {} self._register_watcher( self._report.script_path, module_name=None, # Only the root script has None here. ) def on_file_changed(self, filepath): if filepath not in self._watched_modules: LOGGER.error("Received event for non-watched file", filepath) return wm = self._watched_modules[filepath] if wm.module_name is not None and wm.module_name in sys.modules: del sys.modules[wm.module_name] self._on_file_changed() def close(self): for wm in self._watched_modules.values(): wm.watcher.close() self._watched_modules = {} self._is_closed = True def _register_watcher(self, filepath, module_name): wm = WatchedModule(watcher=FileWatcher(filepath, self.on_file_changed), module_name=module_name) self._watched_modules[filepath] = wm def _deregister_watcher(self, filepath): if filepath not in self._watched_modules: return if filepath == self._report.script_path: return wm = self._watched_modules[filepath] wm.watcher.close() del self._watched_modules[filepath] def update_watched_modules(self): if self._is_closed: return local_filepaths = [] # Clone modules dict here because we may alter the original dict inside # the loop. modules = dict(sys.modules) for name, module in modules.items(): try: spec = getattr(module, "__spec__", None) if spec is None: filepath = getattr(module, "__file__", None) if filepath is None: # Some modules have neither a spec nor a file. But we # can ignore those since they're not the user-created # modules we want to watch anyway. continue else: filepath = spec.origin if filepath is None: # Built-in modules (and other stuff) don't have origins. continue filepath = os.path.abspath(filepath) if not os.path.isfile(filepath): # There are some modules that have a .origin, but don't # point to real files. For example, there's a module where # .origin is 'built-in'. continue if self._folder_black_list.is_blacklisted(filepath): continue file_is_new = filepath not in self._watched_modules file_is_local = util.file_is_in_folder_glob( filepath, self._report.script_folder) local_filepaths.append(filepath) if file_is_local and file_is_new: self._register_watcher(filepath, name) except Exception: # In case there's a problem introspecting some specific module, # let's not stop the entire loop from running. For example, # the __spec__ field in some modules (like IPython) is actually # a dynamic property, which can crash if the underlying # module's code has a bug (as discovered by one of our users). continue # Clone dict here because we may alter the original dict inside the # loop. watched_modules = dict(self._watched_modules) # Remove no-longer-depended-on files from self._watched_modules # Will this ever happen? for filepath in watched_modules: if filepath not in local_filepaths: self._deregister_watcher(filepath)