def directory_loaded_as_zipfile_path(dirpath: Path) -> ContextManager[Path]: """Yield -- but do not save -- a zipfile using the files in a directory. Use `dirpath.name` as `module_id`. Use "dir-{hex-sha1sum of zipfile}" as `version`. Respect `.gitignore` to avoid importing too many files. The ModuleZipfile may not be valid. Use `validate_zipfile()` to test it. """ try: with (dirpath / ".gitignore").open("rt", encoding="utf-8") as f: gitignore = pathspec.PathSpec.from_lines( pathspec.patterns.GitWildMatchPattern, f.readlines() ) except FileNotFoundError: gitignore = pathspec.PathSpec([]) module_id = dirpath.name with tempdir_context(prefix="importdir") as tempdir: unversioned_zip_path = tempdir / f"{module_id}.develop.zip" with zipfile.ZipFile(unversioned_zip_path, mode="w") as zf: for path in dirpath.glob("**/*"): if path.is_file(): relative_path = str(path.relative_to(dirpath)) if not gitignore.match_file(relative_path): zf.write(path, relative_path) version = "dir-" + _hexsha1(unversioned_zip_path) zip_path = tempdir / f"{module_id}.{version}.zip" shutil.move(unversioned_zip_path, zip_path) yield zip_path
def test_happy_path(self): with tempdir_context() as tempdir: zip_path = tempdir / "importmodule.1.zip" with zipfile.ZipFile(zip_path, mode="w") as zf: zf.writestr( "importmodule.yaml", json.dumps( dict( id_name="importmodule", name="Importable module", category="Clean", parameters=[], )).encode("utf-8"), ) zf.writestr("importmodule.py", b"def render(table, params): return table") clientside_module = import_zipfile(zip_path) self.assertEqual( clientside_module, clientside.Module( spec=ModuleSpec( id_name="importmodule", name="Importable module", category="Clean", parameters=[], ), js_module="", ), )
def import_module_from_github( owner: str, repo: str, ref: str = "main" ) -> clientside.Module: """Download module data from GitHub and store it in database+s3. Return a `clientside.Module` on success. Raise `WorkbenchModuleImportError` if import fails. """ if owner.lower() != "cjworkbench": raise WorkbenchModuleImportError( "Refusing to import: according to the GitHub URL, " "this module is not owned by 'cjworkbench'" ) with tempdir_context(prefix="importmodule") as td: # Download to a tempfile, `download_path` download_path = td / "github-download.zip" _download_url( "https://github.com/%s/%s/archive/%s.zip" % (owner, repo, ref), download_path, ) # raise WorkbenchModuleImportError # Read the version (sha1) from zipfile and rename it to match the sha1. # (import_zipfile() reads sha1 from filename.) with zipfile.ZipFile(download_path, "r") as zf: sha1 = zf.comment.decode("latin1") # cannot error assert SHA1_PATTERN.match(sha1), "GitHub archive comment must be sha1" name = "%s.%s.zip" % (repo, sha1) path = td / name download_path.rename(path) # Import the zipfile return import_zipfile(path) # raise WorkbenchModuleImportError
def import_module_from_test_zip_url(url: str) -> clientside.Module: """Download module data from a zipfile at a trusted URL. Return a `clientside.Module` on success. Raise `WorkbenchModuleImportError` if import fails. """ zipfile_name = url.split("/")[-1] with tempdir_context(prefix="importmodule") as td: path = td / zipfile_name _download_url(url, path) # raise WorkbenchModuleImportError return import_zipfile(path) # raise WorkbenchModuleImportError
def test_SECURITY_provide_dir_readable(self): with tempdir_context() as files: files.chmod(0o755) (files / "foo.txt").write_text("foo") (files / "subdir").mkdir(0o755) (files / "subdir" / "bar.bin").write_bytes(b"subbar") self._spawn_and_communicate_or_raise( r""" from pathlib import Path assert Path("/data/foo.txt").read_text() == "foo" assert Path("/data/subdir/bar.bin").read_text() == "subbar" """, chroot_dir=self.chroot_dir, chroot_provide_paths=[(Path("/data"), files)], )
def test_validate_detect_exec_error(self): with tempdir_context() as tempdir: zip_path = tempdir / "badpy.1.zip" with zipfile.ZipFile(zip_path, mode="w") as zf: zf.writestr( "badpy.yaml", json.dumps( dict( name="Exec-error Python", id_name="badpy", category="Clean", parameters=[], )).encode("utf-8"), ) zf.writestr("badpy.py", b"print(badname)") with self.assertRaises(WorkbenchModuleImportError) as cm: import_zipfile(zip_path) self.assertIsInstance(cm.exception.__cause__, ModuleExitedError)
def test_validate_invalid_spec(self): with tempdir_context() as tempdir: zip_path = tempdir / "badyaml.1.zip" with zipfile.ZipFile(zip_path, mode="w") as zf: zf.writestr( "badyaml.yaml", (b"{" b'"idname": "badyaml",' b'"name": "Missing id_name",' b'"category": "Clean",' b'"parameters": []' b"}"), ) zf.writestr("badyaml.py", "def render(table, params):\n return table") with self.assertRaises(WorkbenchModuleImportError) as cm: import_zipfile(zip_path) self.assertIsInstance(cm.exception.__cause__, ValueError)
def test_validate_detect_python_syntax_errors(self): with tempdir_context() as tempdir: zip_path = tempdir / "badpy.1.zip" with zipfile.ZipFile(zip_path, mode="w") as zf: zf.writestr( "badpy.yaml", json.dumps( dict( name="Syntax-error Python", id_name="badpy", category="Clean", parameters=[], )).encode("utf-8"), ) zf.writestr("badpy.py", 'def render(table, params):\n cols = split(","') with self.assertRaises(WorkbenchModuleImportError) as cm: import_zipfile(zip_path) self.assertIsInstance(cm.exception.__cause__, SyntaxError)
def test_load_dynamic(self): code = b"def render(table, params):\n return table * 2" minio.client.put_object( Bucket=minio.ExternalModulesBucket, Key="imported/abcdef/imported.py", Body=code, ContentLength=len(code), ) with self.assertLogs("cjwstate.modules.loaded_module"): lm = LoadedModule.for_module_version( MockModuleVersion("imported", "abcdef", ParamDType.Dict({}), "now") ) self.assertEqual(lm.name, "imported:abcdef") # This ends up being kinda an integration test. with ExitStack() as ctx: basedir = Path(ctx.enter_context(tempdir_context(prefix="test-basedir-"))) basedir.chmod(0o755) input_table = ctx.enter_context( arrow_table_context({"A": [1]}, dir=basedir) ) input_table.path.chmod(0o644) output_tf = ctx.enter_context(tempfile.NamedTemporaryFile(dir=basedir)) ctx.enter_context(self.assertLogs("cjwstate.modules.loaded_module")) result = lm.render( basedir=basedir, input_table=input_table, params=Params({"col": "A"}), tab=Tab("tab-1", "Tab 1"), fetch_result=None, output_filename=Path(output_tf.name).name, ) assert_render_result_equals(result, RenderResult(arrow_table({"A": [2]})))
def _chroot_dir_context( *, provide_paths: List[Path] = [], extract_paths: List[Path] = []) -> ContextManager[Path]: """ Prepare paths for forkserver's `chroot_dir` and `chroot_provide_paths`. Each of `provide_paths` is a file or directory we will expose to module code -- code with an effective UID/GID outside of 0-65535, so we can't transfer ownership to it. Each path within each `provide_path` will be temporarily set to other-readable. (TODO bind-mount instead of chroot, and somehow fiddle with ownership while mounting.) Each of `extract_paths` is an empty file that already exists, which we allow the module to write to. Each path will be set to world-writable within the chroot (so processes with effective UIDs outside of 0-65535 may write to it -- e.g., setuid-nonroot processes within forkserver's sandbox). After the context exits, the original permissions will be restored. The caller is expected to expose the `extract_path` through a `chroot_provide_paths` argument to forkserver. (For instance, if `extract_paths` includes /tmp/basedir/x.arrow, `chroot_provide_paths` should include /tmp/basedir or /tmp/basedir/x.arrow. TODO refactor chroot construction so it happens here, not in forkserver. The contents of the chroot really depend on the code being run -- in this case, code the kernel spawns. """ with tempdir_context(prefix="kernel-chroot-") as chroot: chroot.chmod(0o755) old_stats: Dict[Path, os.stat_result] = {} for provide_path in provide_paths: for dirname, _, filenames in os.walk(provide_path): dirpath = Path(dirname) old_stat = dirpath.stat() old_stats[dirpath] = old_stat dirpath.chmod((old_stat.st_mode & 0o7777) | stat.S_IROTH | stat.S_IXOTH) for filename in filenames: path = dirpath / filename old_stat = path.stat() old_stats[path] = old_stat path.chmod((old_stat.st_mode & 0o7777) | stat.S_IROTH) for path in extract_paths: # read old_stat from cache, not from file! We changed the file. old_stat = old_stats[ path] # KeyError? provide_paths+extract_paths disagree # make it writable path.chmod((old_stat.st_mode & 0o7777) | stat.S_IROTH | stat.S_IWOTH) yield chroot for path in extract_paths: # The module ran as a high-UID user. Extract its output from # the chroot and give it its original permissions. That way, # future module runs won't be allowed to write it (unless # old_stats says it was world-writable in the first place). _extract_from_chroot(chroot, path) for path, old_stat in old_stats.items(): # Restore original owner UID, GID os.chown(path, old_stat.st_uid, old_stat.st_gid) # Restore original permissions (ref: man inode(7)) path.chmod(old_stat.st_mode & 0o7777)
async def fetch(*, workflow_id: int, wf_module_id: int, now: Optional[timezone.datetime] = None) -> None: # 1. Load database objects # - missing WfModule? Return prematurely # - database error? _exit(1) # 2. Calculate result # 2a. Load module # - no module? Result is user-visible error # - load error? Result is user-visible error # - compile error? Result is user-visible error # 2b. Build fetch kwargs # - migrate_params() module error? Result is user-visible error # - migrate_params() validation error? Result is user-visible error # 2c. Call fetch (no errors possible -- LoadedModule catches them) # 3. Save result (and send delta) # - database errors? _exit(1) # - other error (bug in `save`)? Log exception and ignore # 4. Update WfModule last-fetch time # - database errors? _exit(1) with crash_on_database_error(): logger.info("begin fetch(workflow_id=%d, wf_module_id=%d)", workflow_id, wf_module_id) try: ( wf_module, module_version, stored_object, input_crr, ) = await load_database_objects(workflow_id, wf_module_id) except (Workflow.DoesNotExist, WfModule.DoesNotExist): logger.info("Skipping fetch of deleted WfModule %d-%d", workflow_id, wf_module_id) return # Prepare secrets -- mangle user values so modules have all they need. # # This can involve, e.g., HTTP request to OAuth2 token servers. # # TODO unit-test this code path if module_version is None: secrets = {} else: secrets = await fetcher.secrets.prepare_secrets( module_version.param_fields, wf_module.secrets) if now is None: now = timezone.now() with contextlib.ExitStack() as ctx: basedir = ctx.enter_context(tempdir_context(prefix="fetch-")) output_path = ctx.enter_context( tempfile_context(prefix="fetch-result-", dir=basedir)) # get last_fetch_result (This can't error.) last_fetch_result = _stored_object_to_fetch_result( ctx, stored_object, wf_module.fetch_error, dir=basedir) result = await asyncio.get_event_loop().run_in_executor( None, fetch_or_wrap_error, ctx, basedir, wf_module, module_version, secrets, last_fetch_result, input_crr, output_path, ) try: with crash_on_database_error(): if last_fetch_result is not None and versions.are_fetch_results_equal( last_fetch_result, result): await save.mark_result_unchanged(workflow_id, wf_module, now) else: await save.create_result(workflow_id, wf_module, result, now) except asyncio.CancelledError: raise except Exception: # Log exceptions but keep going. # TODO [adamhooper, 2019-09-12] really? I think we don't want this. # Make `fetch.save() robust, then nix this handler logger.exception(f"Error fetching {wf_module}") with crash_on_database_error(): await update_next_update_time(workflow_id, wf_module, now)
def setUp(self): super().setUp() self.exit_stack = ExitStack() self.basedir = self.exit_stack.enter_context(tempdir_context())
def setUp(self): super().setUp() self.ctx = contextlib.ExitStack() self.basedir = self.ctx.enter_context(tempdir_context()) self.output_path = self.ctx.enter_context(tempfile_context(dir=self.basedir))