Esempio n. 1
0
def directory_loaded_as_zipfile_path(dirpath: Path) -> ContextManager[Path]:
    """Yield -- but do not save -- a zipfile using the files in a directory.

    Use `dirpath.name` as `module_id`. Use "dir-{hex-sha1sum of zipfile}" as
    `version`.

    Respect `.gitignore` to avoid importing too many files.

    The ModuleZipfile may not be valid. Use `validate_zipfile()` to test it.
    """
    try:
        with (dirpath / ".gitignore").open("rt", encoding="utf-8") as f:
            gitignore = pathspec.PathSpec.from_lines(
                pathspec.patterns.GitWildMatchPattern, f.readlines()
            )
    except FileNotFoundError:
        gitignore = pathspec.PathSpec([])

    module_id = dirpath.name
    with tempdir_context(prefix="importdir") as tempdir:
        unversioned_zip_path = tempdir / f"{module_id}.develop.zip"
        with zipfile.ZipFile(unversioned_zip_path, mode="w") as zf:
            for path in dirpath.glob("**/*"):
                if path.is_file():
                    relative_path = str(path.relative_to(dirpath))
                    if not gitignore.match_file(relative_path):
                        zf.write(path, relative_path)

        version = "dir-" + _hexsha1(unversioned_zip_path)
        zip_path = tempdir / f"{module_id}.{version}.zip"

        shutil.move(unversioned_zip_path, zip_path)
        yield zip_path
 def test_happy_path(self):
     with tempdir_context() as tempdir:
         zip_path = tempdir / "importmodule.1.zip"
         with zipfile.ZipFile(zip_path, mode="w") as zf:
             zf.writestr(
                 "importmodule.yaml",
                 json.dumps(
                     dict(
                         id_name="importmodule",
                         name="Importable module",
                         category="Clean",
                         parameters=[],
                     )).encode("utf-8"),
             )
             zf.writestr("importmodule.py",
                         b"def render(table, params): return table")
         clientside_module = import_zipfile(zip_path)
     self.assertEqual(
         clientside_module,
         clientside.Module(
             spec=ModuleSpec(
                 id_name="importmodule",
                 name="Importable module",
                 category="Clean",
                 parameters=[],
             ),
             js_module="",
         ),
     )
Esempio n. 3
0
def import_module_from_github(
    owner: str, repo: str, ref: str = "main"
) -> clientside.Module:
    """Download module data from GitHub and store it in database+s3.

    Return a `clientside.Module` on success.

    Raise `WorkbenchModuleImportError` if import fails.
    """
    if owner.lower() != "cjworkbench":
        raise WorkbenchModuleImportError(
            "Refusing to import: according to the GitHub URL, "
            "this module is not owned by 'cjworkbench'"
        )

    with tempdir_context(prefix="importmodule") as td:
        # Download to a tempfile, `download_path`
        download_path = td / "github-download.zip"
        _download_url(
            "https://github.com/%s/%s/archive/%s.zip" % (owner, repo, ref),
            download_path,
        )  # raise WorkbenchModuleImportError

        # Read the version (sha1) from zipfile and rename it to match the sha1.
        # (import_zipfile() reads sha1 from filename.)
        with zipfile.ZipFile(download_path, "r") as zf:
            sha1 = zf.comment.decode("latin1")  # cannot error
            assert SHA1_PATTERN.match(sha1), "GitHub archive comment must be sha1"
        name = "%s.%s.zip" % (repo, sha1)
        path = td / name
        download_path.rename(path)

        # Import the zipfile
        return import_zipfile(path)  # raise WorkbenchModuleImportError
Esempio n. 4
0
def import_module_from_test_zip_url(url: str) -> clientside.Module:
    """Download module data from a zipfile at a trusted URL.

    Return a `clientside.Module` on success.

    Raise `WorkbenchModuleImportError` if import fails.
    """
    zipfile_name = url.split("/")[-1]
    with tempdir_context(prefix="importmodule") as td:
        path = td / zipfile_name
        _download_url(url, path)  # raise WorkbenchModuleImportError
        return import_zipfile(path)  # raise WorkbenchModuleImportError
Esempio n. 5
0
    def test_SECURITY_provide_dir_readable(self):
        with tempdir_context() as files:
            files.chmod(0o755)
            (files / "foo.txt").write_text("foo")
            (files / "subdir").mkdir(0o755)
            (files / "subdir" / "bar.bin").write_bytes(b"subbar")

            self._spawn_and_communicate_or_raise(
                r"""
                from pathlib import Path

                assert Path("/data/foo.txt").read_text() == "foo"
                assert Path("/data/subdir/bar.bin").read_text() == "subbar"
                """,
                chroot_dir=self.chroot_dir,
                chroot_provide_paths=[(Path("/data"), files)],
            )
 def test_validate_detect_exec_error(self):
     with tempdir_context() as tempdir:
         zip_path = tempdir / "badpy.1.zip"
         with zipfile.ZipFile(zip_path, mode="w") as zf:
             zf.writestr(
                 "badpy.yaml",
                 json.dumps(
                     dict(
                         name="Exec-error Python",
                         id_name="badpy",
                         category="Clean",
                         parameters=[],
                     )).encode("utf-8"),
             )
             zf.writestr("badpy.py", b"print(badname)")
         with self.assertRaises(WorkbenchModuleImportError) as cm:
             import_zipfile(zip_path)
     self.assertIsInstance(cm.exception.__cause__, ModuleExitedError)
 def test_validate_invalid_spec(self):
     with tempdir_context() as tempdir:
         zip_path = tempdir / "badyaml.1.zip"
         with zipfile.ZipFile(zip_path, mode="w") as zf:
             zf.writestr(
                 "badyaml.yaml",
                 (b"{"
                  b'"idname": "badyaml",'
                  b'"name": "Missing id_name",'
                  b'"category": "Clean",'
                  b'"parameters": []'
                  b"}"),
             )
             zf.writestr("badyaml.py",
                         "def render(table, params):\n  return table")
         with self.assertRaises(WorkbenchModuleImportError) as cm:
             import_zipfile(zip_path)
     self.assertIsInstance(cm.exception.__cause__, ValueError)
 def test_validate_detect_python_syntax_errors(self):
     with tempdir_context() as tempdir:
         zip_path = tempdir / "badpy.1.zip"
         with zipfile.ZipFile(zip_path, mode="w") as zf:
             zf.writestr(
                 "badpy.yaml",
                 json.dumps(
                     dict(
                         name="Syntax-error Python",
                         id_name="badpy",
                         category="Clean",
                         parameters=[],
                     )).encode("utf-8"),
             )
             zf.writestr("badpy.py",
                         'def render(table, params):\n  cols = split(","')
         with self.assertRaises(WorkbenchModuleImportError) as cm:
             import_zipfile(zip_path)
     self.assertIsInstance(cm.exception.__cause__, SyntaxError)
Esempio n. 9
0
    def test_load_dynamic(self):
        code = b"def render(table, params):\n    return table * 2"
        minio.client.put_object(
            Bucket=minio.ExternalModulesBucket,
            Key="imported/abcdef/imported.py",
            Body=code,
            ContentLength=len(code),
        )

        with self.assertLogs("cjwstate.modules.loaded_module"):
            lm = LoadedModule.for_module_version(
                MockModuleVersion("imported", "abcdef", ParamDType.Dict({}), "now")
            )

        self.assertEqual(lm.name, "imported:abcdef")

        # This ends up being kinda an integration test.
        with ExitStack() as ctx:
            basedir = Path(ctx.enter_context(tempdir_context(prefix="test-basedir-")))
            basedir.chmod(0o755)
            input_table = ctx.enter_context(
                arrow_table_context({"A": [1]}, dir=basedir)
            )
            input_table.path.chmod(0o644)
            output_tf = ctx.enter_context(tempfile.NamedTemporaryFile(dir=basedir))

            ctx.enter_context(self.assertLogs("cjwstate.modules.loaded_module"))

            result = lm.render(
                basedir=basedir,
                input_table=input_table,
                params=Params({"col": "A"}),
                tab=Tab("tab-1", "Tab 1"),
                fetch_result=None,
                output_filename=Path(output_tf.name).name,
            )

        assert_render_result_equals(result, RenderResult(arrow_table({"A": [2]})))
Esempio n. 10
0
def _chroot_dir_context(
        *,
        provide_paths: List[Path] = [],
        extract_paths: List[Path] = []) -> ContextManager[Path]:
    """
    Prepare paths for forkserver's `chroot_dir` and `chroot_provide_paths`.

    Each of `provide_paths` is a file or directory we will expose to module
    code -- code with an effective UID/GID outside of 0-65535, so we can't
    transfer ownership to it. Each path within each `provide_path` will be
    temporarily set to other-readable. (TODO bind-mount instead of chroot,
    and somehow fiddle with ownership while mounting.)

    Each of `extract_paths` is an empty file that already exists, which we
    allow the module to write to. Each path will be set to world-writable
    within the chroot (so processes with effective UIDs outside of 0-65535 may
    write to it -- e.g., setuid-nonroot processes within forkserver's sandbox).
    After the context exits, the original permissions will be restored.

    The caller is expected to expose the `extract_path` through a
    `chroot_provide_paths` argument to forkserver. (For instance, if
    `extract_paths` includes /tmp/basedir/x.arrow, `chroot_provide_paths`
    should include /tmp/basedir or /tmp/basedir/x.arrow.

    TODO refactor chroot construction so it happens here, not in forkserver.
    The contents of the chroot really depend on the code being run -- in this
    case, code the kernel spawns.
    """
    with tempdir_context(prefix="kernel-chroot-") as chroot:
        chroot.chmod(0o755)

        old_stats: Dict[Path, os.stat_result] = {}

        for provide_path in provide_paths:
            for dirname, _, filenames in os.walk(provide_path):
                dirpath = Path(dirname)
                old_stat = dirpath.stat()
                old_stats[dirpath] = old_stat
                dirpath.chmod((old_stat.st_mode & 0o7777) | stat.S_IROTH
                              | stat.S_IXOTH)
                for filename in filenames:
                    path = dirpath / filename
                    old_stat = path.stat()
                    old_stats[path] = old_stat
                    path.chmod((old_stat.st_mode & 0o7777) | stat.S_IROTH)

        for path in extract_paths:
            # read old_stat from cache, not from file! We changed the file.
            old_stat = old_stats[
                path]  # KeyError? provide_paths+extract_paths disagree
            # make it writable
            path.chmod((old_stat.st_mode & 0o7777) | stat.S_IROTH
                       | stat.S_IWOTH)

        yield chroot

        for path in extract_paths:
            # The module ran as a high-UID user. Extract its output from
            # the chroot and give it its original permissions. That way,
            # future module runs won't be allowed to write it (unless
            # old_stats says it was world-writable in the first place).
            _extract_from_chroot(chroot, path)

        for path, old_stat in old_stats.items():
            # Restore original owner UID, GID
            os.chown(path, old_stat.st_uid, old_stat.st_gid)
            # Restore original permissions (ref: man inode(7))
            path.chmod(old_stat.st_mode & 0o7777)
Esempio n. 11
0
async def fetch(*,
                workflow_id: int,
                wf_module_id: int,
                now: Optional[timezone.datetime] = None) -> None:
    # 1. Load database objects
    #    - missing WfModule? Return prematurely
    #    - database error? _exit(1)
    # 2. Calculate result
    #    2a. Load module
    #       - no module? Result is user-visible error
    #       - load error? Result is user-visible error
    #       - compile error? Result is user-visible error
    #    2b. Build fetch kwargs
    #       - migrate_params() module error? Result is user-visible error
    #       - migrate_params() validation error? Result is user-visible error
    #    2c. Call fetch (no errors possible -- LoadedModule catches them)
    # 3. Save result (and send delta)
    #    - database errors? _exit(1)
    #    - other error (bug in `save`)? Log exception and ignore
    # 4. Update WfModule last-fetch time
    #    - database errors? _exit(1)
    with crash_on_database_error():
        logger.info("begin fetch(workflow_id=%d, wf_module_id=%d)",
                    workflow_id, wf_module_id)

        try:
            (
                wf_module,
                module_version,
                stored_object,
                input_crr,
            ) = await load_database_objects(workflow_id, wf_module_id)
        except (Workflow.DoesNotExist, WfModule.DoesNotExist):
            logger.info("Skipping fetch of deleted WfModule %d-%d",
                        workflow_id, wf_module_id)
            return

    # Prepare secrets -- mangle user values so modules have all they need.
    #
    # This can involve, e.g., HTTP request to OAuth2 token servers.
    #
    # TODO unit-test this code path
    if module_version is None:
        secrets = {}
    else:
        secrets = await fetcher.secrets.prepare_secrets(
            module_version.param_fields, wf_module.secrets)

    if now is None:
        now = timezone.now()

    with contextlib.ExitStack() as ctx:
        basedir = ctx.enter_context(tempdir_context(prefix="fetch-"))
        output_path = ctx.enter_context(
            tempfile_context(prefix="fetch-result-", dir=basedir))
        # get last_fetch_result (This can't error.)
        last_fetch_result = _stored_object_to_fetch_result(
            ctx, stored_object, wf_module.fetch_error, dir=basedir)
        result = await asyncio.get_event_loop().run_in_executor(
            None,
            fetch_or_wrap_error,
            ctx,
            basedir,
            wf_module,
            module_version,
            secrets,
            last_fetch_result,
            input_crr,
            output_path,
        )

        try:
            with crash_on_database_error():
                if last_fetch_result is not None and versions.are_fetch_results_equal(
                        last_fetch_result, result):
                    await save.mark_result_unchanged(workflow_id, wf_module,
                                                     now)
                else:
                    await save.create_result(workflow_id, wf_module, result,
                                             now)
        except asyncio.CancelledError:
            raise
        except Exception:
            # Log exceptions but keep going.
            # TODO [adamhooper, 2019-09-12] really? I think we don't want this.
            # Make `fetch.save() robust, then nix this handler
            logger.exception(f"Error fetching {wf_module}")

    with crash_on_database_error():
        await update_next_update_time(workflow_id, wf_module, now)
 def setUp(self):
     super().setUp()
     self.exit_stack = ExitStack()
     self.basedir = self.exit_stack.enter_context(tempdir_context())
Esempio n. 13
0
 def setUp(self):
     super().setUp()
     self.ctx = contextlib.ExitStack()
     self.basedir = self.ctx.enter_context(tempdir_context())
     self.output_path = self.ctx.enter_context(tempfile_context(dir=self.basedir))