def import_zipfile(path: Path) -> clientside.Module:
    """
    Save a zipfile to database and minio and build a `clientside.Module`.

    Raise `WorkbenchModuleImportError` if `path` points to an invalid module.

    Otherwise, do not raise any errors one can sensibly recover from.
    """
    temp_zipfile = ModuleZipfile(path)
    validate_zipfile(temp_zipfile)  # raise WorkbenchModuleImportError
    module_id = temp_zipfile.module_id
    version = temp_zipfile.version
    module_spec = temp_zipfile.get_spec()
    js_module = temp_zipfile.get_optional_js_module() or ""

    minio.fput_file(minio.ExternalModulesBucket,
                    "%s/%s" % (module_id, path.name), path)
    ModuleVersion.objects.update_or_create(
        id_name=module_id,
        source_version_hash=version,
        spec=asdict(temp_zipfile.get_spec()),
        js_module=js_module,
    )

    return clientside.Module(module_spec, js_module)
Exemple #2
0
def get_migrated_params(
        wf_module: WfModule,
        *,
        module_zipfile: ModuleZipfile = None) -> Dict[str, Any]:
    """
    Read `wf_module.params`, calling migrate_params() or using cache fields.

    Call this within a `Workflow.cooperative_lock()`.

    If migrate_params() was already called for this version of the module,
    return the cached value. See `wf_module.cached_migrated_params`,
    `wf_module.cached_migrated_params_module_version`.

    Raise `ModuleError` if migration fails.

    Raise `KeyError` if the module was deleted.

    Raise `RuntimeError` (unrecoverable) if there is a problem loading or
    executing the module. (Modules are validated before import, so this should
    not happen.)

    The result may be invalid. Call `validate()` to raise a `ValueError` to
    detect that case.

    TODO avoid holding the database lock whilst executing stuff on the kernel.
    (This will involve auditing and modifying all callers to handle new error
    cases.)
    """
    if module_zipfile is None:
        # raise KeyError
        module_zipfile = MODULE_REGISTRY.latest(wf_module.module_id_name)

    stale = (
        module_zipfile.version == "develop"
        # works if cached version (and thus cached _result_) is None
        or (module_zipfile.get_param_schema_version() !=
            wf_module.cached_migrated_params_module_version))

    if not stale:
        return wf_module.cached_migrated_params
    else:
        # raise ModuleError
        params = invoke_migrate_params(module_zipfile, wf_module.params)
        wf_module.cached_migrated_params = params
        wf_module.cached_migrated_params_module_version = (
            module_zipfile.get_param_schema_version())
        try:
            wf_module.save(update_fields=[
                "cached_migrated_params",
                "cached_migrated_params_module_version",
            ])
        except ValueError:
            # WfModule was deleted, so we get:
            # "ValueError: Cannot force an update in save() with no primary key."
            pass
        return params
Exemple #3
0
def download_module_zipfile(
    tempdir: Path,
    module_id: ModuleId,
    version: ModuleVersion,
    *,
    deprecated_spec: Dict[str, Any],
    deprecated_js_module: str,
) -> ModuleZipfile:
    """
    Produce a local-path ModuleZipfile by downloading from minio.

    Raise `RuntimeError` (_from_ another kind of error -- `FileNotFoundError`,
    `KeyError`, `ValueError`, `SyntaxError`, `BadZipFile`,
    `UnicodeDecodeError` or more) if the zipfile is not a valid Workbench
    module. We spend the time testing the zipfile for validity because A) it's
    good to catch errors quickly; and B) fetcher, renderer and server all need
    to execute code on each module, so they're destined to validate the module
    anyway.

    The zipfile is always written to "{tempdir}/{module_id}.{version}.zip".
    This function is not re-entrant when called with the same parameters.
    Callers may use locks to avoid trying to download the same data multiple
    times.
    """
    logger.info("download_module_zipfile(%s.%s.zip)", module_id, version)

    zippath = tempdir / ("%s.%s.zip" % (module_id, version))
    try:
        _download_module_zipfile_modern(zippath, module_id, version)
    except FileNotFoundError as original_error:
        try:
            _download_module_zipfile_deprecated(
                zippath,
                module_id,
                version,
                spec=deprecated_spec,
                js_module=deprecated_js_module,
            )
        except FileNotFoundError:
            raise RuntimeError from original_error

    ret = ModuleZipfile(zippath)  # raise ZipfileError
    try:
        # raise KeyError or SyntaxError
        compiled_module = ret.compile_code_without_executing()
        ret.get_spec()  # raise KeyError or ValueError
        cjwstate.modules.kernel.validate(compiled_module)  # raise ModuleError
    except Exception as err:
        raise RuntimeError from err
    return ret
Exemple #4
0
def invoke_migrate_params(
    module_zipfile: ModuleZipfile, raw_params: Dict[str, Any]
) -> Dict[str, Any]:
    """Call module `migrate_params()` using (global) kernel.

    Raise ModuleError if module code did not execute.

    The result may not be valid. Call `param_schema.validate(result)` to
    raise `ValueError` on error; or call `param_schema.coerce(result)` to
    guarantee a valid result.

    Log any ModuleError. Also log success.
    """
    time1 = time.time()
    logger.info("%s:migrate_params() begin", module_zipfile.path.name)
    status = "???"
    try:
        result = cjwstate.modules.kernel.migrate_params(
            module_zipfile.compile_code_without_executing(), raw_params
        )  # raise ModuleError
        status = "ok"
        return result
    except ModuleError as err:
        logger.exception("Exception in %s:migrate_params()", module_zipfile.path.name)
        status = type(err).__name__
        raise
    finally:
        time2 = time.time()
        logger.info(
            "%s:migrate_params() => %s in %dms",
            module_zipfile.path.name,
            status,
            int((time2 - time1) * 1000),
        )
def validate_zipfile(module_zipfile: ModuleZipfile) -> None:
    """Ensure `path` points to a valid ModuleZipfile.

    Raise `WorkbenchModuleImportError` with an English-language description
    of the flaw otherwise. (This can help module authors fix their mistakes.)
    """
    try:
        module_zipfile.get_spec()  # raise KeyError, ValueError, BadZipFile
        # raise KeyError, UnicodeDecodeError, SyntaxError, BadZipFile
        compiled_module = module_zipfile.compile_code_without_executing()
        cjwstate.modules.kernel.validate(compiled_module)  # raise ModuleError
        module_zipfile.get_optional_html()  # raise UnicodeError, BadZipFile
        module_zipfile.get_optional_js_module()  # raise UnicodeError, BadZipFile
    except zipfile.BadZipFile as err:
        raise WorkbenchModuleImportError("Bad zipfile: %s" % str(err)) from err
    except ValueError as err:
        raise WorkbenchModuleImportError(
            "Module .yaml is invalid: %s" % str(err)
        ) from err
    except KeyError as err:
        raise WorkbenchModuleImportError(
            "Zipfile is missing a required file: %s" % str(err)
        ) from err
    except SyntaxError as err:
        raise WorkbenchModuleImportError(
            "Module Python code has a syntax error: %s" % str(err)
        ) from err
    except UnicodeError as err:
        raise WorkbenchModuleImportError(
            "Module Python, HTML or JS code is invalid UTF-8: %s" % str(err)
        ) from err
    except ModuleError as err:
        raise WorkbenchModuleImportError(
            "Module Python code failed to run: %s" % str(err)
        ) from err
Exemple #6
0
def invoke_fetch(
    module_zipfile: ModuleZipfile,
    *,
    chroot_context: ChrootContext,
    basedir: Path,
    params: Params,
    secrets: Dict[str, Any],
    last_fetch_result: Optional[FetchResult],
    input_parquet_filename: Optional[str],
    output_filename: str,
) -> FetchResult:
    """
    Use kernel to invoke module `fetch(...)` method and build a `FetchResult`.

    Raise `ModuleError` on error. (This is usually the module author's fault.)

    Log any ModuleError. Also log success.

    This synchronous method can be slow for complex modules, large datasets
    or slow network requests. Consider calling it from an executor.
    """
    time1 = time.time()
    status = "???"

    logger.info("%s:fetch() begin", module_zipfile.path.name)
    compiled_module = module_zipfile.compile_code_without_executing()

    try:
        ret = cjwstate.modules.kernel.fetch(
            compiled_module=compiled_module,
            chroot_context=chroot_context,
            basedir=basedir,
            params=params,
            secrets=secrets,
            last_fetch_result=last_fetch_result,
            input_parquet_filename=input_parquet_filename,
            output_filename=output_filename,
        )
        status = "%0.1fMB" % (ret.path.stat().st_size / 1024 / 1024)
        return ret
    except ModuleError as err:
        logger.exception("Exception in %s:fetch", module_zipfile.path.name)
        status = type(err).__name__
        raise
    finally:
        time2 = time.time()
        logger.info(
            "%s:fetch() => %s in %dms",
            module_zipfile.path.name,
            status,
            int((time2 - time1) * 1000),
        )
Exemple #7
0
def _execute_step_pre(
    *,
    basedir: Path,
    exit_stack: contextlib.ExitStack,
    workflow: Workflow,
    step: Step,
    module_zipfile: ModuleZipfile,
    raw_params: Dict[str, Any],
    input_path: Path,
    input_table_columns: List[Column],
    tab_results: Dict[Tab, Optional[StepResult]],
) -> ExecuteStepPreResult:
    """First step of execute_step().

    Raise TabCycleError or TabOutputUnreachableError if the module depends on
    tabs with errors.

    Raise NoLoadedDataError if there is no input table and the module's
    loads_data is False (the default).

    Raise PromptingError if the module parameters are invalid.

    Raise UnneededExecution if `step` has changed.

    (We won't call the render() method in any of these cases.)

    All this runs synchronously within a database lock. (It's a separate
    function so that when we're done awaiting it, we can continue executing in
    a context that doesn't use a database thread.)

    `tab_results.keys()` must be ordered as the Workflow's tabs are.
    """
    # raises UnneededExecution
    with locked_step(workflow, step) as safe_step:
        fetch_result = _load_fetch_result(safe_step, basedir, exit_stack)

        module_spec = module_zipfile.get_spec()
        if not module_spec.loads_data and not input_table_columns:
            raise NoLoadedDataError

        # raise TabCycleError, TabOutputUnreachableError, PromptingError
        params, tab_outputs, uploaded_files = renderprep.prep_params(
            params=raw_params,
            schema=module_spec.param_schema,
            step_id=step.id,
            input_table_columns=input_table_columns,
            tab_results=tab_results,
            basedir=basedir,
            exit_stack=exit_stack,
        )

        return ExecuteStepPreResult(fetch_result, params, tab_outputs, uploaded_files)
Exemple #8
0
def extract_module_messages(directory: pathlib.Path):
    with directory_loaded_as_zipfile_path(directory) as zip_path:
        module_zipfile = ModuleZipfile(zip_path)  # may be invalid
        source_catalog = _build_source_catalog(module_zipfile)

    po_path = _po_path(directory, default_locale)

    try:
        old_source_catalog = read_po_catalog(po_path)
    except FileNotFoundError:
        old_source_catalog = Catalog(default_locale)

    # Update file for default locale
    if not catalogs_are_same(source_catalog, old_source_catalog):
        write_po_catalog(po_path, source_catalog)

    # Update template catalog
    # We will have no specific locale in the template catalog
    template_catalog = copy_catalog(source_catalog, locale=None)
    move_strings_to_comments(template_catalog, comment_tag="default-message")
    pot_path = _pot_path(directory)
    try:
        old_template_catalog = read_po_catalog(pot_path)
    except FileNotFoundError:
        old_template_catalog = Catalog()
    if not catalogs_are_same(template_catalog, old_template_catalog):
        write_po_catalog(
            pot_path,
            template_catalog,
            ignore_obsolete=True,
            width=
            10000000,  # we set a huge value for width, so that special comments do not wrap
            omit_header=
            True,  # removes locale and other info from the output file
        )

    fuzzy = find_fuzzy_messages(old_catalog=old_source_catalog,
                                new_catalog=source_catalog)

    for locale_id in supported_locales:
        if locale_id != default_locale:
            po_path = _po_path(directory, locale_id)
            try:
                old_catalog = read_po_catalog(po_path)
            except FileNotFoundError:
                old_catalog = Catalog(locale_id)
            catalog = _merge_nonsource_catalog(locale_id, old_catalog,
                                               source_catalog, fuzzy)

            if not catalogs_are_same(catalog, old_catalog):
                write_po_catalog(po_path, catalog)
Exemple #9
0
def _execute_wfmodule_pre(
    basedir: Path,
    exit_stack: contextlib.ExitStack,
    workflow: Workflow,
    wf_module: WfModule,
    module_zipfile: ModuleZipfile,
    raw_params: Dict[str, Any],
    input_table: ArrowTable,
    tab_results: Dict[Tab, Optional[RenderResult]],
) -> ExecuteStepPreResult:
    """
    First step of execute_wfmodule().

    Raise TabCycleError or TabOutputUnreachableError if the module depends on
    tabs with errors. (We won't call the render() method in that case.)

    Raise PromptingError if the module parameters are invalid. (We'll skip
    render() and prompt the user with quickfixes in that case.)

    Raise UnneededExecution if `wf_module` has changed.

    All this runs synchronously within a database lock. (It's a separate
    function so that when we're done awaiting it, we can continue executing in
    a context that doesn't use a database thread.)

    `tab_results.keys()` must be ordered as the Workflow's tabs are.
    """
    # raises UnneededExecution
    with locked_wf_module(workflow, wf_module) as safe_wf_module:
        fetch_result = _load_fetch_result(safe_wf_module, basedir, exit_stack)

        module_spec = module_zipfile.get_spec()
        param_schema = module_spec.get_param_schema()
        render_context = renderprep.RenderContext(
            wf_module.id,
            input_table,
            tab_results,
            basedir,
            exit_stack,
            raw_params,  # ugh
        )
        # raise TabCycleError, TabOutputUnreachableError, PromptingError
        params = renderprep.get_param_values(param_schema, raw_params,
                                             render_context)

        return ExecuteStepPreResult(fetch_result, params)
Exemple #10
0
def _build_source_catalog(module_zipfile: ModuleZipfile) -> Catalog:
    source_catalog = Catalog(default_locale)
    spec = module_zipfile.get_spec()
    for message_id, source_string in find_spec_messages(spec).items():
        source_catalog.add(message_id, string=source_string)
    with zipfile.ZipFile(module_zipfile.path, mode="r") as zf:
        for info in zf.infolist():
            if info.filename.endswith(".py"):
                with zf.open(info) as code_io:
                    for message_id, message_properties in find_messages_in_module_code(
                            code_io, info.filename).items():
                        source_catalog.add(
                            message_id,
                            string=message_properties["string"],
                            auto_comments=message_properties["comments"],
                            locations=message_properties["locations"],
                        )
    return source_catalog
Exemple #11
0
def _get_migrated_params(wf_module: WfModule,
                         module_zipfile: ModuleZipfile) -> Dict[str, Any]:
    """
    Build the Params dict which will be passed to render().

    Call LoadedModule.migrate_params() to ensure the params are up-to-date.

    On ModuleError or ValueError, log the error and return default params. This
    will render the "wrong" thing ... but the front-end should show the migrate
    error (as it's rendering the form) so users should figure out the problem.
    (What's the alternative? Abort the whole workflow render? We can't render
    _any_ module until we've migrated _all_ modules; and it's hard to imagine
    showing the user a huge, aborted render.)

    Assume we are called within a `workflow.cooperative_lock()`.
    """
    if module_zipfile is None:
        # This is a deleted module. Renderer will pass the input through to
        # the output.
        return {}

    module_spec = module_zipfile.get_spec()
    param_schema = module_spec.get_param_schema()

    try:
        result = get_migrated_params(wf_module, module_zipfile=module_zipfile)
    except ModuleError:
        # LoadedModule logged this error; no need to log it again.
        return param_schema.coerce(None)

    # Is the module buggy? It might be. Log that error, and return a valid
    # set of params anyway -- even if it isn't the params the user wants.
    try:
        param_schema.validate(result)
        return result
    except ValueError as err:
        logger.exception(
            "%s:migrate_params() gave wrong retval: %s",
            module_zipfile.path.name,
            str(err),
        )
        return param_schema.coerce(result)
Exemple #12
0
 def _create_localizer_for_module_zipfile(
     cls, module_zipfile: ModuleZipfile
 ) -> Optional[MessageLocalizer]:
     catalogs = {}
     for locale_id in supported_locales:
         try:
             catalogs[locale_id] = read_po(
                 BytesIO(module_zipfile.read_messages_po_for_locale(locale_id)),
                 abort_invalid=True,
             )
         except PoFileError as err:
             logger.exception(
                 "Invalid po file for module %s in locale %s: %s",
                 module_zipfile.module_id_and_version,
                 locale_id,
                 err,
             )
             pass
         except KeyError:
             pass
     if not catalogs:
         return None
     return MessageLocalizer(catalogs)
Exemple #13
0
def fetch_or_wrap_error(
    ctx: contextlib.ExitStack,
    chroot_context: ChrootContext,
    basedir: Path,
    module_id_name: str,
    module_zipfile: ModuleZipfile,
    migrated_params_or_error: Union[Dict[str, Any], ModuleError],
    secrets: Dict[str, Any],
    last_fetch_result: Optional[FetchResult],
    maybe_input_crr: Optional[CachedRenderResult],
    output_path: Path,
):
    """
    Fetch, and do not raise any exceptions worth catching.

    Exceptions are wrapped -- the result is a FetchResult with `.errors`.

    This function is slow indeed. Perhaps call it from
    EventLoop.run_in_executor(). (Why not make it async? Because all the logic
    inside -- compile module, fetch() -- is sandboxed, meaning it gets its own
    processes. We may eventually avoid asyncio entirely in `fetcher`.

    These problems are all handled:

    * Module was deleted (`module_zipfile is None`)
    * Module times out (`cjwkernel.errors.ModuleTimeoutError`), in `fetch()`.
    * Module crashes (`cjwkernel.errors.ModuleExitedError`), in `fetch()`.
    * migrated_params_or_error is a `ModuleError`
    * migrated_params_or_error is invalid (`ValueError`)
    * input_crr points to a nonexistent file (`FileNotFoundError`)
    """
    # module_zipfile=None is allowed
    if module_zipfile is None:
        logger.info("fetch() deleted module '%s'", module_id_name)
        return FetchResult(
            output_path,
            [
                RenderError(
                    I18nMessage.trans(
                        "py.fetcher.fetch.no_loaded_module",
                        default="Cannot fetch: module was deleted",
                    ))
            ],
        )
    module_spec = module_zipfile.get_spec()
    param_schema = module_spec.get_param_schema()

    if isinstance(migrated_params_or_error, ModuleError):
        # raise the exception so we can log it
        try:
            raise migrated_params_or_error
        except ModuleError:
            # We'll always get here
            logger.exception("%s:migrate_params() raised error",
                             module_zipfile.path.name)
        return user_visible_bug_fetch_result(
            output_path, format_for_user_debugging(migrated_params_or_error))
    migrated_params = migrated_params_or_error

    try:
        param_schema.validate(migrated_params)
    except ValueError:
        logger.exception("Invalid return value from %s:migrate_params()",
                         module_zipfile.path.name)
        return user_visible_bug_fetch_result(
            output_path,
            "%s:migrate_params() output invalid params" %
            module_zipfile.path.name,
        )

    # get input_metadata, input_parquet_path. (This can't error.)
    input_parquet_path, input_metadata = _download_cached_render_result(
        ctx, maybe_input_crr, dir=basedir)

    # Clean params, so they're of the correct type. (This can't error.)
    params = Params(
        fetchprep.clean_value(param_schema, migrated_params, input_metadata))

    # actually fetch
    try:
        return invoke_fetch(
            module_zipfile,
            chroot_context=chroot_context,
            basedir=basedir,
            params=params,
            secrets=secrets,
            last_fetch_result=last_fetch_result,
            input_parquet_filename=(None if input_parquet_path is None else
                                    input_parquet_path.name),
            output_filename=output_path.name,
        )
    except ModuleError as err:
        logger.exception("Error calling %s:fetch()", module_zipfile.path.name)
        return user_visible_bug_fetch_result(output_path,
                                             format_for_user_debugging(err))
Exemple #14
0
def invoke_render(
    module_zipfile: ModuleZipfile,
    *,
    chroot_context: ChrootContext,
    basedir: Path,
    input_filename: Optional[str],
    params: Dict[str, Any],
    tab_name: str,
    fetch_result: Optional[FetchResult],
    tab_outputs: Dict[str, TabOutput],
    uploaded_files: Dict[str, UploadedFile],
    output_filename: str,
) -> LoadedRenderResult:
    """Use kernel to process `table` with module `render` function.

    Raise `ModuleError` on error. (This is usually the module author's fault.)

    Log any ModuleError. Also log success.

    This synchronous method can be slow for complex modules or large
    datasets. Consider calling it from an executor.
    """
    time1 = time.time()
    begin_status_format = "%s:render() (%0.1fMB input)"
    begin_status_args = (
        module_zipfile.path.name,
        (
            (basedir / input_filename).stat().st_size / 1024 / 1024
            if input_filename is not None
            else 0
        ),
    )
    logger.info(begin_status_format + " begin", *begin_status_args)
    status = "???"
    try:
        result = cjwstate.modules.kernel.render(
            module_zipfile.compile_code_without_executing(),
            chroot_context=chroot_context,
            basedir=basedir,
            input_filename=input_filename,
            params=params,
            tab_name=tab_name,
            fetch_result=fetch_result,
            tab_outputs=tab_outputs,
            uploaded_files=uploaded_files,
            output_filename=output_filename,
        )

        output_path = basedir / output_filename
        st_size = output_path.stat().st_size
        if st_size == 0:
            table = pa.table({})
            columns = []
            status = "(no output)"
        else:
            try:
                table, columns = load_untrusted_arrow_file_with_columns(output_path)
                status = "(%drows, %dcols, %0.1fMB)" % (
                    table.num_rows,
                    table.num_columns,
                    st_size / 1024 / 1024,
                )
            except ValidateError as err:
                raise ModuleExitedError(
                    module_zipfile.path.name,
                    0,
                    "Module wrote invalid data: %s" % str(err),
                )
        return LoadedRenderResult(
            path=output_path,
            table=table,
            columns=columns,
            errors=result.errors,
            json=result.json,
        )
    except ModuleError as err:
        logger.exception("Exception in %s:render", module_zipfile.path.name)
        status = type(err).__name__
        raise
    finally:
        time2 = time.time()

        logger.info(
            begin_status_format + " => %s in %dms",
            *begin_status_args,
            status,
            int((time2 - time1) * 1000),
        )
Exemple #15
0
def invoke_render(
    module_zipfile: ModuleZipfile,
    *,
    chroot_context: ChrootContext,
    basedir: Path,
    input_table: ArrowTable,
    params: Params,
    tab: Tab,
    fetch_result: Optional[FetchResult],
    output_filename: str,
) -> RenderResult:
    """
    Use kernel to process `table` with module `render` function.

    Raise `ModuleError` on error. (This is usually the module author's fault.)

    Log any ModuleError. Also log success.

    This synchronous method can be slow for complex modules or large
    datasets. Consider calling it from an executor.
    """
    time1 = time.time()
    begin_status_format = "%s:render() (%d rows, %d cols, %0.1fMB)"
    begin_status_args = (
        module_zipfile.path.name,
        input_table.metadata.n_rows,
        len(input_table.metadata.columns),
        input_table.n_bytes_on_disk / 1024 / 1024,
    )
    logger.info(begin_status_format + " begin", *begin_status_args)
    status = "???"
    try:
        result = cjwstate.modules.kernel.render(
            module_zipfile.compile_code_without_executing(),
            chroot_context=chroot_context,
            basedir=basedir,
            input_table=input_table,
            params=params,
            tab=tab,
            fetch_result=fetch_result,
            output_filename=output_filename,
        )
        status = "(%drows, %dcols, %0.1fMB)" % (
            result.table.metadata.n_rows,
            len(result.table.metadata.columns),
            result.table.n_bytes_on_disk / 1024 / 1024,
        )
        return result
    except ModuleError as err:
        logger.exception("Exception in %s:render", module_zipfile.path.name)
        status = type(err).__name__
        raise
    finally:
        time2 = time.time()

        logger.info(
            begin_status_format + " => %s in %dms",
            *begin_status_args,
            status,
            int((time2 - time1) * 1000),
        )