Python select_datasource Examples, great_expectations.cli.toolkit.select_datasource Python Examples

Example #1

0

Show file

def checkpoint_new(checkpoint, suite, directory, datasource):
    """Create a new checkpoint for easy deployments. (Experimental)"""
    suite_name = suite
    usage_event = "cli.checkpoint.new"
    context = toolkit.load_data_context_with_error_handling(directory)
    _verify_checkpoint_does_not_exist(context, checkpoint, usage_event)
    suite: ExpectationSuite = toolkit.load_expectation_suite(
        context, suite_name, usage_event)
    datasource = toolkit.select_datasource(context, datasource_name=datasource)
    if datasource is None:
        send_usage_message(context, usage_event, success=False)
        sys.exit(1)
    _, _, _, batch_kwargs = toolkit.get_batch_kwargs(context, datasource.name)

    template = _load_checkpoint_yml_template()
    # This picky update helps template comments stay in place
    template["batches"][0]["batch_kwargs"] = dict(batch_kwargs)
    template["batches"][0]["expectation_suite_names"] = [
        suite.expectation_suite_name
    ]

    checkpoint_file = _write_checkpoint_to_disk(context, template, checkpoint)
    cli_message(
        f"""<green>A checkpoint named `{checkpoint}` was added to your project!</green>
  - To edit this checkpoint edit the checkpoint file: {checkpoint_file}
  - To run this checkpoint run `great_expectations checkpoint run {checkpoint}`"""
    )
    send_usage_message(context, usage_event, success=True)

Example #2

0

Show file

File: suite.py Project: bernardofrassy/great_expectations

def _suite_scaffold(suite: str, directory: str, jupyter: bool) -> None:
    usage_event = "cli.suite.scaffold"
    suite_name = suite
    context = load_data_context_with_error_handling(directory)
    notebook_filename = f"scaffold_{suite_name}.ipynb"
    notebook_path = _get_notebook_path(context, notebook_filename)

    if suite_name in context.list_expectation_suite_names():
        toolkit.tell_user_suite_exists(suite_name)
        if os.path.isfile(notebook_path):
            cli_message(
                f"  - If you wish to adjust your scaffolding, you can open this notebook with jupyter: `{notebook_path}` <red>(Please note that if you run that notebook, you will overwrite your existing suite.)</red>"
            )
        send_usage_message(data_context=context, event=usage_event, success=False)
        sys.exit(1)

    datasource = toolkit.select_datasource(context)
    if datasource is None:
        send_usage_message(data_context=context, event=usage_event, success=False)
        sys.exit(1)

    _suite = context.create_expectation_suite(suite_name)
    _, _, _, batch_kwargs = get_batch_kwargs(context, datasource_name=datasource.name)
    renderer = SuiteScaffoldNotebookRenderer(context, _suite, batch_kwargs)
    renderer.render_to_disk(notebook_path)

    if jupyter:
        toolkit.launch_jupyter_notebook(notebook_path)
    else:
        cli_message(
            f"To continue scaffolding this suite, run `jupyter notebook {notebook_path}`"
        )

    send_usage_message(data_context=context, event=usage_event, success=True)

Example #3

0

Show file

def checkpoint_new(checkpoint, suite, directory, datasource, legacy):
    """Create a new checkpoint for easy deployments. (Experimental)"""
    if legacy:
        suite_name = suite
        usage_event = "cli.checkpoint.new"
        context = toolkit.load_data_context_with_error_handling(directory)
        ge_config_version = context.get_config().config_version
        if ge_config_version >= 3:
            cli_message(
                f"""<red>The `checkpoint new` CLI command is not yet implemented for GE config versions >= 3.</red>"""
            )
            send_usage_message(context, usage_event, success=False)
            sys.exit(1)

        _verify_checkpoint_does_not_exist(context, checkpoint, usage_event)
        suite: ExpectationSuite = toolkit.load_expectation_suite(
            context, suite_name, usage_event)
        datasource = toolkit.select_datasource(context,
                                               datasource_name=datasource)
        if datasource is None:
            send_usage_message(context, usage_event, success=False)
            sys.exit(1)
        _, _, _, batch_kwargs = toolkit.get_batch_kwargs(
            context, datasource.name)

        _ = context.add_checkpoint(
            name=checkpoint,
            **{
                "class_name":
                "LegacyCheckpoint",
                "validation_operator_name":
                "action_list_operator",
                "batches": [{
                    "batch_kwargs":
                    dict(batch_kwargs),
                    "expectation_suite_names": [suite.expectation_suite_name],
                }],
            },
        )

        cli_message(
            f"""<green>A checkpoint named `{checkpoint}` was added to your project!</green>
      - To run this checkpoint run `great_expectations checkpoint run {checkpoint}`"""
        )
        send_usage_message(context, usage_event, success=True)
    # TODO: <Rob>Rob</Rob> Add flow for new style checkpoints
    else:
        pass

Example #4

0

Show file

File: validation_operator.py Project: yjlee215/great_expectations

def validation_operator_run(name, run_name, validation_config_file, suite,
                            directory):
    # Note though the long lines here aren't pythonic, they look best if Click does the line wraps.
    """
    Run a validation operator against some data.

    There are two modes to run this command:

    1. Interactive (good for development):

        Specify the name of the validation operator using the --name argument
        and the name of the expectation suite using the --suite argument.

        The cli will help you specify the batch of data that you want to
        validate interactively.


    2. Non-interactive (good for production):

        Use the `--validation_config_file` argument to specify the path of the validation configuration JSON file. This file can be used to instruct a validation operator to validate multiple batches of data and use multiple expectation suites to validate each batch.

        Learn how to create a validation config file here:
        https://great-expectations.readthedocs.io/en/latest/command_line.html#great-expectations-validation-operator-run-validation-config-file-validation-config-file-path

        This command exits with 0 if the validation operator ran and the "success" attribute in its return object is True. Otherwise, the command exits with 1.

    To learn more about validation operators, go here:
    https://great-expectations.readthedocs.io/en/latest/features/validation.html#validation-operators
    """

    try:
        context = DataContext(directory)
    except ge_exceptions.ConfigNotFoundError as err:
        cli_message("Failed to process <red>{}</red>".format(err.message))
        sys.exit(1)

    try:
        if validation_config_file is not None:
            try:
                with open(validation_config_file) as f:
                    validation_config = json.load(f)
            except (OSError, json_parse_exception) as e:
                cli_message(
                    f"Failed to process the --validation_config_file argument: <red>{e}</red>"
                )
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

            validation_config_error_message = _validate_valdiation_config(
                validation_config)
            if validation_config_error_message is not None:
                cli_message(
                    "<red>The validation config in {:s} is misconfigured: {:s}</red>"
                    .format(validation_config_file,
                            validation_config_error_message))
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

        else:
            if suite is None:
                cli_message("""
Please use --suite argument to specify the name of the expectation suite.
Call `great_expectation suite list` command to list the expectation suites in your project.
""")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(0)

            suite = toolkit.load_expectation_suite(
                context, suite, "cli.validation_operator.run")

            if name is None:
                cli_message("""
Please use --name argument to specify the name of the validation operator.
Call `great_expectation validation-operator list` command to list the operators in your project.
""")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)
            else:
                if name not in context.list_validation_operator_names():
                    cli_message(f"""
Could not find a validation operator {name}.
Call `great_expectation validation-operator list` command to list the operators in your project.
""")
                    send_usage_message(
                        data_context=context,
                        event="cli.validation_operator.run",
                        success=False,
                    )
                    sys.exit(1)

            batch_kwargs = None

            cli_message("""
Let us help you specify the batch of data your want the validation operator to validate."""
                        )

            try:
                data_source = toolkit.select_datasource(context)
            except ValueError as ve:
                cli_message("<red>{}</red>".format(ve))
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

            if not data_source:
                cli_message("<red>No datasources found in the context.</red>")
                send_usage_message(
                    data_context=context,
                    event="cli.validation_operator.run",
                    success=False,
                )
                sys.exit(1)

            if batch_kwargs is None:
                (
                    datasource_name,
                    batch_kwargs_generator,
                    data_asset,
                    batch_kwargs,
                ) = get_batch_kwargs(
                    context,
                    datasource_name=data_source.name,
                    batch_kwargs_generator_name=None,
                    data_asset_name=None,
                    additional_batch_kwargs=None,
                )

            validation_config = {
                "validation_operator_name":
                name,
                "batches": [{
                    "batch_kwargs":
                    batch_kwargs,
                    "expectation_suite_names": [suite.expectation_suite_name],
                }],
            }

        try:
            validation_operator_name = validation_config[
                "validation_operator_name"]
            batches_to_validate = []
            for entry in validation_config["batches"]:
                for expectation_suite_name in entry["expectation_suite_names"]:
                    batch = context.get_batch(entry["batch_kwargs"],
                                              expectation_suite_name)
                    batches_to_validate.append(batch)

            if run_name is None:
                run_name = datetime.datetime.now(
                    datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")

            run_id = RunIdentifier(run_name=run_name)

            if suite is None:
                results = context.run_validation_operator(
                    validation_operator_name,
                    assets_to_validate=batches_to_validate,
                    run_id=run_id,
                )
            else:
                if suite.evaluation_parameters is None:
                    results = context.run_validation_operator(
                        validation_operator_name,
                        assets_to_validate=batches_to_validate,
                        run_id=run_id,
                    )
                else:
                    results = context.run_validation_operator(
                        validation_operator_name,
                        assets_to_validate=batches_to_validate,
                        run_id=run_id,
                        evaluation_parameters=suite.evaluation_parameters,
                    )
        except (ge_exceptions.DataContextError, OSError, SQLAlchemyError) as e:
            cli_message("<red>{}</red>".format(e))
            send_usage_message(data_context=context,
                               event="cli.validation_operator.run",
                               success=False)
            sys.exit(1)

        if not results["success"]:
            cli_message("Validation failed!")
            send_usage_message(data_context=context,
                               event="cli.validation_operator.run",
                               success=True)
            sys.exit(1)
        else:
            cli_message("Validation succeeded!")
            send_usage_message(data_context=context,
                               event="cli.validation_operator.run",
                               success=True)
            sys.exit(0)
    except Exception as e:
        send_usage_message(data_context=context,
                           event="cli.validation_operator.run",
                           success=False)
        raise e

Example #5

0

Show file

def _suite_edit(suite, datasource, directory, jupyter, batch_kwargs, usage_event):
    batch_kwargs_json = batch_kwargs
    batch_kwargs = None
    context = toolkit.load_data_context_with_error_handling(directory)

    try:
        suite = toolkit.load_expectation_suite(context, suite, usage_event)
        citations = suite.get_citations(require_batch_kwargs=True)

        if batch_kwargs_json:
            try:
                batch_kwargs = json.loads(batch_kwargs_json)
                if datasource:
                    batch_kwargs["datasource"] = datasource
                _batch = toolkit.load_batch(context, suite, batch_kwargs)
            except json_parse_exception as je:
                cli_message(
                    "<red>Please check that your batch_kwargs are valid JSON.\n{}</red>".format(
                        je
                    )
                )
                send_usage_message(
                    data_context=context, event=usage_event, success=False
                )
                sys.exit(1)
            except ge_exceptions.DataContextError:
                cli_message(
                    "<red>Please check that your batch_kwargs are able to load a batch.</red>"
                )
                send_usage_message(
                    data_context=context, event=usage_event, success=False
                )
                sys.exit(1)
            except ValueError as ve:
                cli_message(
                    "<red>Please check that your batch_kwargs are able to load a batch.\n{}</red>".format(
                        ve
                    )
                )
                send_usage_message(
                    data_context=context, event=usage_event, success=False
                )
                sys.exit(1)
        elif citations:
            citation = citations[-1]
            batch_kwargs = citation.get("batch_kwargs")

        if not batch_kwargs:
            cli_message(
                """
A batch of data is required to edit the suite - let's help you to specify it."""
            )

            additional_batch_kwargs = None
            try:
                data_source = toolkit.select_datasource(context, datasource_name=datasource)
            except ValueError as ve:
                cli_message("<red>{}</red>".format(ve))
                send_usage_message(
                    data_context=context, event=usage_event, success=False
                )
                sys.exit(1)

            if not data_source:
                cli_message("<red>No datasources found in the context.</red>")
                send_usage_message(
                    data_context=context, event=usage_event, success=False
                )
                sys.exit(1)

            if batch_kwargs is None:
                (
                    datasource_name,
                    batch_kwargs_generator,
                    data_asset,
                    batch_kwargs,
                ) = get_batch_kwargs(context, datasource_name=data_source.name,
                                     additional_batch_kwargs=additional_batch_kwargs)

        notebook_name = "edit_{}.ipynb".format(suite.expectation_suite_name)
        notebook_path = _get_notebook_path(context, notebook_name)
        SuiteEditNotebookRenderer().render_to_disk(suite, notebook_path, batch_kwargs)

        if not jupyter:
            cli_message(
                f"To continue editing this suite, run <green>jupyter notebook {notebook_path}</green>"
            )

        payload = edit_expectation_suite_usage_statistics(
            data_context=context, expectation_suite_name=suite.expectation_suite_name
        )

        send_usage_message(
            data_context=context, event=usage_event, event_payload=payload, success=True
        )

        if jupyter:
            toolkit.launch_jupyter_notebook(notebook_path)

    except Exception as e:
        send_usage_message(data_context=context, event=usage_event, success=False)
        raise e

Example #6

0

Show file

def _get_datasource(context, datasource):
    datasource = toolkit.select_datasource(context, datasource_name=datasource)
    if not datasource:
        cli_message("<red>No datasources found in the context.</red>")
        sys.exit(1)
    return datasource

Example #7

0

Show file

def get_batch_kwargs(
    context,
    datasource_name=None,
    batch_kwargs_generator_name=None,
    data_asset_name=None,
    additional_batch_kwargs=None,
):
    """
    This method manages the interaction with user necessary to obtain batch_kwargs for a batch of a data asset.

    In order to get batch_kwargs this method needs datasource_name, batch_kwargs_generator_name and data_asset_name
    to combine them into a fully qualified data asset identifier(datasource_name/batch_kwargs_generator_name/data_asset_name).
    All three arguments are optional. If they are present, the method uses their values. Otherwise, the method
    prompts user to enter them interactively. Since it is possible for any of these three components to be
    passed to this method as empty values and to get their values after interacting with user, this method
    returns these components' values in case they changed.

    If the datasource has batch_kwargs_generators that can list available data asset names, the method lets user choose a name
    from that list (note: if there are multiple batch_kwargs_generators, user has to choose one first). If a name known to
    the chosen batch_kwargs_generator is selected, the batch_kwargs_generators will be able to yield batch_kwargs. The method also gives user
    an alternative to selecting the data asset name from the batch_kwargs_generators's list - user can type in a name for their
    data asset. In this case a passthrough batch kwargs batch_kwargs_generators will be used to construct a fully qualified data asset
    identifier (note: if the datasource has no passthrough batch_kwargs_generators configured, the method will exist with a failure).
    Since no batch_kwargs_generators can yield batch_kwargs for this data asset name, the method prompts user to specify batch_kwargs
    by choosing a file (if the datasource is pandas or spark) or by writing a SQL query (if the datasource points
    to a database).

    :param context:
    :param datasource_name:
    :param batch_kwargs_generator_name:
    :param data_asset_name:
    :param additional_batch_kwargs:
    :return: a tuple: (datasource_name, batch_kwargs_generator_name, data_asset_name, batch_kwargs). The components
                of the tuple were passed into the methods as optional arguments, but their values might
                have changed after this method's execution. If the returned batch_kwargs is None, it means
                that the batch_kwargs_generator will know to yield batch_kwargs when called.
    """
    try:
        available_data_assets_dict = context.get_available_data_asset_names(
            datasource_names=datasource_name)
    except ValueError:
        # the datasource has no batch_kwargs_generators
        available_data_assets_dict = {datasource_name: {}}

    data_source = toolkit.select_datasource(context,
                                            datasource_name=datasource_name)
    datasource_name = data_source.name

    if batch_kwargs_generator_name is None:
        batch_kwargs_generator_name = select_batch_kwargs_generator(
            context,
            datasource_name,
            available_data_assets_dict=available_data_assets_dict,
        )

    # if the user provided us with the batch kwargs generator name and the data asset, we have everything we need -
    # let's ask the generator to build batch kwargs for this asset - we are done.
    if batch_kwargs_generator_name is not None and data_asset_name is not None:
        generator = data_source.get_batch_kwargs_generator(
            batch_kwargs_generator_name)
        batch_kwargs = generator.build_batch_kwargs(data_asset_name,
                                                    **additional_batch_kwargs)
        return batch_kwargs

    if isinstance(context.get_datasource(datasource_name),
                  (PandasDatasource, SparkDFDatasource)):
        (
            data_asset_name,
            batch_kwargs,
        ) = _get_batch_kwargs_from_generator_or_from_file_path(
            context,
            datasource_name,
            batch_kwargs_generator_name=batch_kwargs_generator_name,
        )

    elif isinstance(context.get_datasource(datasource_name),
                    SqlAlchemyDatasource):
        data_asset_name, batch_kwargs = _get_batch_kwargs_for_sqlalchemy_datasource(
            context,
            datasource_name,
            additional_batch_kwargs=additional_batch_kwargs)

    else:
        raise ge_exceptions.DataContextError(
            "Datasource {:s} is expected to be a PandasDatasource or SparkDFDatasource, but is {:s}"
            .format(datasource_name,
                    str(type(context.get_datasource(datasource_name)))))

    return (datasource_name, batch_kwargs_generator_name, data_asset_name,
            batch_kwargs)