Exemple #1
0
    def _ge_validation_fn(context, dataset):
        data_context = context.resources.ge_data_context
        validator_kwargs = {
            "datasource_name": datasource_name,
            "data_connector_name": data_connector_name,
            "data_asset_name": datasource_name or data_asset_name,
            "runtime_parameters": {runtime_method_type: dataset},
            "batch_identifiers": batch_identifiers,
            "expectation_suite_name": suite_name,
            **extra_kwargs,
        }
        validator = data_context.get_validator(**validator_kwargs)

        run_id = {
            "run_name": datasource_name + " run",
            "run_time": datetime.datetime.utcnow(),
        }
        results = validator.validate(run_id=run_id)

        validation_results_page_renderer = ValidationResultsPageRenderer(run_info_at_end=True)
        rendered_document_content_list = validation_results_page_renderer.render(
            validation_results=results
        )
        md_str = "".join(DefaultMarkdownPageView().render(rendered_document_content_list))

        meta_stats = EventMetadataEntry.md(md_str=md_str, label="Expectation Results")
        yield ExpectationResult(
            success=bool(results["success"]),
            metadata_entries=[meta_stats],
        )
        yield Output(results.to_json_dict())
Exemple #2
0
def event_metadata_entries(metadata_entry_datas):
    if not metadata_entry_datas:
        return

    for metadata_entry_data in metadata_entry_datas:
        typename = metadata_entry_data['__typename']
        label = metadata_entry_data['label']
        description = metadata_entry_data.get('description')
        if typename == 'EventPathMetadataEntry':
            yield EventMetadataEntry.path(label=label,
                                          description=description,
                                          path=metadata_entry_data['path'])
        elif typename == 'EventJsonMetadataEntry':
            yield EventMetadataEntry.json(
                label=label,
                description=description,
                data=seven.json.loads(metadata_entry_data.get(
                    'jsonString', '')),
            )
        elif typename == 'EventMarkdownMetadataEntry':
            yield EventMetadataEntry.md(label=label,
                                        description=description,
                                        md_str=metadata_entry_data.get(
                                            'md_str', ''))
        elif typename == 'EventTextMetadataEntry':
            yield EventMetadataEntry.text(label=label,
                                          description=description,
                                          text=metadata_entry_data['text'])
        elif typename == 'EventUrlMetadataEntry':
            yield EventMetadataEntry.url(label=label,
                                         description=description,
                                         url=metadata_entry_data['url'])
        else:
            check.not_implemented('TODO for type {}'.format(typename))
Exemple #3
0
 def ge_validation_solid(context, dataset):
     data_context = context.resources.ge_data_context
     if validation_operator_name is not None:
         validation_operator = validation_operator_name
     else:
         data_context.add_validation_operator(
             "ephemeral_validation",
             {"class_name": "ActionListValidationOperator", "action_list": []},
         )
         validation_operator = "ephemeral_validation"
     suite = data_context.get_expectation_suite(suite_name)
     final_batch_kwargs = batch_kwargs or {"dataset": dataset}
     if "datasource" in batch_kwargs:
         context.log.warning(
             "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "
             "parameter of the solid factory instead."
         )
     final_batch_kwargs["datasource"] = datasource_name
     batch = data_context.get_batch(final_batch_kwargs, suite)
     run_id = {
         "run_name": datasource_name + " run",
         "run_time": datetime.datetime.utcnow(),
     }
     results = data_context.run_validation_operator(
         validation_operator, assets_to_validate=[batch], run_id=run_id
     )
     res = convert_to_json_serializable(results.list_validation_results())[0]
     md_str = render_multiple_validation_result_pages_markdown(
         validation_operator_result=results, run_info_at_end=True,
     )
     meta_stats = EventMetadataEntry.md(md_str=md_str, label="Expectation Results")
     yield ExpectationResult(
         success=res["success"], metadata_entries=[meta_stats,],
     )
     yield Output(res)
Exemple #4
0
 def _best_n_actions(_, df):
     df = df.nlargest(n, "score")
     return Output(
         df,
         metadata_entries=[
             EventMetadataEntry.md(df.to_markdown(), "data"),
         ],
     )
Exemple #5
0
    def _handle_spark_output(self, df: SparkDataFrame, schema: str, table: str):
        options = {
            "sfURL": f"{self._config['account']}.snowflakecomputing.com",
            "sfUser": self._config["user"],
            "sfPassword": self._config["password"],
            "sfDatabase": self._config["database"],
            "sfSchema": schema,
            "sfWarehouse": self._config["warehouse"],
            "dbtable": table,
        }
        yield EventMetadataEntry.md(spark_columns_to_markdown(df.schema), "DataFrame columns")

        df.write.format("net.snowflake.spark.snowflake").options(**options).mode("append").save()
Exemple #6
0
def many_table_materializations(_context):
    with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), 'r') as f:
        md_str = f.read()
        for table in raw_tables:
            yield Materialization(
                label='table_info',
                metadata_entries=[
                    EventMetadataEntry.text(text=table, label='table_name'),
                    EventMetadataEntry.fspath(path='/path/to/{}'.format(table), label='table_path'),
                    EventMetadataEntry.json(data={'name': table}, label='table_data'),
                    EventMetadataEntry.url(
                        url='https://bigty.pe/{}'.format(table), label='table_name_big'
                    ),
                    EventMetadataEntry.md(md_str=md_str, label='table_blurb'),
                ],
            )
Exemple #7
0
    def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str):
        from snowflake import connector  # pylint: disable=no-name-in-module

        yield EventMetadataEntry.int(obj.shape[0], "Rows")
        yield EventMetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns")

        connector.paramstyle = "pyformat"
        with connect_snowflake(config=self._config, schema=schema) as con:
            with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")
            with_uppercase_cols.to_sql(
                table,
                con=con,
                if_exists="append",
                index=False,
                method=pd_writer,
            )
    def _handle_dataframe_output(self, context: OutputContext, obj: DataFrame):
        from snowflake import connector  # pylint: disable=no-name-in-module

        yield EventMetadataEntry.int(obj.shape[0], "Rows")
        yield EventMetadataEntry.md(columns_to_markdown(obj), "DataFrame columns")

        connector.paramstyle = "pyformat"

        schema, table = context.metadata["table"].split(".")
        with connect_snowflake(config=context.resource_config, schema=schema) as con:
            with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")
            with_uppercase_cols.to_sql(
                table,
                con=con,
                if_exists="replace",
                index=False,
                method=pd_writer,
            )
Exemple #9
0
def many_table_materializations(_context):
    with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f:
        md_str = f.read()
        for table in raw_tables:
            yield AssetMaterialization(
                asset_key="table_info",
                metadata_entries=[
                    EventMetadataEntry.text(text=table, label="table_name"),
                    EventMetadataEntry.fspath(path="/path/to/{}".format(table),
                                              label="table_path"),
                    EventMetadataEntry.json(data={"name": table},
                                            label="table_data"),
                    EventMetadataEntry.url(
                        url="https://bigty.pe/{}".format(table),
                        label="table_name_big"),
                    EventMetadataEntry.md(md_str=md_str, label="table_blurb"),
                ],
            )
Exemple #10
0
def event_metadata_entries(metadata_entry_datas):
    if not metadata_entry_datas:
        return

    for metadata_entry_data in metadata_entry_datas:
        typename = metadata_entry_data["__typename"]
        label = metadata_entry_data["label"]
        description = metadata_entry_data.get("description")
        if typename == "EventPathMetadataEntry":
            yield EventMetadataEntry.path(label=label,
                                          description=description,
                                          path=metadata_entry_data["path"])
        elif typename == "EventJsonMetadataEntry":
            yield EventMetadataEntry.json(
                label=label,
                description=description,
                data=seven.json.loads(metadata_entry_data.get(
                    "jsonString", "")),
            )
        elif typename == "EventMarkdownMetadataEntry":
            yield EventMetadataEntry.md(label=label,
                                        description=description,
                                        md_str=metadata_entry_data.get(
                                            "md_str", ""))
        elif typename == "EventTextMetadataEntry":
            yield EventMetadataEntry.text(label=label,
                                          description=description,
                                          text=metadata_entry_data["text"])
        elif typename == "EventUrlMetadataEntry":
            yield EventMetadataEntry.url(label=label,
                                         description=description,
                                         url=metadata_entry_data["url"])
        elif typename == "EventPythonArtifactMetadataEntry":
            yield EventMetadataEntry(
                label=label,
                description=description,
                entry_data=PythonArtifactMetadataEntryData(
                    metadata_entry_data["module"],
                    metadata_entry_data["name"]),
            )
        else:
            check.not_implemented("TODO for type {}".format(typename))
Exemple #11
0
    def _get_metadata(self, result: Dict[str,
                                         Any]) -> List[EventMetadataEntry]:
        """
        Here, we run queries against our output Snowflake database tables to add additional context
        to our asset materializations.
        """

        table_name = result["unique_id"].split(".")[-1]
        with connect_snowflake(config=self._snowflake_config,
                               schema=self._dbt_schema) as con:
            n_rows = pandas.read_sql_query(
                f"SELECT COUNT(*) FROM {table_name}", con)
            sample_rows = pandas.read_sql_query(
                f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con)
        return super()._get_metadata(result) + [
            EventMetadataEntry.int(int(n_rows.iloc[0][0]),
                                   "dbt Model Number of Rows"),
            EventMetadataEntry.md(
                sample_rows.astype("str").to_markdown(),
                "dbt Model Sample Rows"),
        ]
Exemple #12
0
 def ge_validation_solid(context, pandas_df):
     data_context = context.resources.ge_data_context
     if validation_operator_name is not None:
         validation_operator = validation_operator_name
     else:
         data_context.add_validation_operator(
             "ephemeral_validation",
             {
                 "class_name": "ActionListValidationOperator",
                 "action_list": []
             },
         )
         validation_operator = "ephemeral_validation"
     suite = data_context.get_expectation_suite(suite_name)
     batch_kwargs = {
         "dataset": pandas_df,
         "datasource": datasource_name,
     }
     batch = data_context.get_batch(batch_kwargs, suite)
     run_id = {
         "run_name": datasource_name + " run",
         "run_time": datetime.datetime.utcnow(),
     }
     results = data_context.run_validation_operator(
         validation_operator, assets_to_validate=[batch], run_id=run_id)
     res = convert_to_json_serializable(
         results.list_validation_results())[0]
     md_str = render_multiple_validation_result_pages_markdown(
         validation_operator_result=results,
         run_info_at_end=True,
     )
     meta_stats = EventMetadataEntry.md(md_str=md_str,
                                        label="Expectation Results")
     yield ExpectationResult(
         success=res["success"],
         metadata_entries=[
             meta_stats,
         ],
     )
     yield Output(res)
Exemple #13
0
def metadata_for_actions(df):
    return [
        EventMetadataEntry.int(int(df["score"].min()), "min score"),
        EventMetadataEntry.int(int(df["score"].max()), "max score"),
        EventMetadataEntry.md(df[:5].to_markdown(), "sample rows"),
    ]