Esempio n. 1
0
    def _ge_validation_fn(context, dataset):
        data_context = context.resources.ge_data_context
        validator_kwargs = {
            "datasource_name": datasource_name,
            "data_connector_name": data_connector_name,
            "data_asset_name": datasource_name or data_asset_name,
            "runtime_parameters": {
                runtime_method_type: dataset
            },
            "batch_identifiers": batch_identifiers,
            "expectation_suite_name": suite_name,
            **extra_kwargs,
        }
        validator = data_context.get_validator(**validator_kwargs)

        run_id = {
            "run_name": datasource_name + " run",
            "run_time": datetime.datetime.utcnow(),
        }
        results = validator.validate(run_id=run_id)

        validation_results_page_renderer = ValidationResultsPageRenderer(
            run_info_at_end=True)
        rendered_document_content_list = validation_results_page_renderer.render(
            validation_results=results)
        md_str = "".join(
            DefaultMarkdownPageView().render(rendered_document_content_list))

        meta_stats = MetadataEntry.md(md_str=md_str,
                                      label="Expectation Results")
        yield ExpectationResult(
            success=bool(results["success"]),
            metadata_entries=[meta_stats],
        )
        yield Output(results.to_json_dict())
Esempio n. 2
0
    def handle_output(self, context: OutputContext, obj: Union[PandasDataFrame, SparkDataFrame]):
        schema, table = DB_SCHEMA, context.asset_key.path[-1]

        time_window = context.asset_partitions_time_window if context.has_asset_partitions else None
        with connect_snowflake(config=self._config, schema=schema) as con:
            con.execute(self._get_cleanup_statement(table, schema, time_window))

        if isinstance(obj, SparkDataFrame):
            yield from self._handle_spark_output(obj, schema, table)
        elif isinstance(obj, PandasDataFrame):
            yield from self._handle_pandas_output(obj, schema, table)
        elif obj is None:  # dbt
            config = dict(SHARED_SNOWFLAKE_CONF)
            config["schema"] = DB_SCHEMA
            with connect_snowflake(config=config) as con:
                df = read_sql(f"SELECT * FROM {context.name} LIMIT 5", con=con)
                num_rows = con.execute(f"SELECT COUNT(*) FROM {context.name}").fetchone()

            yield MetadataEntry.md(df.to_markdown(), "Data sample")
            yield MetadataEntry.int(num_rows, "Rows")
        else:
            raise Exception(
                "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames"
            )

        yield MetadataEntry.text(
            self._get_select_statement(
                table,
                schema,
                None,
                time_window,
            ),
            "Query",
        )
Esempio n. 3
0
    def _handle_spark_output(self, df: SparkDataFrame, schema: str, table: str):
        options = {
            "sfURL": f"{self._config['account']}.snowflakecomputing.com",
            "sfUser": self._config["user"],
            "sfPassword": self._config["password"],
            "sfDatabase": self._config["database"],
            "sfSchema": schema,
            "sfWarehouse": self._config["warehouse"],
            "dbtable": table,
        }
        yield MetadataEntry.md(spark_columns_to_markdown(df.schema), "DataFrame columns")

        df.write.format("net.snowflake.spark.snowflake").options(**options).mode("append").save()
Esempio n. 4
0
    def handle_output(self, context, obj: pd.DataFrame):
        """This saves the dataframe as a CSV."""
        fpath = self._get_fs_path(context.asset_key)
        os.makedirs(os.path.dirname(fpath), exist_ok=True)
        obj.to_csv(fpath)
        with open(fpath + ".version", "w") as f:
            f.write(context.version if context.version else "None")

        yield MetadataEntry.int(obj.shape[0], "Rows")
        yield MetadataEntry.path(fpath, "Path")
        yield MetadataEntry.md(obj.head(5).to_markdown(), "Sample")
        yield MetadataEntry.text(context.version, "Resolved version")
        yield MetadataEntry.table_schema(
            self.get_schema(context.dagster_type),
            "Schema",
        )
Esempio n. 5
0
    def _get_metadata(self, result: Dict[str, Any]) -> List[MetadataEntry]:
        """
        Here, we run queries against our output Snowflake database tables to add additional context
        to our asset materializations.
        """

        table_name = result["unique_id"].split(".")[-1]
        with connect_snowflake(config=self._snowflake_config, schema=self._dbt_schema) as con:
            n_rows = pandas.read_sql_query(f"SELECT COUNT(*) FROM {table_name}", con)
            sample_rows = pandas.read_sql_query(
                f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con
            )
        return super()._get_metadata(result) + [
            MetadataEntry.int(int(n_rows.iloc[0][0]), "dbt Model Number of Rows"),
            MetadataEntry.md(sample_rows.astype("str").to_markdown(), "dbt Model Sample Rows"),
        ]
Esempio n. 6
0
    def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str):
        from snowflake import connector  # pylint: disable=no-name-in-module

        yield MetadataEntry.int(obj.shape[0], "Rows")
        yield MetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns")

        connector.paramstyle = "pyformat"
        with connect_snowflake(config=self._config, schema=schema) as con:
            with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")
            with_uppercase_cols.to_sql(
                table,
                con=con,
                if_exists="append",
                index=False,
                method=pd_writer,
            )
Esempio n. 7
0
    def _ge_validation_fn(context, dataset):
        data_context = context.resources.ge_data_context
        if validation_operator_name is not None:
            validation_operator = validation_operator_name
        else:
            data_context.add_validation_operator(
                "ephemeral_validation",
                {
                    "class_name": "ActionListValidationOperator",
                    "action_list": []
                },
            )
            validation_operator = "ephemeral_validation"
        suite = data_context.get_expectation_suite(suite_name)
        final_batch_kwargs = batch_kwargs or {"dataset": dataset}
        if "datasource" in batch_kwargs:
            context.log.warning(
                "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "
                f"parameter of the {decorator_name} factory instead.")
        final_batch_kwargs["datasource"] = datasource_name
        batch = data_context.get_batch(final_batch_kwargs, suite)
        run_id = {
            "run_name": datasource_name + " run",
            "run_time": datetime.datetime.utcnow(),
        }
        results = data_context.run_validation_operator(
            validation_operator, assets_to_validate=[batch], run_id=run_id)
        res = convert_to_json_serializable(
            results.list_validation_results())[0]
        validation_results_page_renderer = ValidationResultsPageRenderer(
            run_info_at_end=True)
        rendered_document_content_list = (
            validation_results_page_renderer.render_validation_operator_result(
                results))
        md_str = " ".join(
            DefaultMarkdownPageView().render(rendered_document_content_list))

        meta_stats = MetadataEntry.md(md_str=md_str,
                                      label="Expectation Results")
        yield ExpectationResult(
            success=res["success"],
            metadata_entries=[
                meta_stats,
            ],
        )
        yield Output(res)