Python MetadataEntry Examples, dagster.MetadataEntry Python Examples

Example #1

0

Show file

    def execute(self):
        pipeline = self.recon_pipeline
        with DagsterInstance.from_ref(self.instance_ref) as instance:
            start_termination_thread(self.term_event)
            execution_plan = create_execution_plan(
                pipeline=pipeline,
                run_config=self.run_config,
                mode=self.pipeline_run.mode,
                step_keys_to_execute=[self.step_key],
                known_state=self.known_state,
            )

            yield instance.report_engine_event(
                "Executing step {} in subprocess".format(self.step_key),
                self.pipeline_run,
                EngineEventData(
                    [
                        MetadataEntry.text(str(os.getpid()), "pid"),
                        MetadataEntry.text(self.step_key, "step_key"),
                    ],
                    marker_end=DELEGATE_MARKER,
                ),
                MultiprocessExecutor,
                self.step_key,
            )

            yield from execute_plan_iterator(
                execution_plan,
                pipeline,
                self.pipeline_run,
                run_config=self.run_config,
                retry_mode=self.retry_mode.for_inner_plan(),
                instance=instance,
            )

Example #2

0

Show file

File: utils.py Project: helloworld/dagster

def _timing_to_metadata(timings: List[Dict[str, Any]]) -> List[MetadataEntry]:
    metadata = []
    for timing in timings:
        if timing["name"] == "execute":
            desc = "Execution"
        elif timing["name"] == "compile":
            desc = "Compilation"
        else:
            continue

        started_at = dateutil.parser.isoparse(timing["started_at"])
        completed_at = dateutil.parser.isoparse(timing["completed_at"])
        duration = completed_at - started_at
        metadata.extend(
            [
                MetadataEntry.text(
                    text=started_at.isoformat(timespec="seconds"), label=f"{desc} Started At"
                ),
                MetadataEntry.text(
                    text=started_at.isoformat(timespec="seconds"), label=f"{desc} Completed At"
                ),
                MetadataEntry.float(value=duration.total_seconds(), label=f"{desc} Duration"),
            ]
        )
    return metadata

Example #3

0

Show file

    def handle_output(self, context: OutputContext, obj: Union[PandasDataFrame, SparkDataFrame]):
        schema, table = DB_SCHEMA, context.asset_key.path[-1]

        time_window = context.asset_partitions_time_window if context.has_asset_partitions else None
        with connect_snowflake(config=self._config, schema=schema) as con:
            con.execute(self._get_cleanup_statement(table, schema, time_window))

        if isinstance(obj, SparkDataFrame):
            yield from self._handle_spark_output(obj, schema, table)
        elif isinstance(obj, PandasDataFrame):
            yield from self._handle_pandas_output(obj, schema, table)
        elif obj is None:  # dbt
            config = dict(SHARED_SNOWFLAKE_CONF)
            config["schema"] = DB_SCHEMA
            with connect_snowflake(config=config) as con:
                df = read_sql(f"SELECT * FROM {context.name} LIMIT 5", con=con)
                num_rows = con.execute(f"SELECT COUNT(*) FROM {context.name}").fetchone()

            yield MetadataEntry.md(df.to_markdown(), "Data sample")
            yield MetadataEntry.int(num_rows, "Rows")
        else:
            raise Exception(
                "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames"
            )

        yield MetadataEntry.text(
            self._get_select_statement(
                table,
                schema,
                None,
                time_window,
            ),
            "Query",
        )

Example #4

0

Show file

File: custom_io_manager.py Project: helloworld/dagster

    def handle_output(self, context, obj):
        table_name = context.name
        write_dataframe_to_table(name=table_name, dataframe=obj)

        # attach these to the Handled Output event
        yield MetadataEntry.int(len(obj), label="number of rows")
        yield MetadataEntry.text(table_name, label="table name")

Example #5

0

Show file

def join_q2_data(
    context,
    april_data,
    may_data,
    june_data,
    master_cord_data,
):

    dfs = {"april": april_data, "may": may_data, "june": june_data}

    missing_things = []

    for required_column in ["DestAirportSeqID", "OriginAirportSeqID"]:
        for month, df in dfs.items():
            if required_column not in df.columns:
                missing_things.append({"month": month, "missing_column": required_column})

    yield ExpectationResult(
        success=not bool(missing_things),
        label="airport_ids_present",
        description="Sequence IDs present in incoming monthly flight data.",
        metadata_entries=[
            MetadataEntry.json(label="metadata", data={"missing_columns": missing_things})
        ],
    )

    yield ExpectationResult(
        success=set(april_data.columns) == set(may_data.columns) == set(june_data.columns),
        label="flight_data_same_shape",
        metadata_entries=[
            MetadataEntry.json(label="metadata", data={"columns": april_data.columns})
        ],
    )

    q2_data = april_data.union(may_data).union(june_data)
    sampled_q2_data = q2_data.sample(
        withReplacement=False, fraction=context.solid_config["subsample_pct"] / 100.0
    )
    sampled_q2_data.createOrReplaceTempView("q2_data")

    dest_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "DEST_")
    dest_prefixed_master_cord_data.createOrReplaceTempView("dest_cord_data")

    origin_prefixed_master_cord_data = do_prefix_column_names(master_cord_data, "ORIGIN_")
    origin_prefixed_master_cord_data.createOrReplaceTempView("origin_cord_data")

    full_data = context.resources.pyspark.spark_session.sql(
        """
        SELECT * FROM origin_cord_data
        LEFT JOIN (
            SELECT * FROM q2_data
            LEFT JOIN dest_cord_data ON
            q2_data.DestAirportSeqID = dest_cord_data.DEST_AIRPORT_SEQ_ID
        ) q2_dest_data
        ON origin_cord_data.ORIGIN_AIRPORT_SEQ_ID = q2_dest_data.OriginAirportSeqID
        """
    )

    yield Output(rename_spark_dataframe_columns(full_data, lambda c: c.lower()))

Example #6

0

Show file

    def handle_output(self, context, obj):
        file_path = os.path.join("my_base_dir", context.step_key, context.name)

        obj.to_csv(file_path)

        yield MetadataEntry.int(obj.shape[0], label="number of rows")
        yield MetadataEntry.float(obj["some_column"].mean(),
                                  "some_column mean")

Example #7

0

Show file

File: data_frame.py Project: helloworld/dagster

def df_type_check(_, value):
    if not isinstance(value, pd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            MetadataEntry.text(str(len(value)), "row_count", "Number of rows in DataFrame"),
            # string cast columns since they may be things like datetime
            MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"),
        ],
    )

Example #8

0

Show file

def df_type_check(_, value):
    if not isinstance(value, pd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            MetadataEntry("row_count", value=str(len(value))),
            # string cast columns since they may be things like datetime
            MetadataEntry("metadata", value={"columns": list(map(str, value.columns))}),
        ],
    )

Example #9

0

Show file

File: utils.py Project: helloworld/dagster

def _node_result_to_metadata(node_result: Dict[str, Any]) -> List[MetadataEntry]:
    return [
        MetadataEntry.text(
            text=node_result["config"]["materialized"],
            label="Materialization Strategy",
        ),
        MetadataEntry.text(text=node_result["database"], label="Database"),
        MetadataEntry.text(text=node_result["schema"], label="Schema"),
        MetadataEntry.text(text=node_result["alias"], label="Alias"),
        MetadataEntry.text(text=node_result["description"], label="Description"),
    ]

Example #10

0

Show file

File: launcher.py Project: trevenrawr/dagster

    def _launch_k8s_job_with_args(self, job_name, args, run):
        container_context = self.get_container_context_for_run(run)

        pod_name = job_name

        pipeline_origin = run.pipeline_code_origin
        user_defined_k8s_config = get_user_defined_k8s_config(
            frozentags(run.tags))
        repository_origin = pipeline_origin.repository_origin

        job_config = container_context.get_k8s_job_config(
            job_image=repository_origin.container_image, run_launcher=self)

        self._instance.add_run_tags(
            run.run_id,
            {DOCKER_IMAGE_TAG: job_config.job_image},
        )

        job = construct_dagster_k8s_job(
            job_config=job_config,
            args=args,
            job_name=job_name,
            pod_name=pod_name,
            component="run_worker",
            user_defined_k8s_config=user_defined_k8s_config,
            labels={
                "dagster/job": pipeline_origin.pipeline_name,
                "dagster/run-id": run.run_id,
            },
        )

        self._instance.report_engine_event(
            "Creating Kubernetes run worker job",
            run,
            EngineEventData([
                MetadataEntry("Kubernetes Job name", value=job_name),
                MetadataEntry("Kubernetes Namespace",
                              value=container_context.namespace),
                MetadataEntry("Run ID", value=run.run_id),
            ]),
            cls=self.__class__,
        )

        self._batch_api.create_namespaced_job(
            body=job, namespace=container_context.namespace)
        self._instance.report_engine_event(
            "Kubernetes run worker job created",
            run,
            cls=self.__class__,
        )

Example #11

0

Show file

File: utils.py Project: helloworld/dagster

def result_to_materialization(
    result: Dict[str, Any], asset_key_prefix: List[str] = None, docs_url: str = None
) -> Optional[AssetMaterialization]:
    """
    This is a hacky solution that attempts to consolidate parsing many of the potential formats
    that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+,
    as well as RPC responses for a similar time period, but as the RPC response schema is not documented
    nor enforced, this can become out of date easily.
    """

    asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)

    # status comes from set of fields rather than "status"
    if "fail" in result:
        success = not result.get("fail") and not result.get("skip") and not result.get("error")
    else:
        success = result["status"] == "success"

    if not success:
        return None

    # all versions represent timing the same way
    metadata = [
        MetadataEntry.float(value=result["execution_time"], label="Execution Time (seconds)")
    ] + _timing_to_metadata(result["timing"])

    # working with a response that contains the node block (RPC and CLI 0.18.x)
    if "node" in result:

        unique_id = result["node"]["unique_id"]
        metadata += _node_result_to_metadata(result["node"])
    else:
        unique_id = result["unique_id"]

    id_prefix = unique_id.split(".")

    # only generate materializations for models
    if id_prefix[0] != "model":
        return None

    if docs_url:
        metadata = [
            MetadataEntry.url(url=f"{docs_url}#!/model/{unique_id}", label="docs_url")
        ] + metadata

    return AssetMaterialization(
        description=f"dbt node: {unique_id}",
        metadata_entries=metadata,
        asset_key=asset_key_prefix + id_prefix,
    )

Example #12

0

Show file

def load_data_to_database_from_spark(context, data_frame):
    context.resources.db_info.load_table(data_frame, context.solid_config["table_name"])

    table_name = context.solid_config["table_name"]
    yield AssetMaterialization(
        asset_key="table:{table_name}".format(table_name=table_name),
        description=(
            "Persisted table {table_name} in database configured in the db_info resource."
        ).format(table_name=table_name),
        metadata_entries=[
            MetadataEntry.text(label="Host", text=context.resources.db_info.host),
            MetadataEntry.text(label="Db", text=context.resources.db_info.db_name),
        ],
    )
    yield Output(value=table_name, output_name="table_name")

Example #13

0

Show file

File: parquet_io_manager.py Project: trevenrawr/dagster

    def handle_output(
        self, context: OutputContext, obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]
    ):

        path = self._get_path(context)
        if isinstance(obj, pandas.DataFrame):
            row_count = len(obj)
            obj.to_parquet(path=path, index=False)
        elif isinstance(obj, pyspark.sql.DataFrame):
            row_count = obj.count()
            obj.write.parquet(path=path, mode="overwrite")
        else:
            raise Exception(f"Outputs of type {type(obj)} not supported.")
        yield MetadataEntry.int(value=row_count, label="row_count")
        yield MetadataEntry.path(path=path, label="path")

Example #14

0

Show file

def raise_for_rpc_error(context: SolidExecutionContext,
                        resp: Response) -> None:
    error = resp.json().get("error")
    if error is not None:
        if error["code"] in [
                DBTErrors.project_currently_compiling_error.value,
                DBTErrors.runtime_error.value,
                DBTErrors.server_error.value,
        ]:
            context.log.warning(error["message"])
            raise RetryRequested(max_retries=5, seconds_to_wait=30)
        elif error["code"] == DBTErrors.project_compile_failure_error.value:
            raise Failure(
                description=error["message"],
                metadata_entries=[
                    MetadataEntry.text(text=str(error["code"]),
                                       label="RPC Error Code"),
                    MetadataEntry.text(text=error["data"]["cause"]["message"],
                                       label="RPC Error Cause"),
                ],
            )
        elif error["code"] == DBTErrors.rpc_process_killed_error.value:
            raise Failure(
                description=error["message"],
                metadata_entries=[
                    MetadataEntry.text(text=str(error["code"]),
                                       label="RPC Error Code"),
                    MetadataEntry.text(text=str(error["data"]["signum"]),
                                       label="RPC Signum"),
                    MetadataEntry.text(text=error["data"]["message"],
                                       label="RPC Error Message"),
                ],
            )
        elif error["code"] == DBTErrors.rpc_timeout_error.value:
            raise Failure(
                description=error["message"],
                metadata_entries=[
                    MetadataEntry.text(text=str(error["code"]),
                                       label="RPC Error Code"),
                    MetadataEntry.text(text=str(error["data"]["timeout"]),
                                       label="RPC Timeout"),
                    MetadataEntry.text(text=error["data"]["message"],
                                       label="RPC Error Message"),
                ],
            )
        else:
            raise Failure(
                description=error["message"],
                metadata_entries=[
                    MetadataEntry.text(text=str(error["code"]),
                                       label="RPC Error Code"),
                ],
            )

Example #15

0

Show file

    def handle_output(self, context: OutputContext,
                      obj: Union[PandasDataFrame, SparkDataFrame]):
        schema, table = context.metadata["table"].split(".")

        partition_bounds = (context.resources.partition_bounds
                            if context.metadata.get("partitioned") is True else
                            None)
        with connect_snowflake(config=self._config, schema=schema) as con:
            con.execute(
                self._get_cleanup_statement(table, schema, partition_bounds))

        if isinstance(obj, SparkDataFrame):
            yield from self._handle_spark_output(obj, schema, table)
        elif isinstance(obj, PandasDataFrame):
            yield from self._handle_pandas_output(obj, schema, table)
        else:
            raise Exception(
                "SnowflakeIOManager only supports pandas DataFrames and spark DataFrames"
            )

        yield MetadataEntry.text(
            self._get_select_statement(table, schema,
                                       context.metadata.get("columns"),
                                       partition_bounds),
            "Query",
        )

Example #16

0

Show file

File: test_types.py Project: helloworld/dagster

def test_raise_on_error_true_type_check_returns_unsuccessful_type_check():
    FalsyType = DagsterType(
        name="FalsyType",
        type_check_fn=lambda _, _val: TypeCheck(
            success=False,
            metadata_entries=[MetadataEntry.text("foo", "bar", "baz")]),
    )

    @solid(output_defs=[OutputDefinition(FalsyType)])
    def foo_solid(_):
        return 1

    @pipeline
    def foo_pipeline():
        foo_solid()

    with pytest.raises(DagsterTypeCheckDidNotPass) as e:
        execute_pipeline(foo_pipeline)
    assert e.value.metadata_entries[0].label == "bar"
    assert e.value.metadata_entries[0].entry_data.text == "foo"
    assert e.value.metadata_entries[0].description == "baz"
    assert isinstance(e.value.dagster_type, DagsterType)

    pipeline_result = execute_pipeline(foo_pipeline, raise_on_error=False)
    assert not pipeline_result.success
    assert [
        event.event_type_value for event in pipeline_result.step_event_list
    ] == [
        DagsterEventType.STEP_START.value,
        DagsterEventType.STEP_OUTPUT.value,
        DagsterEventType.STEP_FAILURE.value,
    ]
    for event in pipeline_result.step_event_list:
        if event.event_type_value == DagsterEventType.STEP_FAILURE.value:
            assert event.event_specific_data.error.cls_name == "DagsterTypeCheckDidNotPass"

Example #17

0

Show file

File: errors.py Project: helloworld/dagster

 def __init__(self, description: str, logs: List[Dict[str, Any]], raw_output: str):
     metadata_entries = [
         MetadataEntry.json(
             {"logs": logs},
             label="Parsed CLI Output (JSON)",
         ),
         MetadataEntry.text(
             DagsterDbtCliRuntimeError.stitch_messages(logs),
             label="Parsed CLI Output (JSON) Message Attributes",
         ),
         MetadataEntry.text(
             raw_output,
             label="Raw CLI Output",
         ),
     ]
     super().__init__(description, metadata_entries)

Example #18

0

Show file

File: dbt_asset_resource.py Project: helloworld/dagster

    def _get_metadata(self, result: Dict[str, Any]) -> List[MetadataEntry]:
        """
        Here, we run queries against our output Snowflake database tables to add additional context
        to our asset materializations.
        """

        table_name = result["unique_id"].split(".")[-1]
        with connect_snowflake(config=self._snowflake_config, schema=self._dbt_schema) as con:
            n_rows = pandas.read_sql_query(f"SELECT COUNT(*) FROM {table_name}", con)
            sample_rows = pandas.read_sql_query(
                f"SELECT * FROM {table_name} SAMPLE ROW (10 rows)", con
            )
        return super()._get_metadata(result) + [
            MetadataEntry.int(int(n_rows.iloc[0][0]), "dbt Model Number of Rows"),
            MetadataEntry.md(sample_rows.astype("str").to_markdown(), "dbt Model Sample Rows"),
        ]

Example #19

0

Show file

    def handle_output(self, context, obj: pd.DataFrame):
        """This saves the dataframe as a CSV."""
        fpath = self._get_fs_path(context.asset_key)
        os.makedirs(os.path.dirname(fpath), exist_ok=True)
        obj.to_csv(fpath)
        with open(fpath + ".version", "w") as f:
            f.write(context.version if context.version else "None")

        yield MetadataEntry.int(obj.shape[0], "Rows")
        yield MetadataEntry.path(fpath, "Path")
        yield MetadataEntry.md(obj.head(5).to_markdown(), "Sample")
        yield MetadataEntry.text(context.version, "Resolved version")
        yield MetadataEntry.table_schema(
            self.get_schema(context.dagster_type),
            "Schema",
        )

Example #20

0

Show file

    def _handle_pandas_output(self, obj: PandasDataFrame, schema: str, table: str):
        from snowflake import connector  # pylint: disable=no-name-in-module

        yield MetadataEntry.int(obj.shape[0], "Rows")
        yield MetadataEntry.md(pandas_columns_to_markdown(obj), "DataFrame columns")

        connector.paramstyle = "pyformat"
        with connect_snowflake(config=self._config, schema=schema) as con:
            with_uppercase_cols = obj.rename(str.upper, copy=False, axis="columns")
            with_uppercase_cols.to_sql(
                table,
                con=con,
                if_exists="append",
                index=False,
                method=pd_writer,
            )

Example #21

0

Show file

def cache_file_from_s3(context, s3_coordinate: S3Coordinate) -> FileHandle:
    target_key = context.solid_config.get("file_key", s3_coordinate["key"].split("/")[-1])

    file_cache = context.resources.file_cache

    target_file_handle = file_cache.get_file_handle(target_key)

    if file_cache.overwrite or not file_cache.has_file_object(target_key):
        with get_temp_file_name() as tmp_file:
            context.resources.s3.download_file(
                Bucket=s3_coordinate["bucket"], Key=s3_coordinate["key"], Filename=tmp_file
            )

            context.log.info("File downloaded to {}".format(tmp_file))

            with open(tmp_file, "rb") as tmp_file_object:
                file_cache.write_file_object(target_key, tmp_file_object)
                context.log.info("File handle written at : {}".format(target_file_handle.path_desc))
    else:
        context.log.info("File {} already present in cache".format(target_file_handle.path_desc))

    yield ExpectationResult(
        success=file_cache.has_file_object(target_key),
        label="file_handle_exists",
        metadata_entries=[MetadataEntry.path(path=target_file_handle.path_desc, label=target_key)],
    )
    yield Output(target_file_handle)

Example #22

0

Show file

File: test_external_step.py Project: trevenrawr/dagster

def test_explicit_failure():
    with tempfile.TemporaryDirectory() as tmpdir:
        run_config = {
            "resources": {
                "step_launcher": {
                    "config": {
                        "scratch_dir": tmpdir
                    }
                },
                "io_manager": {
                    "config": {
                        "base_dir": tmpdir
                    }
                },
            }
        }
        with instance_for_test() as instance:
            run = execute_pipeline(
                pipeline=reconstructable(_define_failure_job),
                run_config=run_config,
                instance=instance,
                raise_on_error=False,
            )
            fd = run.result_for_solid("retry_op").failure_data
            assert fd.user_failure_data.description == "some failure description"
            assert fd.user_failure_data.metadata_entries == [
                MetadataEntry.float(label="foo", value=1.23)
            ]

Example #23

0

Show file

    def _ge_validation_fn(context, dataset):
        data_context = context.resources.ge_data_context
        validator_kwargs = {
            "datasource_name": datasource_name,
            "data_connector_name": data_connector_name,
            "data_asset_name": datasource_name or data_asset_name,
            "runtime_parameters": {
                runtime_method_type: dataset
            },
            "batch_identifiers": batch_identifiers,
            "expectation_suite_name": suite_name,
            **extra_kwargs,
        }
        validator = data_context.get_validator(**validator_kwargs)

        run_id = {
            "run_name": datasource_name + " run",
            "run_time": datetime.datetime.utcnow(),
        }
        results = validator.validate(run_id=run_id)

        validation_results_page_renderer = ValidationResultsPageRenderer(
            run_info_at_end=True)
        rendered_document_content_list = validation_results_page_renderer.render(
            validation_results=results)
        md_str = "".join(
            DefaultMarkdownPageView().render(rendered_document_content_list))

        meta_stats = MetadataEntry("Expectation Results",
                                   value=MetadataValue.md(md_str))
        yield ExpectationResult(
            success=bool(results["success"]),
            metadata_entries=[meta_stats],
        )
        yield Output(results.to_json_dict())

Example #24

0

Show file

def pandera_schema_to_dagster_type(
    schema: Union[pa.DataFrameSchema, Type[pa.SchemaModel]],
) -> DagsterType:
    """
    Convert a Pandera dataframe schema to a `DagsterType`.

    The generated Dagster type will be given an automatically generated `name`. The schema's `title`
    property, `name` property, or class name (in that order) will be used. If neither `title` or
    `name` is defined, a name of the form `DagsterPanderaDataframe<n>` is generated.

    Additional metadata is also extracted from the Pandera schema and attached to the returned
    `DagsterType` in an `MetadataEntry` object. The extracted metadata includes:

    - Descriptions on the schema and constituent columns and checks.
    - Data types for each column.
    - String representations of all column-wise checks.
    - String representations of all row-wise (i.e. "wide") checks.

    The returned `DagsterType` type will call the Pandera schema's `validate()` method in its type
    check function. Validation is done in `lazy` mode, i.e. pandera will attempt to validate all
    values in the dataframe, rather than stopping on the first error.

    If validation fails, the returned `TypeCheck` object will contain two pieces of metadata:

    - `num_failures` total number of validation errors.
    - `failure_sample` a table containing up to the first 10 validation errors.

    Args:
        schema (Union[pa.DataFrameSchema, Type[pa.SchemaModel]]):

    Returns:
        DagsterType: Dagster Type constructed from the Pandera schema.

    """
    if not (
        isinstance(schema, pa.DataFrameSchema)
        or (isinstance(schema, type) and issubclass(schema, pa.SchemaModel))
    ):
        raise TypeError(
            "schema must be a pandera `DataFrameSchema` or a subclass of a pandera `SchemaModel`"
        )

    name = _extract_name_from_pandera_schema(schema)
    norm_schema = (
        schema.to_schema()  # type: ignore[attr-defined]
        if isinstance(schema, type) and issubclass(schema, pa.SchemaModel)
        else schema
    )
    tschema = _pandera_schema_to_table_schema(norm_schema)
    type_check_fn = _pandera_schema_to_type_check_fn(norm_schema, tschema)

    return DagsterType(
        type_check_fn=type_check_fn,
        name=name,
        description=norm_schema.description,
        metadata_entries=[
            MetadataEntry("schema", value=tschema),
        ],
    )

Example #25

0

Show file

    def handle_output(self, context: OutputContext,
                      obj: Union[pandas.DataFrame, pyspark.sql.DataFrame]):
        path = self._get_path(context)
        if "://" not in self._base_path:
            os.makedirs(os.path.dirname(path), exist_ok=True)

        if isinstance(obj, pandas.DataFrame):
            row_count = len(obj)
            context.log.info(f"Row count: {row_count}")
            obj.to_parquet(path=path, index=False)
        elif isinstance(obj, pyspark.sql.DataFrame):
            row_count = obj.count()
            obj.write.parquet(path=path, mode="overwrite")
        else:
            raise Exception(f"Outputs of type {type(obj)} not supported.")
        yield MetadataEntry.int(value=row_count, label="row_count")
        yield MetadataEntry.path(path=path, label="path")

Example #26

0

Show file

File: test_io_manager.py Project: trevenrawr/dagster

            def handle_output(self, context, obj):
                keys = tuple(context.get_output_identifier())
                self.values[keys] = obj

                context.add_output_metadata({"foo": "bar"})
                yield MetadataEntry("baz", value="baz")
                context.add_output_metadata({"bar": "bar"})
                yield materialization

Example #27

0

Show file

File: errors.py Project: helloworld/dagster

 def __init__(self, invalid_line_nos: List[int]):
     check.list_param(invalid_line_nos, "invalid_line_nos", int)
     line_nos_str = ", ".join(map(str, invalid_line_nos))
     description = f"dbt CLI emitted unexpected output on lines {line_nos_str}"
     metadata_entries = [
         MetadataEntry.json({"line_nos": invalid_line_nos}, "Invalid CLI Output Line Numbers")
     ]
     super().__init__(description, metadata_entries)
     self.invalid_line_nos = invalid_line_nos

Example #28

0

Show file

File: test_input_manager.py Project: helloworld/dagster

 def should_fail(_):
     raise Failure(
         description="Foolure",
         metadata_entries=[
             MetadataEntry.text(label="label",
                                text="text",
                                description="description")
         ],
     )

Example #29

0

Show file

File: data_frame.py Project: helloworld/dagster

def df_type_check(_, value):
    if not isinstance(value, dd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            # string cast columns since they may be things like datetime
            MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"),
        ],
    )

Example #30

0

Show file

    def handle_output(self, context, obj):
        key = context.asset_key.path[-1]
        bucket = context.resource_config["bucket"]

        context.log.debug("about to pickle object")
        pickled_obj = pickle.dumps(obj)
        yield MetadataEntry.int(len(pickled_obj), "Bytes")
        client = s3_client()
        context.log.debug("created S3 client")
        client.put_object(Bucket=bucket, Key=key, Body=pickled_obj)