Python MetadataValue Beispiele, dagster.MetadataValue Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: resources.py Projekt: helloworld/dagster

    def poll_sync(
        self,
        connector_id: str,
        initial_last_sync_completion: datetime.datetime,
        poll_interval: float = DEFAULT_POLL_INTERVAL,
        poll_timeout: float = None,
    ) -> Dict[str, Any]:
        """
        Given a Fivetran connector and the timestamp at which the previous sync completed, poll
        until the next sync completes.

        The previous sync completion time is necessary because the only way to tell when a sync
        completes is when this value changes.

        Args:
            connector_id (str): The Fivetran Connector ID. You can retrieve this value from the
                "Setup" tab of a given connector in the Fivetran UI.
            initial_last_sync_completion (datetime.datetime): The timestamp of the last completed sync
                (successful or otherwise) for this connector, prior to running this method.
            poll_interval (float): The time (in seconds) that will be waited between successive polls.
            poll_timeout (float): The maximum time that will waited before this operation is timed
                out. By default, this will never time out.

        Returns:
            Dict[str, Any]: Parsed json data representing the API response.
        """
        poll_start = datetime.datetime.now()
        while True:
            (
                curr_last_sync_completion,
                curr_last_sync_succeeded,
                curr_sync_state,
            ) = self.get_connector_sync_status(connector_id)
            self._log.info(f"Polled '{connector_id}'. Status: [{curr_sync_state}]")

            if curr_last_sync_completion > initial_last_sync_completion:
                break

            if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta(
                seconds=poll_timeout
            ):
                raise Failure(
                    f"Sync for connector '{connector_id}' timed out after {datetime.datetime.now() - poll_start}."
                )

            # Sleep for the configured time interval before polling again.
            time.sleep(poll_interval)

        connector_details = self.get_connector_details(connector_id)
        if not curr_last_sync_succeeded:
            raise Failure(
                f"Sync for connector '{connector_id}' failed!",
                metadata={
                    "connector_details": MetadataValue.json(connector_details),
                    "log_url": MetadataValue.url(get_fivetran_logs_url(connector_details)),
                },
            )
        return connector_details

Beispiel #2

0

Datei anzeigen

 def the_solid(_context):
     yield AssetObservation(
         asset_key="foo",
         metadata={
             "text": "FOO",
             "int": 22,
             "url": MetadataValue.url("http://fake.com"),
             "float": 0.1,
             "python": MetadataValue.python_artifact(MetadataValue),
         },
     )

Beispiel #3

0

Datei anzeigen

Datei: test_metadata.py Projekt: trevenrawr/dagster

 def the_solid(_context):
     yield AssetMaterialization(
         asset_key="foo",
         metadata={
             "text": "FOO",
             "int": 22,
             "url": MetadataValue.url("http://fake.com"),
             "float": 0.1,
             "path": MetadataValue.path(Path("/a/b.csv")),
             "python": MetadataValue.python_artifact(MetadataValue),
         },
     )

Beispiel #4

0

Datei anzeigen

Datei: op_events.py Projekt: trevenrawr/dagster

def my_failure_metadata_op():
    path = "/path/to/files"
    my_files = get_files(path)
    if len(my_files) == 0:
        raise Failure(
            description="No files to process",
            metadata={
                "filepath":
                MetadataValue.path(path),
                "dashboard_url":
                MetadataValue.url("http://mycoolsite.com/failures"),
            },
        )
    return some_calculation(my_files)

Beispiel #5

0

Datei anzeigen

Datei: materializations.py Projekt: helloworld/dagster

def sort_by_calories(context, cereals):
    sorted_cereals = sorted(cereals,
                            key=lambda cereal: int(cereal["calories"]))
    least_caloric = sorted_cereals[0]["name"]
    most_caloric = sorted_cereals[-1]["name"]

    logger = get_dagster_logger()
    logger.info(f"Least caloric cereal: {least_caloric}")
    logger.info(f"Most caloric cereal: {most_caloric}")

    fieldnames = list(sorted_cereals[0].keys())
    sorted_cereals_csv_path = os.path.abspath(
        f"output/calories_sorted_{context.run_id}.csv")
    os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True)

    with open(sorted_cereals_csv_path, "w") as fd:
        writer = csv.DictWriter(fd, fieldnames)
        writer.writeheader()
        writer.writerows(sorted_cereals)

    context.log_event(
        AssetMaterialization(
            asset_key="sorted_cereals_csv",
            description="Cereals data frame sorted by caloric content",
            metadata={
                "sorted_cereals_csv_path":
                MetadataValue.path(sorted_cereals_csv_path)
            },
        ))

Beispiel #6

0

Datei anzeigen

    def _ge_validation_fn(context, dataset):
        data_context = context.resources.ge_data_context
        validator_kwargs = {
            "datasource_name": datasource_name,
            "data_connector_name": data_connector_name,
            "data_asset_name": datasource_name or data_asset_name,
            "runtime_parameters": {
                runtime_method_type: dataset
            },
            "batch_identifiers": batch_identifiers,
            "expectation_suite_name": suite_name,
            **extra_kwargs,
        }
        validator = data_context.get_validator(**validator_kwargs)

        run_id = {
            "run_name": datasource_name + " run",
            "run_time": datetime.datetime.utcnow(),
        }
        results = validator.validate(run_id=run_id)

        validation_results_page_renderer = ValidationResultsPageRenderer(
            run_info_at_end=True)
        rendered_document_content_list = validation_results_page_renderer.render(
            validation_results=results)
        md_str = "".join(
            DefaultMarkdownPageView().render(rendered_document_content_list))

        meta_stats = MetadataEntry("Expectation Results",
                                   value=MetadataValue.md(md_str))
        yield ExpectationResult(
            success=bool(results["success"]),
            metadata_entries=[meta_stats],
        )
        yield Output(results.to_json_dict())

Beispiel #7

0

Datei anzeigen

Datei: materialization_ops.py Projekt: helloworld/dagster

def my_metadata_materialization_op(context):
    df = read_df()
    remote_storage_path = persist_to_storage(df)
    context.log_event(
        AssetMaterialization(
            asset_key="my_dataset",
            description="Persisted result to storage",
            metadata={
                "text_metadata":
                "Text-based metadata for this event",
                "path":
                MetadataValue.path(remote_storage_path),
                "dashboard_url":
                MetadataValue.url("http://mycoolsite.com/url_for_my_data"),
                "size (bytes)":
                calculate_bytes(df),
            },
        ))
    return remote_storage_path

Beispiel #8

0

Datei anzeigen

Datei: many_events.py Projekt: trevenrawr/dagster

 def raw_file_op(_context):
     yield AssetMaterialization(
         asset_key="table_info",
         metadata={
             "table_path":
             MetadataValue.path("/path/to/{}.raw".format(name))
         },
     )
     yield do_expectation(_context, name)
     yield Output(name)

Beispiel #9

0

Datei anzeigen

Datei: log_s3.py Projekt: trevenrawr/dagster

def read_s3_key(context):
    s3_key = context.solid_config["s3_key"]
    bucket = context.solid_config["bucket"]
    path = f"s3://{bucket}/{s3_key}"
    context.log.info(f"Found file {path}")
    yield AssetMaterialization(
        asset_key=AssetKey(["log_s3", path]),
        metadata={"S3 path": MetadataValue.url(path)},
    )
    yield Output(path)

Beispiel #10

0

Datei anzeigen

Datei: test_fs_io_manager.py Projekt: trevenrawr/dagster

def test_fs_io_manager():
    with tempfile.TemporaryDirectory() as tmpdir_path:
        io_manager = fs_io_manager.configured({"base_dir": tmpdir_path})
        pipeline_def = define_pipeline(io_manager)

        result = execute_pipeline(pipeline_def)
        assert result.success

        handled_output_events = list(
            filter(lambda evt: evt.is_handled_output, result.event_list))
        assert len(handled_output_events) == 2

        filepath_a = os.path.join(tmpdir_path, result.run_id, "solid_a",
                                  "result")
        result_metadata_entry_a = handled_output_events[
            0].event_specific_data.metadata_entries[0]
        assert result_metadata_entry_a.label == "path"
        assert result_metadata_entry_a.value == MetadataValue.path(filepath_a)
        assert os.path.isfile(filepath_a)
        with open(filepath_a, "rb") as read_obj:
            assert pickle.load(read_obj) == [1, 2, 3]

        loaded_input_events = list(
            filter(lambda evt: evt.is_loaded_input, result.event_list))
        input_metadata_entry_a = loaded_input_events[
            0].event_specific_data.metadata_entries[0]
        assert input_metadata_entry_a.label == "path"
        assert input_metadata_entry_a.value == MetadataValue.path(filepath_a)
        assert len(loaded_input_events) == 1
        assert "solid_a" == loaded_input_events[
            0].event_specific_data.upstream_step_key

        filepath_b = os.path.join(tmpdir_path, result.run_id, "solid_b",
                                  "result")
        result_metadata_entry_b = handled_output_events[
            1].event_specific_data.metadata_entries[0]
        assert result_metadata_entry_b.label == "path"
        assert result_metadata_entry_b.value == MetadataValue.path(filepath_b)
        assert os.path.isfile(filepath_b)
        with open(filepath_b, "rb") as read_obj:
            assert pickle.load(read_obj) == 1

Beispiel #11

0

Datei anzeigen

Datei: materialization_ops.py Projekt: helloworld/dagster

def my_asset_key_materialization_op(context):
    df = read_df()
    remote_storage_path = persist_to_storage(df)
    yield AssetMaterialization(
        asset_key=AssetKey(["dashboard", "my_cool_site"]),
        description="Persisted result to storage",
        metadata={
            "dashboard_url":
            MetadataValue.url("http://mycoolsite.com/dashboard"),
            "size (bytes)": calculate_bytes(df),
        },
    )
    yield Output(remote_storage_path)

Beispiel #12

0

Datei anzeigen

def asset_metadata(_context, model_info):
    config = dict(SHARED_SNOWFLAKE_CONF)
    config["schema"] = model_info["schema"]
    with connect_snowflake(config=config) as con:
        df = pd.read_sql(f"SELECT * FROM {model_info['name']} LIMIT 5",
                         con=con)
        num_rows = con.execute(
            f"SELECT COUNT(*) FROM {model_info['name']}").fetchone()

    return {
        "Data sample": MetadataValue.md(df.to_markdown()),
        "Rows": num_rows[0]
    }

Beispiel #13

0

Datei anzeigen

Datei: asset_defs.py Projekt: trevenrawr/dagster

def _columns_to_metadata(
        columns: Mapping[str, Any]) -> Optional[Mapping[str, Any]]:
    return ({
        "schema":
        MetadataValue.table_schema(
            TableSchema(columns=[
                TableColumn(
                    name=name,
                    type=metadata.get("data_type") or "?",
                    description=metadata.get("description"),
                ) for name, metadata in columns.items()
            ]))
    } if len(columns) > 0 else None)

Beispiel #14

0

Datei anzeigen

def _table_data_to_materialization(
    fivetran_output: FivetranOutput,
    asset_key_prefix: List[str],
    schema_name: str,
    table_data: Dict[str, Any],
) -> AssetMaterialization:
    table_name = table_data["name_in_destination"]
    asset_key = asset_key_prefix + [schema_name, table_name]
    if not table_data["enabled"]:
        return None
    metadata = {
        "connector_url": MetadataValue.url(
            get_fivetran_connector_url(fivetran_output.connector_details)
        )
    }
    if table_data.get("columns"):
        metadata["column_info"] = MetadataValue.json(table_data.get("columns"))
    return AssetMaterialization(
        asset_key=asset_key,
        description=f"Table generated via Fivetran sync: {schema_name}.{table_name}",
        metadata=metadata,
    )

Beispiel #15

0

Datei anzeigen

Datei: utils.py Projekt: trevenrawr/dagster

def result_to_materialization(
    result: Dict[str, Any],
    asset_key_prefix: Optional[List[str]] = None,
    docs_url: Optional[str] = None,
) -> Optional[AssetMaterialization]:
    """
    This is a hacky solution that attempts to consolidate parsing many of the potential formats
    that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+,
    as well as RPC responses for a similar time period, but as the RPC response schema is not documented
    nor enforced, this can become out of date easily.
    """

    asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str)

    # status comes from set of fields rather than "status"
    if "fail" in result:
        success = not result.get("fail") and not result.get("skip") and not result.get("error")
    else:
        success = result["status"] == "success"

    if not success:
        return None

    # all versions represent timing the same way
    metadata = {"Execution Time (seconds)": result["execution_time"]}
    metadata.update(_timing_to_metadata(result["timing"]))

    # working with a response that contains the node block (RPC and CLI 0.18.x)
    if "node" in result:

        unique_id = result["node"]["unique_id"]
        metadata.update(_node_result_to_metadata(result["node"]))
    else:
        unique_id = result["unique_id"]

    id_prefix = unique_id.split(".")

    # only generate materializations for models
    if id_prefix[0] != "model":
        return None

    if docs_url:
        metadata["docs_url"] = MetadataValue.url(f"{docs_url}#!/model/{unique_id}")

    return AssetMaterialization(
        description=f"dbt node: {unique_id}",
        metadata=metadata,
        asset_key=asset_key_prefix + id_prefix,
    )

Beispiel #16

0

Datei anzeigen

Datei: op_events.py Projekt: trevenrawr/dagster

def my_metadata_output(context):
    df = get_some_data()
    yield Output(
        df,
        metadata={
            "text_metadata":
            "Text-based metadata for this event",
            "dashboard_url":
            MetadataValue.url("http://mycoolsite.com/url_for_my_data"),
            "raw_count":
            len(df),
            "size (bytes)":
            calculate_bytes(df),
        },
    )

Beispiel #17

0

Datei anzeigen

Datei: many_events.py Projekt: trevenrawr/dagster

def many_table_materializations(_context):
    with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f:
        md_str = f.read()
        for table in raw_tables:
            yield AssetMaterialization(
                asset_key="table_info",
                metadata={
                    "table_name":
                    table,
                    "table_path":
                    MetadataValue.path(f"/path/to/{table}"),
                    "table_data": {
                        "name": table
                    },
                    "table_name_big":
                    MetadataValue.url(f"https://bigty.pe/{table}"),
                    "table_blurb":
                    MetadataValue.md(md_str),
                    "big_int":
                    29119888133298982934829348,
                    "float_nan":
                    float("nan"),
                },
            )

Beispiel #18

0

Datei anzeigen

def file_handle_to_s3(context, file_handle):
    bucket = context.solid_config["Bucket"]
    key = context.solid_config["Key"]

    with context.resources.file_manager.read(file_handle, "rb") as fileobj:
        context.resources.s3.upload_fileobj(fileobj, bucket, key)
        s3_file_handle = S3FileHandle(bucket, key)

        yield AssetMaterialization(
            asset_key=s3_file_handle.s3_path,
            metadata={
                last_key(key): MetadataValue.path(s3_file_handle.s3_path)
            },
        )

        yield Output(value=s3_file_handle, output_name="s3_file_handle")

Beispiel #19

0

Datei anzeigen

    def _ge_validation_fn(context, dataset):
        data_context = context.resources.ge_data_context
        if validation_operator_name is not None:
            validation_operator = validation_operator_name
        else:
            data_context.add_validation_operator(
                "ephemeral_validation",
                {
                    "class_name": "ActionListValidationOperator",
                    "action_list": []
                },
            )
            validation_operator = "ephemeral_validation"
        suite = data_context.get_expectation_suite(suite_name)
        final_batch_kwargs = batch_kwargs or {"dataset": dataset}
        if "datasource" in batch_kwargs:
            context.log.warning(
                "`datasource` field of `batch_kwargs` will be ignored; use the `datasource_name` "
                f"parameter of the {decorator_name} factory instead.")
        final_batch_kwargs["datasource"] = datasource_name
        batch = data_context.get_batch(final_batch_kwargs, suite)
        run_id = {
            "run_name": datasource_name + " run",
            "run_time": datetime.datetime.utcnow(),
        }
        results = data_context.run_validation_operator(
            validation_operator, assets_to_validate=[batch], run_id=run_id)
        res = convert_to_json_serializable(
            results.list_validation_results())[0]
        validation_results_page_renderer = ValidationResultsPageRenderer(
            run_info_at_end=True)
        rendered_document_content_list = (
            validation_results_page_renderer.render_validation_operator_result(
                results))
        md_str = " ".join(
            DefaultMarkdownPageView().render(rendered_document_content_list))

        meta_stats = MetadataEntry("Expectation Results",
                                   value=MetadataValue.md(md_str))
        yield ExpectationResult(
            success=res["success"],
            metadata_entries=[
                meta_stats,
            ],
        )
        yield Output(res)

Beispiel #20

0

Datei anzeigen

Datei: op_events.py Projekt: trevenrawr/dagster

def my_metadata_expectation_op(context, df):
    df = do_some_transform(df)
    context.log_event(
        ExpectationResult(
            success=len(df) > 0,
            description="ensure dataframe has rows",
            metadata={
                "text_metadata":
                "Text-based metadata for this event",
                "dashboard_url":
                MetadataValue.url("http://mycoolsite.com/url_for_my_data"),
                "raw_count":
                len(df),
                "size (bytes)":
                calculate_bytes(df),
            },
        ))
    return df

Beispiel #21

0

Datei anzeigen

Datei: test_metadata.py Projekt: trevenrawr/dagster

def test_table_metadata_value_schema_inference():

    table_metadata_entry = MetadataEntry(
        "foo",
        value=MetadataValue.table(
            records=[
                TableRecord(name="foo", status=False),
                TableRecord(name="bar", status=True),
            ],
        ),
    )

    schema = table_metadata_entry.entry_data.schema  # type: ignore
    assert isinstance(schema, TableSchema)
    assert schema.columns == [
        TableColumn(name="name", type="string"),
        TableColumn(name="status", type="bool"),
    ]

Beispiel #22

0

Datei anzeigen

def _materialization_for_stream(
    name: str,
    stream_schema_props: Dict[str, Any],
    stream_stats: Dict[str, Any],
    asset_key_prefix: List[str],
) -> AssetMaterialization:

    return AssetMaterialization(
        asset_key=asset_key_prefix + [name],
        metadata={
            "schema": MetadataValue.table_schema(
                TableSchema(
                    columns=[
                        TableColumn(name=name, type=str(info.get("type", "unknown")))
                        for name, info in stream_schema_props.items()
                    ]
                )
            ),
            **{k: v for k, v in stream_stats.items() if v is not None},
        },
    )

Beispiel #23

0

Datei anzeigen

Datei: log_file.py Projekt: trevenrawr/dagster

def read_file(context):
    relative_filename = context.op_config["filename"]
    directory = context.op_config["directory"]
    filename = os.path.join(directory, relative_filename)
    try:
        fstats = os.stat(filename)
        context.log.info("Found file {}".format(relative_filename))
        yield AssetMaterialization(
            asset_key=AssetKey(["log_file", relative_filename]),
            metadata={
                "path": MetadataValue.path(filename),
                "File status": {
                    "size": fstats.st_size,
                    "ctime": fstats.st_ctime,
                    "mtime": fstats.st_mtime,
                },
            },
        )
        yield Output(relative_filename)
    except FileNotFoundError:
        context.log.error("No file found: {}".format(relative_filename))

Beispiel #24

0

Datei anzeigen

def build_component_top_stories(model: TruncatedSVD,
                                user_story_matrix: IndexedCooMatrix,
                                story_titles: DataFrame):
    """
    For each component in the collaborative filtering model, finds the titles of the top stories
    it's associated with.
    """
    n_stories = 10

    components_column = []
    titles_column = []

    story_titles = story_titles.set_index("id")

    for i in range(model.components_.shape[0]):
        component = model.components_[i]
        top_story_indices = component.argsort()[-n_stories:][::-1]
        top_story_ids = user_story_matrix.col_index[top_story_indices]
        top_story_titles = story_titles.loc[top_story_ids]

        for title in top_story_titles["title"]:
            components_column.append(i)
            titles_column.append(title)

    component_top_stories = DataFrame({
        "component_index":
        Series(components_column),
        "title":
        Series(titles_column)
    })

    yield Output(
        component_top_stories,
        metadata={
            "Top component top stories":
            MetadataValue.md(
                top_components_to_markdown(component_top_stories)),
        },
    )

Beispiel #25

0

Datei anzeigen

Datei: many_events.py Projekt: trevenrawr/dagster

def many_materializations_and_passing_expectations(_context):
    tables = [
        "users",
        "groups",
        "events",
        "friends",
        "pages",
        "fans",
        "event_admins",
        "group_admins",
    ]

    for table in tables:
        yield AssetMaterialization(
            asset_key="table_info",
            metadata={
                "table_path": MetadataValue.path(f"/path/to/{table}.raw"),
            },
        )
        yield ExpectationResult(
            success=True,
            label="{table}.row_count".format(table=table),
            description="Row count passed for {table}".format(table=table),
        )

Beispiel #26

0

Datei anzeigen

Datei: utils.py Projekt: helloworld/dagster

def _materialization_for_stream(
    name: str,
    stream_info: Dict[str, Any],
    stream_stats: Dict[str, Any],
    asset_key_prefix: List[str],
) -> AssetMaterialization:

    return AssetMaterialization(
        asset_key=asset_key_prefix + [name],
        metadata={
            "schema":
            MetadataValue.table_schema(
                TableSchema(columns=[
                    TableColumn(name=name, type=str(info["type"]))
                    for name, info in stream_info["stream"]["jsonSchema"]
                    ["properties"].items()
                ])),
            "columns":
            ",".join(name for name in stream_info["stream"]["jsonSchema"]
                     ["properties"].keys()),
            **{k: v
               for k, v in stream_stats.items() if v is not None},
        },
    )

Beispiel #27

0

Datei anzeigen

def metadata_for_actions(df):
    return {
        "min_score": int(df["score"].min()),
        "max_score": int(df["score"].max()),
        "sample rows": MetadataValue.md(df[:5].to_markdown()),
    }

Beispiel #28

0

Datei anzeigen

def daily_top_action(_, df1, df2):
    df = pd.concat([df1, df2]).nlargest(1, "score")
    return Output(df, metadata={"data": MetadataValue.md(df.to_markdown())})

Beispiel #29

0

Datei anzeigen

 def _best_n_actions(_, df):
     df = df.nlargest(n, "score")
     return Output(
         df,
         metadata={"data": MetadataValue.md(df.to_markdown())},
     )

Beispiel #30

0

Datei anzeigen

 def handle_output(self, context, obj):
     super().handle_output(context, obj)
     # can pretend this actually came from a library call
     yield MetadataEntry(label="num rows written to db",
                         description=None,
                         entry_data=MetadataValue.int(len(obj)))