Ejemplo n.º 1
0
def sort_by_calories(context, cereals):
    sorted_cereals = sorted(cereals,
                            key=lambda cereal: int(cereal["calories"]))
    least_caloric = sorted_cereals[0]["name"]
    most_caloric = sorted_cereals[-1]["name"]

    logger = get_dagster_logger()
    logger.info(f"Least caloric cereal: {least_caloric}")
    logger.info(f"Most caloric cereal: {most_caloric}")

    fieldnames = list(sorted_cereals[0].keys())
    sorted_cereals_csv_path = os.path.abspath(
        f"output/calories_sorted_{context.run_id}.csv")
    os.makedirs(os.path.dirname(sorted_cereals_csv_path), exist_ok=True)

    with open(sorted_cereals_csv_path, "w") as fd:
        writer = csv.DictWriter(fd, fieldnames)
        writer.writeheader()
        writer.writerows(sorted_cereals)

    context.log_event(
        AssetMaterialization(
            asset_key="sorted_cereals_csv",
            description="Cereals data frame sorted by caloric content",
            metadata={
                "sorted_cereals_csv_path":
                MetadataValue.path(sorted_cereals_csv_path)
            },
        ))
Ejemplo n.º 2
0
 def raw_file_op(_context):
     yield AssetMaterialization(
         asset_key="table_info",
         metadata={
             "table_path":
             MetadataValue.path("/path/to/{}.raw".format(name))
         },
     )
     yield do_expectation(_context, name)
     yield Output(name)
Ejemplo n.º 3
0
def test_fs_io_manager():
    with tempfile.TemporaryDirectory() as tmpdir_path:
        io_manager = fs_io_manager.configured({"base_dir": tmpdir_path})
        pipeline_def = define_pipeline(io_manager)

        result = execute_pipeline(pipeline_def)
        assert result.success

        handled_output_events = list(
            filter(lambda evt: evt.is_handled_output, result.event_list))
        assert len(handled_output_events) == 2

        filepath_a = os.path.join(tmpdir_path, result.run_id, "solid_a",
                                  "result")
        result_metadata_entry_a = handled_output_events[
            0].event_specific_data.metadata_entries[0]
        assert result_metadata_entry_a.label == "path"
        assert result_metadata_entry_a.value == MetadataValue.path(filepath_a)
        assert os.path.isfile(filepath_a)
        with open(filepath_a, "rb") as read_obj:
            assert pickle.load(read_obj) == [1, 2, 3]

        loaded_input_events = list(
            filter(lambda evt: evt.is_loaded_input, result.event_list))
        input_metadata_entry_a = loaded_input_events[
            0].event_specific_data.metadata_entries[0]
        assert input_metadata_entry_a.label == "path"
        assert input_metadata_entry_a.value == MetadataValue.path(filepath_a)
        assert len(loaded_input_events) == 1
        assert "solid_a" == loaded_input_events[
            0].event_specific_data.upstream_step_key

        filepath_b = os.path.join(tmpdir_path, result.run_id, "solid_b",
                                  "result")
        result_metadata_entry_b = handled_output_events[
            1].event_specific_data.metadata_entries[0]
        assert result_metadata_entry_b.label == "path"
        assert result_metadata_entry_b.value == MetadataValue.path(filepath_b)
        assert os.path.isfile(filepath_b)
        with open(filepath_b, "rb") as read_obj:
            assert pickle.load(read_obj) == 1
Ejemplo n.º 4
0
 def the_solid(_context):
     yield AssetMaterialization(
         asset_key="foo",
         metadata={
             "text": "FOO",
             "int": 22,
             "url": MetadataValue.url("http://fake.com"),
             "float": 0.1,
             "path": MetadataValue.path(Path("/a/b.csv")),
             "python": MetadataValue.python_artifact(MetadataValue),
         },
     )
Ejemplo n.º 5
0
def my_failure_metadata_op():
    path = "/path/to/files"
    my_files = get_files(path)
    if len(my_files) == 0:
        raise Failure(
            description="No files to process",
            metadata={
                "filepath":
                MetadataValue.path(path),
                "dashboard_url":
                MetadataValue.url("http://mycoolsite.com/failures"),
            },
        )
    return some_calculation(my_files)
Ejemplo n.º 6
0
def file_handle_to_s3(context, file_handle):
    bucket = context.solid_config["Bucket"]
    key = context.solid_config["Key"]

    with context.resources.file_manager.read(file_handle, "rb") as fileobj:
        context.resources.s3.upload_fileobj(fileobj, bucket, key)
        s3_file_handle = S3FileHandle(bucket, key)

        yield AssetMaterialization(
            asset_key=s3_file_handle.s3_path,
            metadata={
                last_key(key): MetadataValue.path(s3_file_handle.s3_path)
            },
        )

        yield Output(value=s3_file_handle, output_name="s3_file_handle")
Ejemplo n.º 7
0
def my_metadata_materialization_op(context):
    df = read_df()
    remote_storage_path = persist_to_storage(df)
    context.log_event(
        AssetMaterialization(
            asset_key="my_dataset",
            description="Persisted result to storage",
            metadata={
                "text_metadata":
                "Text-based metadata for this event",
                "path":
                MetadataValue.path(remote_storage_path),
                "dashboard_url":
                MetadataValue.url("http://mycoolsite.com/url_for_my_data"),
                "size (bytes)":
                calculate_bytes(df),
            },
        ))
    return remote_storage_path
Ejemplo n.º 8
0
def read_file(context):
    relative_filename = context.op_config["filename"]
    directory = context.op_config["directory"]
    filename = os.path.join(directory, relative_filename)
    try:
        fstats = os.stat(filename)
        context.log.info("Found file {}".format(relative_filename))
        yield AssetMaterialization(
            asset_key=AssetKey(["log_file", relative_filename]),
            metadata={
                "path": MetadataValue.path(filename),
                "File status": {
                    "size": fstats.st_size,
                    "ctime": fstats.st_ctime,
                    "mtime": fstats.st_mtime,
                },
            },
        )
        yield Output(relative_filename)
    except FileNotFoundError:
        context.log.error("No file found: {}".format(relative_filename))
Ejemplo n.º 9
0
def many_table_materializations(_context):
    with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f:
        md_str = f.read()
        for table in raw_tables:
            yield AssetMaterialization(
                asset_key="table_info",
                metadata={
                    "table_name":
                    table,
                    "table_path":
                    MetadataValue.path(f"/path/to/{table}"),
                    "table_data": {
                        "name": table
                    },
                    "table_name_big":
                    MetadataValue.url(f"https://bigty.pe/{table}"),
                    "table_blurb":
                    MetadataValue.md(md_str),
                    "big_int":
                    29119888133298982934829348,
                    "float_nan":
                    float("nan"),
                },
            )
Ejemplo n.º 10
0
def many_materializations_and_passing_expectations(_context):
    tables = [
        "users",
        "groups",
        "events",
        "friends",
        "pages",
        "fans",
        "event_admins",
        "group_admins",
    ]

    for table in tables:
        yield AssetMaterialization(
            asset_key="table_info",
            metadata={
                "table_path": MetadataValue.path(f"/path/to/{table}.raw"),
            },
        )
        yield ExpectationResult(
            success=True,
            label="{table}.row_count".format(table=table),
            description="Row count passed for {table}".format(table=table),
        )
Ejemplo n.º 11
0
       partitions_def=daily_partitions_def)
def downstream_daily_partitioned_asset(upstream_daily_partitioned_asset):
    assert upstream_daily_partitioned_asset is None


@asset(
    metadata={"owner": "*****@*****.**"},
    partitions_def=HourlyPartitionsDefinition(
        start_date=datetime(2022, 3, 12, 0, 0)),
)
def hourly_partitioned_asset():
    pass


@asset(
    metadata={
        "owner":
        "*****@*****.**",
        "text_metadata":
        "Text-based metadata about this asset",
        "path":
        MetadataValue.path("/unpartitioned/asset"),
        "dashboard_url":
        MetadataValue.url("http://mycoolsite.com/url_for_my_asset"),
    }, )
def unpartitioned_asset():
    pass


partitioned_asset_group = AssetGroup.from_current_module()