Ejemplo n.º 1
0
def test_build_output_context_with_cm_resource():
    entered = []

    @resource
    def cm_resource():
        try:
            yield "foo"
        finally:
            entered.append("yes")

    context = build_output_context(step_key="test",
                                   name="test",
                                   resources={"cm_resource": cm_resource})
    with pytest.raises(
            DagsterInvariantViolationError,
            match=re.escape(
                "At least one provided resource is a generator, but attempting to access "
                "resources outside of context manager scope. You can use the following syntax to "
                "open a context manager: `with build_output_context(...) as context:`",
            ),
    ):
        context.resources  # pylint: disable=pointless-statement

    del context

    assert entered == ["yes"]

    with build_output_context(step_key="test",
                              name="test",
                              resources={"cm_resource":
                                         cm_resource}) as context:
        assert context.resources.cm_resource == "foo"

    assert entered == ["yes", "yes"]
Ejemplo n.º 2
0
def test_versioned_pickled_object_filesystem_io_manager():
    with TemporaryDirectory() as temp_dir:
        store = VersionedPickledObjectFilesystemIOManager(temp_dir)
        context = build_output_context(step_key="foo",
                                       name="bar",
                                       version="version1")
        store.handle_output(context, "cat")
        assert store.has_output(context)
        assert store.load_input(
            build_input_context(upstream_output=context)) == "cat"
        context_diff_version = build_output_context(step_key="foo",
                                                    name="bar",
                                                    version="version2")
        assert not store.has_output(context_diff_version)
Ejemplo n.º 3
0
def test_handle_output_spark_then_load_input_pandas():
    snowflake_manager = snowflake_io_manager(
        build_init_resource_context(config={"database": "TESTDB"},
                                    resources={"partition_bounds": None}))
    spark = SparkSession.builder.config(
        "spark.jars.packages",
        "net.snowflake:snowflake-jdbc:3.8.0,net.snowflake:spark-snowflake_2.12:2.8.2-spark_3.0",
    ).getOrCreate()

    schema = StructType([
        StructField("col1", StringType()),
        StructField("col2", IntegerType())
    ])
    contents = spark.createDataFrame([Row(col1="Thom", col2=51)], schema)

    with temporary_snowflake_table(PandasDataFrame([{
            "col1": "a",
            "col2": 1
    }])) as temp_table_name:
        metadata = {
            "table": f"public.{temp_table_name}",
        }
        output_context = build_output_context(metadata=metadata)

        list(snowflake_manager.handle_output(output_context,
                                             contents))  # exhaust the iterator

        input_context = build_input_context(upstream_output=output_context)
        input_value = snowflake_manager.load_input(input_context)
        contents_pandas = contents.toPandas()
        assert str(input_value) == str(
            contents_pandas), f"{input_value}\n\n{contents_pandas}"
Ejemplo n.º 4
0
def test_context_logging_metadata():
    context = build_output_context()

    context.add_output_metadata({"foo": "bar"})

    assert [entry.label
            for entry in context.get_logged_metadata_entries()] == ["foo"]
Ejemplo n.º 5
0
def test_my_io_manager_load_input():
    manager = my_io_manager(None)
    manager.storage_dict[("123", "abc")] = 5

    context = build_input_context(
        upstream_output=build_output_context(name="abc", step_key="123"))
    assert manager.load_input(context) == 5
Ejemplo n.º 6
0
def test_context_logging_user_events():
    context = build_output_context()

    context.log_event(AssetMaterialization("first"))
    context.log_event(AssetMaterialization("second"))
    assert [event.label
            for event in context.get_logged_events()] == ["first", "second"]
Ejemplo n.º 7
0
def test_mem_io_manager_execution():
    mem_io_manager_instance = InMemoryIOManager()
    output_context = build_output_context(step_key="step_key",
                                          name="output_name")
    mem_io_manager_instance.handle_output(output_context, 1)
    input_context = build_input_context(upstream_output=output_context)
    assert mem_io_manager_instance.load_input(input_context) == 1
Ejemplo n.º 8
0
def test_output_identifier_dynamic_memoization():
    context = build_output_context(version="foo",
                                   mapping_key="bar",
                                   step_key="baz",
                                   name="buzz")

    with pytest.raises(
            CheckError,
            match=
            "Mapping key and version both provided for output 'buzz' of step 'baz'. Dynamic "
            "mapping is not supported when using versioning.",
    ):
        context.get_output_identifier()
Ejemplo n.º 9
0
def test_df_to_csv_io_manager():
    with tempfile.TemporaryDirectory() as temp_dir:
        my_io_manager = df_to_csv_io_manager(
            build_init_resource_context(config={"base_dir": temp_dir}))
        test_df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})
        # test handle_output
        output_context = build_output_context(name="abc", step_key="123")
        my_io_manager.handle_output(output_context, test_df)
        output_path = my_io_manager._get_path(output_context)  # pylint:disable=protected-access
        assert os.path.exists(output_path)
        assert test_df.equals(pd.read_csv(output_path))

        # test load_input
        input_context = build_input_context(upstream_output=output_context)
        assert test_df.equals(my_io_manager.load_input(input_context))
Ejemplo n.º 10
0
def test_handle_output_then_load_input_pandas():
    snowflake_manager = snowflake_io_manager(
        build_init_resource_context(
            config={"database": "TESTDB"}, resources={"partition_bounds": None}
        )
    )
    contents1 = PandasDataFrame([{"col1": "a", "col2": 1}])  # just to get the types right
    contents2 = PandasDataFrame([{"col1": "b", "col2": 2}])  # contents we will insert
    with temporary_snowflake_table(contents1) as temp_table_name:
        metadata = {"table": f"public.{temp_table_name}"}
        output_context = build_output_context(metadata=metadata)

        list(snowflake_manager.handle_output(output_context, contents2))  # exhaust the iterator

        input_context = build_input_context(upstream_output=output_context)
        input_value = snowflake_manager.load_input(input_context)
        assert input_value.equals(contents2), f"{input_value}\n\n{contents2}"
Ejemplo n.º 11
0
def test_handle_output_then_load_input():
    snowflake_config = generate_snowflake_config()
    snowflake_manager = snowflake_io_manager(build_init_resource_context(config=snowflake_config))
    contents1 = DataFrame([{"col1": "a", "col2": 1}])  # just to get the types right
    contents2 = DataFrame([{"col1": "b", "col2": 2}])  # contents we will insert
    with temporary_snowflake_table(contents1) as temp_table_name:
        metadata = {
            "table": f"public.{temp_table_name}",
        }
        output_context = build_output_context(metadata=metadata, resource_config=snowflake_config)

        list(snowflake_manager.handle_output(output_context, contents2))  # exhaust the iterator

        input_context = build_input_context(
            upstream_output=output_context, resource_config=snowflake_config
        )
        input_value = snowflake_manager.load_input(input_context)
        assert input_value.equals(contents2), f"{input_value}\n\n{contents2}"
Ejemplo n.º 12
0
def test_handle_output_then_load_input():
    snowflake_manager = SnowflakeIOManager(config=PROD_SNOWFLAKE_CONF)
    contents1 = DataFrame([{"col1": "a", "col2": 1}])  # just to get the types right
    contents2 = DataFrame([{"col1": "b", "col2": 2}])  # contents we will insert
    with temporary_snowflake_table(contents1) as temp_table_name:

        @solid(output_defs=[OutputDefinition(asset_key=AssetKey(temp_table_name))])
        def my_solid():
            pass

        output_context = build_output_context(
            name="result", solid_def=my_solid, resource_config=PROD_SNOWFLAKE_CONF
        )

        list(snowflake_manager.handle_output(output_context, contents2))  # exhaust the iterator

        input_context = build_input_context(
            upstream_output=output_context, resource_config=PROD_SNOWFLAKE_CONF
        )
        input_value = snowflake_manager.load_input(input_context)
        assert input_value.equals(contents2), f"{input_value}\n\n{contents2}"
Ejemplo n.º 13
0
def test_gcs_pickle_io_manager_execution(gcs_bucket):
    inty_job = define_inty_job()

    run_config = {
        "resources": {
            "io_manager": {
                "config": {
                    "gcs_bucket": gcs_bucket,
                }
            }
        }
    }

    run_id = make_new_run_id()

    resolved_run_config = ResolvedRunConfig.build(inty_job,
                                                  run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(inty_job),
                                         resolved_run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=inty_job.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys, inty_job,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(inty_job),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")

    io_manager = PickledObjectGCSIOManager(gcs_bucket, storage.Client())
    step_output_handle = StepOutputHandle("return_one")
    context = build_input_context(upstream_output=build_output_context(
        step_key=step_output_handle.step_key,
        name=step_output_handle.output_name,
        run_id=run_id,
    ))
    assert io_manager.load_input(context) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], inty_job,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(inty_job),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    step_output_handle = StepOutputHandle("add_one")
    context = build_input_context(upstream_output=build_output_context(
        step_key=step_output_handle.step_key,
        name=step_output_handle.output_name,
        run_id=run_id,
    ))

    assert get_step_output(add_one_step_events, "add_one")
    assert io_manager.load_input(context) == 2
Ejemplo n.º 14
0
def test_s3_pickle_io_manager_execution(mock_s3_bucket):
    pipeline_def = define_inty_pipeline()

    run_config = {
        "resources": {
            "io_manager": {
                "config": {
                    "s3_bucket": mock_s3_bucket.name
                }
            }
        }
    }

    run_id = make_new_run_id()

    resolved_run_config = ResolvedRunConfig.build(pipeline_def,
                                                  run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(pipeline_def),
                                         resolved_run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=pipeline_def.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys, pipeline_def,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")

    io_manager = PickledObjectS3IOManager(mock_s3_bucket.name,
                                          construct_s3_client(max_attempts=5),
                                          s3_prefix="dagster")
    step_output_handle = StepOutputHandle("return_one")
    context = build_input_context(upstream_output=build_output_context(
        step_key=step_output_handle.step_key,
        name=step_output_handle.output_name,
        run_id=run_id,
    ))
    assert io_manager.load_input(context) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], pipeline_def,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(pipeline_def),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    step_output_handle = StepOutputHandle("add_one")
    context = build_input_context(upstream_output=build_output_context(
        step_key=step_output_handle.step_key,
        name=step_output_handle.output_name,
        run_id=run_id,
    ))

    assert get_step_output(add_one_step_events, "add_one")
    assert io_manager.load_input(context) == 2
Ejemplo n.º 15
0
def test_adls2_pickle_io_manager_execution(storage_account, file_system,
                                           credential):
    job = define_inty_job()

    run_config = {
        "resources": {
            "io_manager": {
                "config": {
                    "adls2_file_system": file_system
                }
            },
            "adls2": {
                "config": {
                    "storage_account": storage_account,
                    "credential": {
                        "key": credential
                    }
                }
            },
        }
    }

    run_id = make_new_run_id()

    resolved_run_config = ResolvedRunConfig.build(job, run_config=run_config)
    execution_plan = ExecutionPlan.build(InMemoryPipeline(job),
                                         resolved_run_config)

    assert execution_plan.get_step_by_key("return_one")

    step_keys = ["return_one"]
    instance = DagsterInstance.ephemeral()
    pipeline_run = PipelineRun(pipeline_name=job.name,
                               run_id=run_id,
                               run_config=run_config)

    return_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(step_keys, job,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(job),
            run_config=run_config,
            pipeline_run=pipeline_run,
            instance=instance,
        ))

    assert get_step_output(return_one_step_events, "return_one")
    context = build_input_context(upstream_output=build_output_context(
        step_key="return_one",
        name="result",
        run_id=run_id,
    ))

    io_manager = PickledObjectADLS2IOManager(
        file_system=file_system,
        adls2_client=create_adls2_client(storage_account, credential),
        blob_client=create_blob_client(storage_account, credential),
    )
    assert io_manager.load_input(context) == 1

    add_one_step_events = list(
        execute_plan(
            execution_plan.build_subset_plan(["add_one"], job,
                                             resolved_run_config),
            pipeline=InMemoryPipeline(job),
            pipeline_run=pipeline_run,
            run_config=run_config,
            instance=instance,
        ))

    context = build_input_context(upstream_output=build_output_context(
        step_key="add_one",
        name="result",
        run_id=run_id,
        mapping_key="foo",
    ))

    assert get_step_output(add_one_step_events, "add_one")
    assert io_manager.load_input(context) == 2
Ejemplo n.º 16
0
def test_my_io_manager_handle_output():
    manager = my_io_manager(None)
    context = build_output_context(name="abc", step_key="123")
    manager.handle_output(context, 5)
    assert manager.storage_dict[("123", "abc")] == 5
Ejemplo n.º 17
0
def mock_output_context(table_name):
    @asset(name=table_name)
    def my_asset():
        pass

    return build_output_context(op_def=my_asset.op, name="result")
Ejemplo n.º 18
0
def test_basic_build_output_context():
    context = build_output_context()
    assert isinstance(context, OutputContext)
Ejemplo n.º 19
0
def test_basic_build_output_context():
    context = build_output_context("fake_key", "fake_name")
    assert isinstance(context, OutputContext)
    assert context.step_key == "fake_key"
    assert context.name == "fake_name"