Example #1
0
def test_yield_resource():
    called = {}

    @solid(required_resource_keys={"a_string"})
    def a_solid(context):
        called["yup"] = True
        assert context.resources.a_string == "foo"

    def _do_resource(init_context):
        yield init_context.resource_config

    yield_string_resource = ResourceDefinition(config_schema=String, resource_fn=_do_resource)

    pipeline_def = PipelineDefinition(
        name="with_a_yield_resource",
        solid_defs=[a_solid],
        mode_defs=[ModeDefinition(resource_defs={"a_string": yield_string_resource})],
    )

    result = execute_pipeline(pipeline_def, {"resources": {"a_string": {"config": "foo"}}})

    assert result.success
    assert called["yup"]
Example #2
0
def test_cache_file_from_s3_basic():
    s3_session = mock.MagicMock()
    with get_temp_dir() as temp_dir:
        solid_result = execute_solid(
            cache_file_from_s3,
            ModeDefinition(
                resource_defs={
                    'file_cache': fs_file_cache,
                    's3': ResourceDefinition.hardcoded_resource(S3Resource(s3_session)),
                }
            ),
            environment_dict={
                'solids': {
                    'cache_file_from_s3': {
                        'inputs': {'s3_coordinate': {'bucket': 'some-bucket', 'key': 'some-key'}}
                    }
                },
                'resources': {'file_cache': {'config': {'target_folder': temp_dir}}},
            },
        )

        # assert the download occured
        assert s3_session.download_file.call_count == 1

        assert solid_result.success

        expectation_results = solid_result.expectation_results_during_compute
        assert len(expectation_results) == 1
        expectation_result = expectation_results[0]
        assert expectation_result.success
        assert expectation_result.label == 'file_handle_exists'
        path_in_metadata = expectation_result.metadata_entries[0].entry_data.path
        assert isinstance(path_in_metadata, str)
        assert os.path.exists(path_in_metadata)

        assert isinstance(solid_result.output_value(), LocalFileHandle)
        assert 'some-key' in solid_result.output_value().path_desc
Example #3
0
def define_dagma_resource():
    """Returns a ResourceDefinition appropriate for use of the dagma engine.

    Usage:

        from dagster import PipelineContextDefinition

        PipelineContextDefinition(
            ...,
            resources={
                ...,
                'dagma': define_dagma_resource(),
            },
        )
    """
    def _create_dagma_resource(info):
        sessionmaker = lambda: boto3.Session(  # Otherwise, can't be pickled b/c of ssl.SSLContext
            aws_access_key_id=info.get('aws_access_key_id'),
            aws_secret_access_key=info.get('aws_secret_access_key'),
            aws_session_token=info.get('aws_session_token'),
            region_name=info.config['aws_region_name'],
        )

        storage_config = dict(DEFAULT_STORAGE_CONFIG,
                              sessionmaker=sessionmaker,
                              s3_bucket=info.config['s3_bucket'])

        return DagmaResourceType(
            sessionmaker=sessionmaker,
            aws_region_name=info.config['aws_region_name'],
            storage=Storage(storage_config),
            s3_bucket=info.config['s3_bucket'],
            runtime_bucket=info.config['runtime_bucket'],
        )

    return ResourceDefinition(resource_fn=_create_dagma_resource,
                              config_field=Field(DagmaResourceConfig))
Example #4
0
def test_cache_file_from_s3_basic():
    s3_session = mock.MagicMock()
    with tempfile.TemporaryDirectory() as temp_dir:
        solid_result = execute_solid(
            cache_file_from_s3,
            ModeDefinition(
                resource_defs={
                    "file_cache": fs_file_cache,
                    "s3": ResourceDefinition.hardcoded_resource(s3_session),
                }
            ),
            run_config={
                "solids": {
                    "cache_file_from_s3": {
                        "inputs": {"s3_coordinate": {"bucket": "some-bucket", "key": "some-key"}}
                    }
                },
                "resources": {"file_cache": {"config": {"target_folder": temp_dir}}},
            },
        )

        # assert the download occurred
        assert s3_session.download_file.call_count == 1

        assert solid_result.success

        expectation_results = solid_result.expectation_results_during_compute
        assert len(expectation_results) == 1
        expectation_result = expectation_results[0]
        assert expectation_result.success
        assert expectation_result.label == "file_handle_exists"
        path_in_metadata = expectation_result.metadata_entries[0].entry_data.path
        assert isinstance(path_in_metadata, str)
        assert os.path.exists(path_in_metadata)

        assert isinstance(solid_result.output_value(), LocalFileHandle)
        assert "some-key" in solid_result.output_value().path_desc
Example #5
0
def test_string_resource():
    called = {}

    @solid
    def solid_test_string(context):
        assert context.resources.test_string == 'foo'
        called['yup'] = True

    pipeline = PipelineDefinition(
        name='test_string_resource',
        solids=[solid_test_string],
        context_definitions={
            'default': PipelineContextDefinition(
                resources={'test_string': ResourceDefinition.string_resource()}
            )
        },
    )

    result = execute_pipeline(
        pipeline, {'context': {'default': {'resources': {'test_string': {'config': 'foo'}}}}}
    )

    assert result.success
    assert called['yup']
Example #6
0
def test_required_resource_with_required_subfield():
    pipeline_def = PipelineDefinition(
        name='some_pipeline',
        solid_defs=[],
        mode_defs=[
            ModeDefinition(
                resource_defs={
                    'with_required': ResourceDefinition(
                        resource_fn=lambda _: None, config_schema={'required_field': String},
                    )
                }
            )
        ],
    )

    env_type = create_environment_type(pipeline_def)
    assert env_type.fields['solids'].is_required is False
    assert env_type.fields['execution'].is_required is False
    assert env_type.fields['resources'].is_required
    assert nested_field(env_type, 'resources', 'with_required').is_required
    assert nested_field(env_type, 'resources', 'with_required', 'config').is_required
    assert nested_field(
        env_type, 'resources', 'with_required', 'config', 'required_field'
    ).is_required
Example #7
0
def test_whole_environment():
    pipeline_def = PipelineDefinition(
        name='some_pipeline',
        mode_defs=[
            ModeDefinition(
                name='test_mode',
                resource_defs={
                    'test_resource': ResourceDefinition(resource_fn=lambda: None, config=Any)
                },
            )
        ],
        solid_defs=[
            SolidDefinition(
                name='int_config_solid',
                config=Int,
                input_defs=[],
                output_defs=[],
                compute_fn=lambda *args: None,
            ),
            SolidDefinition(
                name='no_config_solid', input_defs=[], output_defs=[], compute_fn=lambda *args: None
            ),
        ],
    )

    env = EnvironmentConfig.build(
        pipeline_def,
        {
            'resources': {'test_resource': {'config': 1}},
            'solids': {'int_config_solid': {'config': 123}},
        },
    )

    assert isinstance(env, EnvironmentConfig)
    assert env.solids == {'int_config_solid': SolidConfig(123), 'no_config_solid': SolidConfig()}
    assert env.resources == {'test_resource': {'config': 1}}
Example #8
0
def test_depends_on_adls2_resource_file_manager(storage_account, file_system):
    bar_bytes = 'bar'.encode()

    @solid(output_defs=[OutputDefinition(ADLS2FileHandle)])
    def emit_file(context):
        return context.file_manager.write_data(bar_bytes)

    @solid(input_defs=[InputDefinition('file_handle', ADLS2FileHandle)])
    def accept_file(context, file_handle):
        local_path = context.file_manager.copy_handle_to_local_temp(
            file_handle)
        assert isinstance(local_path, str)
        assert open(local_path, 'rb').read() == bar_bytes

    adls2_fake_resource = FakeADLS2Resource(storage_account)

    @pipeline(mode_defs=[
        ModeDefinition(
            system_storage_defs=adls2_plus_default_storage_defs,
            resource_defs={
                'adls2':
                ResourceDefinition.hardcoded_resource(adls2_fake_resource)
            },
        )
    ])
    def adls2_file_manager_test():
        accept_file(emit_file())

    result = execute_pipeline(
        adls2_file_manager_test,
        environment_dict={
            'storage': {
                'adls2': {
                    'config': {
                        'adls2_file_system': file_system
                    }
                }
            }
        },
    )

    assert result.success

    keys_in_bucket = set(
        adls2_fake_resource.adls2_client.file_systems[file_system].keys())

    for step_key, output_name in [
        ('emit_file.compute', 'result'),
        ('accept_file.compute', 'result'),
    ]:
        keys_in_bucket.remove(
            create_adls2_key(result.run_id, step_key, output_name))

    assert len(keys_in_bucket) == 1

    file_key = list(keys_in_bucket)[0]
    comps = file_key.split('/')

    assert '/'.join(comps[:-1]) == 'dagster/storage/{run_id}/files'.format(
        run_id=result.run_id)

    assert uuid.UUID(comps[-1])
def define_string_resource():
    return ResourceDefinition(
        config_schema=String, resource_fn=lambda init_context: init_context.resource_config
    )
from hca_orchestration.support.typing import HcaScratchDatasetName, MetadataType, MetadataTypeFanoutResult
from hca_orchestration.tests.support.gcs import FakeGCSClient, FakeGoogleBucket, HexBlobInfo

test_bucket = FakeGoogleBucket(
    {"gs://my-fake-bucket/fake-prefix": HexBlobInfo(hex_md5="b2d6ec45472467c836f253bd170182c7", content="test content")}
)

test_bucket_name = "my-fake-bucket"

load_table_test_mode = ModeDefinition(
    "test_load_table",
    resource_defs={**test_mode.resource_defs}
)
load_table_test_mode.resource_defs["gcs"] = ResourceDefinition.hardcoded_resource(
    FakeGCSClient(
        buckets={test_bucket_name: test_bucket}
    )
)

run_config = {
    "resources": {
        "scratch_config": {
            "config": {
                "scratch_bucket_name": test_bucket_name,
                "scratch_prefix_name": "prefix_name",
                "scratch_bq_project": "bq_project",
                "scratch_dataset_prefix": "dataset_prefix",
                "scratch_table_expiration_ms": 86400000
            }
        },
        "target_hca_dataset": {
Example #11
0
    if inputs is None:
        inputs = []

    return SolidDefinition(
        name=name,
        transform_fn=_create_sql_alchemy_transform_fn(sql_text),
        inputs=inputs,
        outputs=[OutputDefinition()],
    )


InMemSqlLiteEngineResource = ResourceDefinition(
    resource_fn=lambda info: in_mem_engine(info.config['num_table']),
    config_field=Field(
        Dict({
            'num_table':
            Field(String, is_optional=True, default_value='num_table')
        })),
)


def test_resource_format():
    sum_sql_text = '''CREATE TABLE sum_table AS
            SELECT num1, num2, num1 + num2 as sum FROM num_table'''

    sum_sq_sql_text = '''CREATE TABLE sum_sq_table AS
            SELECT num1, num2, sum, sum * sum as sum_sq FROM sum_table'''

    sum_sql_solid = create_sql_statement_solid('sum_sql_solid', sum_sql_text)

    sum_sq_sql_solid = create_sql_statement_solid(
Example #12
0
def test_airline_demo_load_df():
    db_info_mock = DbInfo(
        engine=mock.MagicMock(),
        url="url",
        jdbc_url="url",
        dialect="dialect",
        load_table=mock.MagicMock(),
        host="host",
        db_name="db_name",
    )

    @solid(
        required_resource_keys={"pyspark"},
        output_defs=[OutputDefinition(io_manager_key="pyspark_io_manager")],
    )
    def emit_mock(context):
        return context.resources.pyspark.spark_session.read.csv(
            file_relative_path(__file__, "../data/test.csv"))

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={
                "db_info": ResourceDefinition.hardcoded_resource(db_info_mock),
                "pyspark": pyspark_resource,
                "pyspark_step_launcher": no_step_launcher,
                "pyspark_io_manager": local_parquet_io_manager,
                "io_manager": fs_io_manager,
            })
    ])
    def load_df_test():
        load_data_to_database_from_spark(emit_mock())

    with tempfile.TemporaryDirectory() as temp_dir:
        solid_result = execute_pipeline(
            load_df_test,
            run_config={
                "solids": {
                    "load_data_to_database_from_spark": {
                        "config": {
                            "table_name": "foo"
                        }
                    }
                },
                "resources": {
                    "io_manager": {
                        "config": {
                            "base_dir": temp_dir
                        }
                    },
                    "pyspark_io_manager": {
                        "config": {
                            "base_dir": temp_dir
                        }
                    },
                },
            },
        ).result_for_solid("load_data_to_database_from_spark")

        assert solid_result.success
        mats = solid_result.materializations_during_compute
        assert len(mats) == 1
        mat = mats[0]
        assert len(mat.metadata_entries) == 2
        entries = {me.label: me for me in mat.metadata_entries}
        assert entries["Host"].entry_data.text == "host"
        assert entries["Db"].entry_data.text == "db_name"
Example #13
0
def test_cache_file_from_s3_overwrite():
    with get_temp_dir() as temp_dir:
        s3_session_one = mock.MagicMock()
        execute_solid(
            cache_file_from_s3,
            ModeDefinition(
                resource_defs={
                    'file_cache': fs_file_cache,
                    's3': ResourceDefinition.hardcoded_resource(
                        s3_session_one),
                }),
            environment_dict={
                'solids': {
                    'cache_file_from_s3': {
                        'inputs': {
                            's3_coordinate': {
                                'bucket': 'some-bucket',
                                'key': 'some-key'
                            }
                        }
                    }
                },
                'resources': {
                    'file_cache': {
                        'config': {
                            'target_folder': temp_dir,
                            'overwrite': True
                        }
                    }
                },
            },
        )

        # assert the download occurred
        assert s3_session_one.download_file.call_count == 1

        s3_session_two = mock.MagicMock()
        execute_solid(
            cache_file_from_s3,
            ModeDefinition(
                resource_defs={
                    'file_cache': fs_file_cache,
                    's3': ResourceDefinition.hardcoded_resource(
                        s3_session_two),
                }),
            environment_dict={
                'solids': {
                    'cache_file_from_s3': {
                        'inputs': {
                            's3_coordinate': {
                                'bucket': 'some-bucket',
                                'key': 'some-key'
                            }
                        }
                    }
                },
                'resources': {
                    'file_cache': {
                        'config': {
                            'target_folder': temp_dir,
                            'overwrite': True
                        }
                    }
                },
            },
        )

        # assert the download did not occur because file is already there
        assert s3_session_two.download_file.call_count == 0
Example #14
0
def define_errorable_resource():
    return ResourceDefinition(resource_fn=resource_init, config={'throw_on_resource_init': Bool})
Example #15
0
def test_depends_on_s3_resource_file_manager():
    bar_bytes = "bar".encode()

    @solid(output_defs=[OutputDefinition(S3FileHandle)])
    def emit_file(context):
        return context.file_manager.write_data(bar_bytes)

    @solid(input_defs=[InputDefinition("file_handle", S3FileHandle)])
    def accept_file(context, file_handle):
        local_path = context.file_manager.copy_handle_to_local_temp(
            file_handle)
        assert isinstance(local_path, str)
        assert open(local_path, "rb").read() == bar_bytes

    # Uses mock S3
    s3 = boto3.client("s3")
    s3.create_bucket(Bucket="some-bucket")

    @pipeline(mode_defs=[
        ModeDefinition(
            system_storage_defs=s3_plus_default_storage_defs,
            resource_defs={"s3": ResourceDefinition.hardcoded_resource(s3)},
        )
    ])
    def s3_file_manager_test():
        accept_file(emit_file())

    result = execute_pipeline(
        s3_file_manager_test,
        run_config={
            "storage": {
                "s3": {
                    "config": {
                        "s3_bucket": "some-bucket"
                    }
                }
            }
        },
    )

    assert result.success

    keys_in_bucket = set([
        obj["Key"] for obj in s3.list_objects(Bucket="some-bucket")["Contents"]
    ])

    for step_key, output_name in [
        ("emit_file.compute", "result"),
        ("accept_file.compute", "result"),
    ]:
        keys_in_bucket.remove(
            create_s3_key(result.run_id, step_key, output_name))

    assert len(keys_in_bucket) == 1

    file_key = list(keys_in_bucket)[0]
    comps = file_key.split("/")

    assert "/".join(comps[:-1]) == "dagster/storage/{run_id}/files".format(
        run_id=result.run_id)

    assert uuid.UUID(comps[-1])
Example #16
0
    """
    Trains a collaborative filtering model that can recommend HN stories to users based on what
    stories they've commented on in the past.
    """
    comment_stories = build_comment_stories()
    user_story_matrix = build_user_story_matrix(comment_stories)
    recommender_model = build_recommender_model(user_story_matrix)
    model_perf_notebook(recommender_model)
    build_component_top_stories(recommender_model, user_story_matrix)
    build_user_top_recommended_stories(recommender_model, user_story_matrix)


story_recommender_prod_job = story_recommender.to_job(resource_defs={
    **RESOURCES_PROD,
    **{
        "partition_bounds": ResourceDefinition.none_resource()
    },
})

story_recommender_staging_job = story_recommender.to_job(
    resource_defs={
        **RESOURCES_STAGING,
        **{
            "partition_bounds": ResourceDefinition.none_resource()
        },
    })

story_recommender_local_job = story_recommender.to_job(
    resource_defs={
        **RESOURCES_LOCAL,
        **{
Example #17
0
        "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
        "spark.hadoop.fs.s3.awsAccessKeyId":
        os.getenv("AWS_ACCESS_KEY_ID", ""),
        "spark.hadoop.fs.s3.awsSecretAccessKey":
        os.getenv("AWS_SECRET_ACCESS_KEY", ""),
        "spark.hadoop.fs.s3.buffer.dir":
        "/tmp",
    }
})

snowflake_io_manager_prod = snowflake_io_manager.configured(
    {"database": "DEMO_DB_ASSETS"})

RESOURCES_PROD = {
    "s3_bucket":
    ResourceDefinition.hardcoded_resource("hackernews-elementl-prod"),
    "io_manager": common_bucket_s3_pickle_io_manager,
    "s3": s3_resource,
    "parquet_io_manager": s3_partitioned_parquet_io_manager,
    "warehouse_io_manager": snowflake_io_manager_prod,
    "pyspark": configured_pyspark,
    "warehouse_loader": snowflake_io_manager_prod,
}

snowflake_io_manager_staging = snowflake_io_manager.configured(
    {"database": "DEMO_DB_ASSETS_STAGING"})

RESOURCES_STAGING = {
    "s3_bucket":
    ResourceDefinition.hardcoded_resource("hackernews-elementl-dev"),
    "io_manager": common_bucket_s3_pickle_io_manager,
Example #18
0

@schedule(job=event_tables, cron_schedule="0 0 * * *")
def event_tables_schedule(_):
    return {}


@graph
def event_reports():
    make_event_reports = make_solid("make_event_reports",
                                    required_resource_keys={"mode"})
    make_event_reports()


@sensor(job=event_reports.to_job(
    resource_defs={"mode": ResourceDefinition.none_resource()}))
def event_reports_sensor():
    pass


event_reports_dev = event_reports.to_job(
    resource_defs={"mode": ResourceDefinition.none_resource()})


@graph
def crm_ingest():
    """A graph with multiple production jobs"""
    ingest_users = make_solid("ingest_users", required_resource_keys={"crm"})
    ingest_interactions = make_solid("ingest_interactions",
                                     required_resource_keys={"crm"})
Example #19
0
    raw_events = make_raw_events()
    clean_events(raw_events)


@schedule(job=event_tables, cron_schedule="0 0 * * *")
def event_tables_schedule(_):
    return {}


@graph
def event_reports():
    make_event_reports = make_solid("make_event_reports", required_resource_keys={"mode"})
    make_event_reports()


@sensor(job=event_reports.to_job(resource_defs={"mode": ResourceDefinition.none_resource()}))
def event_reports_sensor():
    pass


event_reports_dev = event_reports.to_job(resource_defs={"mode": ResourceDefinition.none_resource()})


@graph
def crm_ingest():
    """A graph with multiple production jobs"""
    ingest_users = make_solid("ingest_users", required_resource_keys={"crm"})
    ingest_interactions = make_solid("ingest_interactions", required_resource_keys={"crm"})

    ingest_users()
    ingest_interactions()
Example #20
0
def define_tempfile_resource():
    return ResourceDefinition(resource_fn=_tempfile_resource_fn)
Example #21
0
def test_get_out_of_pipeline_context():
    context = dagstermill.get_context(mode_def=ModeDefinition(
        resource_defs={"list": ResourceDefinition(lambda _: [])}))

    assert context.pipeline_name == "ephemeral_dagstermill_pipeline"
    assert context.resources.list == []
Example #22
0
def define_lambda_resource(func, *args, **kwargs):
    return ResourceDefinition(lambda _info: func(*args, **kwargs))
Example #23
0
@solid
def a(_):
    pass


@solid
def b(_):
    raise Exception()


mode_defs = [
    ModeDefinition(
        'dev',
        resource_defs={
            'slack': ResourceDefinition.hardcoded_resource(
                slack_resource_mock, 'do not send messages in dev'
            )
        },
    ),
    ModeDefinition('prod', resource_defs={'slack': slack_resource}),
]


@slack_on_failure
@pipeline(mode_defs=mode_defs)
def notif_all():
    # the hook "slack_on_failure" is applied on every solid instance within this pipeline
    a()
    b()

Example #24
0
def define_value_resource(value):
    return ResourceDefinition(lambda _info: value)
Example #25
0
def a(_):
    pass


@solid
def b(_):
    raise Exception()


# start_repo_marker_3
mode_defs = [
    ModeDefinition(
        "dev",
        resource_defs={
            "slack": ResourceDefinition.hardcoded_resource(
                slack_resource_mock, "do not send messages in dev"
            )
        },
    ),
    ModeDefinition("prod", resource_defs={"slack": slack_resource}),
]
# end_repo_marker_3

# start_repo_marker_1
@slack_message_on_failure
@pipeline(mode_defs=mode_defs)
def notif_all():
    # the hook "slack_message_on_failure" is applied on every solid instance within this pipeline
    a()
    b()
Example #26
0
def define_string_resource():
    return ResourceDefinition(resource_fn=lambda info: info.config,
                              config_field=Field(String))
Example #27
0
def define_string_resource():
    return ResourceDefinition(
        config_field=Field(String),
        resource_fn=lambda init_context: init_context.resource_config)
Example #28
0
def dummy_resource(config_field):
    return ResourceDefinition(lambda: None, config_field)
Example #29
0
def test_depends_on_adls2_resource_intermediates(storage_account, file_system):
    @solid(
        input_defs=[
            InputDefinition('num_one', Int),
            InputDefinition('num_two', Int)
        ],
        output_defs=[OutputDefinition(Int)],
    )
    def add_numbers(_, num_one, num_two):
        return num_one + num_two

    adls2_fake_resource = FakeADLS2Resource(storage_account)

    @pipeline(mode_defs=[
        ModeDefinition(
            system_storage_defs=adls2_plus_default_storage_defs,
            resource_defs={
                'adls2':
                ResourceDefinition.hardcoded_resource(adls2_fake_resource)
            },
        )
    ])
    def adls2_internal_pipeline():
        return add_numbers()

    result = execute_pipeline(
        adls2_internal_pipeline,
        environment_dict={
            'solids': {
                'add_numbers': {
                    'inputs': {
                        'num_one': {
                            'value': 2
                        },
                        'num_two': {
                            'value': 4
                        }
                    }
                }
            },
            'storage': {
                'adls2': {
                    'config': {
                        'adls2_file_system': file_system
                    }
                }
            },
        },
    )

    assert result.success
    assert result.result_for_solid('add_numbers').output_value() == 6

    assert file_system in adls2_fake_resource.adls2_client.file_systems

    keys = set()
    for step_key, output_name in [('add_numbers.compute', 'result')]:
        keys.add(create_adls2_key(result.run_id, step_key, output_name))

    assert set(adls2_fake_resource.adls2_client.file_systems[file_system].keys(
    )) == keys
Example #30
0

# start_resource_example
class ExternalCerealFetcher:
    def fetch_new_cereals(self, start_ts, end_ts):
        pass


@resource
def cereal_fetcher(init_context):
    return ExternalCerealFetcher()


# end_resource_example

resource_a = ResourceDefinition.hardcoded_resource(1)
resource_b = ResourceDefinition.hardcoded_resource(2)

# start_mode_example
mode_def_ab = ModeDefinition(
    "ab_mode",
    resource_defs={
        "a": resource_a,
        "b": resource_b,
    },
)
# end_mode_example

mode_def_c = ModeDefinition("c_mode", resource_defs={"a": resource_a})