Esempio n. 1
0
    yield Output(df, output_name="treated_data")
    yield Output(error, output_name="error")


@discord_message_on_failure
@discord_message_on_success
@redis_keepalive_on_failure
@redis_keepalive_on_succes
@pipeline(
    mode_defs=[
        ModeDefinition(
            "dev",
            resource_defs={
                "basedosdados_config": basedosdados_config,
                "timezone_config": timezone_config,
                "discord_webhook": discord_webhook,
                "keepalive_key": keepalive_key,
            },
        ),
    ],
    tags={
        "pipeline": "br_rj_riodejaneiro_stpl_gps_registros",
        "dagster-k8s/config": {
            "container_config": {
                "resources": {
                    "requests": {
                        "cpu": "250m",
                        "memory": "250Mi"
                    },
                    "limits": {
Esempio n. 2
0
        "weight",
    ]
    quantities = [cereal["cups"] for cereal in cereals]
    reweights = [1.0 / float(quantity) for quantity in quantities]

    normalized_cereals = deepcopy(cereals)
    for idx in range(len(normalized_cereals)):
        cereal = normalized_cereals[idx]
        for column in columns_to_normalize:
            cereal[column] = float(cereal[column]) * reweights[idx]

    context.resources.warehouse.update_normalized_cereals(normalized_cereals)


@pipeline(mode_defs=[
    ModeDefinition(
        resource_defs={"warehouse": local_sqlite_warehouse_resource})
])
def resources_pipeline():
    normalize_calories(read_csv())


if __name__ == "__main__":
    run_config = {
        "solids": {
            "read_csv": {
                "inputs": {
                    "csv_path": {
                        "value": "cereal.csv"
                    }
                }
            }
Esempio n. 3
0
from dagster.core.definitions.executor import default_executors
from dagster.core.definitions.reconstructable import ReconstructablePipeline
from dagster.core.events import DagsterEventType
from dagster.core.test_utils import instance_for_test, nesting_composite_pipeline
from dagster.utils import send_interrupt
from dagster_dask import DataFrame, dask_executor
from dask.distributed import Scheduler, Worker


@solid
def simple(_):
    return 1


@pipeline(mode_defs=[
    ModeDefinition(executor_defs=default_executors + [dask_executor])
])
def dask_engine_pipeline():
    simple()


def test_execute_on_dask_local():
    with tempfile.TemporaryDirectory() as tempdir:
        with instance_for_test(temp_dir=tempdir) as instance:
            result = execute_pipeline(
                reconstructable(dask_engine_pipeline),
                run_config={
                    "intermediate_storage": {
                        "filesystem": {
                            "config": {
                                "base_dir": tempdir
Esempio n. 4
0
    def get_context(self, solid_config=None, mode_def=None, run_config=None):
        """Get a dagstermill execution context for interactive exploration and development.

        Args:
            solid_config (Optional[Any]): If specified, this value will be made available on the
                context as its ``solid_config`` property.
            mode_def (Optional[:class:`dagster.ModeDefinition`]): If specified, defines the mode to
                use to construct the context. Specify this if you would like a context constructed
                with specific ``resource_defs`` or ``logger_defs``. By default, an ephemeral mode
                with a console logger will be constructed.
            run_config(Optional[dict]): The environment config dict with which to construct
                the context.

        Returns:
            :py:class:`~dagstermill.DagstermillExecutionContext`
        """
        check.opt_inst_param(mode_def, "mode_def", ModeDefinition)
        run_config = check.opt_dict_param(run_config, "run_config", key_type=str)

        # If we are running non-interactively, and there is already a context reconstituted, return
        # that context rather than overwriting it.
        if self.context is not None and isinstance(
            self.context, DagstermillRuntimeExecutionContext
        ):
            return self.context

        if not mode_def:
            mode_def = ModeDefinition(logger_defs={"dagstermill": colored_console_logger})
            run_config["loggers"] = {"dagstermill": {}}

        solid_def = SolidDefinition(
            name="this_solid",
            input_defs=[],
            compute_fn=lambda *args, **kwargs: None,
            output_defs=[],
            description="Ephemeral solid constructed by dagstermill.get_context()",
            required_resource_keys=mode_def.resource_key_set,
        )

        pipeline_def = PipelineDefinition(
            [solid_def], mode_defs=[mode_def], name="ephemeral_dagstermill_pipeline"
        )

        run_id = make_new_run_id()

        # construct stubbed PipelineRun for notebook exploration...
        # The actual pipeline run during pipeline execution will be serialized and reconstituted
        # in the `reconstitute_pipeline_context` call
        pipeline_run = PipelineRun(
            pipeline_name=pipeline_def.name,
            run_id=run_id,
            run_config=run_config,
            mode=mode_def.name,
            step_keys_to_execute=None,
            status=PipelineRunStatus.NOT_STARTED,
            tags=None,
        )

        self.in_pipeline = False
        self.solid_def = solid_def
        self.pipeline = pipeline_def

        environment_config = EnvironmentConfig.build(pipeline_def, run_config, mode=mode_def.name)

        pipeline = InMemoryPipeline(pipeline_def)
        execution_plan = ExecutionPlan.build(pipeline, environment_config)

        with scoped_pipeline_context(
            execution_plan,
            pipeline,
            run_config,
            pipeline_run,
            DagsterInstance.ephemeral(),
            scoped_resources_builder_cm=self._setup_resources,
        ) as pipeline_context:

            self.context = DagstermillExecutionContext(
                pipeline_context=pipeline_context,
                pipeline_def=pipeline_def,
                solid_config=solid_config,
                resource_keys_to_init=get_required_resource_keys_to_init(
                    execution_plan,
                    pipeline_def,
                    environment_config,
                    pipeline_context.intermediate_storage_def,
                ),
                solid_name=solid_def.name,
            )

        return self.context
Esempio n. 5
0
from click.testing import CliRunner
from dagster_celery import celery_executor
from dagster_celery.cli import main

from dagster import ModeDefinition, default_executors, execute_pipeline, pipeline, seven, solid
from dagster.core.definitions.pointer import FileCodePointer
from dagster.core.definitions.reconstructable import ReconstructablePipeline
from dagster.core.instance import DagsterInstance

BUILDKITE = os.getenv('BUILDKITE')
skip_ci = pytest.mark.skipif(
    bool(BUILDKITE),
    reason='Tests hang forever on buildkite for reasons we don\'t currently understand',
)

celery_mode_defs = [ModeDefinition(executor_defs=default_executors + [celery_executor])]


@contextmanager
def execute_pipeline_on_celery(pipeline_name):
    with seven.TemporaryDirectory() as tempdir:
        pipeline_def = ReconstructablePipeline(FileCodePointer(__file__, pipeline_name))
        instance = DagsterInstance.local_temp(tempdir=tempdir)
        result = execute_pipeline(
            pipeline_def,
            environment_dict={
                'storage': {'filesystem': {'config': {'base_dir': tempdir}}},
                'execution': {'celery': {}},
            },
            instance=instance,
        )
Esempio n. 6
0
def test_nb_solid(name, **kwargs):
    output_defs = kwargs.pop("output_defs",
                             [OutputDefinition(is_required=False)])

    return dagstermill.define_dagstermill_solid(
        name=name,
        notebook_path=nb_test_path(name),
        output_notebook="notebook",
        output_defs=output_defs,
        **kwargs,
    )


default_mode_defs = [
    ModeDefinition(resource_defs={"file_manager": local_file_manager})
]

hello_world = test_nb_solid("hello_world", output_defs=[])


@pipeline(mode_defs=default_mode_defs)
def hello_world_pipeline():
    hello_world()


hello_world_with_custom_tags_and_description = test_nb_solid(
    "hello_world",
    output_defs=[],
    tags={"foo": "bar"},
    description="custom description")
Esempio n. 7
0
    load_data_to_database_from_spark,
    process_q2_data,
    process_sfo_weather_data,
    q2_sfo_outbound_flights,
    sfo_delays_by_destination,
    subsample_spark_dataset,
    tickets_with_destination,
    unzip_file,
    westbound_delays,
)

test_mode = ModeDefinition(
    name='test',
    resources={
        'spark': spark_session_local,
        'db_info': redshift_db_info_resource,
        'tempfile': tempfile_resource,
        's3': s3_resource,
    },
)

local_mode = ModeDefinition(
    name='local',
    resources={
        'spark': spark_session_local,
        's3': s3_resource,
        'db_info': postgres_db_info_resource,
        'tempfile': tempfile_resource,
    },
)
Esempio n. 8
0
CONFIG_FILE = """
solids:
  spark_solid:
    config:
      spark_home: /your/spark_home
      application_jar: "{path}"
      deploy_mode: "client"
      application_arguments: "--local-path /tmp/dagster/events/data --date 2019-01-01"
      master_url: "local[*]"
      spark_conf:
        spark:
          app:
            name: "test_app"
"""

MODE_DEF = ModeDefinition(resource_defs={"spark": spark_resource})


def test_jar_not_found():
    spark_solid = create_spark_solid("spark_solid", main_class="something")
    # guid guaranteed to not exist
    run_config = yaml.safe_load(CONFIG_FILE.format(path=str(uuid.uuid4())))

    result = execute_solid(
        spark_solid, run_config=run_config, raise_on_error=False, mode_def=MODE_DEF
    )
    assert result.failure_data
    assert (
        "does not exist. A valid jar must be built before running this op."
        in result.failure_data.error.cause.message
    )
Esempio n. 9
0
from dagster import ModeDefinition, default_executors, fs_io_manager, pipeline, solid
from dagster_dask import dask_executor


@solid
def hello_world(_):
    return "Hello, World!"


@pipeline(mode_defs=[
    ModeDefinition(
        resource_defs={"io_manager": fs_io_manager},
        executor_defs=default_executors + [dask_executor],
    )
])
def dask_pipeline():
    return hello_world()
Esempio n. 10
0
from dagster_pandas import DataFrame

from test2_queries import COPY_INTO, CREATE_STAGE, CREATE_TABLE, TRANSFORMS, parse_s3_config

# Typically we would get these from some other configuration source that we could create on a per-env basis
access_key_id, secret_key = parse_s3_config('test_creds/boto.cfg')
table_name = 'BMESICK.test_data_ingest_dagster'
stage_name = table_name + '_stg'
bucket_name = 'bmez-astronomer'

# Modes allow you to configure substantial behavior based on environment
# (use local disk instead of S3 for local runs, sqlite instead of Snowflake, etc)
prod_mode = ModeDefinition(
    name='prod',
    resource_defs={
        's3': s3_resource,
        'snowflake': snowflake_resource
    },
    system_storage_defs=s3_plus_default_storage_defs,
)

# Presets are a type of configuration where each file can overwrite values in the next, this is where per-environment
# settings and, I guess, secrets go?
preset_defs = PresetDefinition.from_files(
    name='prod',
    mode='prod',
    environment_files=[
        file_relative_path(__file__, 'environments/shared.yaml'),
    ],
)

Esempio n. 11
0
@resource(config=Field(Int))
def multer_resource(init_context):
    return lambda x: x * init_context.resource_config


@resource(config={'num_one': Field(Int), 'num_two': Field(Int)})
def double_adder_resource(init_context):
    return (lambda x: x + init_context.resource_config['num_one'] +
            init_context.resource_config['num_two'])


@pipeline(
    mode_defs=[
        ModeDefinition(
            name='add_mode',
            resource_defs={'op': adder_resource},
            description='Mode that adds things',
        ),
        ModeDefinition(
            name='mult_mode',
            resource_defs={'op': multer_resource},
            description='Mode that multiplies things',
        ),
        ModeDefinition(
            name='double_adder',
            resource_defs={'op': double_adder_resource},
            description='Mode that adds two numbers to thing',
        ),
    ],
    preset_defs=[PresetDefinition.from_files("add", mode="add_mode")],
)
Esempio n. 12
0
def test_get_out_of_pipeline_context():
    context = dagstermill.get_context(mode_def=ModeDefinition(
        resource_defs={'list': ResourceDefinition(lambda _: [])}))

    assert context.pipeline_def.name == 'ephemeral_dagstermill_pipeline'
    assert context.resources.list == []
Esempio n. 13
0
                    "root": {
                        "config": {
                            "sleep_secs": [-10, 30]
                        }
                    }
                },
            },
        ),
        PresetDefinition(
            "sleep",
            {
                "execution": {
                    "multiprocess": {}
                },
                "solids": {
                    "root": {
                        "config": {
                            "sleep_secs": [0, 10]
                        }
                    }
                },
            },
        ),
    ],
    mode_defs=[ModeDefinition(resource_defs={"io_manager": fs_io_manager})],
)
def branch_pipeline():
    out_1, out_2 = root()
    branch("branch_1", out_1, 3)
    branch("branch_2", out_2, 5)
Esempio n. 14
0
def test_input_manager_with_retries():
    _called = False
    _count = {"total": 0}

    @input_manager
    def should_succeed(_, _resource_config):
        if _count["total"] < 2:
            _count["total"] += 1
            raise RetryRequested(max_retries=3)
        return "foo"

    @input_manager
    def should_retry(_, _resource_config):
        raise RetryRequested(max_retries=3)

    @input_manager
    def should_not_execute(_, _resource_config):
        _called = True

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={
                "should_succeed": should_succeed,
                "should_not_execute": should_not_execute,
                "should_retry": should_retry,
            })
    ])
    def simple():
        @solid
        def source_solid(_):
            return "foo"

        @solid(input_defs=[
            InputDefinition("solid_input", manager_key="should_succeed")
        ])
        def take_input_1(_, solid_input):
            return solid_input

        @solid(input_defs=[
            InputDefinition("solid_input", manager_key="should_retry")
        ])
        def take_input_2(_, solid_input):
            return solid_input

        @solid(input_defs=[
            InputDefinition("solid_input", manager_key="should_not_execute")
        ])
        def take_input_3(_, solid_input):
            return solid_input

        take_input_3(take_input_2(take_input_1(source_solid())))

    with seven.TemporaryDirectory() as tmpdir_path:

        instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))

        result = execute_pipeline(simple,
                                  instance=instance,
                                  raise_on_error=False)

        step_stats = instance.get_run_step_stats(result.run_id)
        assert len(step_stats) == 3

        step_stats_1 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_1"])
        assert len(step_stats_1) == 1
        step_stat_1 = step_stats_1[0]
        assert step_stat_1.status.value == "SUCCESS"
        assert step_stat_1.attempts == 3

        step_stats_2 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_2"])
        assert len(step_stats_2) == 1
        step_stat_2 = step_stats_2[0]
        assert step_stat_2.status.value == "FAILURE"
        assert step_stat_2.attempts == 4

        step_stats_3 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_3"])
        assert len(step_stats_3) == 0
        assert _called == False
Esempio n. 15
0
@resource
def add_two_resource(_):
    def add_two(num):
        return num + 2

    return add_two


@solid(required_resource_keys={"adder"})
def solid_that_uses_adder_resource(context, number):
    return context.resources.adder(number)


@pipeline(mode_defs=[
    ModeDefinition(name="add_one", resource_defs={"adder": add_one_resource}),
    ModeDefinition(name="add_two", resource_defs={"adder": add_two_resource}),
])
def pipeline_with_mode():
    solid_that_uses_adder_resource()


_explode_pid = {"pid": None}


# Will throw if the run worker pid tries to access the definition, but subprocesses (the step
# workers) can access the definition
class ExplodingTestPipeline(ReconstructablePipeline):
    def __new__(
        cls,
        repository,
Esempio n. 16
0
        "bar": int
    },
    input_defs=[InputDefinition("people", DataFrame)],
    output_defs=[OutputDefinition(DataFrame)],
    required_resource_keys={"pyspark_step_launcher"},
)
def filter_df_solid(_, people):
    return people.filter(people["age"] < 30)


MODE_DEFS = [
    ModeDefinition(
        "prod",
        resource_defs={
            "pyspark_step_launcher": emr_pyspark_step_launcher,
            "pyspark": pyspark_resource,
            "s3": s3_resource,
        },
        intermediate_storage_defs=s3_plus_default_intermediate_storage_defs,
    ),
    ModeDefinition(
        "local",
        resource_defs={
            "pyspark_step_launcher": no_step_launcher,
            "pyspark": pyspark_resource
        },
    ),
]


@pipeline(mode_defs=MODE_DEFS)
Esempio n. 17
0
        if context.last_completion_time:
            run_config["since"] = context.last_completion_time
        return RunRequest(run_key=None, run_config=run_config)

    return {"foo_sensor": foo_sensor}


@solid(version="foo")
def my_solid():
    return 5


@pipeline(
    name="memoizable",
    mode_defs=[
        ModeDefinition(
            resource_defs={"io_manager": versioned_filesystem_io_manager})
    ],
    tags={MEMOIZED_RUN_TAG: "true"},
)
def memoizable_pipeline():
    my_solid()


@repository
def bar():
    return {
        "pipelines": {
            "foo": foo_pipeline,
            "baz": baz_pipeline,
            "partitioned_scheduled_pipeline": partitioned_scheduled_pipeline,
            "memoizable": memoizable_pipeline,
Esempio n. 18
0

@resource(config_field=Field(Int, is_optional=True))
def a_resource(context):
    raise Exception("Bad Resource")


resources = {'BadResource': a_resource}


@solid(required_resource_keys={'BadResource'})
def one(_):
    return 1


@pipeline(mode_defs=[ModeDefinition(resource_defs=resources)])
def resource_error_pipeline():
    one()


if __name__ == '__main__':
    result = execute_pipeline(
        resource_error_pipeline,
        environment_dict={
            'storage': {
                'filesystem': {}
            },
            'execution': {
                'in_process': {
                    'config': {
                        'raise_on_error': False
Esempio n. 19
0
def test_io_manager_config_inside_composite():
    stored_dict = {}

    @io_manager(output_config_schema={"output_suffix": str})
    def inner_manager(_):
        class MyHardcodedIOManager(IOManager):
            def handle_output(self, context, obj):
                keys = tuple(context.get_run_scoped_output_identifier() +
                             [context.config["output_suffix"]])
                stored_dict[keys] = obj

            def load_input(self, context):
                keys = tuple(context.upstream_output.
                             get_run_scoped_output_identifier() +
                             [context.upstream_output.config["output_suffix"]])
                return stored_dict[keys]

        return MyHardcodedIOManager()

    @solid(output_defs=[OutputDefinition(io_manager_key="inner_manager")])
    def my_solid(_):
        return "hello"

    @solid
    def my_solid_takes_input(_, x):
        assert x == "hello"
        return x

    @composite_solid
    def my_composite_solid():
        return my_solid_takes_input(my_solid())

    @pipeline(mode_defs=[
        ModeDefinition(name="default",
                       resource_defs={"inner_manager": inner_manager})
    ])
    def my_pipeline():
        my_composite_solid()

    result = execute_pipeline(
        my_pipeline,
        run_config={
            "solids": {
                "my_composite_solid": {
                    "solids": {
                        "my_solid": {
                            "outputs": {
                                "result": {
                                    "output_suffix": "my_suffix"
                                }
                            }
                        }
                    },
                }
            }
        },
    )
    assert result.success
    assert result.output_for_solid("my_composite_solid.my_solid") == "hello"
    assert (stored_dict.get((result.run_id, "my_composite_solid.my_solid",
                             "result", "my_suffix")) == "hello")
Esempio n. 20
0
    return number_df.join(string_df, number_df.id == string_df.id, 'inner').drop(string_df.id)


def test_execute_typed_in_mem_lakehouse():
    lakehouse = TypedPySparkMemLakehouse()
    pipeline_result = execute_spark_lakehouse_build(
        tables=[NumberTable, StringTable, JoinTable], lakehouse=lakehouse
    )

    assert pipeline_result.success
    # Row ordering varies on 3.5 - compare as dicts
    assert (
        lakehouse.collected_tables['JoinTable'][0].asDict()
        == Row(id=1, number=2, string='23').asDict()
    )


# for dagit
typed_lakehouse_pipeline = construct_lakehouse_pipeline(
    name='typed_lakehouse_pipeline',
    lakehouse_tables=[NumberTable, StringTable, JoinTable],
    mode_defs=[
        ModeDefinition(
            resource_defs={
                'lakehouse': typed_pyspark_mem_lakehouse,
                'spark': spark_session_resource,
            }
        )
    ],
)
Esempio n. 21
0
    ]
    quantities = [cereal['cups'] for cereal in cereals]
    reweights = [1.0 / float(quantity) for quantity in quantities]

    normalized_cereals = deepcopy(cereals)
    for idx in range(len(normalized_cereals)):
        cereal = normalized_cereals[idx]
        for column in columns_to_normalize:
            cereal[column] = float(cereal[column]) * reweights[idx]

    context.resources.warehouse.update_normalized_cereals(normalized_cereals)


@pipeline(mode_defs=[
    ModeDefinition(
        name='unittest',
        resource_defs={'warehouse': local_sqlite_warehouse_resource},
    ),
    ModeDefinition(
        name='dev',
        resource_defs={'warehouse': sqlachemy_postgres_warehouse_resource},
    ),
])
def modes_pipeline():
    normalize_calories(read_csv())


if __name__ == '__main__':
    environment_dict = {
        'solids': {
            'read_csv': {
                'inputs': {
Esempio n. 22
0
# pylint: disable=no-value-for-parameter

from dagster import execute_pipeline, ModeDefinition, solid, pipeline

from dagster_slack import slack_resource


@solid(required_resource_keys={'slack'})
def post_hello_message(context):
    context.resources.slack.chat.post_message(
        channel='#dagster', text='"Hello, World" from Dagster!')


@pipeline(mode_defs=[ModeDefinition(resource_defs={'slack': slack_resource})])
def resources_pipeline():
    post_hello_message()


if __name__ == '__main__':
    execute_pipeline(
        resources_pipeline,
        environment_dict={
            'resources': {
                'slack': {
                    'config': {
                        'token': 'xoxp-1234123412341234-12341234-1234'
                    }
                }
            }
        },
    )
Esempio n. 23
0
    output_defs = kwargs.pop("output_defs",
                             [OutputDefinition(is_required=False)])

    return dagstermill.define_dagstermill_op(
        name=name,
        notebook_path=path,
        output_notebook_name="notebook",
        output_defs=output_defs,
        **kwargs,
    )


default_mode_defs = [
    ModeDefinition(
        resource_defs={
            "output_notebook_io_manager": local_output_notebook_io_manager,
            "io_manager": fs_io_manager,
        })
]

hello_world = test_nb_solid("hello_world", output_defs=[])


@pipeline(mode_defs=default_mode_defs)
def hello_world_pipeline():
    hello_world()


hello_world_op = test_nb_op(
    "hello_world_op",
    nb_test_path("hello_world"),
Esempio n. 24
0
    load_data_to_database_from_spark,
    process_sfo_weather_data,
    q2_sfo_outbound_flights,
    s3_to_df,
    s3_to_dw_table,
    sfo_delays_by_destination,
    tickets_with_destination,
    westbound_delays,
)

test_mode = ModeDefinition(
    name='test',
    resource_defs={
        'spark': pyspark_resource,
        'db_info': redshift_db_info_resource,
        'tempfile': tempfile_resource,
        's3': s3_resource,
        'file_cache': fs_file_cache,
    },
    system_storage_defs=s3_plus_default_storage_defs,
)


local_mode = ModeDefinition(
    name='local',
    resource_defs={
        'spark': pyspark_resource,
        's3': s3_resource,
        'db_info': postgres_db_info_resource,
        'tempfile': tempfile_resource,
        'file_cache': fs_file_cache,
Esempio n. 25
0
    if context.solid_config["return_wrong_type"]:
        return string + string

    return int(string)


@pipeline(
    description=
    ("Demo pipeline that enables configurable types of errors thrown during pipeline execution, "
     "including solid execution errors, type errors, and resource initialization errors."
     ),
    mode_defs=[
        ModeDefinition(
            name="errorable_mode",
            resource_defs={
                "errorable_resource": define_errorable_resource(),
                "io_manager": errorable_io_manager,
            },
        ),
    ],
    preset_defs=[
        PresetDefinition.from_pkg_resources(
            "passing",
            pkg_resource_defs=[("dagster_test.toys.environments", "error.yaml")
                               ],
            mode="errorable_mode",
        )
    ],
    tags={"monster": "error"},
)
def error_monster():
Esempio n. 26
0
@lambda_solid(input_defs=[InputDefinition('word')])
def count_letters(word):
    counts = defaultdict(int)
    for letter in word:
        counts[letter] += 1
    return dict(counts)


@lambda_solid()
def error_solid():
    raise Exception('Unusual error')


@pipeline(mode_defs=[
    ModeDefinition(system_storage_defs=s3_plus_default_storage_defs,
                   resource_defs={'s3': s3_resource})
])
def demo_pipeline():
    count_letters(multiply_the_word())


@pipeline(mode_defs=[
    ModeDefinition(
        system_storage_defs=gcs_plus_default_storage_defs,
        resource_defs={'gcs': gcs_resource},
    )
])
def demo_pipeline_gcs():
    count_letters(multiply_the_word())

Esempio n. 27
0
    process_sfo_weather_data,
    q2_sfo_outbound_flights,
    s3_to_df,
    s3_to_dw_table,
    sfo_delays_by_destination,
    tickets_with_destination,
    westbound_delays,
)

test_mode = ModeDefinition(
    name="test",
    resource_defs={
        "pyspark_step_launcher": no_step_launcher,
        "pyspark": pyspark_resource,
        "db_info": redshift_db_info_resource,
        "tempfile": tempfile_resource,
        "s3": s3_resource,
        "file_cache": fs_file_cache,
        "file_manager": local_file_manager,
    },
    intermediate_storage_defs=s3_plus_default_intermediate_storage_defs,
)

local_mode = ModeDefinition(
    name="local",
    resource_defs={
        "pyspark_step_launcher": no_step_launcher,
        "pyspark": pyspark_resource,
        "s3": s3_resource,
        "db_info": postgres_db_info_resource,
        "tempfile": tempfile_resource,
Esempio n. 28
0
            "inner_shape_string": String
        }),
        "permissive_complex_shape":
        Permissive(fields={
            "inner_shape_array": Array(str),
            "inner_shape_string": String
        }),
        "noneable_complex_shape":
        Noneable(
            Shape(
                fields={
                    "inner_noneable_shape_array": Array(str),
                    "inner_noneable_shape_string": String,
                })),
    },
)
def test_solid(_):
    return 1


@pipeline(
    mode_defs=[ModeDefinition(resource_defs={"my_resource": my_resource})])
def test_pipeline():
    test_solid()


@repository
def experimental_repository():
    return [test_pipeline, metrics_pipeline, rollup_pipeline
            ] + define_schedules()
Esempio n. 29
0
def test_output_manager_with_failure():
    _called_input_manager = False
    _called_solid = False

    @output_manager
    def should_fail(_, _resource_config, _obj):
        raise Failure(
            description="Foolure",
            metadata_entries=[
                EventMetadataEntry.text(label="label",
                                        text="text",
                                        description="description")
            ],
        )

    @input_manager
    def should_not_enter(_):
        _called_input_manager = True

    @solid(output_defs=[OutputDefinition(manager_key="should_fail")])
    def emit_str(_):
        return "emit"

    @solid(input_defs=[
        InputDefinition(name="_input_str",
                        dagster_type=str,
                        manager_key="should_not_enter")
    ])
    def should_not_call(_, _input_str):
        _called_solid = True

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={
            "should_fail": should_fail,
            "should_not_enter": should_not_enter
        })
    ])
    def simple():
        should_not_call(emit_str())

    with seven.TemporaryDirectory() as tmpdir_path:

        instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))

        result = execute_pipeline(simple,
                                  instance=instance,
                                  raise_on_error=False)

        assert not result.success

        failure_data = result.result_for_solid("emit_str").failure_data

        assert failure_data.error.cls_name == "Failure"

        assert failure_data.user_failure_data.description == "Foolure"
        assert failure_data.user_failure_data.metadata_entries[
            0].label == "label"
        assert failure_data.user_failure_data.metadata_entries[
            0].entry_data.text == "text"
        assert failure_data.user_failure_data.metadata_entries[
            0].description == "description"

        assert not _called_input_manager and not _called_solid
Esempio n. 30
0
def test_depends_on_adls2_resource_file_manager(storage_account, file_system):
    bar_bytes = b"bar"

    @solid(output_defs=[OutputDefinition(ADLS2FileHandle)],
           required_resource_keys={"file_manager"})
    def emit_file(context):
        return context.resources.file_manager.write_data(bar_bytes)

    @solid(
        input_defs=[InputDefinition("file_handle", ADLS2FileHandle)],
        required_resource_keys={"file_manager"},
    )
    def accept_file(context, file_handle):
        local_path = context.resources.file_manager.copy_handle_to_local_temp(
            file_handle)
        assert isinstance(local_path, str)
        assert open(local_path, "rb").read() == bar_bytes

    adls2_fake_resource = FakeADLS2Resource(storage_account)
    adls2_fake_file_manager = ADLS2FileManager(
        adls2_client=adls2_fake_resource.adls2_client,
        file_system=file_system,
        prefix="some-prefix",
    )

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={
            "adls2":
            ResourceDefinition.hardcoded_resource(adls2_fake_resource),
            "file_manager":
            ResourceDefinition.hardcoded_resource(adls2_fake_file_manager),
        }, )
    ])
    def adls2_file_manager_test():
        accept_file(emit_file())

    result = execute_pipeline(
        adls2_file_manager_test,
        run_config={
            "resources": {
                "file_manager": {
                    "config": {
                        "adls2_file_system": file_system
                    }
                }
            }
        },
    )

    assert result.success

    keys_in_bucket = set(
        adls2_fake_resource.adls2_client.file_systems[file_system].keys())

    assert len(keys_in_bucket) == 1

    file_key = list(keys_in_bucket)[0]
    comps = file_key.split("/")

    assert "/".join(comps[:-1]) == "some-prefix"

    assert uuid.UUID(comps[-1])