Beispiel #1
0
def define_repo():
    return RepositoryDefinition(
        name='demo_repository',
        # Note that we can pass a function, rather than pipeline instance.
        # This allows us to construct pipelines on demand.
        pipeline_dict={'repo_demo_pipeline': lambda: repo_demo_pipeline},
    )
Beispiel #2
0
def bar_repo():
    return RepositoryDefinition(
        name='bar',
        pipeline_dict={'foo': define_foo_pipeline, 'baz': lambda: baz_pipeline},
        schedule_defs=define_bar_schedules(),
        partition_set_defs=define_baz_partitions(),
    )
Beispiel #3
0
def define_pandas_repository():
    return RepositoryDefinition(
        name='test_dagstermill_pandas_solids',
        pipeline_defs=[
            papermill_pandas_hello_world_pipeline, pandas_hello_world
        ],
    )
    def test_update_tick_to_success(self, storage):
        assert storage

        repository = RepositoryDefinition("repository_name")
        current_time = time.time()
        tick = storage.create_schedule_tick(repository,
                                            self.build_tick(current_time))

        updated_tick = tick.with_status(ScheduleTickStatus.SUCCESS,
                                        run_id="1234")
        assert updated_tick.status == ScheduleTickStatus.SUCCESS

        storage.update_schedule_tick(repository, updated_tick)

        ticks = storage.get_schedule_ticks_by_schedule(repository,
                                                       "my_schedule")
        assert len(ticks) == 1
        tick = ticks[0]
        assert tick.tick_id == 1
        assert tick.schedule_name == "my_schedule"
        assert tick.cron_schedule == "* * * * *"
        assert tick.timestamp == current_time
        assert tick.status == ScheduleTickStatus.SUCCESS
        assert tick.run_id == "1234"
        assert tick.error == None
def define_part_nine_repo():
    return RepositoryDefinition(name='part_nine_repo',
                                pipeline_dict={
                                    'part_nine_step_one':
                                    define_part_nine_step_one,
                                    'part_nine_final': define_part_nine_final,
                                })
Beispiel #6
0
def define_repository():
    return RepositoryDefinition(
        name='test',
        pipeline_defs=[
            composites_pipeline,
            csv_hello_world,
            csv_hello_world_df_input,
            csv_hello_world_two,
            csv_hello_world_with_expectations,
            materialization_pipeline,
            more_complicated_config,
            more_complicated_nested_config,
            multi_mode_with_loggers,
            multi_mode_with_resources,
            naughty_programmer_pipeline,
            no_config_pipeline,
            pipeline_with_enum_config,
            pipeline_with_expectations,
            pipeline_with_list,
            pipeline_with_step_metadata,
            required_resource_pipeline,
            scalar_output_pipeline,
            secret_pipeline,
        ],
        experimental={'schedule_defs': [no_config_pipeline_hourly_schedule]},
    )
Beispiel #7
0
def define_repository():
    return RepositoryDefinition(
        name='test',
        pipeline_dict={
            'more_complicated_config': define_more_complicated_config,
            'more_complicated_nested_config':
            define_more_complicated_nested_config,
            'csv_hello_world': define_csv_hello_world,
            'csv_hello_world_two': define_pipeline_two,
            'csv_hello_world_with_expectations':
            define_csv_hello_world_with_expectations,
            'pipeline_with_list': define_pipeline_with_list,
            'csv_hello_world_df_input': define_pipeline_with_csv_df_input,
            'no_config_pipeline': define_no_config_pipeline,
            'scalar_output_pipeline': define_scalar_output_pipeline,
            'pipeline_with_enum_config': define_pipeline_with_enum_config,
            'naughty_programmer_pipeline': define_naughty_programmer_pipeline,
            'secret_pipeline': define_pipeline_with_secret,
            'pipeline_with_step_metadata': define_pipeline_with_step_metadata,
            'pipeline_with_expectations': define_pipeline_with_expectation,
            'multi_mode_with_resources':
            define_multi_mode_with_resources_pipeline,
            'multi_mode_with_loggers': define_multi_mode_with_loggers_pipeline,
            'composites_pipeline': define_composites_pipeline,
        },
        pipeline_defs=[materialization_pipeline],
    )
Beispiel #8
0
def define_repo():
    return RepositoryDefinition(name='test',
                                pipeline_dict={
                                    'pandas_hello_world': define_pipeline_one,
                                    'pandas_hello_world_two':
                                    define_pipeline_two,
                                })
Beispiel #9
0
def define_repo():
    return RepositoryDefinition(
        name='demo_repository',
        # Note that we pass the function itself, rather than call the function.
        # This allows us to construct pipelines on demand.
        pipeline_dict={'repo_demo_pipeline': define_repo_demo_pipeline},
    )
Beispiel #10
0
    def load(self):
        if self.loaded:
            reloader.reload(self.module)
        self.loaded = True

        fn = getattr(self.module, self.fn_name)
        check.is_callable(fn)
        obj = fn()

        # Eventually this class will be generic and not coupled to
        # Pipeline / Repository types. Tracking this issue here:
        # https://github.com/dagster-io/dagster/issues/246
        if self.coerce_to_repo:
            if isinstance(obj, RepositoryDefinition):
                self.object = obj
            elif isinstance(obj, PipelineDefinition):
                self.object = RepositoryDefinition(
                    name=EMPHERMAL_NAME, pipeline_dict={obj.name: lambda: obj})
            else:
                raise InvalidPipelineLoadingComboError(
                    'entry point must return a repository or pipeline')
        else:
            self.object = obj

        return self.object
Beispiel #11
0
def define_demo_repo():
    # Lazy import here to prevent deps issues

    from dagster import RepositoryDefinition
    from dagster_examples.toys.error_monster import error_monster
    from dagster_examples.toys.sleepy import sleepy_pipeline
    from dagster_examples.toys.log_spew import log_spew
    from dagster_examples.toys.many_events import many_events
    from dagster_examples.toys.composition import composition
    from dagster_examples.toys.pandas_hello_world import pandas_hello_world_pipeline
    from dagster_examples.airline_demo.pipelines import (
        airline_demo_ingest_pipeline,
        airline_demo_warehouse_pipeline,
    )
    from dagster_examples.event_pipeline_demo.pipelines import event_ingest_pipeline
    from dagster_examples.pyspark_pagerank.pyspark_pagerank_pipeline import pyspark_pagerank
    from dagster_pandas.examples import papermill_pandas_hello_world_pipeline

    return RepositoryDefinition(
        name='demo_repository',
        pipeline_defs=[
            pandas_hello_world_pipeline,
            sleepy_pipeline,
            error_monster,
            log_spew,
            many_events,
            composition,
            airline_demo_ingest_pipeline,
            airline_demo_warehouse_pipeline,
            event_ingest_pipeline,
            pyspark_pagerank,
            papermill_pandas_hello_world_pipeline,
        ],
    )
Beispiel #12
0
    def test_update_tick_to_skip(self, storage):
        assert storage

        repository = RepositoryDefinition(
            name="repository_name",
            repository_data=RepositoryData.from_list([]))
        current_time = time.time()
        tick = storage.create_schedule_tick(repository.name,
                                            self.build_tick(current_time))

        updated_tick = tick.with_status(ScheduleTickStatus.SKIPPED)
        assert updated_tick.status == ScheduleTickStatus.SKIPPED

        storage.update_schedule_tick(repository.name, updated_tick)

        ticks = storage.get_schedule_ticks_by_schedule(repository.name,
                                                       "my_schedule")
        assert len(ticks) == 1
        tick = ticks[0]
        assert tick.tick_id == 1
        assert tick.schedule_name == "my_schedule"
        assert tick.cron_schedule == "* * * * *"
        assert tick.timestamp == current_time
        assert tick.status == ScheduleTickStatus.SKIPPED
        assert tick.run_id == None
        assert tick.error == None
Beispiel #13
0
def define_repo():
    return RepositoryDefinition(
        name='scheduler_demo_repository',
        pipeline_defs=[
            hello_world_pipeline, goodbye_world_pipeline, long_running_pipeline
        ],
    )
Beispiel #14
0
def define_repo():
    return RepositoryDefinition(
        name='pandas_hello_world_repo',
        pipeline_defs=[
            pandas_hello_world, papermill_pandas_hello_world_pipeline
        ],
    )
Beispiel #15
0
def create_repository():
    no_config_pipeline_hourly_schedule = ScheduleDefinition(
        name="no_config_pipeline_hourly_schedule",
        cron_schedule="0 0 * * *",
        execution_params={
            "environmentConfigData": {
                "storage": {
                    "filesystem": None
                }
            },
            "selector": {
                "name": "no_config_pipeline",
                "solidSubset": None
            },
            "mode": "default",
        },
    )

    @pipeline
    def no_config_pipeline():
        @lambda_solid
        def return_hello():
            return 'Hello'

        return return_hello()

    return RepositoryDefinition(
        name='test',
        pipeline_defs=[no_config_pipeline],
        experimental={
            'schedule_defs': [no_config_pipeline_hourly_schedule],
            'scheduler': TestSystemCronScheduler,
        },
    )
Beispiel #16
0
def define_repo():
    return RepositoryDefinition(
        name='partitioning-tutorial',
        pipeline_defs=[compute_total_stock_volume],
        schedule_defs=define_schedules(),
        partition_set_defs=define_partitions(),
    )
Beispiel #17
0
def test_dupe_solid_repo_definition():
    @lambda_solid(name='same')
    def noop():
        pass

    @lambda_solid(name='same')
    def noop2():
        pass

    repo = RepositoryDefinition(
        'error_repo',
        pipeline_dict={
            'first': lambda: PipelineDefinition(name='first', solids=[noop]),
            'second':
            lambda: PipelineDefinition(name='second', solids=[noop2]),
        },
    )

    with pytest.raises(DagsterInvalidDefinitionError) as exc_info:
        repo.get_all_pipelines()

    assert str(exc_info.value) == (
        'You have defined two solids named "same" in repository "error_repo". '
        'Solid names must be unique within a repository. The solid has been defined '
        'in pipeline "first" and it has been defined again in pipeline "second."'
    )
Beispiel #18
0
def define_repository():
    return RepositoryDefinition(
        name='test',
        pipeline_defs=[
            composites_pipeline,
            csv_hello_world,
            csv_hello_world_df_input,
            csv_hello_world_two,
            csv_hello_world_with_expectations,
            eventually_successful,
            infinite_loop_pipeline,
            materialization_pipeline,
            more_complicated_config,
            more_complicated_nested_config,
            multi_mode_with_loggers,
            multi_mode_with_resources,
            naughty_programmer_pipeline,
            no_config_pipeline,
            pipeline_with_enum_config,
            pipeline_with_expectations,
            pipeline_with_list,
            pipeline_with_step_metadata,
            required_resource_pipeline,
            scalar_output_pipeline,
            spew_pipeline,
            noop_pipeline,
        ],
    )
Beispiel #19
0
    def make_python_callable(cls, dag_id, pipeline, env_config, step_keys):
        try:
            from dagster import RepositoryDefinition
            from dagster.cli.dynamic_loader import RepositoryContainer
            from dagster_graphql.cli import execute_query_from_cli
        except ImportError:
            raise AirflowException(
                'To use the DagsterPythonOperator, dagster and dagster_graphql must be installed '
                'in your Airflow environment.')
        repository = RepositoryDefinition('<<ephemeral repository>>',
                                          {dag_id: lambda: pipeline})
        repository_container = RepositoryContainer(repository=repository)

        def python_callable(**kwargs):
            run_id = kwargs.get('dag_run').run_id
            query = QUERY_TEMPLATE.format(
                config=env_config,
                run_id=run_id,
                step_keys=json.dumps(step_keys),
                pipeline_name=pipeline.name,
            )
            res = json.loads(
                execute_query_from_cli(repository_container,
                                       query,
                                       variables=None))
            cls.handle_errors(res, None)
            return cls.handle_result(res)

        return python_callable
Beispiel #20
0
    def test_get_schedule_stats(self, storage):
        assert storage

        repository = RepositoryDefinition("repository_name")
        current_time = time.time()

        error = SerializableErrorInfo(message="Error", stack=[], cls_name="TestError")

        # Create ticks
        for x in range(2):
            storage.create_schedule_tick(repository.name, self.build_tick(current_time))

        for x in range(3):
            storage.create_schedule_tick(
                repository.name,
                self.build_tick(current_time, ScheduleTickStatus.SUCCESS, run_id=str(x)),
            )

        for x in range(4):
            storage.create_schedule_tick(
                repository.name, self.build_tick(current_time, ScheduleTickStatus.SKIPPED),
            )

        for x in range(5):
            storage.create_schedule_tick(
                repository.name,
                self.build_tick(current_time, ScheduleTickStatus.FAILURE, error=error),
            )

        stats = storage.get_schedule_tick_stats_by_schedule(repository.name, "my_schedule")
        assert stats.ticks_started == 2
        assert stats.ticks_succeeded == 3
        assert stats.ticks_skipped == 4
        assert stats.ticks_failed == 5
Beispiel #21
0
    def test_update_tick_to_failure(self, storage):
        assert storage

        repository = RepositoryDefinition("repository_name")
        current_time = time.time()
        tick = storage.create_schedule_tick(repository.name, self.build_tick(current_time))

        updated_tick = tick.with_status(
            ScheduleTickStatus.FAILURE,
            error=SerializableErrorInfo(message="Error", stack=[], cls_name="TestError"),
        )
        assert updated_tick.status == ScheduleTickStatus.FAILURE

        storage.update_schedule_tick(repository.name, updated_tick)

        ticks = storage.get_schedule_ticks_by_schedule(repository.name, "my_schedule")
        assert len(ticks) == 1
        tick = ticks[0]
        assert tick.tick_id == 1
        assert tick.schedule_name == "my_schedule"
        assert tick.cron_schedule == "* * * * *"
        assert tick.timestamp == current_time
        assert tick.status == ScheduleTickStatus.FAILURE
        assert tick.run_id == None
        assert tick.error == SerializableErrorInfo(message="Error", stack=[], cls_name="TestError")
Beispiel #22
0
def define_repo():
    return RepositoryDefinition(
        name='hello_cereal_repository',
        # Note that we can pass a function, rather than pipeline instance.
        # This allows us to construct pipelines lazily, if, e.g.,
        # initializing a pipeline involves any heavy compute
        pipeline_dict={'hello_cereal_pipeline': lambda: hello_cereal_pipeline},
        pipeline_defs=[complex_pipeline])
Beispiel #23
0
def define_repo():
    return RepositoryDefinition(
        name='bay_bikes_demo',
        pipeline_dict={
            'extract_monthly_bay_bike_pipeline': lambda: extract_monthly_bay_bike_pipeline,
            'extract_daily_weather_data_pipeline': lambda: extract_daily_weather_data_pipeline,
        },
    )
Beispiel #24
0
def define_repo():
    return RepositoryDefinition(
        name='bay_bikes_demo',
        pipeline_dict={
            'monthly_bay_bike_etl_pipeline':
            lambda: monthly_bay_bike_etl_pipeline
        },
    )
Beispiel #25
0
def define_bar_repo():
    return RepositoryDefinition(
        'bar',
        {
            'foo': define_foo_pipeline,
            'baz': lambda: baz_pipeline
        },
    )
Beispiel #26
0
    def test_get_schedule_by_name_not_found(self, storage):
        assert storage

        repository = RepositoryDefinition("repository_name")
        storage.add_schedule(repository.name, self.build_schedule("my_schedule", "* * * * *"))
        schedule = storage.get_schedule_by_name(repository.name, "fake_schedule")

        assert schedule is None
Beispiel #27
0
    def test_get_schedule_by_name(self, storage):
        assert storage

        repository = RepositoryDefinition("repository_name")
        storage.add_schedule(repository.name, self.build_schedule("my_schedule", "* * * * *"))
        schedule = storage.get_schedule_by_name(repository.name, "my_schedule")

        assert schedule.name == "my_schedule"
Beispiel #28
0
    def test_delete_schedule_not_found(self, storage):
        assert storage

        repository = RepositoryDefinition("repository_name")
        schedule = self.build_schedule("my_schedule", "* * * * *")

        with pytest.raises(DagsterInvariantViolationError):
            storage.delete_schedule(repository.name, schedule)
Beispiel #29
0
def define_demo_execution_repo():
    return RepositoryDefinition(
        name='demo_execution_repo',
        pipeline_dict={
            'demo_pipeline': define_demo_execution_pipeline,
            'demo_error_pipeline': define_demo_error_pipeline,
        },
    )
Beispiel #30
0
def define_repo():
    return RepositoryDefinition(
        name='simple_pyspark_repo',
        pipeline_dict={
            'simple_pyspark_sfo_weather_pipeline':
            define_simple_pyspark_sfo_weather_pipeline
        },
    )