Ejemplo n.º 1
0
        super(DagsterAutoRestartTrick, self).__init__(*args, **kwargs)
        self.restarting = False

    def on_any_event(self, event):
        self.restarting = True
        super(DagsterAutoRestartTrick, self).on_any_event(event)
        self.restarting = False


def handle_sigterm(_signum, _frame):
    raise KeyboardInterrupt()


# Note: This must be declared outside of `main` or it is cleaned up by
# some weakref magic when the watchmedo restarts Dagit.
host_tempdir = seven.TemporaryDirectory()
watch_tempdir = seven.TemporaryDirectory()


def main():
    # Build the dagit-cli command
    watch_for_reload = True
    fallback_set = False
    command = ['dagit-cli']
    for arg in sys.argv[1:]:
        if arg == '--help':
            watch_for_reload = False
            command.append(arg)
        elif arg == '--version':
            watch_for_reload = False
            command.append(arg)
Ejemplo n.º 2
0
 def temp_storage():
     if DagsterInstance._PROCESS_TEMPDIR is None:
         DagsterInstance._PROCESS_TEMPDIR = seven.TemporaryDirectory()
     return DagsterInstance._PROCESS_TEMPDIR.name
Ejemplo n.º 3
0
def test_pipeline_reexecution_successful_launch():
    test_queue = InMemoryRunLauncher()

    with seven.TemporaryDirectory() as temp_dir:
        instance = DagsterInstance(
            instance_type=InstanceType.EPHEMERAL,
            local_artifact_storage=LocalArtifactStorage(temp_dir),
            run_storage=InMemoryRunStorage(),
            event_storage=InMemoryEventLogStorage(),
            compute_log_manager=NoOpComputeLogManager(temp_dir),
            run_launcher=test_queue,
        )

    context = define_context_for_repository_yaml(path=file_relative_path(
        __file__, '../repository.yaml'),
                                                 instance=instance)
    run_id = make_new_run_id()
    result = execute_dagster_graphql(
        context=context,
        query=LAUNCH_PIPELINE_EXECUTION_MUTATION,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'no_config_pipeline'
                },
                'environmentConfigData': {
                    'storage': {
                        'filesystem': {}
                    }
                },
                'executionMetadata': {
                    'runId': run_id
                },
                'mode': 'default',
            }
        },
    )

    assert result.data['launchPipelineExecution'][
        '__typename'] == 'LaunchPipelineExecutionSuccess'
    assert result.data['launchPipelineExecution']['run'][
        'status'] == 'NOT_STARTED'
    test_queue.run_one(instance)
    result = execute_dagster_graphql(context=context,
                                     query=RUN_QUERY,
                                     variables={'runId': run_id})
    assert result.data['pipelineRunOrError']['__typename'] == 'PipelineRun'
    assert result.data['pipelineRunOrError']['status'] == 'SUCCESS'

    # reexecution
    new_run_id = make_new_run_id()
    result = execute_dagster_graphql(
        context=context,
        query=LAUNCH_PIPELINE_REEXECUTION_MUTATION,
        variables={
            'executionParams': {
                'selector': {
                    'name': 'no_config_pipeline'
                },
                'environmentConfigData': {
                    'storage': {
                        'filesystem': {}
                    }
                },
                'executionMetadata': {
                    'runId': new_run_id,
                    'rootRunId': run_id,
                    'parentRunId': run_id,
                },
                'mode': 'default',
            }
        },
    )
    assert (result.data['launchPipelineReexecution']['__typename'] ==
            'LaunchPipelineReexecutionSuccess')

    test_queue.run_one(instance)

    result = execute_dagster_graphql(context=context,
                                     query=RUN_QUERY,
                                     variables={'runId': new_run_id})
    assert result.data['pipelineRunOrError']['__typename'] == 'PipelineRun'
    assert result.data['pipelineRunOrError']['status'] == 'SUCCESS'
Ejemplo n.º 4
0
def create_sqlite_run_event_logstorage():
    with seven.TemporaryDirectory() as tmpdir_path:
        yield SqliteEventLogStorage(tmpdir_path)
Ejemplo n.º 5
0
def create_in_memory_event_log_instance():
    with seven.TemporaryDirectory() as temp_dir:
        asset_storage = InMemoryEventLogStorage()
        instance = get_instance(temp_dir, asset_storage)
        yield [instance, asset_storage]
Ejemplo n.º 6
0
def test_run_groups_over_time():
    with seven.TemporaryDirectory() as tempdir:
        instance = DagsterInstance.local_temp(tempdir=tempdir)

        repo_1 = get_repo_at_time_1()

        full_evolve_run_id = execute_pipeline(
            repo_1.get_pipeline("evolving_pipeline"), instance=instance).run_id
        foo_run_id = execute_pipeline(repo_1.get_pipeline("foo_pipeline"),
                                      instance=instance).run_id
        evolve_a_run_id = execute_pipeline(
            repo_1.get_pipeline("evolving_pipeline").get_pipeline_subset_def(
                {"solid_A"}),
            instance=instance,
        ).run_id
        evolve_b_run_id = execute_pipeline(
            repo_1.get_pipeline("evolving_pipeline").get_pipeline_subset_def(
                {"solid_B"}),
            instance=instance,
        ).run_id

        context_at_time_1 = define_out_of_process_context(
            __file__, "get_repo_at_time_1", instance)

        result = execute_dagster_graphql(context_at_time_1,
                                         ALL_RUN_GROUPS_QUERY)
        assert result.data
        assert "runGroupsOrError" in result.data
        assert "results" in result.data["runGroupsOrError"]
        assert len(result.data["runGroupsOrError"]["results"]) == 4

        t1_runs = {
            run["runId"]: run
            for group in result.data["runGroupsOrError"]["results"]
            for run in group["runs"]
        }

        # test full_evolve_run_id
        assert t1_runs[full_evolve_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": None,
        }

        # test foo_run_id
        assert t1_runs[foo_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "foo_pipeline",
            "solidSelection": None,
        }

        # test evolve_a_run_id
        assert t1_runs[evolve_a_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": ["solid_A"],
        }
        assert t1_runs[evolve_a_run_id]["pipelineSnapshotId"]

        # test evolve_b_run_id
        assert t1_runs[evolve_b_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": ["solid_B"],
        }

        context_at_time_2 = define_out_of_process_context(
            __file__, "get_repo_at_time_2", instance)

        result = execute_dagster_graphql(context_at_time_2,
                                         ALL_RUN_GROUPS_QUERY)
        assert "runGroupsOrError" in result.data
        assert "results" in result.data["runGroupsOrError"]
        assert len(result.data["runGroupsOrError"]["results"]) == 4

        t2_runs = {
            run["runId"]: run
            for group in result.data["runGroupsOrError"]["results"]
            for run in group["runs"]
        }

        # test full_evolve_run_id
        assert t2_runs[full_evolve_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": None,
        }

        # test evolve_a_run_id
        assert t2_runs[evolve_a_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": ["solid_A"],
        }
        assert t2_runs[evolve_a_run_id]["pipelineSnapshotId"]

        # names same
        assert (t1_runs[full_evolve_run_id]["pipeline"]["name"] ==
                t2_runs[evolve_a_run_id]["pipeline"]["name"])

        # snapshots differ
        assert (t1_runs[full_evolve_run_id]["pipelineSnapshotId"] !=
                t2_runs[evolve_a_run_id]["pipelineSnapshotId"])

        # pipeline name changed
        assert t2_runs[foo_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "foo_pipeline",
            "solidSelection": None,
        }
        # subset no longer valid - b renamed
        assert t2_runs[evolve_b_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": ["solid_B"],
        }
Ejemplo n.º 7
0
def test_priority_pipeline():
    with seven.TemporaryDirectory() as tempdir:
        result = execute_pipeline_on_celery(tempdir, 'priority_pipeline')
        assert result.success
Ejemplo n.º 8
0
def test_0_7_6_postgres_pre_add_pipeline_snapshot(hostname, conn_string):
    engine = create_engine(conn_string)
    engine.execute("drop schema public cascade;")
    engine.execute("create schema public;")

    env = os.environ.copy()
    env["PGPASSWORD"] = "******"
    subprocess.check_call(
        [
            "psql",
            "-h",
            hostname,
            "-p",
            "5432",
            "-U",
            "test",
            "-f",
            file_relative_path(
                __file__, "snapshot_0_7_6_pre_add_pipeline_snapshot/postgres/pg_dump.txt"
            ),
        ],
        env=env,
    )

    run_id = "d5f89349-7477-4fab-913e-0925cef0a959"

    with seven.TemporaryDirectory() as tempdir:
        with open(file_relative_path(__file__, "dagster.yaml"), "r") as template_fd:
            with open(os.path.join(tempdir, "dagster.yaml"), "w") as target_fd:
                template = template_fd.read().format(hostname=hostname)
                target_fd.write(template)

        instance = DagsterInstance.from_config(tempdir)

        @solid
        def noop_solid(_):
            pass

        @pipeline
        def noop_pipeline():
            noop_solid()

        with pytest.raises(
            DagsterInstanceMigrationRequired, match=_migration_regex("run", current_revision=None)
        ):
            execute_pipeline(noop_pipeline, instance=instance)

        # ensure migration is run
        instance.upgrade()

        runs = instance.get_runs()

        assert len(runs) == 1

        assert runs[0].run_id == run_id

        run = instance.get_run_by_id(run_id)

        assert run.run_id == run_id
        assert run.pipeline_snapshot_id is None
        result = execute_pipeline(noop_pipeline, instance=instance)

        assert result.success

        runs = instance.get_runs()
        assert len(runs) == 2

        new_run_id = result.run_id

        new_run = instance.get_run_by_id(new_run_id)

        assert new_run.pipeline_snapshot_id
Ejemplo n.º 9
0
def graphql_context():
    with seven.TemporaryDirectory() as temp_dir:
        yield define_test_context(DagsterInstance.local_temp(temp_dir))
Ejemplo n.º 10
0
    def _post_artifacts(self, log, step_run_ref, run_id, step_key):
        '''
        Synchronize the step run ref and pyspark code to an S3 staging bucket for use on EMR.

        For the zip file, consider the following toy example:

            # Folder: my_pyspark_project/
            # a.py
            def foo():
                print(1)

            # b.py
            def bar():
                print(2)

            # main.py
            from a import foo
            from b import bar

            foo()
            bar()

        This will zip up `my_pyspark_project/` as `my_pyspark_project.zip`. Then, when running
        `spark-submit --py-files my_pyspark_project.zip emr_step_main.py` on EMR this will
        print 1, 2.
        '''

        with seven.TemporaryDirectory() as temp_dir:
            s3 = boto3.client('s3', region_name=self.region_name)

            # Upload step run ref
            def _upload_file_to_s3(local_path, s3_filename):
                key = self._artifact_s3_key(run_id, step_key, s3_filename)
                s3_uri = self._artifact_s3_uri(run_id, step_key, s3_filename)
                log.debug('Uploading file {local_path} to {s3_uri}'.format(
                    local_path=local_path, s3_uri=s3_uri))
                s3.upload_file(Filename=local_path,
                               Bucket=self.staging_bucket,
                               Key=key)

            # Upload main file.
            # The remote Dagster installation should also have the file, but locating it there
            # could be a pain.
            main_local_path = self._main_file_local_path()
            _upload_file_to_s3(main_local_path, self._main_file_name())

            if self.deploy_local_pipeline_package:
                # Zip and upload package containing pipeline
                zip_local_path = os.path.join(temp_dir, CODE_ZIP_NAME)
                build_pyspark_zip(zip_local_path,
                                  self.local_pipeline_package_path)
                _upload_file_to_s3(zip_local_path, CODE_ZIP_NAME)

            # Create step run ref pickle file
            step_run_ref_local_path = os.path.join(
                temp_dir, PICKLED_STEP_RUN_REF_FILE_NAME)
            with open(step_run_ref_local_path, 'wb') as step_pickle_file:
                pickle.dump(step_run_ref, step_pickle_file)

            _upload_file_to_s3(step_run_ref_local_path,
                               PICKLED_STEP_RUN_REF_FILE_NAME)
Ejemplo n.º 11
0
def dagster_cli_runner():
    with seven.TemporaryDirectory() as dagster_home_temp:
        with instance_for_test_tempdir(dagster_home_temp):
            yield CliRunner(env={"DAGSTER_HOME": dagster_home_temp})
Ejemplo n.º 12
0
def sqlite_instance_with_manager_disabled():
    with seven.TemporaryDirectory() as temp_dir:
        yield DagsterInstance.local_temp(
            tempdir=temp_dir, overrides={'dagit': {'execution_manager': {'disabled': True}}}
        )
Ejemplo n.º 13
0
def dagster_cli_runner():
    with seven.TemporaryDirectory() as dagster_home_temp:
        yield CliRunner(env={'DAGSTER_HOME': dagster_home_temp})
Ejemplo n.º 14
0
def test_execute_display_command():
    with seven.TemporaryDirectory() as temp_dir:
        run_store = SqliteRunStorage.from_local(temp_dir)
        event_store = ConsolidatedSqliteEventLogStorage(temp_dir)
        compute_log_manager = LocalComputeLogManager(temp_dir)
        instance = DagsterInstance(
            instance_type=InstanceType.PERSISTENT,
            local_artifact_storage=LocalArtifactStorage(temp_dir),
            run_storage=run_store,
            event_storage=event_store,
            compute_log_manager=compute_log_manager,
            run_coordinator=DefaultRunCoordinator(),
            run_launcher=DefaultRunLauncher(),
        )
        run_config = {
            "solids": {
                "create_string_1": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
                "create_string_2": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
                "take_string_1": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
                "take_string_2": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
                "take_string_two_inputs": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
            },
            "intermediate_storage": {
                "filesystem": {
                    "config": {
                        "base_dir": temp_dir
                    }
                }
            },
        }

        # write run config to temp file
        # file is temp because intermediate storage directory is temporary
        with open(os.path.join(temp_dir, "pipeline_config.yaml"), "w") as f:
            f.write(yaml.dump(run_config))

        kwargs = {
            "config": (os.path.join(temp_dir, "pipeline_config.yaml"), ),
            "pipeline":
            "basic_pipeline",
            "python_file":
            file_relative_path(
                __file__,
                "../../core_tests/execution_tests/memoized_dev_loop_pipeline.py"
            ),
            "tags":
            '{"dagster/is_memoized_run": "true"}',
        }

        with Capturing() as output:
            execute_list_versions_command(kwargs=kwargs, instance=instance)

        assert output

        # execute the pipeline once so that addresses have been populated.

        result = execute_pipeline(
            basic_pipeline,
            run_config=run_config,
            mode="only_mode",
            tags={"dagster/is_memoized_run": "true"},
            instance=instance,
        )
        assert result.success

        with Capturing() as output:
            execute_list_versions_command(kwargs=kwargs, instance=instance)

        assert output
Ejemplo n.º 15
0
def default_instance(overrides=None):
    with seven.TemporaryDirectory() as temp_dir:
        with default_instance_tempdir(temp_dir, overrides) as instance:
            yield instance
Ejemplo n.º 16
0
def test_init_compute_log_with_bad_config():
    with seven.TemporaryDirectory() as tmpdir_path:
        with open(os.path.join(tmpdir_path, 'dagster.yaml'), 'w') as fd:
            yaml.dump({'compute_logs': {'garbage': 'flargh'}}, fd, default_flow_style=False)
        with pytest.raises(DagsterInvalidConfigError, match='Undefined field "garbage"'):
            DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))
Ejemplo n.º 17
0
def schedule_tempdir():
    with seven.TemporaryDirectory() as tempdir:
        yield tempdir
Ejemplo n.º 18
0
def test_init_compute_log_with_bad_config_override():
    with seven.TemporaryDirectory() as tmpdir_path:
        with pytest.raises(DagsterInvalidConfigError, match='Undefined field "garbage"'):
            DagsterInstance.from_ref(
                InstanceRef.from_dir(tmpdir_path, overrides={'compute_logs': {'garbage': 'flargh'}})
            )
Ejemplo n.º 19
0
def create_consolidated_sqlite_run_event_log_storage():
    with seven.TemporaryDirectory() as tmpdir_path:
        yield ConsolidatedSqliteEventLogStorage(tmpdir_path)
Ejemplo n.º 20
0
def tempdir_wrapper(tempdir=None):
    if tempdir:
        yield tempdir
    else:
        with seven.TemporaryDirectory() as t:
            yield t
def test_compute_log_manager(s3_bucket):
    @pipeline
    def simple():
        @solid
        def easy(context):
            context.log.info("easy")
            print(HELLO_WORLD)  # pylint: disable=print-call
            return "easy"

        easy()

    # Uses mock S3
    s3 = boto3.client("s3")
    s3.create_bucket(Bucket=s3_bucket)

    with seven.TemporaryDirectory() as temp_dir:
        run_store = SqliteRunStorage.from_local(temp_dir)
        event_store = SqliteEventLogStorage(temp_dir)
        manager = S3ComputeLogManager(bucket=s3_bucket,
                                      prefix="my_prefix",
                                      local_dir=temp_dir)
        instance = DagsterInstance(
            instance_type=InstanceType.PERSISTENT,
            local_artifact_storage=LocalArtifactStorage(temp_dir),
            run_storage=run_store,
            event_storage=event_store,
            compute_log_manager=manager,
            run_launcher=CliApiRunLauncher(),
        )
        result = execute_pipeline(simple, instance=instance)
        compute_steps = [
            event.step_key for event in result.step_event_list
            if event.event_type == DagsterEventType.STEP_START
        ]
        assert len(compute_steps) == 1
        step_key = compute_steps[0]

        stdout = manager.read_logs_file(result.run_id, step_key,
                                        ComputeIOType.STDOUT)
        assert stdout.data == HELLO_WORLD + SEPARATOR

        stderr = manager.read_logs_file(result.run_id, step_key,
                                        ComputeIOType.STDERR)
        for expected in EXPECTED_LOGS:
            assert expected in stderr.data

        # Check S3 directly
        s3_object = s3.get_object(
            Bucket=s3_bucket,
            Key="{prefix}/storage/{run_id}/compute_logs/easy.compute.err".
            format(prefix="my_prefix", run_id=result.run_id),
        )
        stderr_s3 = six.ensure_str(s3_object["Body"].read())
        for expected in EXPECTED_LOGS:
            assert expected in stderr_s3

        # Check download behavior by deleting locally cached logs
        compute_logs_dir = os.path.join(temp_dir, result.run_id,
                                        "compute_logs")
        for filename in os.listdir(compute_logs_dir):
            os.unlink(os.path.join(compute_logs_dir, filename))

        stdout = manager.read_logs_file(result.run_id, step_key,
                                        ComputeIOType.STDOUT)
        assert stdout.data == HELLO_WORLD + SEPARATOR

        stderr = manager.read_logs_file(result.run_id, step_key,
                                        ComputeIOType.STDERR)
        for expected in EXPECTED_LOGS:
            assert expected in stderr.data
Ejemplo n.º 22
0
def test_compute_log_manager(
    mock_create_blob_client, mock_generate_blob_sas, storage_account, container, credential
):
    mock_generate_blob_sas.return_value = "fake-url"
    fake_client = FakeBlobServiceClient(storage_account)
    mock_create_blob_client.return_value = fake_client

    @pipeline
    def simple():
        @solid
        def easy(context):
            context.log.info("easy")
            print(HELLO_WORLD)  # pylint: disable=print-call
            return "easy"

        easy()

    with seven.TemporaryDirectory() as temp_dir:
        run_store = SqliteRunStorage.from_local(temp_dir)
        event_store = SqliteEventLogStorage(temp_dir)
        manager = AzureBlobComputeLogManager(
            storage_account=storage_account,
            container=container,
            prefix="my_prefix",
            local_dir=temp_dir,
            secret_key=credential,
        )
        instance = DagsterInstance(
            instance_type=InstanceType.PERSISTENT,
            local_artifact_storage=LocalArtifactStorage(temp_dir),
            run_storage=run_store,
            event_storage=event_store,
            compute_log_manager=manager,
            run_coordinator=DefaultRunCoordinator(),
            run_launcher=SyncInMemoryRunLauncher(),
        )
        result = execute_pipeline(simple, instance=instance)
        compute_steps = [
            event.step_key
            for event in result.step_event_list
            if event.event_type == DagsterEventType.STEP_START
        ]
        assert len(compute_steps) == 1
        step_key = compute_steps[0]

        stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT)
        assert stdout.data == HELLO_WORLD + SEPARATOR

        stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR)
        for expected in EXPECTED_LOGS:
            assert expected in stderr.data

        # Check ADLS2 directly
        adls2_object = fake_client.get_blob_client(
            container=container,
            blob="{prefix}/storage/{run_id}/compute_logs/easy.compute.err".format(
                prefix="my_prefix", run_id=result.run_id
            ),
        )
        adls2_stderr = six.ensure_str(adls2_object.download_blob().readall())
        for expected in EXPECTED_LOGS:
            assert expected in adls2_stderr

        # Check download behavior by deleting locally cached logs
        compute_logs_dir = os.path.join(temp_dir, result.run_id, "compute_logs")
        for filename in os.listdir(compute_logs_dir):
            os.unlink(os.path.join(compute_logs_dir, filename))

        stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT)
        assert stdout.data == HELLO_WORLD + SEPARATOR

        stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR)
        for expected in EXPECTED_LOGS:
            assert expected in stderr.data
Ejemplo n.º 23
0
def test_input_manager_with_retries():
    _called = False
    _count = {"total": 0}

    @input_manager
    def should_succeed(_, _resource_config):
        if _count["total"] < 2:
            _count["total"] += 1
            raise RetryRequested(max_retries=3)
        return "foo"

    @input_manager
    def should_retry(_, _resource_config):
        raise RetryRequested(max_retries=3)

    @input_manager
    def should_not_execute(_, _resource_config):
        _called = True

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={
                "should_succeed": should_succeed,
                "should_not_execute": should_not_execute,
                "should_retry": should_retry,
            })
    ])
    def simple():
        @solid
        def source_solid(_):
            return "foo"

        @solid(input_defs=[
            InputDefinition("solid_input", manager_key="should_succeed")
        ])
        def take_input_1(_, solid_input):
            return solid_input

        @solid(input_defs=[
            InputDefinition("solid_input", manager_key="should_retry")
        ])
        def take_input_2(_, solid_input):
            return solid_input

        @solid(input_defs=[
            InputDefinition("solid_input", manager_key="should_not_execute")
        ])
        def take_input_3(_, solid_input):
            return solid_input

        take_input_3(take_input_2(take_input_1(source_solid())))

    with seven.TemporaryDirectory() as tmpdir_path:

        instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))

        result = execute_pipeline(simple,
                                  instance=instance,
                                  raise_on_error=False)

        step_stats = instance.get_run_step_stats(result.run_id)
        assert len(step_stats) == 3

        step_stats_1 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_1"])
        assert len(step_stats_1) == 1
        step_stat_1 = step_stats_1[0]
        assert step_stat_1.status.value == "SUCCESS"
        assert step_stat_1.attempts == 3

        step_stats_2 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_2"])
        assert len(step_stats_2) == 1
        step_stat_2 = step_stats_2[0]
        assert step_stat_2.status.value == "FAILURE"
        assert step_stat_2.attempts == 4

        step_stats_3 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_3"])
        assert len(step_stats_3) == 0
        assert _called == False
Ejemplo n.º 24
0
def test_0_7_6_postgres_pre_add_pipeline_snapshot(hostname, conn_string):
    engine = create_engine(conn_string)
    engine.execute('drop schema public cascade;')
    engine.execute('create schema public;')

    env = os.environ.copy()
    env['PGPASSWORD'] = '******'
    subprocess.check_call(
        [
            'psql',
            '-h',
            hostname,
            '-p',
            '5432',
            '-U',
            'test',
            '-f',
            file_relative_path(
                __file__,
                'snapshot_0_7_6_pre_add_pipeline_snapshot/postgres/pg_dump.txt'
            ),
        ],
        env=env,
    )

    run_id = 'd5f89349-7477-4fab-913e-0925cef0a959'

    with seven.TemporaryDirectory() as tempdir:
        with open(file_relative_path(__file__, 'dagster.yaml'),
                  'r') as template_fd:
            with open(os.path.join(tempdir, 'dagster.yaml'), 'w') as target_fd:
                template = template_fd.read().format(hostname=hostname)
                target_fd.write(template)

        instance = DagsterInstance.from_config(tempdir)

        @solid
        def noop_solid(_):
            pass

        @pipeline
        def noop_pipeline():
            noop_solid()

        with pytest.raises(DagsterInstanceMigrationRequired,
                           match=_migration_regex(current_revision=None)):
            execute_pipeline(noop_pipeline, instance=instance)

        # ensure migration is run
        instance.upgrade()

        runs = instance.get_runs()

        assert len(runs) == 1

        assert runs[0].run_id == run_id

        run = instance.get_run_by_id(run_id)

        assert run.run_id == run_id
        assert run.pipeline_snapshot_id is None
        result = execute_pipeline(noop_pipeline, instance=instance)

        assert result.success

        runs = instance.get_runs()
        assert len(runs) == 2

        new_run_id = result.run_id

        new_run = instance.get_run_by_id(new_run_id)

        assert new_run.pipeline_snapshot_id
Ejemplo n.º 25
0
def create_consolidated_sqlite_event_log_instance():
    with seven.TemporaryDirectory() as temp_dir:
        asset_storage = ConsolidatedSqliteEventLogStorage(temp_dir)
        instance = get_instance(temp_dir, asset_storage)
        yield [instance, asset_storage]
Ejemplo n.º 26
0
def test_0_7_6_postgres_pre_event_log_migration(hostname, conn_string):
    engine = create_engine(conn_string)
    engine.execute('drop schema public cascade;')
    engine.execute('create schema public;')

    env = os.environ.copy()
    env['PGPASSWORD'] = '******'
    subprocess.check_call(
        [
            'psql',
            '-h',
            hostname,
            '-p',
            '5432',
            '-U',
            'test',
            '-f',
            file_relative_path(
                __file__,
                'snapshot_0_7_6_pre_event_log_migration/postgres/pg_dump.txt'),
        ],
        env=env,
    )

    run_id = 'ca7f1e33-526d-4f75-9bc5-3e98da41ab97'

    with seven.TemporaryDirectory() as tempdir:
        with open(file_relative_path(__file__, 'dagster.yaml'),
                  'r') as template_fd:
            with open(os.path.join(tempdir, 'dagster.yaml'), 'w') as target_fd:
                template = template_fd.read().format(hostname=hostname)
                target_fd.write(template)

        instance = DagsterInstance.from_config(tempdir)

        # Runs will appear in DB, but event logs need migration
        runs = instance.get_runs()
        assert len(runs) == 1
        assert instance.get_run_by_id(run_id)

        # Make sure the schema is migrated
        instance.upgrade()

        assert isinstance(instance._event_storage, SqlEventLogStorage)
        events_by_id = instance._event_storage.get_logs_for_run_by_log_id(
            run_id)
        assert len(events_by_id) == 40

        step_key_records = []
        for record_id, _event in events_by_id.items():
            row_data = instance._event_storage.get_event_log_table_data(
                run_id, record_id)
            if row_data.step_key is not None:
                step_key_records.append(row_data)
        assert len(step_key_records) == 0

        # run the event_log data migration
        migrate_event_log_data(instance=instance)

        step_key_records = []
        for record_id, _event in events_by_id.items():
            row_data = instance._event_storage.get_event_log_table_data(
                run_id, record_id)
            if row_data.step_key is not None:
                step_key_records.append(row_data)
        assert len(step_key_records) > 0
Ejemplo n.º 27
0
def test_dev_loop_changing_versions():
    with seven.TemporaryDirectory() as temp_dir:
        run_store = SqliteRunStorage.from_local(temp_dir)
        event_store = ConsolidatedSqliteEventLogStorage(temp_dir)
        compute_log_manager = LocalComputeLogManager(temp_dir)
        instance = DagsterInstance(
            instance_type=InstanceType.PERSISTENT,
            local_artifact_storage=LocalArtifactStorage(temp_dir),
            run_storage=run_store,
            event_storage=event_store,
            compute_log_manager=compute_log_manager,
            run_coordinator=DefaultRunCoordinator(),
            run_launcher=DefaultRunLauncher(),
        )

        run_config = {
            "solids": {
                "create_string_1": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
                "create_string_2": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
                "take_string_1": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
                "take_string_2": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
                "take_string_two_inputs": {
                    "config": {
                        "input_str": "apple",
                        "base_dir": temp_dir
                    }
                },
            },
            "intermediate_storage": {
                "filesystem": {
                    "config": {
                        "base_dir": temp_dir
                    }
                }
            },
        }

        result = execute_pipeline(
            basic_pipeline,
            run_config=run_config,
            mode="only_mode",
            tags={"dagster/is_memoized_run": "true"},
            instance=instance,
        )
        assert result.success

        assert not get_step_keys_to_execute(instance, basic_pipeline,
                                            run_config, "only_mode")

        run_config["solids"]["take_string_1"]["config"]["input_str"] = "banana"

        assert set(
            get_step_keys_to_execute(instance, basic_pipeline, run_config,
                                     "only_mode")) == set([
                                         "take_string_1.compute",
                                         "take_string_two_inputs.compute"
                                     ])

        result2 = execute_pipeline(
            basic_pipeline,
            run_config=run_config,
            mode="only_mode",
            tags={"dagster/is_memoized_run": "true"},
            instance=instance,
        )
        assert result2.success

        assert not get_step_keys_to_execute(instance, basic_pipeline,
                                            run_config, "only_mode")

        run_config["solids"]["take_string_two_inputs"]["config"][
            "input_str"] = "banana"

        assert get_step_keys_to_execute(
            instance, basic_pipeline, run_config,
            "only_mode") == ["take_string_two_inputs.compute"]

        result3 = execute_pipeline(
            basic_pipeline,
            run_config=run_config,
            mode="only_mode",
            tags={"dagster/is_memoized_run": "true"},
            instance=instance,
        )
        assert result3.success

        assert not get_step_keys_to_execute(instance, basic_pipeline,
                                            run_config, "only_mode")
Ejemplo n.º 28
0
def create_sqlite_run_storage():
    with seven.TemporaryDirectory() as tempdir:
        yield SqliteRunStorage.from_local(tempdir)
Ejemplo n.º 29
0
def test_output_manager_with_failure():
    _called_input_manager = False
    _called_solid = False

    @output_manager
    def should_fail(_, _resource_config, _obj):
        raise Failure(
            description="Foolure",
            metadata_entries=[
                EventMetadataEntry.text(label="label",
                                        text="text",
                                        description="description")
            ],
        )

    @input_manager
    def should_not_enter(_):
        _called_input_manager = True

    @solid(output_defs=[OutputDefinition(manager_key="should_fail")])
    def emit_str(_):
        return "emit"

    @solid(input_defs=[
        InputDefinition(name="_input_str",
                        dagster_type=str,
                        manager_key="should_not_enter")
    ])
    def should_not_call(_, _input_str):
        _called_solid = True

    @pipeline(mode_defs=[
        ModeDefinition(resource_defs={
            "should_fail": should_fail,
            "should_not_enter": should_not_enter
        })
    ])
    def simple():
        should_not_call(emit_str())

    with seven.TemporaryDirectory() as tmpdir_path:

        instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))

        result = execute_pipeline(simple,
                                  instance=instance,
                                  raise_on_error=False)

        assert not result.success

        failure_data = result.result_for_solid("emit_str").failure_data

        assert failure_data.error.cls_name == "Failure"

        assert failure_data.user_failure_data.description == "Foolure"
        assert failure_data.user_failure_data.metadata_entries[
            0].label == "label"
        assert failure_data.user_failure_data.metadata_entries[
            0].entry_data.text == "text"
        assert failure_data.user_failure_data.metadata_entries[
            0].description == "description"

        assert not _called_input_manager and not _called_solid
Ejemplo n.º 30
0
def test_runs_over_time():
    with seven.TemporaryDirectory() as temp_dir:
        instance = DagsterInstance.local_temp(temp_dir)

        repo_1 = get_repo_at_time_1()

        full_evolve_run_id = execute_pipeline(
            repo_1.get_pipeline("evolving_pipeline"), instance=instance).run_id
        foo_run_id = execute_pipeline(repo_1.get_pipeline("foo_pipeline"),
                                      instance=instance).run_id
        evolve_a_run_id = execute_pipeline(
            repo_1.get_pipeline("evolving_pipeline").get_pipeline_subset_def(
                {"solid_A"}),
            instance=instance,
        ).run_id
        evolve_b_run_id = execute_pipeline(
            repo_1.get_pipeline("evolving_pipeline").get_pipeline_subset_def(
                {"solid_B"}),
            instance=instance,
        ).run_id

        context_at_time_1 = define_context_for_file(__file__,
                                                    "get_repo_at_time_1",
                                                    instance)

        result = execute_dagster_graphql(context_at_time_1, ALL_RUNS_QUERY)
        assert result.data

        t1_runs = {
            run["runId"]: run
            for run in result.data["pipelineRunsOrError"]["results"]
        }

        assert t1_runs[full_evolve_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": None,
        }

        assert t1_runs[foo_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "foo_pipeline",
            "solidSelection": None,
        }

        assert t1_runs[evolve_a_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": ["solid_A"],
        }

        assert t1_runs[evolve_b_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": ["solid_B"],
        }

        context_at_time_2 = define_context_for_file(__file__,
                                                    "get_repo_at_time_2",
                                                    instance)

        result = execute_dagster_graphql(context_at_time_2, ALL_RUNS_QUERY)
        assert result.data

        t2_runs = {
            run["runId"]: run
            for run in result.data["pipelineRunsOrError"]["results"]
        }

        assert t2_runs[full_evolve_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": None,
        }

        assert t2_runs[evolve_a_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": ["solid_A"],
        }
        # pipeline name changed
        assert t2_runs[foo_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "foo_pipeline",
            "solidSelection": None,
        }
        # subset no longer valid - b renamed
        assert t2_runs[evolve_b_run_id]["pipeline"] == {
            "__typename": "PipelineSnapshot",
            "name": "evolving_pipeline",
            "solidSelection": ["solid_B"],
        }