Exemple #1
0
    def test_paginated_fetch(self, storage):
        assert storage
        one, two, three = [
            make_new_run_id(),
            make_new_run_id(),
            make_new_run_id()
        ]
        storage.add_run(
            TestRunStorage.build_run(run_id=one,
                                     pipeline_name="some_pipeline",
                                     tags={"mytag": "hello"}))
        storage.add_run(
            TestRunStorage.build_run(run_id=two,
                                     pipeline_name="some_pipeline",
                                     tags={"mytag": "hello"}))
        storage.add_run(
            TestRunStorage.build_run(run_id=three,
                                     pipeline_name="some_pipeline",
                                     tags={"mytag": "hello"}))

        all_runs = storage.get_runs()
        assert len(all_runs) == 3
        sliced_runs = storage.get_runs(cursor=three, limit=1)
        assert len(sliced_runs) == 1
        assert sliced_runs[0].run_id == two

        all_runs = storage.get_runs(
            PipelineRunsFilter(pipeline_name="some_pipeline"))
        assert len(all_runs) == 3
        sliced_runs = storage.get_runs(
            PipelineRunsFilter(pipeline_name="some_pipeline"),
            cursor=three,
            limit=1)
        assert len(sliced_runs) == 1
        assert sliced_runs[0].run_id == two

        all_runs = storage.get_runs(
            PipelineRunsFilter(tags={"mytag": "hello"}))
        assert len(all_runs) == 3
        sliced_runs = storage.get_runs(
            PipelineRunsFilter(tags={"mytag": "hello"}), cursor=three, limit=1)
        assert len(sliced_runs) == 1
        assert sliced_runs[0].run_id == two
def test_run_priority_pipeline():
    with seven.TemporaryDirectory() as tempdir:
        instance = DagsterInstance.local_temp(tempdir)

        low_done = threading.Event()
        hi_done = threading.Event()

        # enqueue low-priority tasks
        low_thread = threading.Thread(target=execute_on_thread,
                                      args=(tempdir, 'low_pipeline', -3,
                                            low_done))
        low_thread.daemon = True
        low_thread.start()

        time.sleep(
            1)  # sleep so that we don't hit any sqlite concurrency issues

        # enqueue hi-priority tasks
        hi_thread = threading.Thread(target=execute_on_thread,
                                     args=(tempdir, 'hi_pipeline', 3, hi_done))
        hi_thread.daemon = True
        hi_thread.start()

        time.sleep(5)  # sleep to give queue time to prioritize tasks

        with start_celery_worker():
            while not low_done.is_set() or not hi_done.is_set():
                time.sleep(1)

            low_runs = instance.get_runs(filters=PipelineRunsFilter(
                pipeline_name='low_pipeline'))
            assert len(low_runs) == 1
            low_run = low_runs[0]
            lowstats = instance.get_run_stats(low_run.run_id)
            hi_runs = instance.get_runs(filters=PipelineRunsFilter(
                pipeline_name='hi_pipeline'))
            assert len(hi_runs) == 1
            hi_run = hi_runs[0]
            histats = instance.get_run_stats(hi_run.run_id)

            assert lowstats.start_time < histats.start_time
            assert lowstats.end_time > histats.end_time
Exemple #3
0
    def test_fetch_by_snapshot_id(self, storage):
        assert storage
        pipeline_def_a = PipelineDefinition(name="some_pipeline",
                                            solid_defs=[])
        pipeline_def_b = PipelineDefinition(name="some_other_pipeline",
                                            solid_defs=[])
        pipeline_snapshot_a = pipeline_def_a.get_pipeline_snapshot()
        pipeline_snapshot_b = pipeline_def_b.get_pipeline_snapshot()
        pipeline_snapshot_a_id = create_pipeline_snapshot_id(
            pipeline_snapshot_a)
        pipeline_snapshot_b_id = create_pipeline_snapshot_id(
            pipeline_snapshot_b)

        assert storage.add_pipeline_snapshot(
            pipeline_snapshot_a) == pipeline_snapshot_a_id
        assert storage.add_pipeline_snapshot(
            pipeline_snapshot_b) == pipeline_snapshot_b_id

        one = make_new_run_id()
        two = make_new_run_id()
        storage.add_run(
            TestRunStorage.build_run(
                run_id=one,
                pipeline_name="some_pipeline",
                pipeline_snapshot_id=pipeline_snapshot_a_id,
            ))
        storage.add_run(
            TestRunStorage.build_run(
                run_id=two,
                pipeline_name="some_other_pipeline",
                pipeline_snapshot_id=pipeline_snapshot_b_id,
            ))
        assert len(storage.get_runs()) == 2
        runs_a = storage.get_runs(
            PipelineRunsFilter(snapshot_id=pipeline_snapshot_a_id))
        assert len(runs_a) == 1
        assert runs_a[0].run_id == one

        runs_b = storage.get_runs(
            PipelineRunsFilter(snapshot_id=pipeline_snapshot_b_id))
        assert len(runs_b) == 1
        assert runs_b[0].run_id == two
Exemple #4
0
def _recent_failed_runs_text(instance):
    lines = []
    runs = instance.get_runs(
        limit=5,
        filters=PipelineRunsFilter(statuses=[PipelineRunStatus.FAILURE]))
    if len(runs) <= 0:
        return ""
    for run in runs:
        lines.append("{:<50}{:<50}{:<20}".format(run.run_id, run.pipeline_name,
                                                 run.status))
    return "Recently failed runs:\n{}".format("\n".join(lines))
def test_add_get_postgres_run_storage(clean_storage):
    run_storage = clean_storage
    run_id = make_new_run_id()
    run_to_add = build_run(pipeline_name='pipeline_name', run_id=run_id)
    added = run_storage.add_run(run_to_add)
    assert added

    fetched_run = run_storage.get_run_by_id(run_id)

    assert run_to_add == fetched_run

    assert run_storage.has_run(run_id)
    assert not run_storage.has_run(make_new_run_id())

    assert run_storage.get_runs() == [run_to_add]
    assert run_storage.get_runs(PipelineRunsFilter(pipeline_name='pipeline_name')) == [run_to_add]
    assert run_storage.get_runs(PipelineRunsFilter(pipeline_name='nope')) == []

    run_storage.wipe()
    assert run_storage.get_runs() == []
Exemple #6
0
    def resolve_runs(self, graphene_info, **kwargs):
        from .pipelines.pipeline import GraphenePipelineRun

        filters = PipelineRunsFilter.for_backfill(self._backfill_id)
        return [
            GraphenePipelineRun(r)
            for r in graphene_info.context.instance.get_runs(
                filters=filters,
                limit=kwargs.get("limit"),
            )
        ]
Exemple #7
0
def _fetch_runs_by_partition(instance, partition_set_def, status_filters=None):
    # query runs db for this partition set
    filters = PipelineRunsFilter(tags={"dagster/partition_set": partition_set_def.name})
    partition_set_runs = instance.get_runs(filters)

    runs_by_partition = defaultdict(list)

    for run in partition_set_runs:
        if not status_filters or run.status in status_filters:
            runs_by_partition[run.tags["dagster/partition"]].append(run)

    return runs_by_partition
    def resolve_runs(self, graphene_info, **kwargs):
        filters = kwargs.get("filter")
        partition_tags = {
            PARTITION_SET_TAG: self._external_partition_set.name,
            PARTITION_NAME_TAG: self._partition_name,
        }
        if filters is not None:
            filters = filters.to_selector()
            runs_filter = PipelineRunsFilter(
                run_ids=filters.run_ids,
                pipeline_name=filters.pipeline_name,
                statuses=filters.statuses,
                tags=merge_dicts(filters.tags, partition_tags),
            )
        else:
            runs_filter = PipelineRunsFilter(tags=partition_tags)

        return get_runs(graphene_info,
                        runs_filter,
                        cursor=kwargs.get("cursor"),
                        limit=kwargs.get("limit"))
Exemple #9
0
def _get_existing_run_for_request(instance, external_schedule, schedule_time, run_request):
    tags = merge_dicts(
        PipelineRun.tags_for_schedule(external_schedule),
        {SCHEDULED_EXECUTION_TIME_TAG: schedule_time.in_tz("UTC").isoformat(),},
    )
    if run_request.run_key:
        tags[RUN_KEY_TAG] = run_request.run_key
    runs_filter = PipelineRunsFilter(tags=tags)
    existing_runs = instance.get_runs(runs_filter)
    if not len(existing_runs):
        return None
    return existing_runs[0]
Exemple #10
0
def _fetch_runs_by_partition(instance, partition_set_def):
    # query runs db for this partition set
    filters = PipelineRunsFilter(
        tags={'dagster/partition_set': partition_set_def.name})
    partition_set_runs = instance.get_runs(filters)

    runs_by_partition = defaultdict(list)

    for run in partition_set_runs:
        runs_by_partition[run.tags['dagster/partition']].append(run)

    return runs_by_partition
Exemple #11
0
    def resolve_numRequested(self, graphene_info):
        filters = PipelineRunsFilter.for_backfill(self._backfill_job.backfill_id)
        run_count = graphene_info.context.instance.get_runs_count(filters)
        if self._backfill_job.status == BulkActionStatus.COMPLETED:
            return len(self._backfill_job.partition_names)

        checkpoint = self._backfill_job.last_submitted_partition_name
        return max(
            run_count,
            self._backfill_job.partition_names.index(checkpoint) + 1
            if checkpoint and checkpoint in self._backfill_job.partition_names
            else 0,
        )
Exemple #12
0
def test_run_record_timestamps():
    with get_instance() as instance:
        freeze_datetime = to_timezone(
            create_pendulum_time(2019, 11, 2, 0, 0, 0, tz="US/Central"), "US/Pacific"
        )

        with pendulum.test(freeze_datetime):
            result = my_job.execute_in_process(instance=instance)
            records = instance.get_run_records(filters=PipelineRunsFilter(run_ids=[result.run_id]))
            assert len(records) == 1
            record = records[0]
            assert record.start_time == 1572670800.0
            assert record.end_time == 1572670800.0
Exemple #13
0
def tick_specific_data_from_dagster_tick(graphene_info, tick):
    from ..pipelines.pipeline import GrapheneRun

    if tick.status == TickStatus.SUCCESS:
        if tick.run_ids and graphene_info.context.instance.has_run(
                tick.run_ids[0]):
            record = graphene_info.context.instance.get_run_records(
                PipelineRunsFilter(run_ids=[tick.run_ids[0]]))[0]
            return GrapheneScheduleTickSuccessData(run=GrapheneRun(record))
        return GrapheneScheduleTickSuccessData(run=None)
    elif tick.status == TickStatus.FAILURE:
        error = tick.error
        return GrapheneScheduleTickFailureData(error=error)
Exemple #14
0
    def test_fetch_run_filter(self, storage):
        assert storage
        one = make_new_run_id()
        two = make_new_run_id()

        storage.add_run(
            TestRunStorage.build_run(
                run_id=one, pipeline_name="some_pipeline", status=PipelineRunStatus.SUCCESS,
            )
        )
        storage.add_run(
            TestRunStorage.build_run(
                run_id=two, pipeline_name="some_pipeline", status=PipelineRunStatus.SUCCESS,
            ),
        )

        assert len(storage.get_runs()) == 2

        some_runs = storage.get_runs(PipelineRunsFilter(run_ids=[one, two]))
        count = storage.get_runs_count(PipelineRunsFilter(run_ids=[one, two]))
        assert len(some_runs) == 2
        assert count == 2
Exemple #15
0
def get_in_progress_runs_by_step(graphene_info, job_names, step_keys):
    from ..schema.pipelines.pipeline import GrapheneInProgressRunsByStep, GrapheneRun

    instance = graphene_info.context.instance

    in_progress_records = []
    for job_name in job_names:
        in_progress_records.extend(
            instance.get_run_records(
                PipelineRunsFilter(pipeline_name=job_name, statuses=PENDING_STATUSES)
            )
        )

    in_progress_runs_by_step = defaultdict(list)
    unstarted_runs_by_step = defaultdict(list)

    for record in in_progress_records:
        run = record.pipeline_run

        asset_names = graphene_info.context.instance.get_execution_plan_snapshot(
            run.execution_plan_snapshot_id
        ).step_keys_to_execute

        if run.status in IN_PROGRESS_STATUSES:
            step_stats = graphene_info.context.instance.get_run_step_stats(run.run_id, step_keys)
            for step_stat in step_stats:
                if step_stat.status == StepEventStatus.IN_PROGRESS:
                    in_progress_runs_by_step[step_stat.step_key].append(GrapheneRun(record))

            for step_key in asset_names:
                # step_stats only contains stats for steps that are in progress or complete
                is_unstarted = (
                    len([step_stat for step_stat in step_stats if step_stat.step_key == step_key])
                    == 0
                )
                if is_unstarted:
                    unstarted_runs_by_step[step_key].append(GrapheneRun(record))
        else:
            # the run never began execution, all steps are unstarted
            for step_key in asset_names:
                unstarted_runs_by_step[step_key].append(GrapheneRun(record))

    all_step_keys = in_progress_runs_by_step.keys() | unstarted_runs_by_step.keys()
    return [
        GrapheneInProgressRunsByStep(
            key,
            unstarted_runs_by_step.get(key, []),
            in_progress_runs_by_step.get(key, []),
        )
        for key in all_step_keys
    ]
Exemple #16
0
def get_in_progress_runs_by_step(graphene_info, job_names, step_keys):
    from ..schema.pipelines.pipeline import GrapheneInProgressRunsByStep, GrapheneRun

    instance = graphene_info.context.instance

    in_progress_records = []
    for job_name in job_names:
        in_progress_records.extend(
            instance.get_run_records(
                PipelineRunsFilter(pipeline_name=job_name, statuses=IN_PROGRESS_STATUSES)
            )
        )

    in_progress_runs_by_step = {}
    unstarted_runs_by_step = {}

    for record in in_progress_records:
        run = record.pipeline_run
        step_stats = graphene_info.context.instance.get_run_step_stats(run.run_id, step_keys)
        for step_stat in step_stats:
            if step_stat.status == StepEventStatus.IN_PROGRESS:
                if step_stat.step_key not in in_progress_runs_by_step:
                    in_progress_runs_by_step[step_stat.step_key] = []
                in_progress_runs_by_step[step_stat.step_key].append(GrapheneRun(record))

        asset_names = graphene_info.context.instance.get_execution_plan_snapshot(
            run.execution_plan_snapshot_id
        ).step_keys_to_execute

        for step_key in asset_names:
            # step_stats only contains stats for steps that are in progress or complete
            is_unstarted = (
                len([step_stat for step_stat in step_stats if step_stat.step_key == step_key]) == 0
            )
            if is_unstarted:
                if step_key not in unstarted_runs_by_step:
                    unstarted_runs_by_step[step_key] = []
                unstarted_runs_by_step[step_key].append(GrapheneRun(record))

    step_runs = []
    for key in in_progress_runs_by_step.keys() | unstarted_runs_by_step.keys():
        step_runs.append(
            GrapheneInProgressRunsByStep(
                key,
                unstarted_runs_by_step.get(key, []),
                in_progress_runs_by_step.get(key, []),
            )
        )

    return step_runs
Exemple #17
0
    def resolve_runs(self, graphene_info, **kwargs):
        from .pipelines.pipeline import GrapheneRun

        if kwargs.get("limit") and self._batch_loader:
            limit = kwargs["limit"]
            records = (self._batch_loader.get_run_records_for_sensor(
                self._job_state.name, limit)
                       if self._job_state.job_type == InstigatorType.SENSOR
                       else self._batch_loader.get_run_records_for_schedule(
                           self._job_state.name, limit))
            return [GrapheneRun(record) for record in records]

        if self._job_state.job_type == InstigatorType.SENSOR:
            filters = PipelineRunsFilter.for_sensor(self._job_state)
        else:
            filters = PipelineRunsFilter.for_schedule(self._job_state)
        return [
            GrapheneRun(record)
            for record in graphene_info.context.instance.get_run_records(
                filters=filters,
                limit=kwargs.get("limit"),
            )
        ]
Exemple #18
0
    def resolve_runs(self, graphene_info):
        from .pipelines.pipeline import GrapheneRun

        instance = graphene_info.context.instance
        run_ids = self._tick.origin_run_ids or self._tick.run_ids
        if not run_ids:
            return []

        records_by_id = {
            record.pipeline_run.run_id: record
            for record in instance.get_run_records(PipelineRunsFilter(run_ids=run_ids))
        }

        return [GrapheneRun(records_by_id[run_id]) for run_id in run_ids if run_id in records_by_id]
Exemple #19
0
def test_0_12_0_add_mode_column(hostname, conn_string):
    _reconstruct_from_file(
        hostname,
        conn_string,
        file_relative_path(
            __file__,
            "snapshot_0_11_16_pre_add_mode_column/postgres/pg_dump.txt"),
    )

    with tempfile.TemporaryDirectory() as tempdir:
        with open(file_relative_path(__file__, "dagster.yaml"),
                  "r") as template_fd:
            with open(os.path.join(tempdir, "dagster.yaml"), "w") as target_fd:
                template = template_fd.read().format(hostname=hostname)
                target_fd.write(template)

        instance = DagsterInstance.from_config(tempdir)

        # Ensure that you don't get a migration required exception if not trying to use the
        # migration-required column.
        assert len(instance.get_runs()) == 1

        @solid
        def basic():
            pass

        @pipeline
        def noop_pipeline():
            basic()

        # Ensure that you don't get a migration required exception when running a pipeline
        # pre-migration.
        result = execute_pipeline(noop_pipeline, instance=instance)
        assert result.success
        assert len(instance.get_runs()) == 2

        # Ensure that migration required exception throws, since you are trying to use the
        # migration-required column.
        with pytest.raises(
                DagsterInstanceMigrationRequired,
                match=_migration_regex("run", current_revision="7cba9eeaaf1d"),
        ):
            instance.get_runs(filters=PipelineRunsFilter(mode="the_mode"))

        instance.upgrade()

        result = execute_pipeline(noop_pipeline, instance=instance)
        assert result.success
        assert len(instance.get_runs()) == 3
Exemple #20
0
 def test_fetch_by_pipeline(self, storage):
     assert storage
     one = make_new_run_id()
     two = make_new_run_id()
     storage.add_run(
         TestRunStorage.build_run(run_id=one,
                                  pipeline_name="some_pipeline"))
     storage.add_run(
         TestRunStorage.build_run(run_id=two,
                                  pipeline_name="some_other_pipeline"))
     assert len(storage.get_runs()) == 2
     some_runs = storage.get_runs(
         PipelineRunsFilter(pipeline_name="some_pipeline"))
     assert len(some_runs) == 1
     assert some_runs[0].run_id == one
Exemple #21
0
    def to_selector(self):
        if self.status:
            status = PipelineRunStatus[self.status]
        else:
            status = None

        if self.tags:
            # We are wrapping self.tags in a list because dauphin.List is not marked as iterable
            tags = {tag['key']: tag['value'] for tag in list(self.tags)}
        else:
            tags = None

        run_ids = [self.run_id] if self.run_id else []
        return PipelineRunsFilter(
            run_ids=run_ids, pipeline_name=self.pipeline_name, tags=tags, status=status,
        )
Exemple #22
0
def last_empty_partition(context, partition_set_def):
    check.inst_param(context, 'context', ScheduleExecutionContext)
    partition_set_def = check.inst_param(
        partition_set_def, 'partition_set_def', PartitionSetDefinition
    )
    partitions = partition_set_def.get_partitions()
    if not partitions:
        return None
    selected = None
    for partition in reversed(partitions):
        filters = PipelineRunsFilter.for_partition(partition_set_def, partition)
        matching = context.instance.get_runs(filters)
        if not any(run.status == PipelineRunStatus.SUCCESS for run in matching):
            selected = partition
            break
    return selected
Exemple #23
0
def get_in_progress_runs_for_job(graphene_info, job_name):
    instance = graphene_info.context.instance

    in_progress_runs_filter = PipelineRunsFilter(
        pipeline_name=job_name,
        statuses=[
            PipelineRunStatus.STARTING,
            PipelineRunStatus.MANAGED,
            PipelineRunStatus.NOT_STARTED,
            PipelineRunStatus.QUEUED,
            PipelineRunStatus.STARTED,
            PipelineRunStatus.CANCELING,
        ],
    )

    return instance.get_runs(in_progress_runs_filter)
Exemple #24
0
def _fetch_last_run(instance, external_partition_set, partition_name):
    check.inst_param(instance, "instance", DagsterInstance)
    check.inst_param(external_partition_set, "external_partition_set", ExternalPartitionSet)
    check.str_param(partition_name, "partition_name")

    runs = instance.get_runs(
        PipelineRunsFilter(
            pipeline_name=external_partition_set.pipeline_name,
            tags={
                PARTITION_SET_TAG: external_partition_set.name,
                PARTITION_NAME_TAG: partition_name,
            },
        ),
        limit=1,
    )

    return runs[0] if runs else None
Exemple #25
0
    def test_by_job(self, storage):
        if not storage.supports_bucket_queries:
            pytest.skip("storage cannot bucket")

        def _add_run(job_name, tags=None):
            return storage.add_run(
                TestRunStorage.build_run(
                    pipeline_name=job_name, run_id=make_new_run_id(), tags=tags
                )
            )

        _a_one = _add_run("a_pipeline", tags={"a": "A"})
        a_two = _add_run("a_pipeline", tags={"a": "A"})
        _b_one = _add_run("b_pipeline", tags={"a": "A"})
        b_two = _add_run("b_pipeline", tags={"a": "A"})
        c_one = _add_run("c_pipeline", tags={"a": "A"})
        c_two = _add_run("c_pipeline", tags={"a": "B"})

        runs_by_job = {
            run.pipeline_name: run
            for run in storage.get_runs(
                bucket_by=JobBucket(
                    job_names=["a_pipeline", "b_pipeline", "c_pipeline"], bucket_limit=1
                )
            )
        }
        assert set(runs_by_job.keys()) == {"a_pipeline", "b_pipeline", "c_pipeline"}
        assert runs_by_job.get("a_pipeline").run_id == a_two.run_id
        assert runs_by_job.get("b_pipeline").run_id == b_two.run_id
        assert runs_by_job.get("c_pipeline").run_id == c_two.run_id

        # fetch with a runs filter applied
        runs_by_job = {
            run.pipeline_name: run
            for run in storage.get_runs(
                filters=PipelineRunsFilter(tags={"a": "A"}),
                bucket_by=JobBucket(
                    job_names=["a_pipeline", "b_pipeline", "c_pipeline"], bucket_limit=1
                ),
            )
        }
        assert set(runs_by_job.keys()) == {"a_pipeline", "b_pipeline", "c_pipeline"}
        assert runs_by_job.get("a_pipeline").run_id == a_two.run_id
        assert runs_by_job.get("b_pipeline").run_id == b_two.run_id
        assert runs_by_job.get("c_pipeline").run_id == c_one.run_id
Exemple #26
0
    def test_fetch_run_groups_filter(self, storage):
        assert storage

        root_runs = [
            TestRunStorage.build_run(run_id=make_new_run_id(),
                                     pipeline_name="foo_pipeline")
            for i in range(3)
        ]

        runs = [run for run in root_runs]
        for root_run in root_runs:
            failed_run_id = make_new_run_id()
            runs.append(
                TestRunStorage.build_run(
                    run_id=failed_run_id,
                    pipeline_name="foo_pipeline",
                    tags={
                        PARENT_RUN_ID_TAG: root_run.run_id,
                        ROOT_RUN_ID_TAG: root_run.run_id
                    },
                    status=PipelineRunStatus.FAILURE,
                ))
            for _ in range(3):
                runs.append(
                    TestRunStorage.build_run(
                        run_id=make_new_run_id(),
                        pipeline_name="foo_pipeline",
                        tags={
                            PARENT_RUN_ID_TAG: failed_run_id,
                            ROOT_RUN_ID_TAG: root_run.run_id
                        },
                    ))

        for run in runs:
            storage.add_run(run)

        run_groups = storage.get_run_groups(
            limit=5,
            filters=PipelineRunsFilter(status=PipelineRunStatus.FAILURE))

        assert len(run_groups) == 3

        for root_run_id in run_groups:
            assert len(run_groups[root_run_id]["runs"]) == 2
            assert run_groups[root_run_id]["count"] == 5
Exemple #27
0
def get_latest_asset_run_by_step_key(graphene_info, asset_nodes):
    from ..schema.pipelines.pipeline import (
        GrapheneLatestRun,
        GrapheneRun,
    )

    # This method returns the latest run that has occurred for a given step.
    # Because it is expensive to deserialize PipelineRun objects, we limit this
    # query to retrieving the last 5 runs per job. If no runs have occurred, we return
    # a GrapheneLatestRun object with no run. If none of the latest runs contain the
    # step key, we return None.

    instance = graphene_info.context.instance

    latest_run_by_step: Dict[str, PipelineRun] = {}

    for asset_node in asset_nodes:
        job_names = asset_node.job_names
        step_key = asset_node.op_name

        run_records = []
        for job_name in job_names:
            run_records.extend(
                instance.get_run_records(PipelineRunsFilter(pipeline_name=job_name), limit=5)
            )

        if len(run_records) == 0:
            latest_run_by_step[step_key] = GrapheneLatestRun(step_key, None)

        latest_run = None
        for record in run_records:
            run = record.pipeline_run

            asset_names = graphene_info.context.instance.get_execution_plan_snapshot(
                run.execution_plan_snapshot_id
            ).step_keys_to_execute

            if step_key in asset_names:
                if latest_run == None or record.create_timestamp > latest_run.create_timestamp:
                    latest_run = record
        if latest_run:
            latest_run_by_step[step_key] = GrapheneLatestRun(step_key, GrapheneRun(latest_run))

    return latest_run_by_step
Exemple #28
0
def poll_for_finished_run(instance, run_id=None, timeout=20, run_tags=None):
    total_time = 0
    interval = 0.01

    filters = PipelineRunsFilter(
        run_ids=[run_id] if run_id else None,
        tags=run_tags,
        statuses=[PipelineRunStatus.SUCCESS, PipelineRunStatus.FAILURE, PipelineRunStatus.CANCELED],
    )

    while True:
        runs = instance.get_runs(filters, limit=1)
        if runs:
            return runs[0]
        else:
            time.sleep(interval)
            total_time += interval
            if total_time > timeout:
                raise Exception("Timed out")
Exemple #29
0
def _get_or_create_sensor_run(context, instance, repo_location,
                              external_sensor, external_pipeline, run_request):

    if not run_request.run_key:
        return _create_sensor_run(context, instance, repo_location,
                                  external_sensor, external_pipeline,
                                  run_request)

    existing_runs = instance.get_runs(
        PipelineRunsFilter(tags=merge_dicts(
            PipelineRun.tags_for_sensor(external_sensor),
            {RUN_KEY_TAG: run_request.run_key},
        )))

    if len(existing_runs):
        check.invariant(len(existing_runs) == 1)
        run = existing_runs[0]
        if run.status != PipelineRunStatus.NOT_STARTED:
            # A run already exists and was launched for this time period,
            # but the scheduler must have crashed before the tick could be put
            # into a SUCCESS state

            context.logger.info(
                f"Run {run.run_id} already completed with the run key "
                f"`{run_request.run_key}` for {external_sensor.name}")
            context.add_state(
                JobTickStatus.SUCCESS,
                run_id=run.run_id,
                run_key=run_request.run_key,
            )

            return None
        else:
            context.logger.info(
                f"Run {run.run_id} already created with the run key "
                f"`{run_request.run_key}` for {external_sensor.name}")
            return run

    context.logger.info(f"Creating new run for {external_sensor.name}")

    return _create_sensor_run(context, instance, repo_location,
                              external_sensor, external_pipeline, run_request)
Exemple #30
0
    def test_by_tag(self, storage):
        if not storage.supports_bucket_queries:
            pytest.skip("storage cannot bucket")

        def _add_run(job_name, tags=None):
            return storage.add_run(
                TestRunStorage.build_run(
                    pipeline_name=job_name, run_id=make_new_run_id(), tags=tags
                )
            )

        _one = _add_run("a", tags={"a": "1"})
        _two = _add_run("a", tags={"a": "2"})
        three = _add_run("a", tags={"a": "3"})
        _none = _add_run("a")
        b = _add_run("b", tags={"a": "4"})
        one = _add_run("a", tags={"a": "1"})
        two = _add_run("a", tags={"a": "2"})

        runs_by_tag = {
            run.tags.get("a"): run
            for run in storage.get_runs(
                bucket_by=TagBucket(tag_key="a", tag_values=["1", "2", "3", "4"], bucket_limit=1)
            )
        }
        assert set(runs_by_tag.keys()) == {"1", "2", "3", "4"}
        assert runs_by_tag.get("1").run_id == one.run_id
        assert runs_by_tag.get("2").run_id == two.run_id
        assert runs_by_tag.get("3").run_id == three.run_id
        assert runs_by_tag.get("4").run_id == b.run_id

        runs_by_tag = {
            run.tags.get("a"): run
            for run in storage.get_runs(
                filters=PipelineRunsFilter(pipeline_name="a"),
                bucket_by=TagBucket(tag_key="a", tag_values=["1", "2", "3", "4"], bucket_limit=1),
            )
        }
        assert set(runs_by_tag.keys()) == {"1", "2", "3"}
        assert runs_by_tag.get("1").run_id == one.run_id
        assert runs_by_tag.get("2").run_id == two.run_id
        assert runs_by_tag.get("3").run_id == three.run_id