Beispiel #1
0
def submit_backfill_runs(instance,
                         repo_location,
                         backfill_job,
                         partition_names=None):
    check.inst_param(instance, "instance", DagsterInstance)
    check.inst_param(repo_location, "repo_location", RepositoryLocation)
    check.inst_param(backfill_job, "backfill_job", PartitionBackfill)

    repository_origin = backfill_job.partition_set_origin.external_repository_origin
    repo_name = repository_origin.repository_name

    if not partition_names:
        partition_names = backfill_job.partition_names

    check.invariant(
        repo_location.has_repository(repo_name),
        "Could not find repository {repo_name} in location {repo_location_name}"
        .format(repo_name=repo_name, repo_location_name=repo_location.name),
    )
    external_repo = repo_location.get_repository(repo_name)
    partition_set_name = backfill_job.partition_set_origin.partition_set_name
    external_partition_set = external_repo.get_external_partition_set(
        partition_set_name)
    result = repo_location.get_external_partition_set_execution_param_data(
        external_repo.handle, partition_set_name, partition_names)
    if isinstance(result, ExternalPartitionExecutionErrorData):
        raise DagsterBackfillFailedError(serializable_error_info=result.error)

    assert isinstance(result, ExternalPartitionSetExecutionParamData)
    external_pipeline = external_repo.get_full_external_pipeline(
        external_partition_set.pipeline_name)
    submitted = []
    for partition_data in result.partition_data:
        pipeline_run = create_backfill_run(
            instance,
            repo_location,
            external_pipeline,
            external_partition_set,
            backfill_job,
            partition_data,
        )
        if pipeline_run:
            # we skip runs in certain cases, e.g. we are running a `from_failure` backfill job
            # and the partition has had a successful run since the time the backfill was
            # scheduled
            instance.submit_run(pipeline_run.run_id, external_pipeline)
            submitted.append(pipeline_run.run_id)
    return submitted
Beispiel #2
0
def _execute_backfill_command_at_location(cli_args, print_fn, instance, workspace, repo_location):
    external_repo = get_external_repository_from_repo_location(
        repo_location, cli_args.get("repository")
    )

    external_pipeline = get_external_pipeline_from_external_repo(
        external_repo,
        cli_args.get("pipeline"),
    )

    noprompt = cli_args.get("noprompt")

    pipeline_partition_set_names = {
        external_partition_set.name: external_partition_set
        for external_partition_set in external_repo.get_external_partition_sets()
        if external_partition_set.pipeline_name == external_pipeline.name
    }

    if not pipeline_partition_set_names:
        raise click.UsageError(
            "No partition sets found for pipeline `{}`".format(external_pipeline.name)
        )
    partition_set_name = cli_args.get("partition_set")
    if not partition_set_name:
        if len(pipeline_partition_set_names) == 1:
            partition_set_name = next(iter(pipeline_partition_set_names.keys()))
        elif noprompt:
            raise click.UsageError("No partition set specified (see option `--partition-set`)")
        else:
            partition_set_name = click.prompt(
                "Select a partition set to use for backfill: {}".format(
                    ", ".join(x for x in pipeline_partition_set_names.keys())
                )
            )

    partition_set = pipeline_partition_set_names.get(partition_set_name)

    if not partition_set:
        raise click.UsageError("No partition set found named `{}`".format(partition_set_name))

    run_tags = get_tags_from_args(cli_args)

    repo_handle = RepositoryHandle(
        repository_name=external_repo.name,
        repository_location=repo_location,
    )

    try:
        partition_names_or_error = repo_location.get_external_partition_names(
            repo_handle,
            partition_set_name,
        )
    except Exception:  # pylint: disable=broad-except
        error_info = serializable_error_info_from_exc_info(sys.exc_info())
        raise DagsterBackfillFailedError(
            "Failure fetching partition names for {partition_set_name}: {error_message}".format(
                partition_set_name=partition_set_name,
                error_message=error_info.message,
            ),
            serialized_error_info=error_info,
        )

    partition_names = gen_partition_names_from_args(
        partition_names_or_error.partition_names, cli_args
    )

    # Print backfill info
    print_fn("\n     Pipeline: {}".format(external_pipeline.name))
    print_fn("Partition set: {}".format(partition_set_name))
    print_fn("   Partitions: {}\n".format(print_partition_format(partition_names, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
        "Do you want to proceed with the backfill ({} partitions)?".format(len(partition_names))
    ):

        print_fn("Launching runs... ")

        backfill_id = make_new_backfill_id()
        backfill_job = PartitionBackfill(
            backfill_id=backfill_id,
            partition_set_origin=partition_set.get_external_origin(),
            status=BulkActionStatus.REQUESTED,
            partition_names=partition_names,
            from_failure=False,
            reexecution_steps=None,
            tags=run_tags,
            backfill_timestamp=pendulum.now("UTC").timestamp(),
        )
        try:
            partition_execution_data = (
                repo_location.get_external_partition_set_execution_param_data(
                    repository_handle=repo_handle,
                    partition_set_name=partition_set_name,
                    partition_names=partition_names,
                )
            )
        except Exception:  # pylint: disable=broad-except
            error_info = serializable_error_info_from_exc_info(sys.exc_info())
            instance.add_backfill(
                backfill_job.with_status(BulkActionStatus.FAILED).with_error(error_info)
            )
            return print_fn("Backfill failed: {}".format(error_info))

        assert isinstance(partition_execution_data, ExternalPartitionSetExecutionParamData)

        for partition_data in partition_execution_data.partition_data:
            pipeline_run = create_backfill_run(
                instance,
                repo_location,
                external_pipeline,
                partition_set,
                backfill_job,
                partition_data,
            )
            if pipeline_run:
                instance.submit_run(pipeline_run.run_id, workspace)

        instance.add_backfill(backfill_job.with_status(BulkActionStatus.COMPLETED))

        print_fn("Launched backfill job `{}`".format(backfill_id))

    else:
        print_fn("Aborted!")
Beispiel #3
0
def execute_backfill_iteration(instance,
                               workspace,
                               logger,
                               debug_crash_flags=None):
    check.inst_param(instance, "instance", DagsterInstance)
    check.inst_param(workspace, "workspace", IWorkspace)

    backfill_jobs = instance.get_backfills(status=BulkActionStatus.REQUESTED)

    if not backfill_jobs:
        logger.info("No backfill jobs requested.")
        yield
        return

    for backfill_job in backfill_jobs:
        backfill_id = backfill_job.backfill_id

        if not backfill_job.last_submitted_partition_name:
            logger.info(f"Starting backfill for {backfill_id}")
        else:
            logger.info(
                f"Resuming backfill for {backfill_id} from {backfill_job.last_submitted_partition_name}"
            )

        origin = (backfill_job.partition_set_origin.external_repository_origin.
                  repository_location_origin)

        try:
            repo_location = workspace.get_location(origin)
            repo_name = backfill_job.partition_set_origin.external_repository_origin.repository_name
            partition_set_name = backfill_job.partition_set_origin.partition_set_name
            if not repo_location.has_repository(repo_name):
                raise DagsterBackfillFailedError(
                    f"Could not find repository {repo_name} in location {repo_location.name} to "
                    f"run backfill {backfill_id}.")
            external_repo = repo_location.get_repository(repo_name)
            if not external_repo.has_external_partition_set(
                    partition_set_name):
                raise DagsterBackfillFailedError(
                    f"Could not find partition set {partition_set_name} in repository {repo_name}. "
                )

            has_more = True
            while has_more:
                # refetch the backfill job
                backfill_job = instance.get_backfill(backfill_job.backfill_id)
                if backfill_job.status != BulkActionStatus.REQUESTED:
                    break

                chunk, checkpoint, has_more = _get_partitions_chunk(
                    instance, logger, backfill_job, CHECKPOINT_COUNT)
                _check_for_debug_crash(debug_crash_flags, "BEFORE_SUBMIT")

                if chunk:

                    for _run_id in submit_backfill_runs(
                            instance, workspace, repo_location, backfill_job,
                            chunk):
                        yield

                _check_for_debug_crash(debug_crash_flags, "AFTER_SUBMIT")

                if has_more:
                    instance.update_backfill(
                        backfill_job.with_partition_checkpoint(checkpoint))
                    yield
                    time.sleep(CHECKPOINT_INTERVAL)
                else:
                    logger.info(
                        f"Backfill completed for {backfill_id} for {len(backfill_job.partition_names)} partitions"
                    )
                    instance.update_backfill(
                        backfill_job.with_status(BulkActionStatus.COMPLETED))
                    yield
        except Exception:
            error_info = serializable_error_info_from_exc_info(sys.exc_info())
            instance.update_backfill(
                backfill_job.with_status(
                    BulkActionStatus.FAILED).with_error(error_info))
            logger.error(
                f"Backfill failed for {backfill_id}: {error_info.to_string()}")
            yield error_info
Beispiel #4
0
def _execute_backfill_command_at_location(cli_args, print_fn, instance, repo_location):
    external_repo = get_external_repository_from_repo_location(
        repo_location, cli_args.get("repository")
    )

    external_pipeline = get_external_pipeline_from_external_repo(
        external_repo, cli_args.get("pipeline"),
    )

    noprompt = cli_args.get("noprompt")

    pipeline_partition_set_names = {
        external_partition_set.name: external_partition_set
        for external_partition_set in external_repo.get_external_partition_sets()
        if external_partition_set.pipeline_name == external_pipeline.name
    }

    if not pipeline_partition_set_names:
        raise click.UsageError(
            "No partition sets found for pipeline `{}`".format(external_pipeline.name)
        )
    partition_set_name = cli_args.get("partition_set")
    if not partition_set_name:
        if len(pipeline_partition_set_names) == 1:
            partition_set_name = next(iter(pipeline_partition_set_names.keys()))
        elif noprompt:
            raise click.UsageError("No partition set specified (see option `--partition-set`)")
        else:
            partition_set_name = click.prompt(
                "Select a partition set to use for backfill: {}".format(
                    ", ".join(x for x in pipeline_partition_set_names.keys())
                )
            )

    partition_set = pipeline_partition_set_names.get(partition_set_name)

    if not partition_set:
        raise click.UsageError("No partition set found named `{}`".format(partition_set_name))

    mode = partition_set.mode
    solid_selection = partition_set.solid_selection
    run_tags = get_tags_from_args(cli_args)

    repo_handle = RepositoryHandle(
        repository_name=external_repo.name,
        repository_location_handle=repo_location.location_handle,
    )

    # Resolve partitions to backfill
    partition_names_or_error = repo_location.get_external_partition_names(
        repo_handle, partition_set_name,
    )

    if isinstance(partition_names_or_error, ExternalPartitionExecutionErrorData):
        raise DagsterBackfillFailedError(
            "Failure fetching partition names for {partition_set_name}: {error_message}".format(
                partition_set_name=partition_set_name,
                error_message=partition_names_or_error.error.message,
            ),
            serialized_error_info=partition_names_or_error.error,
        )

    partition_names = gen_partition_names_from_args(
        partition_names_or_error.partition_names, cli_args
    )

    # Print backfill info
    print_fn("\n     Pipeline: {}".format(external_pipeline.name))
    print_fn("Partition set: {}".format(partition_set_name))
    print_fn("   Partitions: {}\n".format(print_partition_format(partition_names, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
        "Do you want to proceed with the backfill ({} partitions)?".format(len(partition_names))
    ):

        print_fn("Launching runs... ")

        backfill_id = make_new_backfill_id()
        backfill_tags = PipelineRun.tags_for_backfill_id(backfill_id)
        partition_execution_data = repo_location.get_external_partition_set_execution_param_data(
            repository_handle=repo_handle,
            partition_set_name=partition_set_name,
            partition_names=partition_names,
        )

        if isinstance(partition_execution_data, ExternalPartitionExecutionErrorData):
            return print_fn("Backfill failed: {}".format(partition_execution_data.error))

        assert isinstance(partition_execution_data, ExternalPartitionSetExecutionParamData)

        for partition_data in partition_execution_data.partition_data:
            run = _create_external_pipeline_run(
                instance=instance,
                repo_location=repo_location,
                external_repo=external_repo,
                external_pipeline=external_pipeline,
                run_config=partition_data.run_config,
                mode=mode,
                preset=None,
                tags=merge_dicts(merge_dicts(partition_data.tags, backfill_tags), run_tags),
                solid_selection=frozenset(solid_selection) if solid_selection else None,
            )

            instance.launch_run(run.run_id, external_pipeline)

        print_fn("Launched backfill job `{}`".format(backfill_id))

    else:
        print_fn("Aborted!")
Beispiel #5
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    instance = instance or DagsterInstance.get()
    repo_location = get_repository_location_from_kwargs(cli_args, instance)
    external_repo = get_external_repository_from_repo_location(
        repo_location, cli_args.get('repository'))

    external_pipeline = get_external_pipeline_from_external_repo(
        external_repo,
        cli_args.get('pipeline'),
    )

    noprompt = cli_args.get('noprompt')

    pipeline_partition_set_names = {
        external_partition_set.name: external_partition_set
        for external_partition_set in
        external_repo.get_external_partition_sets()
        if external_partition_set.pipeline_name == external_pipeline.name
    }

    if not pipeline_partition_set_names:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(
                external_pipeline.name))
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_set_names) == 1:
            partition_set_name = next(iter(
                pipeline_partition_set_names.keys()))
        elif noprompt:
            raise click.UsageError(
                'No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x for x in pipeline_partition_set_names.keys())))

    partition_set = pipeline_partition_set_names.get(partition_set_name)

    if not partition_set:
        raise click.UsageError(
            'No partition set found named `{}`'.format(partition_set_name))

    mode = partition_set.mode
    solid_selection = partition_set.solid_selection

    repo_handle = RepositoryHandle(
        repository_name=external_repo.name,
        repository_location_handle=repo_location.location_handle,
    )

    # Resolve partitions to backfill
    partition_names_or_error = repo_location.get_external_partition_names(
        repo_handle,
        partition_set_name,
    )

    if isinstance(partition_names_or_error,
                  ExternalPartitionExecutionErrorData):
        raise DagsterBackfillFailedError(
            'Failure fetching partition names for {partition_set_name}: {error_message}'
            .format(
                partition_set_name=partition_set_name,
                error_message=partition_names_or_error.error.message,
            ),
            serialized_error_info=partition_names_or_error.error,
        )

    partition_names = gen_partition_names_from_args(
        partition_names_or_error.partition_names, cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(external_pipeline.name))
    print_fn('Partition set: {}'.format(partition_set_name))
    print_fn('   Partitions: {}\n'.format(
        print_partition_format(partition_names, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
            'Do you want to proceed with the backfill ({} partitions)?'.format(
                len(partition_names))):

        print_fn('Launching runs... ')
        backfill_id = make_new_backfill_id()

        run_tags = merge_dicts(
            PipelineRun.tags_for_backfill_id(backfill_id),
            get_tags_from_args(cli_args),
        )

        for partition_name in partition_names:
            run_config_or_error = repo_location.get_external_partition_config(
                repo_handle, partition_set_name, partition_name)
            if isinstance(run_config_or_error,
                          ExternalPartitionExecutionErrorData):
                raise DagsterBackfillFailedError(
                    'Failure fetching run config for partition {partition_name} in {partition_set_name}: {error_message}'
                    .format(
                        partition_name=partition_name,
                        partition_set_name=partition_set_name,
                        error_message=run_config_or_error.error.message,
                    ),
                    serialized_error_info=run_config_or_error.error,
                )

            tags_or_error = repo_location.get_external_partition_tags(
                repo_handle, partition_set_name, partition_name)
            if isinstance(tags_or_error, ExternalPartitionExecutionErrorData):
                raise DagsterBackfillFailedError(
                    'Failure fetching tags for partition {partition_name} in {partition_set_name}: {error_message}'
                    .format(
                        partition_name=partition_name,
                        partition_set_name=partition_set_name,
                        error_message=tags_or_error.error.message,
                    ),
                    serialized_error_info=tags_or_error.error,
                )
            run = _create_external_pipeline_run(
                instance=instance,
                repo_location=repo_location,
                external_repo=external_repo,
                external_pipeline=external_pipeline,
                run_config=run_config_or_error.run_config,
                mode=mode,
                preset=None,
                tags=merge_dicts(tags_or_error.tags, run_tags),
                solid_selection=frozenset(solid_selection)
                if solid_selection else None,
            )

            instance.launch_run(run.run_id, external_pipeline)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_id))
    else:
        print_fn('Aborted!')