Ejemplo n.º 1
0
def _get_partitions_chunk(instance, logger, backfill_job, chunk_size):
    check.inst_param(backfill_job, "backfill_job", PartitionBackfill)
    partition_names = backfill_job.partition_names
    checkpoint = backfill_job.last_submitted_partition_name

    if (backfill_job.last_submitted_partition_name
            and backfill_job.last_submitted_partition_name in partition_names):
        index = partition_names.index(
            backfill_job.last_submitted_partition_name)
        partition_names = partition_names[index + 1:]

    # for idempotence, fetch all runs with the current backfill id
    backfill_runs = instance.get_runs(
        PipelineRunsFilter(
            tags=PipelineRun.tags_for_backfill_id(backfill_job.backfill_id)))
    completed_partitions = set(
        [run.tags.get(PARTITION_NAME_TAG) for run in backfill_runs])
    initial_checkpoint = (partition_names.index(checkpoint) + 1 if checkpoint
                          and checkpoint in partition_names else 0)
    partition_names = partition_names[initial_checkpoint:]
    has_more = chunk_size < len(partition_names)
    partitions_chunk = partition_names[:chunk_size]
    next_checkpoint = partitions_chunk[-1]

    to_skip = set(partitions_chunk).intersection(completed_partitions)
    if to_skip:
        logger.info(
            f"Found {len(to_skip)} existing runs for backfill {backfill_job.backfill_id}, skipping"
        )
    to_submit = [
        partition_name for partition_name in partitions_chunk
        if partition_name not in completed_partitions
    ]
    return to_submit, next_checkpoint, has_more
Ejemplo n.º 2
0
def execute_partition_set(partition_set, partition_filter, instance=None):
    '''Programatically perform a backfill over a partition set

    Arguments:
        partition_set (PartitionSet): The base partition set to run the backfill over
        partition_filter (Callable[[List[Partition]]], List[Partition]): A function that takes
            a list of partitions and returns a filtered list of partitions to run the backfill
            over.
        instance (DagsterInstance): The instance to use to perform the backfill
    '''
    check.inst_param(partition_set, 'partition_set', PartitionSetDefinition)
    check.callable_param(partition_filter, 'partition_filter')
    check.inst_param(instance, 'instance', DagsterInstance)

    candidate_partitions = partition_set.get_partitions()
    partitions = partition_filter(candidate_partitions)

    instance = instance or DagsterInstance.ephemeral()

    for partition in partitions:
        run = PipelineRun(
            pipeline_name=partition_set.pipeline_name,
            run_id=make_new_run_id(),
            selector=ExecutionSelector(partition_set.pipeline_name),
            environment_dict=partition_set.environment_dict_for_partition(
                partition),
            mode='default',
            tags=merge_dicts(
                PipelineRun.tags_for_backfill_id(make_new_backfill_id()),
                partition_set.tags_for_partition(partition),
            ),
            status=PipelineRunStatus.NOT_STARTED,
        )

        # Remove once we can handle synchronous execution... currently limited by sqlite
        time.sleep(0.1)

        instance.launch_run(run)
Ejemplo n.º 3
0
def create_backfill_run(instance, repo_location, external_pipeline,
                        external_partition_set, backfill_job, partition_data):
    check.inst_param(instance, "instance", DagsterInstance)
    check.inst_param(repo_location, "repo_location", RepositoryLocation)
    check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline)
    check.inst_param(external_partition_set, "external_partition_set",
                     ExternalPartitionSet)
    check.inst_param(backfill_job, "backfill_job", PartitionBackfill)
    check.inst_param(partition_data, "partition_data",
                     ExternalPartitionExecutionParamData)

    full_external_execution_plan = repo_location.get_external_execution_plan(
        external_pipeline,
        partition_data.run_config,
        external_partition_set.mode,
        step_keys_to_execute=None,
        known_state=None,
    )

    tags = merge_dicts(
        external_pipeline.tags,
        partition_data.tags,
        PipelineRun.tags_for_backfill_id(backfill_job.backfill_id),
        backfill_job.tags,
    )

    if not backfill_job.from_failure and not backfill_job.reexecution_steps:
        step_keys_to_execute = None
        parent_run_id = None
        root_run_id = None
        known_state = None

    elif backfill_job.from_failure:
        last_run = _fetch_last_run(instance, external_partition_set,
                                   partition_data.name)
        if not last_run or last_run.status != PipelineRunStatus.FAILURE:
            return None

        parent_run_id = last_run.run_id
        root_run_id = last_run.root_run_id or last_run.run_id
        tags = merge_dicts(
            tags,
            {
                RESUME_RETRY_TAG: "true",
                PARENT_RUN_ID_TAG: parent_run_id,
                ROOT_RUN_ID_TAG: root_run_id,
            },
        )
        step_keys_to_execute, known_state = get_retry_steps_from_execution_plan(
            instance, full_external_execution_plan, parent_run_id)
    elif backfill_job.reexecution_steps:
        last_run = _fetch_last_run(instance, external_partition_set,
                                   partition_data.name)
        parent_run_id = last_run.run_id if last_run else None
        root_run_id = (last_run.root_run_id
                       or last_run.run_id) if last_run else None
        if parent_run_id and root_run_id:
            tags = merge_dicts(tags, {
                PARENT_RUN_ID_TAG: parent_run_id,
                ROOT_RUN_ID_TAG: root_run_id
            })
        step_keys_to_execute = backfill_job.reexecution_steps
        if last_run and last_run.status == PipelineRunStatus.SUCCESS:
            known_state = KnownExecutionState.for_reexecution(
                instance.all_logs(parent_run_id),
                step_keys_to_execute,
            )
        else:
            known_state = None

    if step_keys_to_execute:
        external_execution_plan = repo_location.get_external_execution_plan(
            external_pipeline,
            partition_data.run_config,
            external_partition_set.mode,
            step_keys_to_execute=step_keys_to_execute,
            known_state=known_state,
        )
    else:
        external_execution_plan = full_external_execution_plan

    return instance.create_run(
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=external_execution_plan.
        execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        pipeline_name=external_pipeline.name,
        run_id=make_new_run_id(),
        solids_to_execute=frozenset(external_partition_set.solid_selection)
        if external_partition_set.solid_selection else None,
        run_config=partition_data.run_config,
        mode=external_partition_set.mode,
        step_keys_to_execute=step_keys_to_execute,
        tags=tags,
        root_run_id=root_run_id,
        parent_run_id=parent_run_id,
        status=PipelineRunStatus.NOT_STARTED,
        external_pipeline_origin=external_pipeline.get_external_origin(),
    )
Ejemplo n.º 4
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    instance = instance or DagsterInstance.get()
    external_pipeline = get_external_pipeline_from_kwargs(cli_args, instance)
    external_repository = get_external_repository_from_kwargs(cli_args, instance)

    # We should move this to use external repository
    # https://github.com/dagster-io/dagster/issues/2556
    recon_repo = recon_repo_from_external_repo(external_repository)
    repo_def = recon_repo.get_definition()

    noprompt = cli_args.get('noprompt')

    pipeline_def = repo_def.get_pipeline(external_pipeline.name)

    # Resolve partition set
    all_partition_sets = repo_def.partition_set_defs + [
        schedule_def.get_partition_set()
        for schedule_def in repo_def.schedule_defs
        if isinstance(schedule_def, PartitionScheduleDefinition)
    ]

    pipeline_partition_sets = [
        x for x in all_partition_sets if x.pipeline_name == pipeline_def.name
    ]
    if not pipeline_partition_sets:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(pipeline_def.name)
        )
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_sets) == 1:
            partition_set_name = pipeline_partition_sets[0].name
        elif noprompt:
            raise click.UsageError('No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x.name for x in pipeline_partition_sets)
                )
            )
    partition_set = next((x for x in pipeline_partition_sets if x.name == partition_set_name), None)
    if not partition_set:
        raise click.UsageError('No partition set found named `{}`'.format(partition_set_name))

    # Resolve partitions to backfill
    partitions = gen_partitions_from_args(partition_set, cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(pipeline_def.name))
    print_fn('Partition set: {}'.format(partition_set.name))
    print_fn('   Partitions: {}\n'.format(print_partition_format(partitions, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
        'Do you want to proceed with the backfill ({} partitions)?'.format(len(partitions))
    ):

        print_fn('Launching runs... ')
        backfill_id = make_new_backfill_id()

        run_tags = merge_dicts(
            PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args),
        )

        for partition in partitions:
            run = instance.create_run_for_pipeline(
                pipeline_def=pipeline_def,
                mode=partition_set.mode,
                solids_to_execute=frozenset(partition_set.solid_selection)
                if partition_set and partition_set.solid_selection
                else None,
                run_config=partition_set.run_config_for_partition(partition),
                tags=merge_dicts(partition_set.tags_for_partition(partition), run_tags),
            )

            instance.launch_run(run.run_id, external_pipeline)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_id))
    else:
        print_fn(' Aborted!')
Ejemplo n.º 5
0
def _execute_backfill_command_at_location(cli_args, print_fn, instance, repo_location):
    external_repo = get_external_repository_from_repo_location(
        repo_location, cli_args.get("repository")
    )

    external_pipeline = get_external_pipeline_from_external_repo(
        external_repo, cli_args.get("pipeline"),
    )

    noprompt = cli_args.get("noprompt")

    pipeline_partition_set_names = {
        external_partition_set.name: external_partition_set
        for external_partition_set in external_repo.get_external_partition_sets()
        if external_partition_set.pipeline_name == external_pipeline.name
    }

    if not pipeline_partition_set_names:
        raise click.UsageError(
            "No partition sets found for pipeline `{}`".format(external_pipeline.name)
        )
    partition_set_name = cli_args.get("partition_set")
    if not partition_set_name:
        if len(pipeline_partition_set_names) == 1:
            partition_set_name = next(iter(pipeline_partition_set_names.keys()))
        elif noprompt:
            raise click.UsageError("No partition set specified (see option `--partition-set`)")
        else:
            partition_set_name = click.prompt(
                "Select a partition set to use for backfill: {}".format(
                    ", ".join(x for x in pipeline_partition_set_names.keys())
                )
            )

    partition_set = pipeline_partition_set_names.get(partition_set_name)

    if not partition_set:
        raise click.UsageError("No partition set found named `{}`".format(partition_set_name))

    mode = partition_set.mode
    solid_selection = partition_set.solid_selection
    run_tags = get_tags_from_args(cli_args)

    repo_handle = RepositoryHandle(
        repository_name=external_repo.name,
        repository_location_handle=repo_location.location_handle,
    )

    # Resolve partitions to backfill
    partition_names_or_error = repo_location.get_external_partition_names(
        repo_handle, partition_set_name,
    )

    if isinstance(partition_names_or_error, ExternalPartitionExecutionErrorData):
        raise DagsterBackfillFailedError(
            "Failure fetching partition names for {partition_set_name}: {error_message}".format(
                partition_set_name=partition_set_name,
                error_message=partition_names_or_error.error.message,
            ),
            serialized_error_info=partition_names_or_error.error,
        )

    partition_names = gen_partition_names_from_args(
        partition_names_or_error.partition_names, cli_args
    )

    # Print backfill info
    print_fn("\n     Pipeline: {}".format(external_pipeline.name))
    print_fn("Partition set: {}".format(partition_set_name))
    print_fn("   Partitions: {}\n".format(print_partition_format(partition_names, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
        "Do you want to proceed with the backfill ({} partitions)?".format(len(partition_names))
    ):

        print_fn("Launching runs... ")

        backfill_id = make_new_backfill_id()
        backfill_tags = PipelineRun.tags_for_backfill_id(backfill_id)
        partition_execution_data = repo_location.get_external_partition_set_execution_param_data(
            repository_handle=repo_handle,
            partition_set_name=partition_set_name,
            partition_names=partition_names,
        )

        if isinstance(partition_execution_data, ExternalPartitionExecutionErrorData):
            return print_fn("Backfill failed: {}".format(partition_execution_data.error))

        assert isinstance(partition_execution_data, ExternalPartitionSetExecutionParamData)

        for partition_data in partition_execution_data.partition_data:
            run = _create_external_pipeline_run(
                instance=instance,
                repo_location=repo_location,
                external_repo=external_repo,
                external_pipeline=external_pipeline,
                run_config=partition_data.run_config,
                mode=mode,
                preset=None,
                tags=merge_dicts(merge_dicts(partition_data.tags, backfill_tags), run_tags),
                solid_selection=frozenset(solid_selection) if solid_selection else None,
            )

            instance.launch_run(run.run_id, external_pipeline)

        print_fn("Launched backfill job `{}`".format(backfill_id))

    else:
        print_fn("Aborted!")
Ejemplo n.º 6
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    pipeline_name = cli_args.pop('pipeline_name')
    repo_args = {k: v for k, v in cli_args.items() if k in REPO_ARG_NAMES}
    if pipeline_name and not isinstance(pipeline_name, six.string_types):
        if len(pipeline_name) == 1:
            pipeline_name = pipeline_name[0]

    instance = instance or DagsterInstance.get()
    handle = handle_for_repo_cli_args(repo_args)
    repository = handle.build_repository_definition()
    noprompt = cli_args.get('noprompt')

    # check run launcher
    if not instance.run_launcher:
        raise click.UsageError(
            'A run launcher must be configured before running a backfill. You can configure a run '
            'launcher (e.g. dagster_graphql.launcher.RemoteDagitRunLauncher) in your instance '
            '`dagster.yaml` settings. See '
            'https://docs.dagster.io/latest/deploying/instance/ for more'
            'information.')

    # Resolve pipeline
    if not pipeline_name and noprompt:
        raise click.UsageError('No pipeline specified')
    if not pipeline_name:
        pipeline_name = click.prompt(
            'Select a pipeline to backfill: {}'.format(', '.join(
                repository.pipeline_names)))
    repository = handle.build_repository_definition()
    if not repository.has_pipeline(pipeline_name):
        raise click.UsageError(
            'No pipeline found named `{}`'.format(pipeline_name))

    pipeline = repository.get_pipeline(pipeline_name)

    # Resolve partition set
    all_partition_sets = get_partition_sets_for_handle(handle)
    pipeline_partition_sets = [
        x for x in all_partition_sets if x.pipeline_name == pipeline.name
    ]
    if not pipeline_partition_sets:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(pipeline.name))
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_sets) == 1:
            partition_set_name = pipeline_partition_sets[0].name
        elif noprompt:
            raise click.UsageError(
                'No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x.name for x in pipeline_partition_sets)))
    partition_set = next(
        (x for x in pipeline_partition_sets if x.name == partition_set_name),
        None)
    if not partition_set:
        raise click.UsageError(
            'No partition set found named `{}`'.format(partition_set_name))

    # Resolve partitions to backfill
    partitions = gen_partitions_from_args(partition_set, cli_args)

    # Resolve priority
    celery_priority = get_backfill_priority_from_args(cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(pipeline.name))
    print_fn('Partition set: {}'.format(partition_set.name))
    print_fn('   Partitions: {}\n'.format(
        print_partition_format(partitions, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
            'Do you want to proceed with the backfill ({} partitions)?'.format(
                len(partitions))):

        print_fn('Launching runs... ')
        backfill_id = make_new_backfill_id()

        run_tags = merge_dicts(
            PipelineRun.tags_for_backfill_id(backfill_id),
            get_tags_from_args(cli_args),
        )

        # for backwards compatibility - remove once prezi switched over to using tags argument
        if celery_priority is not None:
            run_tags['dagster-celery/run_priority'] = celery_priority

        for partition in partitions:
            run = PipelineRun(
                pipeline_name=pipeline.name,
                run_id=make_new_run_id(),
                selector=ExecutionSelector(pipeline.name),
                environment_dict=partition_set.environment_dict_for_partition(
                    partition),
                mode=cli_args.get('mode') or 'default',
                tags=merge_dicts(partition_set.tags_for_partition(partition),
                                 run_tags),
                status=PipelineRunStatus.NOT_STARTED,
            )
            instance.launch_run(run)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_id))
    else:
        print_fn(' Aborted!')
Ejemplo n.º 7
0
def create_backfill_run(instance, repo_location, external_pipeline,
                        external_partition_set, backfill_job, partition_data):
    from dagster.daemon.daemon import get_telemetry_daemon_session_id

    check.inst_param(instance, "instance", DagsterInstance)
    check.inst_param(repo_location, "repo_location", RepositoryLocation)
    check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline)
    check.inst_param(external_partition_set, "external_partition_set",
                     ExternalPartitionSet)
    check.inst_param(backfill_job, "backfill_job", PartitionBackfill)
    check.inst_param(partition_data, "partition_data",
                     ExternalPartitionExecutionParamData)

    log_action(
        instance,
        BACKFILL_RUN_CREATED,
        metadata={
            "DAEMON_SESSION_ID": get_telemetry_daemon_session_id(),
            "repo_hash": hash_name(repo_location.name),
            "pipeline_name_hash": hash_name(external_pipeline.name),
        },
    )

    tags = merge_dicts(
        external_pipeline.tags,
        partition_data.tags,
        PipelineRun.tags_for_backfill_id(backfill_job.backfill_id),
        backfill_job.tags,
    )

    solids_to_execute = None
    solid_selection = None
    if not backfill_job.from_failure and not backfill_job.reexecution_steps:
        step_keys_to_execute = None
        parent_run_id = None
        root_run_id = None
        known_state = None
        if external_partition_set.solid_selection:
            solids_to_execute = frozenset(
                external_partition_set.solid_selection)
            solid_selection = external_partition_set.solid_selection

    elif backfill_job.from_failure:
        last_run = _fetch_last_run(instance, external_partition_set,
                                   partition_data.name)
        if not last_run or last_run.status != PipelineRunStatus.FAILURE:
            return None
        return instance.create_reexecuted_run_from_failure(
            last_run,
            repo_location,
            external_pipeline,
            tags=tags,
            run_config=partition_data.run_config,
            mode=external_partition_set.mode,
        )

    elif backfill_job.reexecution_steps:
        last_run = _fetch_last_run(instance, external_partition_set,
                                   partition_data.name)
        parent_run_id = last_run.run_id if last_run else None
        root_run_id = (last_run.root_run_id
                       or last_run.run_id) if last_run else None
        if parent_run_id and root_run_id:
            tags = merge_dicts(tags, {
                PARENT_RUN_ID_TAG: parent_run_id,
                ROOT_RUN_ID_TAG: root_run_id
            })
        step_keys_to_execute = backfill_job.reexecution_steps
        if last_run and last_run.status == PipelineRunStatus.SUCCESS:
            known_state = KnownExecutionState.for_reexecution(
                instance.all_logs(parent_run_id),
                step_keys_to_execute,
            )
        else:
            known_state = None

        if external_partition_set.solid_selection:
            solids_to_execute = frozenset(
                external_partition_set.solid_selection)
            solid_selection = external_partition_set.solid_selection

    external_execution_plan = repo_location.get_external_execution_plan(
        external_pipeline,
        partition_data.run_config,
        external_partition_set.mode,
        step_keys_to_execute=step_keys_to_execute,
        known_state=known_state,
        instance=instance,
    )

    return instance.create_run(
        pipeline_snapshot=external_pipeline.pipeline_snapshot,
        execution_plan_snapshot=external_execution_plan.
        execution_plan_snapshot,
        parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot,
        pipeline_name=external_pipeline.name,
        run_id=make_new_run_id(),
        solids_to_execute=solids_to_execute,
        run_config=partition_data.run_config,
        mode=external_partition_set.mode,
        step_keys_to_execute=step_keys_to_execute,
        tags=tags,
        root_run_id=root_run_id,
        parent_run_id=parent_run_id,
        status=PipelineRunStatus.NOT_STARTED,
        external_pipeline_origin=external_pipeline.get_external_origin(),
        pipeline_code_origin=external_pipeline.get_python_origin(),
        solid_selection=solid_selection,
    )
Ejemplo n.º 8
0
def _build_execution_param_list_for_backfill(
    instance,
    partition_data_list,
    backfill_id,
    backfill_params,
    pipeline_selector,
    external_partition_set,
):
    check.inst_param(instance, "instance", DagsterInstance)
    check.list_param(partition_data_list,
                     "partition_data_list",
                     of_type=ExternalPartitionExecutionParamData)
    check.str_param(backfill_id, "backfill_id")
    check.dict_param(backfill_params, "backfill_params")
    check.inst_param(pipeline_selector, "pipeline_selector", PipelineSelector)
    check.inst_param(external_partition_set, "external_partition_set",
                     ExternalPartitionSet)

    backfill_tags = PipelineRun.tags_for_backfill_id(backfill_id)
    execution_tags = {
        t["key"]: t["value"]
        for t in backfill_params.get("tags", [])
    }
    execution_param_list = []
    for partition_data in partition_data_list:
        tags = merge_dicts(merge_dicts(partition_data.tags, backfill_tags),
                           execution_tags)
        if not backfill_params.get("fromFailure") and not backfill_params.get(
                "reexecutionSteps"):
            # full pipeline execution
            execution_param_list.append(
                ExecutionParams(
                    selector=pipeline_selector,
                    run_config=partition_data.run_config,
                    mode=external_partition_set.mode,
                    execution_metadata=ExecutionMetadata(run_id=None,
                                                         tags=tags),
                    step_keys=None,
                ))
            continue

        last_run = _fetch_last_run(instance, external_partition_set,
                                   partition_data.name)

        if backfill_params.get("fromFailure"):
            if not last_run or last_run.status != PipelineRunStatus.FAILURE:
                continue

            execution_param_list.append(
                ExecutionParams(
                    selector=pipeline_selector,
                    run_config=partition_data.run_config,
                    mode=external_partition_set.mode,
                    execution_metadata=ExecutionMetadata(
                        run_id=None,
                        tags=merge_dicts(tags, {RESUME_RETRY_TAG: "true"}),
                        root_run_id=last_run.root_run_id or last_run.run_id,
                        parent_run_id=last_run.run_id,
                    ),
                    step_keys=None,
                ))
            continue

        # partial reexecution from success
        if not last_run or last_run.status != PipelineRunStatus.SUCCESS:
            continue

        execution_param_list.append(
            ExecutionParams(
                selector=pipeline_selector,
                run_config=partition_data.run_config,
                mode=external_partition_set.mode,
                execution_metadata=ExecutionMetadata(
                    run_id=None,
                    tags=tags,
                    root_run_id=last_run.root_run_id or last_run.run_id,
                    parent_run_id=last_run.run_id,
                ),
                step_keys=backfill_params["reexecutionSteps"],
            ))
        continue

    return execution_param_list
Ejemplo n.º 9
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    pipeline_name = cli_args.pop('pipeline_name')
    repo_args = {k: v for k, v in cli_args.items() if k in REPO_ARG_NAMES}
    if pipeline_name and not isinstance(pipeline_name, six.string_types):
        if len(pipeline_name) == 1:
            pipeline_name = pipeline_name[0]

    instance = instance or DagsterInstance.get()
    recon_repo = recon_repo_for_cli_args(repo_args)
    repo_def = recon_repo.get_definition()
    noprompt = cli_args.get('noprompt')

    # Resolve pipeline
    if not pipeline_name and noprompt:
        raise click.UsageError('No pipeline specified')
    if not pipeline_name:
        pipeline_name = click.prompt(
            'Select a pipeline to backfill: {}'.format(', '.join(repo_def.pipeline_names))
        )
    if not repo_def.has_pipeline(pipeline_name):
        raise click.UsageError('No pipeline found named `{}`'.format(pipeline_name))

    pipeline_def = repo_def.get_pipeline(pipeline_name)

    # Resolve partition set
    all_partition_sets = repo_def.partition_set_defs + [
        schedule_def.get_partition_set()
        for schedule_def in repo_def.schedule_defs
        if isinstance(schedule_def, PartitionScheduleDefinition)
    ]

    pipeline_partition_sets = [
        x for x in all_partition_sets if x.pipeline_name == pipeline_def.name
    ]
    if not pipeline_partition_sets:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(pipeline_def.name)
        )
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_sets) == 1:
            partition_set_name = pipeline_partition_sets[0].name
        elif noprompt:
            raise click.UsageError('No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x.name for x in pipeline_partition_sets)
                )
            )
    partition_set = next((x for x in pipeline_partition_sets if x.name == partition_set_name), None)
    if not partition_set:
        raise click.UsageError('No partition set found named `{}`'.format(partition_set_name))

    # Resolve partitions to backfill
    partitions = gen_partitions_from_args(partition_set, cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(pipeline_def.name))
    print_fn('Partition set: {}'.format(partition_set.name))
    print_fn('   Partitions: {}\n'.format(print_partition_format(partitions, indent_level=15)))

    # This whole CLI tool should move to more of a "host process" model - but this is how we start
    repo_location = InProcessRepositoryLocation(recon_repo)
    external_pipeline = (
        repo_location.get_repository(repo_def.name).get_full_external_pipeline(pipeline_name),
    )

    # Confirm and launch
    if noprompt or click.confirm(
        'Do you want to proceed with the backfill ({} partitions)?'.format(len(partitions))
    ):

        print_fn('Launching runs... ')
        backfill_id = make_new_backfill_id()

        run_tags = merge_dicts(
            PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args),
        )

        for partition in partitions:
            run = instance.create_run_for_pipeline(
                pipeline_def=pipeline_def,
                mode=partition_set.mode,
                solids_to_execute=frozenset(partition_set.solid_selection)
                if partition_set and partition_set.solid_selection
                else None,
                environment_dict=partition_set.environment_dict_for_partition(partition),
                tags=merge_dicts(partition_set.tags_for_partition(partition), run_tags),
            )

            instance.launch_run(run.run_id, external_pipeline)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_id))
    else:
        print_fn(' Aborted!')
Ejemplo n.º 10
0
def execute_backfill_command(cli_args, print_fn, instance=None):
    instance = instance or DagsterInstance.get()
    repo_location = get_repository_location_from_kwargs(cli_args, instance)
    external_repo = get_external_repository_from_repo_location(
        repo_location, cli_args.get('repository'))

    external_pipeline = get_external_pipeline_from_external_repo(
        external_repo,
        cli_args.get('pipeline'),
    )

    noprompt = cli_args.get('noprompt')

    pipeline_partition_set_names = {
        external_partition_set.name: external_partition_set
        for external_partition_set in
        external_repo.get_external_partition_sets()
        if external_partition_set.pipeline_name == external_pipeline.name
    }

    if not pipeline_partition_set_names:
        raise click.UsageError(
            'No partition sets found for pipeline `{}`'.format(
                external_pipeline.name))
    partition_set_name = cli_args.get('partition_set')
    if not partition_set_name:
        if len(pipeline_partition_set_names) == 1:
            partition_set_name = next(iter(
                pipeline_partition_set_names.keys()))
        elif noprompt:
            raise click.UsageError(
                'No partition set specified (see option `--partition-set`)')
        else:
            partition_set_name = click.prompt(
                'Select a partition set to use for backfill: {}'.format(
                    ', '.join(x for x in pipeline_partition_set_names.keys())))

    partition_set = pipeline_partition_set_names.get(partition_set_name)

    if not partition_set:
        raise click.UsageError(
            'No partition set found named `{}`'.format(partition_set_name))

    mode = partition_set.mode
    solid_selection = partition_set.solid_selection

    repo_handle = RepositoryHandle(
        repository_name=external_repo.name,
        repository_location_handle=repo_location.location_handle,
    )

    # Resolve partitions to backfill
    partition_names_or_error = repo_location.get_external_partition_names(
        repo_handle,
        partition_set_name,
    )

    if isinstance(partition_names_or_error,
                  ExternalPartitionExecutionErrorData):
        raise DagsterBackfillFailedError(
            'Failure fetching partition names for {partition_set_name}: {error_message}'
            .format(
                partition_set_name=partition_set_name,
                error_message=partition_names_or_error.error.message,
            ),
            serialized_error_info=partition_names_or_error.error,
        )

    partition_names = gen_partition_names_from_args(
        partition_names_or_error.partition_names, cli_args)

    # Print backfill info
    print_fn('\n     Pipeline: {}'.format(external_pipeline.name))
    print_fn('Partition set: {}'.format(partition_set_name))
    print_fn('   Partitions: {}\n'.format(
        print_partition_format(partition_names, indent_level=15)))

    # Confirm and launch
    if noprompt or click.confirm(
            'Do you want to proceed with the backfill ({} partitions)?'.format(
                len(partition_names))):

        print_fn('Launching runs... ')
        backfill_id = make_new_backfill_id()

        run_tags = merge_dicts(
            PipelineRun.tags_for_backfill_id(backfill_id),
            get_tags_from_args(cli_args),
        )

        for partition_name in partition_names:
            run_config_or_error = repo_location.get_external_partition_config(
                repo_handle, partition_set_name, partition_name)
            if isinstance(run_config_or_error,
                          ExternalPartitionExecutionErrorData):
                raise DagsterBackfillFailedError(
                    'Failure fetching run config for partition {partition_name} in {partition_set_name}: {error_message}'
                    .format(
                        partition_name=partition_name,
                        partition_set_name=partition_set_name,
                        error_message=run_config_or_error.error.message,
                    ),
                    serialized_error_info=run_config_or_error.error,
                )

            tags_or_error = repo_location.get_external_partition_tags(
                repo_handle, partition_set_name, partition_name)
            if isinstance(tags_or_error, ExternalPartitionExecutionErrorData):
                raise DagsterBackfillFailedError(
                    'Failure fetching tags for partition {partition_name} in {partition_set_name}: {error_message}'
                    .format(
                        partition_name=partition_name,
                        partition_set_name=partition_set_name,
                        error_message=tags_or_error.error.message,
                    ),
                    serialized_error_info=tags_or_error.error,
                )
            run = _create_external_pipeline_run(
                instance=instance,
                repo_location=repo_location,
                external_repo=external_repo,
                external_pipeline=external_pipeline,
                run_config=run_config_or_error.run_config,
                mode=mode,
                preset=None,
                tags=merge_dicts(tags_or_error.tags, run_tags),
                solid_selection=frozenset(solid_selection)
                if solid_selection else None,
            )

            instance.launch_run(run.run_id, external_pipeline)
            # Remove once we can handle synchronous execution... currently limited by sqlite
            time.sleep(0.1)

        print_fn('Launched backfill job `{}`'.format(backfill_id))
    else:
        print_fn('Aborted!')