def _get_partitions_chunk(instance, logger, backfill_job, chunk_size): check.inst_param(backfill_job, "backfill_job", PartitionBackfill) partition_names = backfill_job.partition_names checkpoint = backfill_job.last_submitted_partition_name if (backfill_job.last_submitted_partition_name and backfill_job.last_submitted_partition_name in partition_names): index = partition_names.index( backfill_job.last_submitted_partition_name) partition_names = partition_names[index + 1:] # for idempotence, fetch all runs with the current backfill id backfill_runs = instance.get_runs( PipelineRunsFilter( tags=PipelineRun.tags_for_backfill_id(backfill_job.backfill_id))) completed_partitions = set( [run.tags.get(PARTITION_NAME_TAG) for run in backfill_runs]) initial_checkpoint = (partition_names.index(checkpoint) + 1 if checkpoint and checkpoint in partition_names else 0) partition_names = partition_names[initial_checkpoint:] has_more = chunk_size < len(partition_names) partitions_chunk = partition_names[:chunk_size] next_checkpoint = partitions_chunk[-1] to_skip = set(partitions_chunk).intersection(completed_partitions) if to_skip: logger.info( f"Found {len(to_skip)} existing runs for backfill {backfill_job.backfill_id}, skipping" ) to_submit = [ partition_name for partition_name in partitions_chunk if partition_name not in completed_partitions ] return to_submit, next_checkpoint, has_more
def execute_partition_set(partition_set, partition_filter, instance=None): '''Programatically perform a backfill over a partition set Arguments: partition_set (PartitionSet): The base partition set to run the backfill over partition_filter (Callable[[List[Partition]]], List[Partition]): A function that takes a list of partitions and returns a filtered list of partitions to run the backfill over. instance (DagsterInstance): The instance to use to perform the backfill ''' check.inst_param(partition_set, 'partition_set', PartitionSetDefinition) check.callable_param(partition_filter, 'partition_filter') check.inst_param(instance, 'instance', DagsterInstance) candidate_partitions = partition_set.get_partitions() partitions = partition_filter(candidate_partitions) instance = instance or DagsterInstance.ephemeral() for partition in partitions: run = PipelineRun( pipeline_name=partition_set.pipeline_name, run_id=make_new_run_id(), selector=ExecutionSelector(partition_set.pipeline_name), environment_dict=partition_set.environment_dict_for_partition( partition), mode='default', tags=merge_dicts( PipelineRun.tags_for_backfill_id(make_new_backfill_id()), partition_set.tags_for_partition(partition), ), status=PipelineRunStatus.NOT_STARTED, ) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) instance.launch_run(run)
def create_backfill_run(instance, repo_location, external_pipeline, external_partition_set, backfill_job, partition_data): check.inst_param(instance, "instance", DagsterInstance) check.inst_param(repo_location, "repo_location", RepositoryLocation) check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline) check.inst_param(external_partition_set, "external_partition_set", ExternalPartitionSet) check.inst_param(backfill_job, "backfill_job", PartitionBackfill) check.inst_param(partition_data, "partition_data", ExternalPartitionExecutionParamData) full_external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, partition_data.run_config, external_partition_set.mode, step_keys_to_execute=None, known_state=None, ) tags = merge_dicts( external_pipeline.tags, partition_data.tags, PipelineRun.tags_for_backfill_id(backfill_job.backfill_id), backfill_job.tags, ) if not backfill_job.from_failure and not backfill_job.reexecution_steps: step_keys_to_execute = None parent_run_id = None root_run_id = None known_state = None elif backfill_job.from_failure: last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) if not last_run or last_run.status != PipelineRunStatus.FAILURE: return None parent_run_id = last_run.run_id root_run_id = last_run.root_run_id or last_run.run_id tags = merge_dicts( tags, { RESUME_RETRY_TAG: "true", PARENT_RUN_ID_TAG: parent_run_id, ROOT_RUN_ID_TAG: root_run_id, }, ) step_keys_to_execute, known_state = get_retry_steps_from_execution_plan( instance, full_external_execution_plan, parent_run_id) elif backfill_job.reexecution_steps: last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) parent_run_id = last_run.run_id if last_run else None root_run_id = (last_run.root_run_id or last_run.run_id) if last_run else None if parent_run_id and root_run_id: tags = merge_dicts(tags, { PARENT_RUN_ID_TAG: parent_run_id, ROOT_RUN_ID_TAG: root_run_id }) step_keys_to_execute = backfill_job.reexecution_steps if last_run and last_run.status == PipelineRunStatus.SUCCESS: known_state = KnownExecutionState.for_reexecution( instance.all_logs(parent_run_id), step_keys_to_execute, ) else: known_state = None if step_keys_to_execute: external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, partition_data.run_config, external_partition_set.mode, step_keys_to_execute=step_keys_to_execute, known_state=known_state, ) else: external_execution_plan = full_external_execution_plan return instance.create_run( pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=external_execution_plan. execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, pipeline_name=external_pipeline.name, run_id=make_new_run_id(), solids_to_execute=frozenset(external_partition_set.solid_selection) if external_partition_set.solid_selection else None, run_config=partition_data.run_config, mode=external_partition_set.mode, step_keys_to_execute=step_keys_to_execute, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, status=PipelineRunStatus.NOT_STARTED, external_pipeline_origin=external_pipeline.get_external_origin(), )
def execute_backfill_command(cli_args, print_fn, instance=None): instance = instance or DagsterInstance.get() external_pipeline = get_external_pipeline_from_kwargs(cli_args, instance) external_repository = get_external_repository_from_kwargs(cli_args, instance) # We should move this to use external repository # https://github.com/dagster-io/dagster/issues/2556 recon_repo = recon_repo_from_external_repo(external_repository) repo_def = recon_repo.get_definition() noprompt = cli_args.get('noprompt') pipeline_def = repo_def.get_pipeline(external_pipeline.name) # Resolve partition set all_partition_sets = repo_def.partition_set_defs + [ schedule_def.get_partition_set() for schedule_def in repo_def.schedule_defs if isinstance(schedule_def, PartitionScheduleDefinition) ] pipeline_partition_sets = [ x for x in all_partition_sets if x.pipeline_name == pipeline_def.name ] if not pipeline_partition_sets: raise click.UsageError( 'No partition sets found for pipeline `{}`'.format(pipeline_def.name) ) partition_set_name = cli_args.get('partition_set') if not partition_set_name: if len(pipeline_partition_sets) == 1: partition_set_name = pipeline_partition_sets[0].name elif noprompt: raise click.UsageError('No partition set specified (see option `--partition-set`)') else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x.name for x in pipeline_partition_sets) ) ) partition_set = next((x for x in pipeline_partition_sets if x.name == partition_set_name), None) if not partition_set: raise click.UsageError('No partition set found named `{}`'.format(partition_set_name)) # Resolve partitions to backfill partitions = gen_partitions_from_args(partition_set, cli_args) # Print backfill info print_fn('\n Pipeline: {}'.format(pipeline_def.name)) print_fn('Partition set: {}'.format(partition_set.name)) print_fn(' Partitions: {}\n'.format(print_partition_format(partitions, indent_level=15))) # Confirm and launch if noprompt or click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format(len(partitions)) ): print_fn('Launching runs... ') backfill_id = make_new_backfill_id() run_tags = merge_dicts( PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args), ) for partition in partitions: run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, mode=partition_set.mode, solids_to_execute=frozenset(partition_set.solid_selection) if partition_set and partition_set.solid_selection else None, run_config=partition_set.run_config_for_partition(partition), tags=merge_dicts(partition_set.tags_for_partition(partition), run_tags), ) instance.launch_run(run.run_id, external_pipeline) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) print_fn('Launched backfill job `{}`'.format(backfill_id)) else: print_fn(' Aborted!')
def _execute_backfill_command_at_location(cli_args, print_fn, instance, repo_location): external_repo = get_external_repository_from_repo_location( repo_location, cli_args.get("repository") ) external_pipeline = get_external_pipeline_from_external_repo( external_repo, cli_args.get("pipeline"), ) noprompt = cli_args.get("noprompt") pipeline_partition_set_names = { external_partition_set.name: external_partition_set for external_partition_set in external_repo.get_external_partition_sets() if external_partition_set.pipeline_name == external_pipeline.name } if not pipeline_partition_set_names: raise click.UsageError( "No partition sets found for pipeline `{}`".format(external_pipeline.name) ) partition_set_name = cli_args.get("partition_set") if not partition_set_name: if len(pipeline_partition_set_names) == 1: partition_set_name = next(iter(pipeline_partition_set_names.keys())) elif noprompt: raise click.UsageError("No partition set specified (see option `--partition-set`)") else: partition_set_name = click.prompt( "Select a partition set to use for backfill: {}".format( ", ".join(x for x in pipeline_partition_set_names.keys()) ) ) partition_set = pipeline_partition_set_names.get(partition_set_name) if not partition_set: raise click.UsageError("No partition set found named `{}`".format(partition_set_name)) mode = partition_set.mode solid_selection = partition_set.solid_selection run_tags = get_tags_from_args(cli_args) repo_handle = RepositoryHandle( repository_name=external_repo.name, repository_location_handle=repo_location.location_handle, ) # Resolve partitions to backfill partition_names_or_error = repo_location.get_external_partition_names( repo_handle, partition_set_name, ) if isinstance(partition_names_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( "Failure fetching partition names for {partition_set_name}: {error_message}".format( partition_set_name=partition_set_name, error_message=partition_names_or_error.error.message, ), serialized_error_info=partition_names_or_error.error, ) partition_names = gen_partition_names_from_args( partition_names_or_error.partition_names, cli_args ) # Print backfill info print_fn("\n Pipeline: {}".format(external_pipeline.name)) print_fn("Partition set: {}".format(partition_set_name)) print_fn(" Partitions: {}\n".format(print_partition_format(partition_names, indent_level=15))) # Confirm and launch if noprompt or click.confirm( "Do you want to proceed with the backfill ({} partitions)?".format(len(partition_names)) ): print_fn("Launching runs... ") backfill_id = make_new_backfill_id() backfill_tags = PipelineRun.tags_for_backfill_id(backfill_id) partition_execution_data = repo_location.get_external_partition_set_execution_param_data( repository_handle=repo_handle, partition_set_name=partition_set_name, partition_names=partition_names, ) if isinstance(partition_execution_data, ExternalPartitionExecutionErrorData): return print_fn("Backfill failed: {}".format(partition_execution_data.error)) assert isinstance(partition_execution_data, ExternalPartitionSetExecutionParamData) for partition_data in partition_execution_data.partition_data: run = _create_external_pipeline_run( instance=instance, repo_location=repo_location, external_repo=external_repo, external_pipeline=external_pipeline, run_config=partition_data.run_config, mode=mode, preset=None, tags=merge_dicts(merge_dicts(partition_data.tags, backfill_tags), run_tags), solid_selection=frozenset(solid_selection) if solid_selection else None, ) instance.launch_run(run.run_id, external_pipeline) print_fn("Launched backfill job `{}`".format(backfill_id)) else: print_fn("Aborted!")
def execute_backfill_command(cli_args, print_fn, instance=None): pipeline_name = cli_args.pop('pipeline_name') repo_args = {k: v for k, v in cli_args.items() if k in REPO_ARG_NAMES} if pipeline_name and not isinstance(pipeline_name, six.string_types): if len(pipeline_name) == 1: pipeline_name = pipeline_name[0] instance = instance or DagsterInstance.get() handle = handle_for_repo_cli_args(repo_args) repository = handle.build_repository_definition() noprompt = cli_args.get('noprompt') # check run launcher if not instance.run_launcher: raise click.UsageError( 'A run launcher must be configured before running a backfill. You can configure a run ' 'launcher (e.g. dagster_graphql.launcher.RemoteDagitRunLauncher) in your instance ' '`dagster.yaml` settings. See ' 'https://docs.dagster.io/latest/deploying/instance/ for more' 'information.') # Resolve pipeline if not pipeline_name and noprompt: raise click.UsageError('No pipeline specified') if not pipeline_name: pipeline_name = click.prompt( 'Select a pipeline to backfill: {}'.format(', '.join( repository.pipeline_names))) repository = handle.build_repository_definition() if not repository.has_pipeline(pipeline_name): raise click.UsageError( 'No pipeline found named `{}`'.format(pipeline_name)) pipeline = repository.get_pipeline(pipeline_name) # Resolve partition set all_partition_sets = get_partition_sets_for_handle(handle) pipeline_partition_sets = [ x for x in all_partition_sets if x.pipeline_name == pipeline.name ] if not pipeline_partition_sets: raise click.UsageError( 'No partition sets found for pipeline `{}`'.format(pipeline.name)) partition_set_name = cli_args.get('partition_set') if not partition_set_name: if len(pipeline_partition_sets) == 1: partition_set_name = pipeline_partition_sets[0].name elif noprompt: raise click.UsageError( 'No partition set specified (see option `--partition-set`)') else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x.name for x in pipeline_partition_sets))) partition_set = next( (x for x in pipeline_partition_sets if x.name == partition_set_name), None) if not partition_set: raise click.UsageError( 'No partition set found named `{}`'.format(partition_set_name)) # Resolve partitions to backfill partitions = gen_partitions_from_args(partition_set, cli_args) # Resolve priority celery_priority = get_backfill_priority_from_args(cli_args) # Print backfill info print_fn('\n Pipeline: {}'.format(pipeline.name)) print_fn('Partition set: {}'.format(partition_set.name)) print_fn(' Partitions: {}\n'.format( print_partition_format(partitions, indent_level=15))) # Confirm and launch if noprompt or click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format( len(partitions))): print_fn('Launching runs... ') backfill_id = make_new_backfill_id() run_tags = merge_dicts( PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args), ) # for backwards compatibility - remove once prezi switched over to using tags argument if celery_priority is not None: run_tags['dagster-celery/run_priority'] = celery_priority for partition in partitions: run = PipelineRun( pipeline_name=pipeline.name, run_id=make_new_run_id(), selector=ExecutionSelector(pipeline.name), environment_dict=partition_set.environment_dict_for_partition( partition), mode=cli_args.get('mode') or 'default', tags=merge_dicts(partition_set.tags_for_partition(partition), run_tags), status=PipelineRunStatus.NOT_STARTED, ) instance.launch_run(run) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) print_fn('Launched backfill job `{}`'.format(backfill_id)) else: print_fn(' Aborted!')
def create_backfill_run(instance, repo_location, external_pipeline, external_partition_set, backfill_job, partition_data): from dagster.daemon.daemon import get_telemetry_daemon_session_id check.inst_param(instance, "instance", DagsterInstance) check.inst_param(repo_location, "repo_location", RepositoryLocation) check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline) check.inst_param(external_partition_set, "external_partition_set", ExternalPartitionSet) check.inst_param(backfill_job, "backfill_job", PartitionBackfill) check.inst_param(partition_data, "partition_data", ExternalPartitionExecutionParamData) log_action( instance, BACKFILL_RUN_CREATED, metadata={ "DAEMON_SESSION_ID": get_telemetry_daemon_session_id(), "repo_hash": hash_name(repo_location.name), "pipeline_name_hash": hash_name(external_pipeline.name), }, ) tags = merge_dicts( external_pipeline.tags, partition_data.tags, PipelineRun.tags_for_backfill_id(backfill_job.backfill_id), backfill_job.tags, ) solids_to_execute = None solid_selection = None if not backfill_job.from_failure and not backfill_job.reexecution_steps: step_keys_to_execute = None parent_run_id = None root_run_id = None known_state = None if external_partition_set.solid_selection: solids_to_execute = frozenset( external_partition_set.solid_selection) solid_selection = external_partition_set.solid_selection elif backfill_job.from_failure: last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) if not last_run or last_run.status != PipelineRunStatus.FAILURE: return None return instance.create_reexecuted_run_from_failure( last_run, repo_location, external_pipeline, tags=tags, run_config=partition_data.run_config, mode=external_partition_set.mode, ) elif backfill_job.reexecution_steps: last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) parent_run_id = last_run.run_id if last_run else None root_run_id = (last_run.root_run_id or last_run.run_id) if last_run else None if parent_run_id and root_run_id: tags = merge_dicts(tags, { PARENT_RUN_ID_TAG: parent_run_id, ROOT_RUN_ID_TAG: root_run_id }) step_keys_to_execute = backfill_job.reexecution_steps if last_run and last_run.status == PipelineRunStatus.SUCCESS: known_state = KnownExecutionState.for_reexecution( instance.all_logs(parent_run_id), step_keys_to_execute, ) else: known_state = None if external_partition_set.solid_selection: solids_to_execute = frozenset( external_partition_set.solid_selection) solid_selection = external_partition_set.solid_selection external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, partition_data.run_config, external_partition_set.mode, step_keys_to_execute=step_keys_to_execute, known_state=known_state, instance=instance, ) return instance.create_run( pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=external_execution_plan. execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, pipeline_name=external_pipeline.name, run_id=make_new_run_id(), solids_to_execute=solids_to_execute, run_config=partition_data.run_config, mode=external_partition_set.mode, step_keys_to_execute=step_keys_to_execute, tags=tags, root_run_id=root_run_id, parent_run_id=parent_run_id, status=PipelineRunStatus.NOT_STARTED, external_pipeline_origin=external_pipeline.get_external_origin(), pipeline_code_origin=external_pipeline.get_python_origin(), solid_selection=solid_selection, )
def _build_execution_param_list_for_backfill( instance, partition_data_list, backfill_id, backfill_params, pipeline_selector, external_partition_set, ): check.inst_param(instance, "instance", DagsterInstance) check.list_param(partition_data_list, "partition_data_list", of_type=ExternalPartitionExecutionParamData) check.str_param(backfill_id, "backfill_id") check.dict_param(backfill_params, "backfill_params") check.inst_param(pipeline_selector, "pipeline_selector", PipelineSelector) check.inst_param(external_partition_set, "external_partition_set", ExternalPartitionSet) backfill_tags = PipelineRun.tags_for_backfill_id(backfill_id) execution_tags = { t["key"]: t["value"] for t in backfill_params.get("tags", []) } execution_param_list = [] for partition_data in partition_data_list: tags = merge_dicts(merge_dicts(partition_data.tags, backfill_tags), execution_tags) if not backfill_params.get("fromFailure") and not backfill_params.get( "reexecutionSteps"): # full pipeline execution execution_param_list.append( ExecutionParams( selector=pipeline_selector, run_config=partition_data.run_config, mode=external_partition_set.mode, execution_metadata=ExecutionMetadata(run_id=None, tags=tags), step_keys=None, )) continue last_run = _fetch_last_run(instance, external_partition_set, partition_data.name) if backfill_params.get("fromFailure"): if not last_run or last_run.status != PipelineRunStatus.FAILURE: continue execution_param_list.append( ExecutionParams( selector=pipeline_selector, run_config=partition_data.run_config, mode=external_partition_set.mode, execution_metadata=ExecutionMetadata( run_id=None, tags=merge_dicts(tags, {RESUME_RETRY_TAG: "true"}), root_run_id=last_run.root_run_id or last_run.run_id, parent_run_id=last_run.run_id, ), step_keys=None, )) continue # partial reexecution from success if not last_run or last_run.status != PipelineRunStatus.SUCCESS: continue execution_param_list.append( ExecutionParams( selector=pipeline_selector, run_config=partition_data.run_config, mode=external_partition_set.mode, execution_metadata=ExecutionMetadata( run_id=None, tags=tags, root_run_id=last_run.root_run_id or last_run.run_id, parent_run_id=last_run.run_id, ), step_keys=backfill_params["reexecutionSteps"], )) continue return execution_param_list
def execute_backfill_command(cli_args, print_fn, instance=None): pipeline_name = cli_args.pop('pipeline_name') repo_args = {k: v for k, v in cli_args.items() if k in REPO_ARG_NAMES} if pipeline_name and not isinstance(pipeline_name, six.string_types): if len(pipeline_name) == 1: pipeline_name = pipeline_name[0] instance = instance or DagsterInstance.get() recon_repo = recon_repo_for_cli_args(repo_args) repo_def = recon_repo.get_definition() noprompt = cli_args.get('noprompt') # Resolve pipeline if not pipeline_name and noprompt: raise click.UsageError('No pipeline specified') if not pipeline_name: pipeline_name = click.prompt( 'Select a pipeline to backfill: {}'.format(', '.join(repo_def.pipeline_names)) ) if not repo_def.has_pipeline(pipeline_name): raise click.UsageError('No pipeline found named `{}`'.format(pipeline_name)) pipeline_def = repo_def.get_pipeline(pipeline_name) # Resolve partition set all_partition_sets = repo_def.partition_set_defs + [ schedule_def.get_partition_set() for schedule_def in repo_def.schedule_defs if isinstance(schedule_def, PartitionScheduleDefinition) ] pipeline_partition_sets = [ x for x in all_partition_sets if x.pipeline_name == pipeline_def.name ] if not pipeline_partition_sets: raise click.UsageError( 'No partition sets found for pipeline `{}`'.format(pipeline_def.name) ) partition_set_name = cli_args.get('partition_set') if not partition_set_name: if len(pipeline_partition_sets) == 1: partition_set_name = pipeline_partition_sets[0].name elif noprompt: raise click.UsageError('No partition set specified (see option `--partition-set`)') else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x.name for x in pipeline_partition_sets) ) ) partition_set = next((x for x in pipeline_partition_sets if x.name == partition_set_name), None) if not partition_set: raise click.UsageError('No partition set found named `{}`'.format(partition_set_name)) # Resolve partitions to backfill partitions = gen_partitions_from_args(partition_set, cli_args) # Print backfill info print_fn('\n Pipeline: {}'.format(pipeline_def.name)) print_fn('Partition set: {}'.format(partition_set.name)) print_fn(' Partitions: {}\n'.format(print_partition_format(partitions, indent_level=15))) # This whole CLI tool should move to more of a "host process" model - but this is how we start repo_location = InProcessRepositoryLocation(recon_repo) external_pipeline = ( repo_location.get_repository(repo_def.name).get_full_external_pipeline(pipeline_name), ) # Confirm and launch if noprompt or click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format(len(partitions)) ): print_fn('Launching runs... ') backfill_id = make_new_backfill_id() run_tags = merge_dicts( PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args), ) for partition in partitions: run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, mode=partition_set.mode, solids_to_execute=frozenset(partition_set.solid_selection) if partition_set and partition_set.solid_selection else None, environment_dict=partition_set.environment_dict_for_partition(partition), tags=merge_dicts(partition_set.tags_for_partition(partition), run_tags), ) instance.launch_run(run.run_id, external_pipeline) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) print_fn('Launched backfill job `{}`'.format(backfill_id)) else: print_fn(' Aborted!')
def execute_backfill_command(cli_args, print_fn, instance=None): instance = instance or DagsterInstance.get() repo_location = get_repository_location_from_kwargs(cli_args, instance) external_repo = get_external_repository_from_repo_location( repo_location, cli_args.get('repository')) external_pipeline = get_external_pipeline_from_external_repo( external_repo, cli_args.get('pipeline'), ) noprompt = cli_args.get('noprompt') pipeline_partition_set_names = { external_partition_set.name: external_partition_set for external_partition_set in external_repo.get_external_partition_sets() if external_partition_set.pipeline_name == external_pipeline.name } if not pipeline_partition_set_names: raise click.UsageError( 'No partition sets found for pipeline `{}`'.format( external_pipeline.name)) partition_set_name = cli_args.get('partition_set') if not partition_set_name: if len(pipeline_partition_set_names) == 1: partition_set_name = next(iter( pipeline_partition_set_names.keys())) elif noprompt: raise click.UsageError( 'No partition set specified (see option `--partition-set`)') else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x for x in pipeline_partition_set_names.keys()))) partition_set = pipeline_partition_set_names.get(partition_set_name) if not partition_set: raise click.UsageError( 'No partition set found named `{}`'.format(partition_set_name)) mode = partition_set.mode solid_selection = partition_set.solid_selection repo_handle = RepositoryHandle( repository_name=external_repo.name, repository_location_handle=repo_location.location_handle, ) # Resolve partitions to backfill partition_names_or_error = repo_location.get_external_partition_names( repo_handle, partition_set_name, ) if isinstance(partition_names_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( 'Failure fetching partition names for {partition_set_name}: {error_message}' .format( partition_set_name=partition_set_name, error_message=partition_names_or_error.error.message, ), serialized_error_info=partition_names_or_error.error, ) partition_names = gen_partition_names_from_args( partition_names_or_error.partition_names, cli_args) # Print backfill info print_fn('\n Pipeline: {}'.format(external_pipeline.name)) print_fn('Partition set: {}'.format(partition_set_name)) print_fn(' Partitions: {}\n'.format( print_partition_format(partition_names, indent_level=15))) # Confirm and launch if noprompt or click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format( len(partition_names))): print_fn('Launching runs... ') backfill_id = make_new_backfill_id() run_tags = merge_dicts( PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args), ) for partition_name in partition_names: run_config_or_error = repo_location.get_external_partition_config( repo_handle, partition_set_name, partition_name) if isinstance(run_config_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( 'Failure fetching run config for partition {partition_name} in {partition_set_name}: {error_message}' .format( partition_name=partition_name, partition_set_name=partition_set_name, error_message=run_config_or_error.error.message, ), serialized_error_info=run_config_or_error.error, ) tags_or_error = repo_location.get_external_partition_tags( repo_handle, partition_set_name, partition_name) if isinstance(tags_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( 'Failure fetching tags for partition {partition_name} in {partition_set_name}: {error_message}' .format( partition_name=partition_name, partition_set_name=partition_set_name, error_message=tags_or_error.error.message, ), serialized_error_info=tags_or_error.error, ) run = _create_external_pipeline_run( instance=instance, repo_location=repo_location, external_repo=external_repo, external_pipeline=external_pipeline, run_config=run_config_or_error.run_config, mode=mode, preset=None, tags=merge_dicts(tags_or_error.tags, run_tags), solid_selection=frozenset(solid_selection) if solid_selection else None, ) instance.launch_run(run.run_id, external_pipeline) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) print_fn('Launched backfill job `{}`'.format(backfill_id)) else: print_fn('Aborted!')