def submit_backfill_runs(instance, repo_location, backfill_job, partition_names=None): check.inst_param(instance, "instance", DagsterInstance) check.inst_param(repo_location, "repo_location", RepositoryLocation) check.inst_param(backfill_job, "backfill_job", PartitionBackfill) repository_origin = backfill_job.partition_set_origin.external_repository_origin repo_name = repository_origin.repository_name if not partition_names: partition_names = backfill_job.partition_names check.invariant( repo_location.has_repository(repo_name), "Could not find repository {repo_name} in location {repo_location_name}" .format(repo_name=repo_name, repo_location_name=repo_location.name), ) external_repo = repo_location.get_repository(repo_name) partition_set_name = backfill_job.partition_set_origin.partition_set_name external_partition_set = external_repo.get_external_partition_set( partition_set_name) result = repo_location.get_external_partition_set_execution_param_data( external_repo.handle, partition_set_name, partition_names) if isinstance(result, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError(serializable_error_info=result.error) assert isinstance(result, ExternalPartitionSetExecutionParamData) external_pipeline = external_repo.get_full_external_pipeline( external_partition_set.pipeline_name) submitted = [] for partition_data in result.partition_data: pipeline_run = create_backfill_run( instance, repo_location, external_pipeline, external_partition_set, backfill_job, partition_data, ) if pipeline_run: # we skip runs in certain cases, e.g. we are running a `from_failure` backfill job # and the partition has had a successful run since the time the backfill was # scheduled instance.submit_run(pipeline_run.run_id, external_pipeline) submitted.append(pipeline_run.run_id) return submitted
def _execute_backfill_command_at_location(cli_args, print_fn, instance, workspace, repo_location): external_repo = get_external_repository_from_repo_location( repo_location, cli_args.get("repository") ) external_pipeline = get_external_pipeline_from_external_repo( external_repo, cli_args.get("pipeline"), ) noprompt = cli_args.get("noprompt") pipeline_partition_set_names = { external_partition_set.name: external_partition_set for external_partition_set in external_repo.get_external_partition_sets() if external_partition_set.pipeline_name == external_pipeline.name } if not pipeline_partition_set_names: raise click.UsageError( "No partition sets found for pipeline `{}`".format(external_pipeline.name) ) partition_set_name = cli_args.get("partition_set") if not partition_set_name: if len(pipeline_partition_set_names) == 1: partition_set_name = next(iter(pipeline_partition_set_names.keys())) elif noprompt: raise click.UsageError("No partition set specified (see option `--partition-set`)") else: partition_set_name = click.prompt( "Select a partition set to use for backfill: {}".format( ", ".join(x for x in pipeline_partition_set_names.keys()) ) ) partition_set = pipeline_partition_set_names.get(partition_set_name) if not partition_set: raise click.UsageError("No partition set found named `{}`".format(partition_set_name)) run_tags = get_tags_from_args(cli_args) repo_handle = RepositoryHandle( repository_name=external_repo.name, repository_location=repo_location, ) try: partition_names_or_error = repo_location.get_external_partition_names( repo_handle, partition_set_name, ) except Exception: # pylint: disable=broad-except error_info = serializable_error_info_from_exc_info(sys.exc_info()) raise DagsterBackfillFailedError( "Failure fetching partition names for {partition_set_name}: {error_message}".format( partition_set_name=partition_set_name, error_message=error_info.message, ), serialized_error_info=error_info, ) partition_names = gen_partition_names_from_args( partition_names_or_error.partition_names, cli_args ) # Print backfill info print_fn("\n Pipeline: {}".format(external_pipeline.name)) print_fn("Partition set: {}".format(partition_set_name)) print_fn(" Partitions: {}\n".format(print_partition_format(partition_names, indent_level=15))) # Confirm and launch if noprompt or click.confirm( "Do you want to proceed with the backfill ({} partitions)?".format(len(partition_names)) ): print_fn("Launching runs... ") backfill_id = make_new_backfill_id() backfill_job = PartitionBackfill( backfill_id=backfill_id, partition_set_origin=partition_set.get_external_origin(), status=BulkActionStatus.REQUESTED, partition_names=partition_names, from_failure=False, reexecution_steps=None, tags=run_tags, backfill_timestamp=pendulum.now("UTC").timestamp(), ) try: partition_execution_data = ( repo_location.get_external_partition_set_execution_param_data( repository_handle=repo_handle, partition_set_name=partition_set_name, partition_names=partition_names, ) ) except Exception: # pylint: disable=broad-except error_info = serializable_error_info_from_exc_info(sys.exc_info()) instance.add_backfill( backfill_job.with_status(BulkActionStatus.FAILED).with_error(error_info) ) return print_fn("Backfill failed: {}".format(error_info)) assert isinstance(partition_execution_data, ExternalPartitionSetExecutionParamData) for partition_data in partition_execution_data.partition_data: pipeline_run = create_backfill_run( instance, repo_location, external_pipeline, partition_set, backfill_job, partition_data, ) if pipeline_run: instance.submit_run(pipeline_run.run_id, workspace) instance.add_backfill(backfill_job.with_status(BulkActionStatus.COMPLETED)) print_fn("Launched backfill job `{}`".format(backfill_id)) else: print_fn("Aborted!")
def execute_backfill_iteration(instance, workspace, logger, debug_crash_flags=None): check.inst_param(instance, "instance", DagsterInstance) check.inst_param(workspace, "workspace", IWorkspace) backfill_jobs = instance.get_backfills(status=BulkActionStatus.REQUESTED) if not backfill_jobs: logger.info("No backfill jobs requested.") yield return for backfill_job in backfill_jobs: backfill_id = backfill_job.backfill_id if not backfill_job.last_submitted_partition_name: logger.info(f"Starting backfill for {backfill_id}") else: logger.info( f"Resuming backfill for {backfill_id} from {backfill_job.last_submitted_partition_name}" ) origin = (backfill_job.partition_set_origin.external_repository_origin. repository_location_origin) try: repo_location = workspace.get_location(origin) repo_name = backfill_job.partition_set_origin.external_repository_origin.repository_name partition_set_name = backfill_job.partition_set_origin.partition_set_name if not repo_location.has_repository(repo_name): raise DagsterBackfillFailedError( f"Could not find repository {repo_name} in location {repo_location.name} to " f"run backfill {backfill_id}.") external_repo = repo_location.get_repository(repo_name) if not external_repo.has_external_partition_set( partition_set_name): raise DagsterBackfillFailedError( f"Could not find partition set {partition_set_name} in repository {repo_name}. " ) has_more = True while has_more: # refetch the backfill job backfill_job = instance.get_backfill(backfill_job.backfill_id) if backfill_job.status != BulkActionStatus.REQUESTED: break chunk, checkpoint, has_more = _get_partitions_chunk( instance, logger, backfill_job, CHECKPOINT_COUNT) _check_for_debug_crash(debug_crash_flags, "BEFORE_SUBMIT") if chunk: for _run_id in submit_backfill_runs( instance, workspace, repo_location, backfill_job, chunk): yield _check_for_debug_crash(debug_crash_flags, "AFTER_SUBMIT") if has_more: instance.update_backfill( backfill_job.with_partition_checkpoint(checkpoint)) yield time.sleep(CHECKPOINT_INTERVAL) else: logger.info( f"Backfill completed for {backfill_id} for {len(backfill_job.partition_names)} partitions" ) instance.update_backfill( backfill_job.with_status(BulkActionStatus.COMPLETED)) yield except Exception: error_info = serializable_error_info_from_exc_info(sys.exc_info()) instance.update_backfill( backfill_job.with_status( BulkActionStatus.FAILED).with_error(error_info)) logger.error( f"Backfill failed for {backfill_id}: {error_info.to_string()}") yield error_info
def _execute_backfill_command_at_location(cli_args, print_fn, instance, repo_location): external_repo = get_external_repository_from_repo_location( repo_location, cli_args.get("repository") ) external_pipeline = get_external_pipeline_from_external_repo( external_repo, cli_args.get("pipeline"), ) noprompt = cli_args.get("noprompt") pipeline_partition_set_names = { external_partition_set.name: external_partition_set for external_partition_set in external_repo.get_external_partition_sets() if external_partition_set.pipeline_name == external_pipeline.name } if not pipeline_partition_set_names: raise click.UsageError( "No partition sets found for pipeline `{}`".format(external_pipeline.name) ) partition_set_name = cli_args.get("partition_set") if not partition_set_name: if len(pipeline_partition_set_names) == 1: partition_set_name = next(iter(pipeline_partition_set_names.keys())) elif noprompt: raise click.UsageError("No partition set specified (see option `--partition-set`)") else: partition_set_name = click.prompt( "Select a partition set to use for backfill: {}".format( ", ".join(x for x in pipeline_partition_set_names.keys()) ) ) partition_set = pipeline_partition_set_names.get(partition_set_name) if not partition_set: raise click.UsageError("No partition set found named `{}`".format(partition_set_name)) mode = partition_set.mode solid_selection = partition_set.solid_selection run_tags = get_tags_from_args(cli_args) repo_handle = RepositoryHandle( repository_name=external_repo.name, repository_location_handle=repo_location.location_handle, ) # Resolve partitions to backfill partition_names_or_error = repo_location.get_external_partition_names( repo_handle, partition_set_name, ) if isinstance(partition_names_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( "Failure fetching partition names for {partition_set_name}: {error_message}".format( partition_set_name=partition_set_name, error_message=partition_names_or_error.error.message, ), serialized_error_info=partition_names_or_error.error, ) partition_names = gen_partition_names_from_args( partition_names_or_error.partition_names, cli_args ) # Print backfill info print_fn("\n Pipeline: {}".format(external_pipeline.name)) print_fn("Partition set: {}".format(partition_set_name)) print_fn(" Partitions: {}\n".format(print_partition_format(partition_names, indent_level=15))) # Confirm and launch if noprompt or click.confirm( "Do you want to proceed with the backfill ({} partitions)?".format(len(partition_names)) ): print_fn("Launching runs... ") backfill_id = make_new_backfill_id() backfill_tags = PipelineRun.tags_for_backfill_id(backfill_id) partition_execution_data = repo_location.get_external_partition_set_execution_param_data( repository_handle=repo_handle, partition_set_name=partition_set_name, partition_names=partition_names, ) if isinstance(partition_execution_data, ExternalPartitionExecutionErrorData): return print_fn("Backfill failed: {}".format(partition_execution_data.error)) assert isinstance(partition_execution_data, ExternalPartitionSetExecutionParamData) for partition_data in partition_execution_data.partition_data: run = _create_external_pipeline_run( instance=instance, repo_location=repo_location, external_repo=external_repo, external_pipeline=external_pipeline, run_config=partition_data.run_config, mode=mode, preset=None, tags=merge_dicts(merge_dicts(partition_data.tags, backfill_tags), run_tags), solid_selection=frozenset(solid_selection) if solid_selection else None, ) instance.launch_run(run.run_id, external_pipeline) print_fn("Launched backfill job `{}`".format(backfill_id)) else: print_fn("Aborted!")
def execute_backfill_command(cli_args, print_fn, instance=None): instance = instance or DagsterInstance.get() repo_location = get_repository_location_from_kwargs(cli_args, instance) external_repo = get_external_repository_from_repo_location( repo_location, cli_args.get('repository')) external_pipeline = get_external_pipeline_from_external_repo( external_repo, cli_args.get('pipeline'), ) noprompt = cli_args.get('noprompt') pipeline_partition_set_names = { external_partition_set.name: external_partition_set for external_partition_set in external_repo.get_external_partition_sets() if external_partition_set.pipeline_name == external_pipeline.name } if not pipeline_partition_set_names: raise click.UsageError( 'No partition sets found for pipeline `{}`'.format( external_pipeline.name)) partition_set_name = cli_args.get('partition_set') if not partition_set_name: if len(pipeline_partition_set_names) == 1: partition_set_name = next(iter( pipeline_partition_set_names.keys())) elif noprompt: raise click.UsageError( 'No partition set specified (see option `--partition-set`)') else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x for x in pipeline_partition_set_names.keys()))) partition_set = pipeline_partition_set_names.get(partition_set_name) if not partition_set: raise click.UsageError( 'No partition set found named `{}`'.format(partition_set_name)) mode = partition_set.mode solid_selection = partition_set.solid_selection repo_handle = RepositoryHandle( repository_name=external_repo.name, repository_location_handle=repo_location.location_handle, ) # Resolve partitions to backfill partition_names_or_error = repo_location.get_external_partition_names( repo_handle, partition_set_name, ) if isinstance(partition_names_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( 'Failure fetching partition names for {partition_set_name}: {error_message}' .format( partition_set_name=partition_set_name, error_message=partition_names_or_error.error.message, ), serialized_error_info=partition_names_or_error.error, ) partition_names = gen_partition_names_from_args( partition_names_or_error.partition_names, cli_args) # Print backfill info print_fn('\n Pipeline: {}'.format(external_pipeline.name)) print_fn('Partition set: {}'.format(partition_set_name)) print_fn(' Partitions: {}\n'.format( print_partition_format(partition_names, indent_level=15))) # Confirm and launch if noprompt or click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format( len(partition_names))): print_fn('Launching runs... ') backfill_id = make_new_backfill_id() run_tags = merge_dicts( PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args), ) for partition_name in partition_names: run_config_or_error = repo_location.get_external_partition_config( repo_handle, partition_set_name, partition_name) if isinstance(run_config_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( 'Failure fetching run config for partition {partition_name} in {partition_set_name}: {error_message}' .format( partition_name=partition_name, partition_set_name=partition_set_name, error_message=run_config_or_error.error.message, ), serialized_error_info=run_config_or_error.error, ) tags_or_error = repo_location.get_external_partition_tags( repo_handle, partition_set_name, partition_name) if isinstance(tags_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( 'Failure fetching tags for partition {partition_name} in {partition_set_name}: {error_message}' .format( partition_name=partition_name, partition_set_name=partition_set_name, error_message=tags_or_error.error.message, ), serialized_error_info=tags_or_error.error, ) run = _create_external_pipeline_run( instance=instance, repo_location=repo_location, external_repo=external_repo, external_pipeline=external_pipeline, run_config=run_config_or_error.run_config, mode=mode, preset=None, tags=merge_dicts(tags_or_error.tags, run_tags), solid_selection=frozenset(solid_selection) if solid_selection else None, ) instance.launch_run(run.run_id, external_pipeline) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) print_fn('Launched backfill job `{}`'.format(backfill_id)) else: print_fn('Aborted!')