def pipeline_backfill_command(**kwargs): with DagsterInstance.get() as instance: execute_backfill_command(kwargs, click.echo, instance)
def debug_heartbeat_dump_command(): with DagsterInstance.get() as instance: for daemon_type in DaemonType: click.echo(get_daemon_status(instance, daemon_type))
def _make_airflow_dag( handle, pipeline_name, environment_dict=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, operator=DagsterPythonOperator, ): check.inst_param(handle, 'handle', ExecutionTargetHandle) check.str_param(pipeline_name, 'pipeline_name') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) mode = check.opt_str_param(mode, 'mode') # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory, # which would not be consistent between Airflow task invocations. instance = ( check.inst_param(instance, 'instance', DagsterInstance) if instance else DagsterInstance.get(fallback_storage=seven.get_system_temp_directory()) ) # Only used for Airflow; internally we continue to use pipeline.name dag_id = check.opt_str_param(dag_id, 'dag_id', _rename_for_airflow(pipeline_name)) dag_description = check.opt_str_param( dag_description, 'dag_description', _make_dag_description(pipeline_name) ) check.subclass_param(operator, 'operator', BaseOperator) # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting # the trailing comma after **check.opt_dict_param...) -- black 19.3b0 supports multiple python # versions, but currently doesn't know what to do with from __future__ import print_function -- # see https://github.com/ambv/black/issues/768 # fmt: off dag_kwargs = dict( {'default_args': DEFAULT_ARGS}, **check.opt_dict_param(dag_kwargs, 'dag_kwargs', key_type=str) ) # fmt: on op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str) dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs) pipeline = handle.build_pipeline_definition() if mode is None: mode = pipeline.get_default_mode_name() execution_plan = create_execution_plan( pipeline, environment_dict, run_config=RunConfig(mode=mode) ) tasks = {} coalesced_plan = coalesce_execution_steps(execution_plan) for solid_handle, solid_steps in coalesced_plan.items(): step_keys = [step.key for step in solid_steps] # We separately construct the Airflow operators here with the appropriate args, because if # Airflow gets extraneous args/kwargs it emits a warning every time it parses the DAG (and # future Airflow versions will mark this a failure). # see https://github.com/ambv/black/issues/768 # fmt: off if operator == DagsterPythonOperator: task = operator( handle=handle, pipeline_name=pipeline_name, environment_dict=environment_dict, mode=mode, task_id=solid_handle, step_keys=step_keys, dag=dag, instance_ref=instance.get_ref(), **op_kwargs ) else: task = operator( pipeline_name=pipeline_name, environment_dict=environment_dict, mode=mode, task_id=solid_handle, step_keys=step_keys, dag=dag, instance_ref=instance.get_ref(), **op_kwargs ) # fmt: on tasks[solid_handle] = task for solid_step in solid_steps: for step_input in solid_step.step_inputs: for key in step_input.dependency_keys: prev_solid_handle = execution_plan.get_step_by_key(key).solid_handle.to_string() if solid_handle != prev_solid_handle: tasks[prev_solid_handle].set_downstream(task) return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
def pipeline_backfill_command(mode, *args, **kwargs): pipeline_name = kwargs.pop('pipeline_name') repo_args = {k: v for k, v in kwargs.items() if k in REPO_ARG_NAMES} if pipeline_name and not isinstance(pipeline_name, six.string_types): if len(pipeline_name) == 1: pipeline_name = pipeline_name[0] instance = DagsterInstance.get() handle = handle_for_repo_cli_args(repo_args) repository = handle.build_repository_definition() if not pipeline_name: pipeline_name = click.prompt( 'Select a pipeline to backfill: {}'.format(', '.join(repository.pipeline_names)) ) pipeline = handle.with_pipeline_name(pipeline_name).build_pipeline_definition() partition_handle = handle.build_partitions_handle() if not pipeline: raise click.UsageError('no pipeline') if not partition_handle: raise click.UsageError('no parititon') if kwargs.get('partition_set'): partition_set = partition_handle.get_partition_set(kwargs['partition_set']) else: all_partition_sets = [ x for x in partition_handle.get_all_partition_sets() if x.pipeline_name == pipeline.name ] if not all_partition_sets: raise click.UsageError( 'Pipeline `{}` does not have partition sets defined'.format(pipeline.name) ) if len(all_partition_sets) == 1: partition_set = all_partition_sets[0] else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x.name for x in all_partition_sets) ) ) partition_set = next( (x for x in all_partition_sets if x.name == partition_set_name), None ) if not partition_set: raise click.UsageError('error, partition set not found') partitions = gen_partition_slice_from_args(partition_set, kwargs) click.echo('\n Pipeline: {}'.format(pipeline.name)) click.echo('Partition set: {}'.format(partition_set.name)) click.echo(' Partitions: {}\n'.format(', '.join([x.name for x in partitions]))) if click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format(len(partitions)) ): click.echo('Launching') from dagster_graphql.launcher import RemoteDagitRunLauncher run_launcher = RemoteDagitRunLauncher('http://localhost:3333') for partition in partitions: run = PipelineRun( pipeline_name=pipeline.name, run_id=make_new_run_id(), selector=ExecutionSelector(pipeline.name), environment_dict=partition_set.environment_dict_for_partition(partition), mode=mode or 'default', tags=partition_set.tags_for_partition(partition), status=PipelineRunStatus.NOT_STARTED, ) run = run_launcher.launch_run(run) else: click.echo(' Aborted!')
def wipe_command(): with DagsterInstance.get() as instance: instance.wipe_daemon_heartbeats() click.echo("Daemon heartbeats wiped")
def pipeline_list_command(**kwargs): return execute_list_command(kwargs, click.echo, DagsterInstance.get())
def execute_backfill_command(cli_args, print_fn, instance=None): pipeline_name = cli_args.pop('pipeline_name') repo_args = {k: v for k, v in cli_args.items() if k in REPO_ARG_NAMES} if pipeline_name and not isinstance(pipeline_name, six.string_types): if len(pipeline_name) == 1: pipeline_name = pipeline_name[0] instance = instance or DagsterInstance.get() handle = handle_for_repo_cli_args(repo_args) repository = handle.build_repository_definition() noprompt = cli_args.get('noprompt') # check run launcher if not instance.run_launcher: raise click.UsageError( 'A run launcher must be configured before running a backfill. You can configure a run ' 'launcher (e.g. dagster_graphql.launcher.RemoteDagitRunLauncher) in your instance ' '`dagster.yaml` settings. See ' 'https://dagster.readthedocs.io/en/latest/sections/deploying/instance.html for more' 'information.') # Resolve pipeline if not pipeline_name and noprompt: raise click.UsageError('No pipeline specified') if not pipeline_name: pipeline_name = click.prompt( 'Select a pipeline to backfill: {}'.format(', '.join( repository.pipeline_names))) repository = handle.build_repository_definition() if not repository.has_pipeline(pipeline_name): raise click.UsageError( 'No pipeline found named `{}`'.format(pipeline_name)) pipeline = repository.get_pipeline(pipeline_name) # Resolve partition set all_partition_sets = get_partition_sets_for_handle(handle) pipeline_partition_sets = [ x for x in all_partition_sets if x.pipeline_name == pipeline.name ] if not pipeline_partition_sets: raise click.UsageError( 'No partition sets found for pipeline `{}`'.format(pipeline.name)) partition_set_name = cli_args.get('partition_set') if not partition_set_name: if len(pipeline_partition_sets) == 1: partition_set_name = pipeline_partition_sets[0].name elif noprompt: raise click.UsageError( 'No partition set specified (see option `--partition-set`)') else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x.name for x in pipeline_partition_sets))) partition_set = next( (x for x in pipeline_partition_sets if x.name == partition_set_name), None) if not partition_set: raise click.UsageError( 'No partition set found named `{}`'.format(partition_set_name)) # Resolve partitions to backfill partitions = gen_partitions_from_args(partition_set, cli_args) # Print backfill info print_fn('\n Pipeline: {}'.format(pipeline.name)) print_fn('Partition set: {}'.format(partition_set.name)) print_fn(' Partitions: {}\n'.format( print_partition_format(partitions, indent_level=15))) # Confirm and launch if noprompt or click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format( len(partitions))): backfill_tag = ''.join( random.choice(string.ascii_lowercase) for x in range(BACKFILL_TAG_LENGTH)) print_fn('Launching runs... ') for partition in partitions: run = PipelineRun( pipeline_name=pipeline.name, run_id=make_new_run_id(), selector=ExecutionSelector(pipeline.name), environment_dict=partition_set.environment_dict_for_partition( partition), mode=cli_args.get('mode') or 'default', tags=merge_dicts({'dagster/backfill': backfill_tag}, partition_set.tags_for_partition(partition)), status=PipelineRunStatus.NOT_STARTED, ) instance.launch_run(run) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) print_fn('Launched backfill job `{}`'.format(backfill_tag)) else: print_fn(' Aborted!')
from dagster_examples.experimental.repo import daily_rollup_schedule from dagster import execute_partition_set from dagster.core.instance import DagsterInstance def select_weekday_partitions(candidate_partitions): SATURDAY = 5 SUNDAY = 6 return [ partition for partition in candidate_partitions if partition.value.weekday() != SATURDAY and partition.value.weekday() != SUNDAY ] partition_set = daily_rollup_schedule.get_partition_set() execute_partition_set(partition_set, select_weekday_partitions, instance=DagsterInstance.get())
def _make_airflow_dag( handle, pipeline_name, environment_dict=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, operator=DagsterPythonOperator, ): check.inst_param(handle, 'handle', ExecutionTargetHandle) check.str_param(pipeline_name, 'pipeline_name') environment_dict = check.opt_dict_param(environment_dict, 'environment_dict', key_type=str) mode = check.opt_str_param(mode, 'mode') # Default to use the (persistent) system temp directory rather than a seven.TemporaryDirectory, # which would not be consistent between Airflow task invocations. instance = (check.inst_param(instance, 'instance', DagsterInstance) if instance else DagsterInstance.get( fallback_storage=seven.get_system_temp_directory())) # Only used for Airflow; internally we continue to use pipeline.name dag_id = check.opt_str_param(dag_id, 'dag_id', _rename_for_airflow(pipeline_name)) dag_description = check.opt_str_param(dag_description, 'dag_description', _make_dag_description(pipeline_name)) check.subclass_param(operator, 'operator', BaseOperator) dag_kwargs = dict({'default_args': DEFAULT_ARGS}, **check.opt_dict_param(dag_kwargs, 'dag_kwargs', key_type=str)) op_kwargs = check.opt_dict_param(op_kwargs, 'op_kwargs', key_type=str) dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs) pipeline = handle.build_pipeline_definition() if mode is None: mode = pipeline.get_default_mode_name() execution_plan = create_execution_plan(pipeline, environment_dict, mode=mode) tasks = {} coalesced_plan = coalesce_execution_steps(execution_plan) for solid_handle, solid_steps in coalesced_plan.items(): step_keys = [step.key for step in solid_steps] operator_parameters = DagsterOperatorParameters( handle=handle, pipeline_name=pipeline_name, environment_dict=environment_dict, mode=mode, task_id=solid_handle, step_keys=step_keys, dag=dag, instance_ref=instance.get_ref(), op_kwargs=op_kwargs, pipeline_snapshot=pipeline.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( execution_plan, pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()), ) task = operator(operator_parameters) tasks[solid_handle] = task for solid_step in solid_steps: for step_input in solid_step.step_inputs: for key in step_input.dependency_keys: prev_solid_handle = execution_plan.get_step_by_key( key).solid_handle.to_string() if solid_handle != prev_solid_handle: tasks[prev_solid_handle].set_downstream(task) return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
def run_list_command(limit): with DagsterInstance.get() as instance: for run in instance.get_runs(limit=limit): click.echo("Run: {}".format(run.run_id)) click.echo(" Pipeline: {}".format(run.pipeline_name))
def execute_preview_command(cli_args, print_fn): with DagsterInstance.get() as instance: with get_external_repository_from_kwargs(cli_args) as external_repo: check_repo_and_scheduler(external_repo, instance) print_changes(external_repo, instance, print_fn, preview=True)
def execute_backfill_command(cli_args, print_fn, instance=None): instance = instance or DagsterInstance.get() external_pipeline = get_external_pipeline_from_kwargs(cli_args) external_repository = get_external_repository_from_kwargs(cli_args) # We should move this to use external repository # https://github.com/dagster-io/dagster/issues/2556 recon_repo = recon_repo_from_external_repo(external_repository) repo_def = recon_repo.get_definition() noprompt = cli_args.get('noprompt') pipeline_def = repo_def.get_pipeline(external_pipeline.name) # Resolve partition set all_partition_sets = repo_def.partition_set_defs + [ schedule_def.get_partition_set() for schedule_def in repo_def.schedule_defs if isinstance(schedule_def, PartitionScheduleDefinition) ] pipeline_partition_sets = [ x for x in all_partition_sets if x.pipeline_name == pipeline_def.name ] if not pipeline_partition_sets: raise click.UsageError( 'No partition sets found for pipeline `{}`'.format(pipeline_def.name) ) partition_set_name = cli_args.get('partition_set') if not partition_set_name: if len(pipeline_partition_sets) == 1: partition_set_name = pipeline_partition_sets[0].name elif noprompt: raise click.UsageError('No partition set specified (see option `--partition-set`)') else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x.name for x in pipeline_partition_sets) ) ) partition_set = next((x for x in pipeline_partition_sets if x.name == partition_set_name), None) if not partition_set: raise click.UsageError('No partition set found named `{}`'.format(partition_set_name)) # Resolve partitions to backfill partitions = gen_partitions_from_args(partition_set, cli_args) # Print backfill info print_fn('\n Pipeline: {}'.format(pipeline_def.name)) print_fn('Partition set: {}'.format(partition_set.name)) print_fn(' Partitions: {}\n'.format(print_partition_format(partitions, indent_level=15))) # Confirm and launch if noprompt or click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format(len(partitions)) ): print_fn('Launching runs... ') backfill_id = make_new_backfill_id() run_tags = merge_dicts( PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args), ) for partition in partitions: run = instance.create_run_for_pipeline( pipeline_def=pipeline_def, mode=partition_set.mode, solids_to_execute=frozenset(partition_set.solid_selection) if partition_set and partition_set.solid_selection else None, run_config=partition_set.run_config_for_partition(partition), tags=merge_dicts(partition_set.tags_for_partition(partition), run_tags), ) instance.launch_run(run.run_id, external_pipeline) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) print_fn('Launched backfill job `{}`'.format(backfill_id)) else: print_fn(' Aborted!')
def _logged_pipeline_launch_command(config, preset_name, mode, instance, kwargs): check.inst_param(instance, 'instance', DagsterInstance) env = ( canonicalize_backcompat_args( (config if config else None), '--config', (kwargs.get('env') if kwargs.get('env') else None), '--env', '0.9.0', stacklevel=2, # this stacklevel can point the warning to this line ) or tuple() # back to default empty tuple ) env = list(check.opt_tuple_param(env, 'env', default=(), of_type=str)) external_pipeline = get_external_pipeline_from_kwargs(kwargs) # We should move this to use external pipeline # https://github.com/dagster-io/dagster/issues/2556 pipeline = recon_pipeline_from_origin(external_pipeline.get_origin()) instance = DagsterInstance.get() log_repo_stats(instance=instance, pipeline=pipeline, source='pipeline_launch_command') if preset_name: if env: raise click.UsageError('Can not use --preset with --config.') if mode: raise click.UsageError('Can not use --preset with --mode.') preset = pipeline.get_definition().get_preset(preset_name) else: preset = None run_tags = get_tags_from_args(kwargs) solid_selection = get_solid_selection_from_args(kwargs) if preset and preset.solid_selection is not None: check.invariant( solid_selection is None or solid_selection == preset.solid_selection, 'The solid_selection set in preset \'{preset}\', {preset_subset}, does not agree with ' 'the `solid_selection` argument: {solid_selection}'.format( preset=preset, preset_subset=preset.solid_selection, solid_selection=solid_selection, ), ) solid_selection = preset.solid_selection # generate pipeline subset from the given solid_selection if solid_selection: pipeline = pipeline.subset_for_execution(solid_selection) # FIXME need to check the env against run_config pipeline_run = instance.create_run_for_pipeline( pipeline_def=pipeline.get_definition(), solid_selection=solid_selection, solids_to_execute=pipeline.solids_to_execute, run_config=preset.run_config if preset else load_yaml_from_glob_list(env), mode=(preset.mode if preset else mode) or 'default', tags=run_tags, ) recon_repo = pipeline.get_reconstructable_repository() repo_location = InProcessRepositoryLocation(recon_repo) external_pipeline = ( repo_location.get_repository(recon_repo.get_definition().name).get_full_external_pipeline( pipeline.get_definition().name ), ) return instance.launch_run(pipeline_run.run_id, external_pipeline)
def run_wipe_command(): instance = DagsterInstance.get() instance.wipe() click.echo('Deleted all run history')
def launch_scheduled_execution(output_file, schedule_name, override_system_timezone, **kwargs): with (mock_system_timezone(override_system_timezone) if override_system_timezone else nullcontext()): with ipc_write_stream(output_file) as stream: with DagsterInstance.get() as instance: repository_origin = get_repository_origin_from_kwargs(kwargs) job_origin = repository_origin.get_job_origin(schedule_name) # open the tick scope before we load any external artifacts so that # load errors are stored in DB with _schedule_tick_context( instance, stream, JobTickData( job_origin_id=job_origin.get_id(), job_name=schedule_name, job_type=JobType.SCHEDULE, status=JobTickStatus.STARTED, timestamp=time.time(), ), ) as tick_context: with get_repository_location_from_kwargs( kwargs) as repo_location: repo_dict = repo_location.get_repositories() check.invariant( repo_dict and len(repo_dict) == 1, "Passed in arguments should reference exactly one repository, instead there are {num_repos}" .format(num_repos=len(repo_dict)), ) external_repo = next(iter(repo_dict.values())) if not schedule_name in [ schedule.name for schedule in external_repo.get_external_schedules() ]: raise DagsterInvariantViolationError( "Could not find schedule named {schedule_name}" .format(schedule_name=schedule_name), ) external_schedule = external_repo.get_external_schedule( schedule_name) # Validate that either the schedule has no timezone or it matches # the system timezone schedule_timezone = external_schedule.execution_timezone if schedule_timezone: system_timezone = pendulum.now().timezone.name if system_timezone != external_schedule.execution_timezone: raise DagsterInvariantViolationError( "Schedule {schedule_name} is set to execute in {schedule_timezone}, " "but this scheduler can only run in the system timezone, " "{system_timezone}. Use DagsterDaemonScheduler if you want to be able " "to execute schedules in arbitrary timezones." .format( schedule_name=external_schedule.name, schedule_timezone=schedule_timezone, system_timezone=system_timezone, ), ) _launch_scheduled_executions(instance, repo_location, external_repo, external_schedule, tick_context)
def run_list_command(): instance = DagsterInstance.get() for run in instance.get_runs(): click.echo("Run: {}".format(run.run_id)) click.echo(" Pipeline: {}".format(run.pipeline_name))
def pipeline_launch_command(config, preset, mode, **kwargs): return _logged_pipeline_launch_command(config, preset, mode, DagsterInstance.get(), kwargs)
def execute_preview_command(sensor_name, since, last_run_key, cursor, cli_args, print_fn, instance=None): with DagsterInstance.get() as instance: with get_repository_location_from_kwargs( instance, version=dagster_version, kwargs=cli_args, ) as repo_location: try: external_repo = get_external_repository_from_repo_location( repo_location, cli_args.get("repository")) check_repo_and_scheduler(external_repo, instance) external_sensor = external_repo.get_external_sensor( sensor_name) try: sensor_runtime_data = repo_location.get_external_sensor_execution_data( instance, external_repo.handle, external_sensor.name, since, last_run_key, cursor, ) except Exception: error_info = serializable_error_info_from_exc_info( sys.exc_info()) print_fn( "Failed to resolve sensor for {sensor_name} : {error_info}" .format( sensor_name=external_sensor.name, error_info=error_info.to_string(), )) return if not sensor_runtime_data.run_requests: if sensor_runtime_data.skip_message: print_fn( "Sensor returned false for {sensor_name}, skipping: {skip_message}" .format( sensor_name=external_sensor.name, skip_message=sensor_runtime_data.skip_message, )) else: print_fn( "Sensor returned false for {sensor_name}, skipping" .format(sensor_name=external_sensor.name)) else: print_fn( "Sensor returning run requests for {num} run(s):\n\n{run_requests}" .format( num=len(sensor_runtime_data.run_requests), run_requests="\n".join( yaml.safe_dump(run_request.run_config, default_flow_style=False) for run_request in sensor_runtime_data.run_requests), )) except DagsterInvariantViolationError as ex: raise click.UsageError(ex)
def execute_backfill_command(cli_args, print_fn, instance=None): instance = instance or DagsterInstance.get() repo_location = get_repository_location_from_kwargs(cli_args, instance) external_repo = get_external_repository_from_repo_location( repo_location, cli_args.get('repository')) external_pipeline = get_external_pipeline_from_external_repo( external_repo, cli_args.get('pipeline'), ) noprompt = cli_args.get('noprompt') pipeline_partition_set_names = { external_partition_set.name: external_partition_set for external_partition_set in external_repo.get_external_partition_sets() if external_partition_set.pipeline_name == external_pipeline.name } if not pipeline_partition_set_names: raise click.UsageError( 'No partition sets found for pipeline `{}`'.format( external_pipeline.name)) partition_set_name = cli_args.get('partition_set') if not partition_set_name: if len(pipeline_partition_set_names) == 1: partition_set_name = next(iter( pipeline_partition_set_names.keys())) elif noprompt: raise click.UsageError( 'No partition set specified (see option `--partition-set`)') else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x for x in pipeline_partition_set_names.keys()))) partition_set = pipeline_partition_set_names.get(partition_set_name) if not partition_set: raise click.UsageError( 'No partition set found named `{}`'.format(partition_set_name)) mode = partition_set.mode solid_selection = partition_set.solid_selection repo_handle = RepositoryHandle( repository_name=external_repo.name, repository_location_handle=repo_location.location_handle, ) # Resolve partitions to backfill partition_names_or_error = repo_location.get_external_partition_names( repo_handle, partition_set_name, ) if isinstance(partition_names_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( 'Failure fetching partition names for {partition_set_name}: {error_message}' .format( partition_set_name=partition_set_name, error_message=partition_names_or_error.error.message, ), serialized_error_info=partition_names_or_error.error, ) partition_names = gen_partition_names_from_args( partition_names_or_error.partition_names, cli_args) # Print backfill info print_fn('\n Pipeline: {}'.format(external_pipeline.name)) print_fn('Partition set: {}'.format(partition_set_name)) print_fn(' Partitions: {}\n'.format( print_partition_format(partition_names, indent_level=15))) # Confirm and launch if noprompt or click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format( len(partition_names))): print_fn('Launching runs... ') backfill_id = make_new_backfill_id() run_tags = merge_dicts( PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args), ) for partition_name in partition_names: run_config_or_error = repo_location.get_external_partition_config( repo_handle, partition_set_name, partition_name) if isinstance(run_config_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( 'Failure fetching run config for partition {partition_name} in {partition_set_name}: {error_message}' .format( partition_name=partition_name, partition_set_name=partition_set_name, error_message=run_config_or_error.error.message, ), serialized_error_info=run_config_or_error.error, ) tags_or_error = repo_location.get_external_partition_tags( repo_handle, partition_set_name, partition_name) if isinstance(tags_or_error, ExternalPartitionExecutionErrorData): raise DagsterBackfillFailedError( 'Failure fetching tags for partition {partition_name} in {partition_set_name}: {error_message}' .format( partition_name=partition_name, partition_set_name=partition_set_name, error_message=tags_or_error.error.message, ), serialized_error_info=tags_or_error.error, ) run = _create_external_pipeline_run( instance=instance, repo_location=repo_location, external_repo=external_repo, external_pipeline=external_pipeline, run_config=run_config_or_error.run_config, mode=mode, preset=None, tags=merge_dicts(tags_or_error.tags, run_tags), solid_selection=frozenset(solid_selection) if solid_selection else None, ) instance.launch_run(run.run_id, external_pipeline) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) print_fn('Launched backfill job `{}`'.format(backfill_id)) else: print_fn('Aborted!')
def pipeline_list_versions_command(**kwargs): with DagsterInstance.get() as instance: execute_list_versions_command(instance, kwargs)
def _make_airflow_dag( recon_repo, job_name, run_config=None, mode=None, instance=None, dag_id=None, dag_description=None, dag_kwargs=None, op_kwargs=None, operator=DagsterPythonOperator, ): check.inst_param(recon_repo, "recon_repo", ReconstructableRepository) check.str_param(job_name, "job_name") run_config = check.opt_dict_param(run_config, "run_config", key_type=str) mode = check.opt_str_param(mode, "mode") # Default to use the (persistent) system temp directory rather than a TemporaryDirectory, # which would not be consistent between Airflow task invocations. if instance is None: if is_dagster_home_set(): instance = DagsterInstance.get() else: instance = DagsterInstance.local_temp( tempdir=seven.get_system_temp_directory()) check.inst_param(instance, "instance", DagsterInstance) # Only used for Airflow; internally we continue to use pipeline.name dag_id = check.opt_str_param(dag_id, "dag_id", _rename_for_airflow(job_name)) dag_description = check.opt_str_param(dag_description, "dag_description", _make_dag_description(job_name)) check.subclass_param(operator, "operator", BaseOperator) dag_kwargs = dict( {"default_args": DEFAULT_ARGS}, **check.opt_dict_param(dag_kwargs, "dag_kwargs", key_type=str), ) op_kwargs = check.opt_dict_param(op_kwargs, "op_kwargs", key_type=str) dag = DAG(dag_id=dag_id, description=dag_description, **dag_kwargs) pipeline = recon_repo.get_definition().get_pipeline(job_name) if mode is None: mode = pipeline.get_default_mode_name() execution_plan = create_execution_plan(pipeline, run_config, mode=mode) tasks = {} coalesced_plan = coalesce_execution_steps(execution_plan) for solid_handle, solid_steps in coalesced_plan.items(): step_keys = [step.key for step in solid_steps] operator_parameters = DagsterOperatorParameters( recon_repo=recon_repo, pipeline_name=job_name, run_config=run_config, mode=mode, task_id=solid_handle, step_keys=step_keys, dag=dag, instance_ref=instance.get_ref(), op_kwargs=op_kwargs, pipeline_snapshot=pipeline.get_pipeline_snapshot(), execution_plan_snapshot=snapshot_from_execution_plan( execution_plan, pipeline_snapshot_id=pipeline.get_pipeline_snapshot_id()), ) task = operator(operator_parameters) tasks[solid_handle] = task for solid_step in solid_steps: for step_input in solid_step.step_inputs: for key in step_input.dependency_keys: prev_solid_handle = execution_plan.get_step_by_key( key).solid_handle.to_string() if solid_handle != prev_solid_handle: tasks[prev_solid_handle].set_downstream(task) return (dag, [tasks[solid_handle] for solid_handle in coalesced_plan.keys()])
def pipeline_execute_command(**kwargs): with capture_interrupts(): with DagsterInstance.get() as instance: execute_execute_command(instance, kwargs)
def execute_backfill_command(cli_args, print_fn, instance=None): pipeline_name = cli_args.pop('pipeline_name') repo_args = {k: v for k, v in cli_args.items() if k in REPO_ARG_NAMES} if pipeline_name and not isinstance(pipeline_name, six.string_types): if len(pipeline_name) == 1: pipeline_name = pipeline_name[0] instance = instance or DagsterInstance.get() handle = recon_repo_for_cli_args(repo_args) repository = handle.get_definition() noprompt = cli_args.get('noprompt') # check run launcher if not instance.run_launcher: raise click.UsageError( 'A run launcher must be configured before running a backfill. You can configure a run ' 'launcher (e.g. dagster_graphql.launcher.RemoteDagitRunLauncher) in your instance ' '`dagster.yaml` settings. See ' 'https://docs.dagster.io/docs/deploying/instance/ for more' 'information.') # Resolve pipeline if not pipeline_name and noprompt: raise click.UsageError('No pipeline specified') if not pipeline_name: pipeline_name = click.prompt( 'Select a pipeline to backfill: {}'.format(', '.join( repository.pipeline_names))) repository = handle.get_definition() if not repository.has_pipeline(pipeline_name): raise click.UsageError( 'No pipeline found named `{}`'.format(pipeline_name)) pipeline = repository.get_pipeline(pipeline_name) # Resolve partition set all_partition_sets = get_partition_sets_for_handle(handle) pipeline_partition_sets = [ x for x in all_partition_sets if x.pipeline_name == pipeline.name ] if not pipeline_partition_sets: raise click.UsageError( 'No partition sets found for pipeline `{}`'.format(pipeline.name)) partition_set_name = cli_args.get('partition_set') if not partition_set_name: if len(pipeline_partition_sets) == 1: partition_set_name = pipeline_partition_sets[0].name elif noprompt: raise click.UsageError( 'No partition set specified (see option `--partition-set`)') else: partition_set_name = click.prompt( 'Select a partition set to use for backfill: {}'.format( ', '.join(x.name for x in pipeline_partition_sets))) partition_set = next( (x for x in pipeline_partition_sets if x.name == partition_set_name), None) if not partition_set: raise click.UsageError( 'No partition set found named `{}`'.format(partition_set_name)) # Resolve partitions to backfill partitions = gen_partitions_from_args(partition_set, cli_args) # Print backfill info print_fn('\n Pipeline: {}'.format(pipeline.name)) print_fn('Partition set: {}'.format(partition_set.name)) print_fn(' Partitions: {}\n'.format( print_partition_format(partitions, indent_level=15))) # Confirm and launch if noprompt or click.confirm( 'Do you want to proceed with the backfill ({} partitions)?'.format( len(partitions))): print_fn('Launching runs... ') backfill_id = make_new_backfill_id() run_tags = merge_dicts( PipelineRun.tags_for_backfill_id(backfill_id), get_tags_from_args(cli_args), ) for partition in partitions: run = instance.create_run_for_pipeline( pipeline_def=pipeline, mode=partition_set.mode, solid_subset=partition_set.solid_subset, environment_dict=partition_set.environment_dict_for_partition( partition), tags=merge_dicts(partition_set.tags_for_partition(partition), run_tags), ) instance.launch_run(run.run_id) # Remove once we can handle synchronous execution... currently limited by sqlite time.sleep(0.1) print_fn('Launched backfill job `{}`'.format(backfill_id)) else: print_fn(' Aborted!')
def pipeline_launch_command(**kwargs): with DagsterInstance.get() as instance: return execute_launch_command(instance, kwargs)
def debug_heartbeat_command(): with DagsterInstance.get() as instance: debug_daemon_heartbeats(instance)
def pipeline_list_command(**kwargs): with DagsterInstance.get() as instance: return execute_list_command(kwargs, click.echo, instance)
def pipeline_print_command(verbose, **cli_args): with DagsterInstance.get() as instance: return execute_print_command(verbose, cli_args, click.echo, instance)
def run_command(**kwargs): with capture_interrupts(): with DagsterInstance.get() as instance: _daemon_run_command(instance, kwargs)