from dagster.core.execution.context_creation_pipeline import pipeline_initialization_manager from dagster.core.execution.plan.execute_step import core_dagster_event_sequence_for_step from dagster.core.execution.retries import Retries from dagster.core.instance import DagsterInstance from dagster.core.storage.file_manager import LocalFileHandle PICKLED_EVENTS_FILE_NAME = 'events.pkl' PICKLED_STEP_RUN_REF_FILE_NAME = 'step_run_ref.pkl' @resource( config_schema={ 'scratch_dir': Field( StringSource, description= 'Directory used to pass files between the plan process and step process.', ), }, ) def local_external_step_launcher(context): return LocalExternalStepLauncher(**context.resource_config) class LocalExternalStepLauncher(StepLauncher): '''Launches each step in its own local process, outside the plan process.''' def __init__(self, scratch_dir): self.scratch_dir = check.str_param(scratch_dir, 'scratch_dir') def launch_step(self, step_context, prior_attempts_count): step_run_ref = step_context_to_step_run_ref(step_context, prior_attempts_count)
def _multiple_required_fields_config_permissive_dict(): return Field(Permissive({"field_one": Field(String), "field_two": Field(String)}))
def test_noop_config(): assert Field(Any)
def test_compute_fields_hash(): assert isinstance(_hash({"some_int": Field(int)}), str)
def _single_optional_string_config_dict(): return convert_potential_field({"optional_field": Field(String, is_required=False)})
from .decorators import pyspark_solid from .resources import ( PySparkResourceDefinition, pyspark_resource, spark_session_from_config, spark_session_resource, ) @input_selector_schema( Selector({ 'csv': Field( Dict({ 'path': Field(Path), 'sep': Field(String, is_optional=True), 'header': Field(Bool, is_optional=True), })) })) def load_rdd(context, file_type, file_options): if file_type == 'csv': return context.resources.spark.read.csv( file_options['path'], sep=file_options.get('sep')).rdd else: check.failed('Unsupported file type: {}'.format(file_type)) @output_selector_schema( Selector({ 'csv': Field(
def test_construct_different_selectors(): int_selector = Selector(fields={"an_int": Field(int)}) string_selector = Selector(fields={"a_string": Field(str)}) assert int_selector is not string_selector assert int_selector.key != string_selector.key
InputDefinition, Int, ModeDefinition, RepositoryDefinition, String, lambda_solid, pipeline, solid, ) from dagster_aws.s3.resources import s3_resource from dagster_aws.s3.system_storage import s3_plus_default_storage_defs @solid(input_defs=[InputDefinition('word', String)], config={'factor': Field(Int)}) def multiply_the_word(context, word): return word * context.solid_config['factor'] @lambda_solid(input_defs=[InputDefinition('word')]) def count_letters(word): counts = defaultdict(int) for letter in word: counts[letter] += 1 return dict(counts) @lambda_solid() def error_solid(): raise Exception('Unusual error')
logger = S3Logger(context.log.debug, bucket, key, target_file, int(headers['ContentLength'])) session.download_file(Bucket=bucket, Key=key, Filename=target_file, Callback=logger) return target_file # This should be ported to use FileHandle-based solids. # See https://github.com/dagster-io/dagster/issues/1476 @solid( name='download_from_s3_to_file', config={ 'bucket': Field(String, description='S3 bucket name'), 'key': Field(String, description='S3 key name'), 'target_folder': Field(Path, description=( 'Specifies the path at which to download the object.')), 'skip_if_present': Field(Bool, is_required=False, default_value=False), }, description='Downloads an object from S3 to a file.', output_defs=[ OutputDefinition(FileExistsAtPath, description='The path to the downloaded object.') ], required_resource_keys={'s3'},
def test_config_with_and_without_config(): @solid(config_schema={ "prefix": Field(str, is_required=False, default_value="_") }) def prefix_value(context, v): return "{prefix}{v}".format(prefix=context.solid_config["prefix"], v=v) @composite_solid( config_fn=lambda cfg: {"prefix_value": { "config": { "prefix": cfg["prefix"] } }}, config_schema={ "prefix": Field(str, is_required=False, default_value="_id_") }, ) def prefix_id(val): return prefix_value(val) @solid def print_value(_, v): return str(v) @pipeline def config_issue_pipeline(): v = prefix_id() print_value(v) result = execute_pipeline( config_issue_pipeline, { "solids": { "prefix_id": { "config": { "prefix": "_customprefix_" }, "inputs": { "val": { "value": "12345" } }, } } }, ) assert result.success assert result.result_for_solid( "print_value").output_value() == "_customprefix_12345" result_using_default = execute_pipeline( config_issue_pipeline, { "solids": { "prefix_id": { "config": {}, "inputs": { "val": { "value": "12345" } } } } }, ) assert result_using_default.success assert result_using_default.result_for_solid( "print_value").output_value() == "_id_12345"
# pylint: disable=no-value-for-parameter import collections from dagster import Field, Int, lambda_solid, solid, pipeline, as_dagster_type Counter = as_dagster_type(collections.Counter) @solid(config={'factor': Field(Int)}) def multiply_the_word(context, word: str) -> str: return word * context.solid_config['factor'] @lambda_solid def count_letters(word: str) -> Counter: return collections.Counter(word) @pipeline def configuration_schema_pipeline(): return count_letters(multiply_the_word())
def test_map_shape_complex(): # Long form assert _validate( Field(Map(str, Shape({ "name": Field(str), "number": Field(int) }))), { "foo": { "name": "test_name", "number": 5, }, "bar": { "name": "other_name", "number": 10, }, }, ) == { "foo": { "name": "test_name", "number": 5, }, "bar": { "name": "other_name", "number": 10, }, } # Short form assert _validate( Field({ str: { "name": Field(str), "number": Field(int), }, }), { "foo": { "name": "test_name", "number": 5, }, "bar": { "name": "other_name", "number": 10, }, }, ) == { "foo": { "name": "test_name", "number": 5, }, "bar": { "name": "other_name", "number": 10, }, } with pytest.raises(AssertionError): _validate( Field(Map(str, Shape({ "name": Field(str), "number": Field(int) }))), { "foo": { "name": "test_name", "number": "not_a_number", }, "bar": { "name": "other_name", "number": 10, }, }, ) with pytest.raises(AssertionError): _validate( Field(Map(str, Shape({ "name": Field(str), "number": Field(int) }))), { "foo": { "name": "test_name", "number": 15, }, "baz": "not_a_shape", }, )
def test_permissive_defaults(): @solid(config_schema=Permissive({"four": Field(int, default_value=4)})) def perm_with_defaults(context): assert context.solid_config["four"] == 4 assert execute_solid(perm_with_defaults).success
from dagster.utils import safe_tempfile_path try: import _thread as thread except ImportError: import thread def _send_kbd_int(temp_files): while not all([os.path.exists(temp_file) for temp_file in temp_files]): time.sleep(0.1) thread.interrupt_main() @solid(config={'tempfile': Field(String)}) def write_a_file(context): with open(context.solid_config['tempfile'], 'w') as ff: ff.write('yup') while True: time.sleep(0.1) @pipeline def write_files_pipeline(): write_a_file.alias('write_1')() write_a_file.alias('write_2')() write_a_file.alias('write_3')() write_a_file.alias('write_4')()
def resource_init(init_context): if init_context.resource_config['throw_on_resource_init']: raise Exception('throwing from in resource_fn') return ErrorableResource() def define_errorable_resource(): return ResourceDefinition(resource_fn=resource_init, config_field=Field( Dict({'throw_on_resource_init': Field(Bool)}))) solid_throw_config = Field( Dict(fields={ 'throw_in_solid': Field(Bool), 'return_wrong_type': Field(Bool) })) @solid(name='emit_num', output_defs=[OutputDefinition(Int)], config_field=solid_throw_config) def emit_num(context): if context.solid_config['throw_in_solid']: raise Exception('throwing from in the solid') if context.solid_config['return_wrong_type']: return 'wow' return 13
from dagster.config.field_utils import Selector from dagster.core.types.config_schema import input_selector_schema, output_selector_schema CONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint} def dict_without_keys(ddict, *keys): return {key: value for key, value in ddict.items() if key not in set(keys)} @output_selector_schema( Selector( { 'csv': { 'path': StringSource, 'sep': Field( StringSource, is_required=False, default_value=','), }, 'parquet': { 'path': StringSource }, 'table': { 'path': StringSource }, }, )) def dataframe_output_schema(_context, file_type, file_options, pandas_df): check.str_param(file_type, 'file_type') check.dict_param(file_options, 'file_options') check.inst_param(pandas_df, 'pandas_df', pd.DataFrame) if file_type == 'csv': path = file_options['path']
def define_errorable_resource(): return ResourceDefinition(resource_fn=resource_init, config_field=Field( Dict({'throw_on_resource_init': Field(Bool)})))
if not isinstance(value, list): return False fields = [field for field in value[0].keys()] for i in range(len(value)): row = value[i] if not isinstance(row, dict): return False row_fields = [field for field in row.keys()] if fields != row_fields: return False return True @input_hydration_config(Selector({'csv': Field(String)})) def less_simple_data_frame_input_hydration_config(context, selector): with open(selector['csv'], 'r') as fd: lines = [row for row in csv.DictReader(fd)] context.log.info('Read {n_lines} lines'.format(n_lines=len(lines))) return lines if typing.TYPE_CHECKING: LessSimpleDataFrame = list else: LessSimpleDataFrame = DagsterType( name='LessSimpleDataFrame', description= 'A more sophisticated data frame that type checks its structure.',
def test_hash_diff(): assert _hash({"some_int": Field(int)}) != _hash( {"another_int": Field(int)}) assert _hash({"same_name": Field(int)}) != _hash({"same_name": Field(str)}) assert _hash({"same_name": Field(int)}) != _hash( {"same_name": Field(int, is_required=False)}) assert _hash({"same_name": Field(int)}) != _hash( {"same_name": Field(int, is_required=False, default_value=2)}) assert _hash({"same_name": Field(int, is_required=False)}) != _hash( {"same_name": Field(int, is_required=False, default_value=2)}) assert _hash({"same_name": Field(int)}) != _hash( {"same_name": Field(int, description="desc")})
EventMetadataEntry, Field, Materialization, Selector, String, execute_pipeline, input_hydration_config, output_materialization_config, pipeline, seven, solid, usable_as_dagster_type, ) @input_hydration_config(Selector({'csv': Field(String)})) def less_simple_data_frame_input_hydration_config(context, selector): with open(selector['csv'], 'r') as fd: lines = [row for row in csv.DictReader(fd)] context.log.info('Read {n_lines} lines'.format(n_lines=len(lines))) return LessSimpleDataFrame(lines) @output_materialization_config({ 'csv': Field( { 'path': String, 'sep': Field(String, is_required=False, default_value=','), },
def test_kitchen_sink(): big_dict_1 = Shape({ "field_one": Field(int, default_value=2, is_required=False), "field_two": Field( Shape({ "nested_field_one": Field(bool), "nested_selector": Field( Selector({ "int_field_in_selector": Field(int), "permissive_dict_in_selector": Field(Permissive()), "permissive_dict_with_fields_in_selector": Field(Permissive({"string_field": Field(str)})), })), })), }) big_dict_2 = Shape({ "field_one": Field(int, default_value=2, is_required=False), "field_two": Field( Shape( fields={ "nested_field_one": Field(bool), "nested_selector": Field( Selector( fields={ "permissive_dict_in_selector": Field(Permissive()), "int_field_in_selector": Field(int), "permissive_dict_with_fields_in_selector": Field( Permissive( fields={"string_field": Field(str)})), })), })), }) assert big_dict_1 is big_dict_2 assert big_dict_1.key == big_dict_2.key # differs way down in tree big_dict_3 = Shape({ "field_one": Field(int, default_value=2, is_required=False), "field_two": Field( Shape( fields={ "nested_field_one": Field(bool), "nested_selector": Field( Selector( fields={ "permissive_dict_in_selector": Field(Permissive()), "int_field_in_selector": Field(int), "permissive_dict_with_fields_in_selector": Field( Permissive( fields={"int_field": Field(int)})), })), })), }) assert big_dict_1 is not big_dict_3 assert big_dict_1.key != big_dict_3.key
def bash_command_solid(bash_command, name=None, output_encoding=None): '''Execute a Bash command. ''' check.str_param(bash_command, 'bash_command') name = check.opt_str_param(name, 'name', default='bash_solid') output_encoding = check.opt_str_param(output_encoding, 'output_encoding', default='utf-8') @solid( name=name, config={ 'output_logging': Field( Enum( 'OutputType', [ EnumValue('STREAM', description='Stream script stdout/stderr.'), EnumValue( 'BUFFER', description='Buffer bash script stdout/stderr, then log upon completion.', ), EnumValue('NONE', description='No logging'), ], ), is_required=False, default_value='STREAM', ), 'env': Field( Permissive(), description='Environment variables to pass to the child process; if not provided, ' 'the current process environment will be passed.', is_required=False, default_value=None, ), }, ) def _bash_solid(context): '''This logic is ported from the Airflow BashOperator implementation. https://github.com/apache/airflow/blob/master/airflow/operators/bash_operator.py ''' def log_info_msg(log_str): context.log.info('[bash][{name}] '.format(name=name) + log_str) tmp_path = seven.get_system_temp_directory() log_info_msg('using temporary directory: %s' % tmp_path) env = ( context.solid_config['env'] if context.solid_config['env'] is not None else os.environ.copy() ) with NamedTemporaryFile(dir=tmp_path, prefix=name) as tmp_file: tmp_file.write(bytes(bash_command.encode('utf-8'))) tmp_file.flush() script_location = os.path.abspath(tmp_file.name) log_info_msg('Temporary script location: {location}'.format(location=script_location)) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() log_info_msg('Running command: {command}'.format(command=bash_command)) # pylint: disable=subprocess-popen-preexec-fn sub_process = Popen( ['bash', tmp_file.name], stdout=PIPE, stderr=STDOUT, cwd=tmp_path, env=env, preexec_fn=pre_exec, ) # Stream back logs as they are emitted if context.solid_config['output_logging'] == 'STREAM': line = '' for raw_line in iter(sub_process.stdout.readline, b''): line = raw_line.decode(output_encoding).rstrip() log_info_msg(line) sub_process.wait() # Collect and buffer all logs, then emit if context.solid_config['output_logging'] == 'BUFFER': line = '' for raw_line in iter(sub_process.stdout.readline, b''): line += raw_line.decode(output_encoding) log_info_msg(line) # no logging in this case elif context.solid_config['output_logging'] == 'NONE': pass log_info_msg( 'Command exited with return code {retcode}'.format(retcode=sub_process.returncode) ) if sub_process.returncode: raise Failure(description='[bash][{name}] Bash command failed'.format(name=name)) return line return _bash_solid
def test_construct_different_dicts(): int_dict = Shape(fields={"an_int": Field(int)}) string_dict = Shape(fields={"a_string": Field(str)}) assert int_dict is not string_dict assert int_dict.key != string_dict.key
from slack import WebClient from dagster import Field, StringSource, resource @resource( { "token": Field( StringSource, description= """To configure access to the Slack API, you'll need an access token provisioned with access to your Slack workspace. Tokens are typically either user tokens or bot tokens. For programmatic posting to Slack from this resource, you probably want to provision and use a bot token. More in the Slack API documentation here: https://api.slack.com/docs/token-types """, ) }, description="This resource is for connecting to Slack", ) def slack_resource(context): """This resource is for connecting to Slack. By configuring this Slack resource, you can post messages to Slack from any Dagster solid: Examples: .. code-block:: python
def _single_optional_string_field_config_dict_with_default(): optional_field_def = Field(String, is_required=False, default_value="some_default") return convert_potential_field({"optional_field": optional_field_def})
if self._has_object(key): logging.warning( "Removing existing ADLS2 key: {key}".format(key=key)) self._rm_object(key) pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL) file = self.file_system_client.create_file(key) with file.acquire_lease(self.lease_duration) as lease: file.upload_data(pickled_obj, lease=lease, overwrite=True) @object_manager( config_schema={ "adls2_file_system": Field(StringSource, description="ADLS Gen2 file system name"), "adls2_prefix": Field(StringSource, is_required=False, default_value="dagster"), }, required_resource_keys={"adls2"}, ) def adls2_object_manager(init_context): """Persistent object manager using Azure Data Lake Storage Gen2 for storage. Suitable for objects storage for distributed executors, so long as each execution node has network connectivity and credentials for ADLS and the backing container. Attach this resource definition to a :py:class:`~dagster.ModeDefinition` in order to make it available to your pipeline:
def _nested_optional_config_with_no_default(): return convert_potential_field({"nested": {"int_field": Field(Int, is_required=False)}})
class SlackToFile: def __init__(self, output_path): self.chat = ChatToFile(output_path) class ChatToFile: def __init__(self, output_path): self.output_path = output_path def post_message(self, channel, text): with open(self.output_path, 'a') as f: f.write('%s -- %s\n' % (channel, text)) @resource(Field(Dict({'output_path': Field(String)}))) def slack_to_file_resource(context): return SlackToFile(context.resource_config['output_path']) @pipeline(mode_definitions=[ ModeDefinition(name='production', resources={'slack': slack_resource}), ModeDefinition(name='local', resources={'slack': slack_to_file_resource}), ]) def resources_pipeline(): post_hello_message() if __name__ == '__main__': execute_pipeline( resources_pipeline,
def test_default_arg(): config_field = convert_potential_field( {"int_field": Field(Int, default_value=2, is_required=False)} ) assert_config_value_success(config_field.config_type, {}, {"int_field": 2})
from dagster.core.events import DagsterEvent from dagster.core.execution.api import create_execution_plan from dagster.core.execution.context.system import SystemStepExecutionContext from dagster.core.execution.context_creation_pipeline import PlanExecutionContextManager from dagster.core.execution.plan.execute_step import core_dagster_event_sequence_for_step from dagster.core.instance import DagsterInstance from dagster.core.storage.file_manager import LocalFileHandle, LocalFileManager PICKLED_EVENTS_FILE_NAME = "events.pkl" PICKLED_STEP_RUN_REF_FILE_NAME = "step_run_ref.pkl" @resource( config_schema={ "scratch_dir": Field( StringSource, description="Directory used to pass files between the plan process and step process.", ), }, ) def local_external_step_launcher(context): return LocalExternalStepLauncher(**context.resource_config) class LocalExternalStepLauncher(StepLauncher): """Launches each step in its own local process, outside the plan process.""" def __init__(self, scratch_dir: str): self.scratch_dir = check.str_param(scratch_dir, "scratch_dir") def launch_step( self, step_context: SystemStepExecutionContext, prior_attempts_count: int