Ejemplo n.º 1
0
def _dataframe_loader_config():
    read_fields = {
        read_from: Permissive({
            option_name: Field(option_args[0],
                               is_required=option_args[1],
                               description=option_args[2])
            for option_name, option_args in read_opts["options"].items()
        })
        for read_from, read_opts in DataFrameReadTypes.items()
    }

    return Selector({
        "read": Field(
            Selector(read_fields),
            is_required=False,
        ),
        # https://github.com/dagster-io/dagster/issues/2872
        **{
            field_name: Field(
                field_config,
                is_required=False,
            )
            for field_name, field_config in read_fields.items()
        },
    })
Ejemplo n.º 2
0
def _dataframe_materializer_config():
    to_fields = {
        write_to: Permissive({
            option_name: Field(option_args[0],
                               is_required=option_args[1],
                               description=option_args[2])
            for option_name, option_args in to_opts["options"].items()
        })
        for write_to, to_opts in DataFrameToTypes.items()
    }

    return Selector({
        "to": Field(
            Selector(to_fields),
            is_required=False,
        ),
        # https://github.com/dagster-io/dagster/issues/2872
        **{
            field_name: Field(
                field_config,
                is_required=False,
            )
            for field_name, field_config in to_fields.items()
        },
    })
Ejemplo n.º 3
0
def test_construct_same_selectors():
    int_selector_1 = Selector(fields={'an_int': Field(int)})
    int_selector_2 = Selector(fields={'an_int': Field(int)})

    # assert identical object
    assert int_selector_1 is int_selector_2
    # assert equivalent key
    assert int_selector_1.key == int_selector_2.key
Ejemplo n.º 4
0
def test_custom_dagster_dataframe_parametrizable_input():
    @input_selector_schema(
        Selector({'door_a': Field(str), 'door_b': Field(str), 'door_c': Field(str),})
    )
    def silly_hydrator(_, which_door, _field):
        if which_door == 'door_a':
            return DataFrame({'foo': ['goat']})
        elif which_door == 'door_b':
            return DataFrame({'foo': ['car']})
        elif which_door == 'door_c':
            return DataFrame({'foo': ['goat']})
        raise DagsterInvariantViolationError(
            'You did not pick a door. You chose: {which_door}'.format(which_door=which_door)
        )

    @output_selector_schema(Selector({'devnull': Field(str), 'nothing': Field(str)}))
    def silly_materializer(_, _location, _field, _value):
        return Materialization(label='did nothing', description='just one of those days')

    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[PandasColumn.exists('foo'),],
        input_hydration_config=silly_hydrator,
        output_materialization_config=silly_materializer,
    )

    @solid(
        input_defs=[InputDefinition('df', TestDataFrame)],
        output_defs=[OutputDefinition(TestDataFrame)],
    )
    def did_i_win(_, df):
        return df

    solid_result = execute_solid(
        did_i_win,
        run_config={
            'solids': {
                'did_i_win': {
                    'inputs': {'df': {'door_a': 'bar'}},
                    'outputs': [{'result': {'devnull': 'baz'}}],
                }
            }
        },
    )
    assert solid_result.success
    output_df = solid_result.output_value()
    assert isinstance(output_df, DataFrame)
    assert output_df['foo'].tolist() == ['goat']
    materialization_events = solid_result.materialization_events_during_compute
    assert len(materialization_events) == 1
    assert materialization_events[0].event_specific_data.materialization.label == 'did nothing'
Ejemplo n.º 5
0
def _construct_selector_from_snap(config_type_snap, config_snap_map):
    check.list_param(config_type_snap.fields, "config_field_snap", ConfigFieldSnap)

    return Selector(
        fields=_construct_fields(config_type_snap, config_snap_map),
        description=config_type_snap.description,
    )
Ejemplo n.º 6
0
def test_scalar_or_selector():
    int_or_selector = ScalarUnion(
        scalar_type=int,
        non_scalar_schema=Selector({
            "a_string": str,
            "an_int": int
        }),
    )

    assert validate_config(int_or_selector, 2).success
    assert not validate_config(int_or_selector, "2").success
    assert not validate_config(int_or_selector, False).success

    assert validate_config(int_or_selector, {"a_string": "kjdfk"}).success
    assert validate_config(int_or_selector, {"an_int": 2}).success
    assert not validate_config(int_or_selector, {}).success
    assert not validate_config(int_or_selector, {
        "a_string": "kjdfk",
        "an_int": 2
    }).success
    assert not validate_config(int_or_selector, {"wrong_key": "kjdfd"}).success
    assert not validate_config(int_or_selector, {"a_string": 2}).success
    assert not validate_config(int_or_selector, {
        "a_string": "kjdfk",
        "extra_field": "kd"
    }).success
def test_kitchen_sink_break_out():
    @solid(config_schema=[{
        "opt_list_of_int": Field([int], is_required=False),
        "nested_dict": {
            "list_list": [[int]],
            "nested_selector":
            Selector({
                "some_field": int,
                "noneable_list": Noneable([bool])
            }),
        },
        "map": {
            str: {
                "map_a": int,
                "map_b": [str]
            },
        },
    }])
    def solid_with_kitchen_sink_config(_):
        pass

    @pipeline
    def single_solid_pipeline():
        solid_with_kitchen_sink_config()

    config_snaps = build_config_schema_snapshot(
        single_solid_pipeline).all_config_snaps_by_key

    solid_config_key = solid_with_kitchen_sink_config.config_schema.config_type.key
    assert solid_config_key in config_snaps
    solid_config_snap = config_snaps[solid_config_key]

    assert solid_config_snap.kind == ConfigTypeKind.ARRAY

    dict_within_list = config_snaps[solid_config_snap.inner_type_key]

    assert len(dict_within_list.fields) == 3

    opt_field = dict_within_list.get_field("opt_list_of_int")

    assert opt_field.is_required is False
    assert config_snaps[opt_field.type_key].kind == ConfigTypeKind.ARRAY

    nested_dict = config_snaps[dict_within_list.get_field(
        "nested_dict").type_key]
    assert len(nested_dict.fields) == 2
    nested_selector = config_snaps[nested_dict.get_field(
        "nested_selector").type_key]
    noneable_list_bool = config_snaps[nested_selector.get_field(
        "noneable_list").type_key]
    assert noneable_list_bool.kind == ConfigTypeKind.NONEABLE
    list_bool = config_snaps[noneable_list_bool.inner_type_key]
    assert list_bool.kind == ConfigTypeKind.ARRAY

    map = config_snaps[dict_within_list.get_field("map").type_key]
    assert map.kind == ConfigTypeKind.MAP
    map_dict = config_snaps[map.inner_type_key]
    assert len(map_dict.fields) == 2
    map_a = config_snaps[map_dict.get_field("map_a").type_key]
    assert map_a.kind == ConfigTypeKind.SCALAR
def test_kitchen_sink():
    kitchen_sink = resolve_to_config_type([{
        "opt_list_of_int":
        Field(int, is_required=False),
        "nested_dict": {
            "list_list": [[int]],
            "nested_selector":
            Field(Selector({
                "some_field": int,
                "more_list": Noneable([bool])
            })),
        },
        "map": {
            str: {
                "map_a": int,
                "map_b": [str]
            },
        },
    }])

    kitchen_sink_snap = snap_from_dagster_type(kitchen_sink)

    rehydrated_snap = deserialize_json_to_dagster_namedtuple(
        serialize_dagster_namedtuple(kitchen_sink_snap))
    assert kitchen_sink_snap == rehydrated_snap
Ejemplo n.º 9
0
def test_kitchen_sink_break_out():
    nested_dict_cls = resolve_to_config_type({
        'list_list': [[int]],
        'nested_selector':
        Selector({
            'some_field': int,
            'list': Noneable([bool])
        }),
    })
    dict_within_list_cls = resolve_to_config_type({
        'opt_list_of_int':
        Field([int], is_optional=True),
        'nested_dict':
        Field(nested_dict_cls)
    })
    kitchen_sink = Array(dict_within_list_cls)

    dict_within_list_key = dict_within_list_cls.key
    kitchen_sink_meta = meta_from_dagster_type(kitchen_sink)

    assert len(kitchen_sink_meta.type_param_refs) == 1
    assert kitchen_sink_meta.type_param_refs[0].key == dict_within_list_key
    assert len(kitchen_sink_meta.inner_type_refs) == 1
    assert kitchen_sink_meta.inner_type_refs[0].key == dict_within_list_key
    dict_within_list_meta = meta_from_dagster_type(dict_within_list_cls)
    assert dict_within_list_meta.type_param_refs is None
    # List[int], Int, Shape.XXX
    assert len(dict_within_list_meta.inner_type_refs) == 3
    assert sorted([
        type_ref.key for type_ref in dict_within_list_meta.inner_type_refs
    ]) == sorted([nested_dict_cls.key, 'Int', 'Array.Int'])
Ejemplo n.º 10
0
def test_resource_invocation_kitchen_sink_config():
    @resource(
        config_schema={
            "str_field": str,
            "int_field": int,
            "list_int": [int],
            "list_list_int": [[int]],
            "dict_field": {"a_string": str},
            "list_dict_field": [{"an_int": int}],
            "selector_of_things": Selector(
                {"select_list_dict_field": [{"an_int": int}], "select_int": int}
            ),
            "optional_list_of_optional_string": Noneable([Noneable(str)]),
        }
    )
    def kitchen_sink(context):
        return context.resource_config

    resource_config = {
        "str_field": "kjf",
        "int_field": 2,
        "list_int": [3],
        "list_list_int": [[1], [2, 3]],
        "dict_field": {"a_string": "kdjfkd"},
        "list_dict_field": [{"an_int": 2}, {"an_int": 4}],
        "selector_of_things": {"select_int": 3},
        "optional_list_of_optional_string": ["foo", None],
    }

    assert kitchen_sink(build_init_resource_context(config=resource_config)) == resource_config
Ejemplo n.º 11
0
def test_scalar_or_selector():
    int_or_selector = ScalarUnion(
        scalar_type=int,
        non_scalar_schema=Selector({
            'a_string': str,
            'an_int': int
        }),
    )

    assert validate_config(int_or_selector, 2).success
    assert not validate_config(int_or_selector, '2').success
    assert not validate_config(int_or_selector, False).success

    assert validate_config(int_or_selector, {'a_string': 'kjdfk'}).success
    assert validate_config(int_or_selector, {'an_int': 2}).success
    assert not validate_config(int_or_selector, {}).success
    assert not validate_config(int_or_selector, {
        'a_string': 'kjdfk',
        'an_int': 2
    }).success
    assert not validate_config(int_or_selector, {'wrong_key': 'kjdfd'}).success
    assert not validate_config(int_or_selector, {'a_string': 2}).success
    assert not validate_config(int_or_selector, {
        'a_string': 'kjdfk',
        'extra_field': 'kd'
    }).success
Ejemplo n.º 12
0
def test_kitchen_sink():
    kitchen_sink = List[Dict({
        'opt_list_of_int':
        Field(List[int], is_optional=True),
        'tuple_of_things':
        Field(Tuple[int, str]),
        'nested_dict':
        Field(
            Dict({
                'list_list':
                Field(List[List[int]]),
                'nested_selector':
                Field(
                    Selector({
                        'some_field': Field(int),
                        'set': Field(Optional[Set[bool]])
                    })),
            })),
    })]

    kitchen_sink_meta = meta_from_dagster_type(kitchen_sink)

    rehydrated_meta = deserialize_json_to_dagster_namedtuple(
        serialize_dagster_namedtuple(kitchen_sink_meta))
    assert kitchen_sink_meta == rehydrated_meta
Ejemplo n.º 13
0
def test_invalid_selector_field():
    with pytest.raises(DagsterInvalidDefinitionError) as exc_info:
        Selector({'val': Int})

    assert str(exc_info.value) == (
        'You have passed a config type "Int" in the parameter "fields" and it is '
        'in the "val" entry of that dict. It is from a Selector with fields '
        '[\'val\']. You have likely forgot to wrap this type in a Field.')
Ejemplo n.º 14
0
def test_selector_of_things():
    selector_snap = snap_from_dagster_type(Selector({"bar": Field(int)}))
    assert selector_snap.key.startswith("Selector")
    assert selector_snap.kind == ConfigTypeKind.SELECTOR
    assert selector_snap.fields and len(selector_snap.fields) == 1
    field_snap = selector_snap.fields[0]
    assert field_snap.name == "bar"
    assert field_snap.type_key == "Int"
Ejemplo n.º 15
0
def test_selector_of_things():
    selector_meta = meta_from_dagster_type(Selector({'bar': Field(int)}))
    assert selector_meta.key.startswith('Selector')
    assert selector_meta.kind == ConfigTypeKind.SELECTOR
    assert selector_meta.fields and len(selector_meta.fields) == 1
    field_meta = selector_meta.fields[0]
    assert field_meta.name == 'bar'
    assert field_meta.type_ref.key == 'Int'
Ejemplo n.º 16
0
def get_retries_config():
    return Field(
        Selector({
            'enabled': {},
            'disabled': {}
        }),
        is_required=False,
        default_value={'enabled': {}},
    )
Ejemplo n.º 17
0
def test_kitchen_sink():
    @solid(
        config_schema={
            'str_field': str,
            'int_field': int,
            'list_int': [int],
            'list_list_int': [[int]],
            'dict_field': {'a_string': str},
            'list_dict_field': [{'an_int': int}],
            'selector_of_things': Selector(
                {'select_list_dict_field': [{'an_int': int}], 'select_int': int}
            ),
            # this is a good argument to use () instead of [] for type parameterization in
            # the config system
            'optional_list_of_optional_string': Noneable([Noneable(str)]),
        }
    )
    def kitchen_sink(context):
        return context.solid_config

    solid_config_one = {
        'str_field': 'kjf',
        'int_field': 2,
        'list_int': [3],
        'list_list_int': [[1], [2, 3]],
        'dict_field': {'a_string': 'kdjfkd'},
        'list_dict_field': [{'an_int': 2}, {'an_int': 4}],
        'selector_of_things': {'select_int': 3},
        'optional_list_of_optional_string': ['foo', None],
    }

    assert (
        execute_solid(
            kitchen_sink, run_config={'solids': {'kitchen_sink': {'config': solid_config_one}}},
        ).output_value()
        == solid_config_one
    )

    solid_config_two = {
        'str_field': 'kjf',
        'int_field': 2,
        'list_int': [3],
        'list_list_int': [[1], [2, 3]],
        'dict_field': {'a_string': 'kdjfkd'},
        'list_dict_field': [{'an_int': 2}, {'an_int': 4}],
        'selector_of_things': {'select_list_dict_field': [{'an_int': 5}]},
        'optional_list_of_optional_string': None,
    }

    assert (
        execute_solid(
            kitchen_sink, run_config={'solids': {'kitchen_sink': {'config': solid_config_two}}},
        ).output_value()
        == solid_config_two
    )
Ejemplo n.º 18
0
def test_kitchen_sink():
    @solid(
        config_schema={
            "str_field": str,
            "int_field": int,
            "list_int": [int],
            "list_list_int": [[int]],
            "dict_field": {"a_string": str},
            "list_dict_field": [{"an_int": int}],
            "selector_of_things": Selector(
                {"select_list_dict_field": [{"an_int": int}], "select_int": int}
            ),
            # this is a good argument to use () instead of [] for type parameterization in
            # the config system
            "optional_list_of_optional_string": Noneable([Noneable(str)]),
        }
    )
    def kitchen_sink(context):
        return context.solid_config

    solid_config_one = {
        "str_field": "kjf",
        "int_field": 2,
        "list_int": [3],
        "list_list_int": [[1], [2, 3]],
        "dict_field": {"a_string": "kdjfkd"},
        "list_dict_field": [{"an_int": 2}, {"an_int": 4}],
        "selector_of_things": {"select_int": 3},
        "optional_list_of_optional_string": ["foo", None],
    }

    assert (
        execute_solid(
            kitchen_sink, run_config={"solids": {"kitchen_sink": {"config": solid_config_one}}},
        ).output_value()
        == solid_config_one
    )

    solid_config_two = {
        "str_field": "kjf",
        "int_field": 2,
        "list_int": [3],
        "list_list_int": [[1], [2, 3]],
        "dict_field": {"a_string": "kdjfkd"},
        "list_dict_field": [{"an_int": 2}, {"an_int": 4}],
        "selector_of_things": {"select_list_dict_field": [{"an_int": 5}]},
        "optional_list_of_optional_string": None,
    }

    assert (
        execute_solid(
            kitchen_sink, run_config={"solids": {"kitchen_sink": {"config": solid_config_two}}},
        ).output_value()
        == solid_config_two
    )
Ejemplo n.º 19
0
def mysql_config():
    return Selector({
        "mysql_url": StringSource,
        "mysql_db": {
            "username": StringSource,
            "password": StringSource,
            "hostname": StringSource,
            "db_name": StringSource,
            "port": Field(IntSource, is_required=False, default_value=3306),
        },
    })
Ejemplo n.º 20
0
def pg_config():
    return Selector({
        "postgres_url": StringSource,
        "postgres_db": {
            "username": StringSource,
            "password": StringSource,
            "hostname": StringSource,
            "db_name": StringSource,
            "port": Field(IntSource, is_required=False, default_value=5432),
        },
    })
Ejemplo n.º 21
0
def _define_task():
    return Field(
        Selector({
            "notebook_task": _define_notebook_task(),
            "spark_jar_task": _define_spark_jar_task(),
            "spark_python_task": _define_spark_python_task(),
            "spark_submit_task": _define_spark_submit_task(),
        }),
        description="The task to run.",
        is_required=True,
    )
Ejemplo n.º 22
0
def pg_config():
    return Selector({
        'postgres_url': str,
        'postgres_db': {
            'username': StringSource,
            'password': StringSource,
            'hostname': StringSource,
            'db_name': StringSource,
            'port': Field(IntSource, is_required=False, default_value=5432),
        },
    })
Ejemplo n.º 23
0
def _define_task():
    return Field(
        Selector({
            'notebook_task': _define_notebook_task(),
            'spark_jar_task': _define_spark_jar_task(),
            'spark_python_task': _define_spark_python_task(),
            'spark_submit_task': _define_spark_submit_task(),
        }),
        description='The task to run.',
        is_required=True,
    )
Ejemplo n.º 24
0
def get_retries_config():
    return Field(
        Selector({
            'enabled': {},
            'disabled': {},
            'deferred': {
                'previous_attempts': Permissive()
            }
        }),
        is_required=False,
        default_value={'enabled': {}},
    )
Ejemplo n.º 25
0
def define_databricks_storage_config():
    return Field(
        Selector({
            "s3": _define_s3_storage_credentials(),
            "adls2": _define_adls2_storage_credentials()
        }),
        description=
        "Databricks storage configuration for either S3 or ADLS2. If access credentials "
        "for your Databricks storage are stored in Databricks secrets, this config indicates the "
        "secret scope and the secret keys used to access either S3 or ADLS2.",
        is_required=False,
    )
Ejemplo n.º 26
0
def _define_size():
    num_workers = Field(
        Int,
        description=
        'If num_workers, number of worker nodes that this cluster should have. '
        'A cluster has one Spark Driver and num_workers Executors for a total of '
        'num_workers + 1 Spark nodes.',
        is_required=True,
    )
    return Selector({
        'autoscale': _define_autoscale(),
        'num_workers': num_workers
    })
Ejemplo n.º 27
0
def define_databricks_storage_config():
    return Field(
        Selector({
            "s3": _define_s3_storage_credentials(),
            "adls2": _define_adls2_storage_credentials()
        }),
        description="Databricks storage configuration. Solids using the "
        "DatabricksPySparkStepLauncher to execute pipeline steps in Databricks MUST configure "
        "storage using this config (either S3 or ADLS2 can be used). Access credentials for the "
        "storage must be stored in Databricks secrets; this config indicates the secret scope "
        "and the secret keys used to access either S3 or ADLS2.",
        is_required=True,
    )
Ejemplo n.º 28
0
def _define_size():
    num_workers = Field(
        Int,
        description=
        "If num_workers, number of worker nodes that this cluster should have. "
        "A cluster has one Spark Driver and num_workers Executors for a total of "
        "num_workers + 1 Spark nodes.",
        is_required=True,
    )
    return Selector({
        "autoscale": _define_autoscale(),
        "num_workers": num_workers
    })
Ejemplo n.º 29
0
def _define_cluster_log_conf():
    return Field(
        Selector({
            "dbfs": _define_dbfs_storage_info(),
            "s3": _define_s3_storage_info()
        }),
        description=
        "Recommended! The configuration for delivering Spark logs to a long-term "
        "storage destination. Only one destination can be specified for one cluster. If the conf "
        "is given, the logs will be delivered to the destination every 5 mins. "
        "The destination of driver logs is <destination>/<cluster-id>/driver, while the "
        "destination of executor logs is <destination>/<cluster-id>/executor.",
        is_required=False,
    )
Ejemplo n.º 30
0
def _define_cluster():
    existing_cluster_id = Field(
        String,
        description=
        "The ID of an existing cluster that will be used for all runs "
        "of this job. When running jobs on an existing cluster, you may "
        "need to manually restart the cluster if it stops responding. "
        "Databricks suggests running jobs on new clusters for "
        "greater reliability.",
        is_required=True,
    )
    return Selector({
        "new": _define_new_cluster(),
        "existing": existing_cluster_id
    })