def test_make_dagster_type():
    SomeNamedTuple = collections.namedtuple("SomeNamedTuple", "prop")
    DagsterSomeNamedTuple = PythonObjectDagsterType(SomeNamedTuple)
    dagster_type = resolve_dagster_type(DagsterSomeNamedTuple)
    assert dagster_type.name == "SomeNamedTuple"
    assert SomeNamedTuple(prop="foo").prop == "foo"

    DagsterNewNameNamedTuple = PythonObjectDagsterType(SomeNamedTuple, name="OverwriteName")
    dagster_type = resolve_dagster_type(DagsterNewNameNamedTuple)
    assert dagster_type.name == "OverwriteName"
def test_make_dagster_type():
    SomeNamedTuple = collections.namedtuple('SomeNamedTuple', 'prop')
    DagsterSomeNamedTuple = PythonObjectDagsterType(SomeNamedTuple)
    dagster_type = resolve_dagster_type(DagsterSomeNamedTuple)
    assert dagster_type.name == 'SomeNamedTuple'
    assert SomeNamedTuple(prop='foo').prop == 'foo'

    DagsterNewNameNamedTuple = PythonObjectDagsterType(SomeNamedTuple,
                                                       name='OverwriteName')
    dagster_type = resolve_dagster_type(DagsterNewNameNamedTuple)
    assert dagster_type.name == 'OverwriteName'
Beispiel #3
0
def test_even_type_loader():
    class EvenType:
        def __init__(self, num):
            assert num % 2 is 0
            self.num = num

    @dagster_type_loader(int)
    def load_even_type(_, cfg):
        return EvenType(cfg)

    EvenDagsterType = PythonObjectDagsterType(EvenType, loader=load_even_type)

    @solid
    def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType:
        return EvenType(even_num.num * 2)

    yaml_doc = """
    solids:
        double_even:
            inputs:
                even_num: 2
    """

    assert execute_solid(double_even, run_config=yaml.safe_load(yaml_doc)).success

    assert execute_solid(
        double_even, run_config={"solids": {"double_even": {"inputs": {"even_num": 2}}}}
    ).success

    # Same same as above w/r/t chatting to prha
    with pytest.raises(AssertionError):
        execute_solid(
            double_even, run_config={"solids": {"double_even": {"inputs": {"even_num": 3}}}}
        )
Beispiel #4
0
def test_even_type_materialization_config():
    class EvenType:
        def __init__(self, num):
            assert num % 2 is 0
            self.num = num

    @dagster_type_materializer({"path": str})
    def save_to_file_materialization(_, cfg, value):
        with open(cfg["path"], "w") as ff:
            ff.write(str(value))
            return AssetMaterialization(
                "path", "Wrote out value to {path}".format(path=path), metadata={"path": path}
            )

    EvenDagsterType = PythonObjectDagsterType(EvenType, materializer=save_to_file_materialization)

    @solid
    def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType:
        return EvenType(even_num.num * 2)

    with safe_tempfile_path() as path:
        yaml_doc = """
solids:
    double_even:
        outputs:
            - result:
                path: {path}
 """
        solid_result = execute_solid(
            double_even,
            input_values={"even_num": EvenType(2)},
            run_config=yaml.safe_load(yaml_doc.format(path=path)),
        )
        assert solid_result.success
Beispiel #5
0
def test_even_type_hydration_config():
    class EvenType:
        def __init__(self, num):
            assert num % 2 is 0
            self.num = num

    @input_hydration_config(int)
    def hydrate_even_type(_, cfg):
        return EvenType(cfg)

    EvenDagsterType = PythonObjectDagsterType(
        EvenType, input_hydration_config=hydrate_even_type)

    @solid
    def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType:
        return EvenType(even_num.num * 2)

    yaml_doc = '''
    solids:
        double_even:
            inputs:
                even_num: 2
    '''

    assert execute_solid(double_even,
                         run_config=yaml.safe_load(yaml_doc)).success

    assert execute_solid(double_even,
                         run_config={
                             'solids': {
                                 'double_even': {
                                     'inputs': {
                                         'even_num': 2
                                     }
                                 }
                             }
                         }).success

    # Same same as above w/r/t chatting to prha
    with pytest.raises(AssertionError):
        execute_solid(double_even,
                      run_config={
                          'solids': {
                              'double_even': {
                                  'inputs': {
                                      'even_num': 3
                                  }
                              }
                          }
                      })
Beispiel #6
0
def test_make_usable_as_dagster_type():
    class EvenType:
        def __init__(self, num):
            assert num % 2 is 0
            self.num = num

    EvenDagsterType = PythonObjectDagsterType(EvenType, name="EvenDagsterType",)

    make_python_type_usable_as_dagster_type(EvenType, EvenDagsterType)

    @solid
    def double_even(_, even_num: EvenType) -> EvenType:
        return EvenType(even_num.num * 2)

    assert execute_solid(double_even, input_values={"even_num": EvenType(2)}).success
Beispiel #7
0
def test_mypy_compliance():
    class EvenType:
        def __init__(self, num):
            assert num % 2 is 0
            self.num = num

    if typing.TYPE_CHECKING:
        EvenDagsterType = EvenType
    else:
        EvenDagsterType = PythonObjectDagsterType(EvenType)

    @solid
    def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType:
        return EvenType(even_num.num * 2)

    assert execute_solid(double_even, input_values={"even_num": EvenType(2)}).success
Beispiel #8
0
def test_python_object_dagster_type():
    class EvenType:
        def __init__(self, num):
            assert num % 2 is 0
            self.num = num

    EvenDagsterType = PythonObjectDagsterType(EvenType, name="EvenDagsterType")

    @solid
    def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType:
        # These type annotations are a shorthand for constructing InputDefinitions
        # and OutputDefinitions, and are not mypy compliant
        return EvenType(even_num.num * 2)

    assert execute_solid(double_even, input_values={"even_num": EvenType(2)}).success
    with pytest.raises(AssertionError):
        execute_solid(double_even, input_values={"even_num": EvenType(3)})
def test_validate_inputs():
    @root_input_manager
    def my_loader(_):
        return 5

    @solid(input_defs=[
        InputDefinition("input1",
                        dagster_type=PythonObjectDagsterType(int),
                        root_manager_key="my_loader")
    ])
    def my_solid(_, input1):
        return input1

    @pipeline(
        mode_defs=[ModeDefinition(resource_defs={"my_loader": my_loader})])
    def my_pipeline():
        my_solid()

    execute_pipeline(my_pipeline)
Beispiel #10
0
def test_even_type_materialization_config():
    class EvenType:
        def __init__(self, num):
            assert num % 2 is 0
            self.num = num

    @output_materialization_config({'path': str})
    def save_to_file_materialization(_, cfg, value):
        with open(cfg['path'], 'w') as ff:
            ff.write(str(value))
            return Materialization(
                'path',
                'Wrote out value to {path}'.format(path=path),
                metadata_entries=[EventMetadataEntry.text('path', path)],
            )

    EvenDagsterType = PythonObjectDagsterType(
        EvenType, output_materialization_config=save_to_file_materialization)

    @solid
    def double_even(_, even_num: EvenDagsterType) -> EvenDagsterType:
        return EvenType(even_num.num * 2)

    with safe_tempfile_path() as path:
        yaml_doc = '''
solids:
    double_even:
        outputs:
            - result:
                path: {path}
 '''
        solid_result = execute_solid(
            double_even,
            input_values={'even_num': EvenType(2)},
            run_config=yaml.safe_load(yaml_doc.format(path=path)),
        )
        assert solid_result.success
Beispiel #11
0
from dagster import PythonObjectDagsterType, solid


# start_object_type
class EvenType:
    def __init__(self, num):
        assert num % 2 is 0
        self.num = num


EvenDagsterType = PythonObjectDagsterType(EvenType, name="EvenDagsterType")
# end_object_type

# start_use_object_type
@solid
def double_even(even_num: EvenDagsterType) -> EvenDagsterType:
    return EvenType(even_num.num * 2)


# end_use_object_type
Beispiel #12
0
        ]


@dagster_type_materializer(String)
def df_output_schema(_context, path, value):
    with open(path, "w") as fd:
        writer = csv.DictWriter(fd, fieldnames=value[0].keys())
        writer.writeheader()
        writer.writerows(rowdicts=value)

    return AssetMaterialization.file(path)


PoorMansDataFrame = PythonObjectDagsterType(
    python_type=list,
    name="PoorMansDataFrame",
    loader=df_input_schema,
    materializer=df_output_schema,
)


@contextmanager
def define_test_out_of_process_context(instance):
    check.inst_param(instance, "instance", DagsterInstance)
    with define_out_of_process_context(__file__, main_repo_name(),
                                       instance) as context:
        yield context


def create_main_recon_repo():
    return ReconstructableRepository.for_file(__file__, main_repo_name())
Beispiel #13
0
from dagster import (
    Bool,
    Field,
    Int,
    PythonObjectDagsterType,
    String,
    composite_solid,
    execute_pipeline,
    pipeline,
    solid,
)

if typing.TYPE_CHECKING:
    DataFrame = list
else:
    DataFrame = PythonObjectDagsterType(list, name="DataFrame")  # type: Any


@solid(
    config_schema={
        "delimiter":
        Field(
            String,
            default_value=",",
            is_required=False,
            description=("A one-character string used to separate fields."),
        ),
        "doublequote":
        Field(
            Bool,
            default_value=False,
Beispiel #14
0
class SparkDataFrameFilesystemStoragePlugin(TypeStoragePlugin):  # pylint: disable=no-init
    @classmethod
    def compatible_with_storage_def(cls, system_storage_def):
        return system_storage_def is fs_system_storage

    @classmethod
    def set_object(cls, intermediate_store, obj, _context, _dagster_type, paths):
        target_path = os.path.join(intermediate_store.root, *paths)
        obj.write.parquet(intermediate_store.uri_for_paths(paths))
        return target_path

    @classmethod
    def get_object(cls, intermediate_store, context, _dagster_type, paths):
        return context.resources.pyspark.spark_session.read.parquet(
            os.path.join(intermediate_store.root, *paths)
        )

    @classmethod
    def required_resource_keys(cls):
        return frozenset({'pyspark'})


DataFrame = PythonObjectDagsterType(
    python_type=NativeSparkDataFrame,
    name='PySparkDataFrame',
    description='A PySpark data frame.',
    auto_plugins=[SparkDataFrameS3StoragePlugin, SparkDataFrameFilesystemStoragePlugin],
    output_materialization_config=spark_df_output_schema,
)
Beispiel #15
0
"""Type definitions for the airline_demo."""

from collections import namedtuple

import sqlalchemy

from dagster import PythonObjectDagsterType
from dagster.core.types.dagster_type import create_string_type

AirlineDemoResources = namedtuple(
    'AirlineDemoResources',
    ('spark', 's3', 'db_url', 'db_engine', 'db_dialect',
     'redshift_s3_temp_dir', 'db_load'),
)

SqlAlchemyEngineType = PythonObjectDagsterType(
    sqlalchemy.engine.Connectable,
    name='SqlAlchemyEngineType',
    description='A SqlAlchemy Connectable',
)

SqlTableName = create_string_type('SqlTableName',
                                  description='The name of a database table')
Beispiel #16
0
from dagster import solid, SolidExecutionContext, Field, Array, String, PythonObjectDagsterType, make_python_type_usable_as_dagster_type
from typing import Any, Optional, List, TYPE_CHECKING
from azmeta.access.specifications import AzureComputeSpecifications, load_compute_specifications

AzureComputeSpecificationsDagsterType = PythonObjectDagsterType(
    AzureComputeSpecifications)
make_python_type_usable_as_dagster_type(AzureComputeSpecifications,
                                        AzureComputeSpecificationsDagsterType)


@solid(
    config_schema={
        'subscription':
        Field(String,
              is_required=False,
              description='The subscription ID to list SKUs from.')
    })
def load_compute_specs(
        context: SolidExecutionContext) -> AzureComputeSpecifications:
    return load_compute_specifications(logger=context.log)
Beispiel #17
0

PositiveNumber = DagsterType(
    name="PostivieNumber",
    description="Only take in numbers greater than zero",
    type_check_fn=positive_num_check,
    loader=positive_num_loader)


# How to use PythonObjectDagsterType
class PercentType:
    def __init__(self, number):
        self.value = number * 100


PercentDagsterType = PythonObjectDagsterType(PercentType,
                                             name="PercentDagsterType")


@solid(
    input_defs=[
        InputDefinition("num1", PositiveNumber),
        InputDefinition("num2", PositiveNumber)
    ],
    output_defs=[OutputDefinition(PercentDagsterType)]  # mypy compliance
)
@solid
def add_two_nums(_context, num1: PositiveNumber, num2: PositiveNumber
                 ):  # mypy compliance only works for naked python type
    adding = num1 + num2  # 2 + 3 => 5
    add_percent_type = PercentType(adding)  # object 5
    yield ExpectationResult(
Beispiel #18
0
def create_lakehouse_table_def(
    name,
    lakehouse_fn,
    input_tables=None,
    other_input_defs=None,
    required_resource_keys=None,
    tags=None,
    description=None,
):
    input_tables = check.opt_list_param(input_tables,
                                        input_tables,
                                        of_type=LakehouseTableInputDefinition)
    other_input_defs = check.opt_list_param(other_input_defs,
                                            other_input_defs,
                                            of_type=InputDefinition)
    required_resource_keys = check.opt_set_param(required_resource_keys,
                                                 'required_resource_keys',
                                                 of_type=str)

    table_type = PythonObjectDagsterType(python_type=ITableHandle,
                                         name=name,
                                         description=description)

    table_input_dict = {
        input_table.name: input_table
        for input_table in input_tables
    }
    input_defs = input_tables + other_input_defs
    validate_solid_fn('@solid', name, lakehouse_fn, input_defs, ['context'])

    def _compute(context, inputs):
        '''
        Workhouse function of lakehouse. The inputs are something that inherits from ITableHandle.
        This compute_fn:
        (1) Iterates over input tables and ask the lakehouse resource to
         hydrate their contents or a representation of their contents
         (e.g a pyspark dataframe) into memory for computation
        (2) Pass those into the lakehouse table function. Do the actual thing.
        (3) Pass the output of the lakehouse function to the lakehouse materialize function.
        (4) Yield a materialization if the lakehouse function returned that.


        There's an argument that the hydrate and materialize functions should return
        a stream of events but that started to feel like I was implementing what should
        be a framework feature.
        '''
        check.inst_param(context.resources.lakehouse,
                         'context.resources.lakehouse', Lakehouse)

        # hydrate tables
        hydrated_tables = {}
        other_inputs = {}
        for input_name, value in inputs.items():
            context.log.info(
                'About to hydrate table {input_name} for use in {name}'.format(
                    input_name=input_name, name=name))
            if input_name in table_input_dict:
                table_handle = value
                input_type = table_input_dict[input_name].runtime_type
                hydrated_tables[
                    input_name] = context.resources.lakehouse.hydrate(
                        context,
                        input_type,
                        table_def_of_type(context.pipeline_def,
                                          input_type.name).tags,
                        table_handle,
                        tags,
                    )
            else:
                other_inputs[input_name] = value

        # call user-provided business logic which operates on the hydrated values
        # (as opposed to the handles)
        computed_output = lakehouse_fn(context, **hydrated_tables,
                                       **other_inputs)

        materialization, output_table_handle = context.resources.lakehouse.materialize(
            context, table_type, tags, computed_output)

        if materialization:
            yield materialization

        # just pass in a dummy handle for now if the materialize function
        # does not return one
        yield Output(
            output_table_handle if output_table_handle else TableHandle())

    required_resource_keys.add('lakehouse')

    return LakehouseTableDefinition(
        lakehouse_fn=lakehouse_fn,
        name=name,
        input_tables=input_tables,
        input_defs=input_defs,
        output_defs=[OutputDefinition(table_type)],
        compute_fn=_compute,
        required_resource_keys=required_resource_keys,
        tags=tags,
        description=description,
    )
Beispiel #19
0
    def compatible_with_storage_def(cls, system_storage_def):
        return (system_storage_def is fs_system_storage
                or system_storage_def is fs_intermediate_storage)

    @classmethod
    def set_object(cls, intermediate_store, obj, _context, _dagster_type,
                   paths):
        target_path = os.path.join(intermediate_store.root, *paths)
        obj.write.parquet(intermediate_store.uri_for_paths(paths))
        return target_path

    @classmethod
    def get_object(cls, intermediate_store, context, _dagster_type, paths):
        return context.resources.pyspark.spark_session.read.parquet(
            os.path.join(intermediate_store.root, *paths))

    @classmethod
    def required_resource_keys(cls):
        return frozenset({'pyspark'})


DataFrame = PythonObjectDagsterType(
    python_type=NativeSparkDataFrame,
    name='PySparkDataFrame',
    description='A PySpark data frame.',
    auto_plugins=[
        SparkDataFrameS3StoragePlugin, SparkDataFrameFilesystemStoragePlugin
    ],
    materializer=spark_df_materializer,
)
from dagster import (
    Bool,
    Field,
    Int,
    PythonObjectDagsterType,
    String,
    composite_solid,
    execute_pipeline,
    pipeline,
    solid,
)

if typing.TYPE_CHECKING:
    DataFrame = list
else:
    DataFrame = PythonObjectDagsterType(list, name='DataFrame')  # type: Any


@solid(
    config={
        'delimiter':
        Field(
            String,
            default_value=',',
            is_required=False,
            description=('A one-character string used to separate fields.'),
        ),
        'doublequote':
        Field(
            Bool,
            default_value=False,
Beispiel #21
0
    if file_type == "csv":
        return spark_read.csv(path, **dict_without_keys(file_options, "path"))
    elif file_type == "parquet":
        return spark_read.parquet(path,
                                  **dict_without_keys(file_options, "path"))
    elif file_type == "json":
        return spark_read.json(path, **dict_without_keys(file_options, "path"))
    elif file_type == "jdbc":
        return spark_read.jdbc(**file_options)
    elif file_type == "orc":
        return spark_read.orc(path, **dict_without_keys(file_options, "path"))
    elif file_type == "table":
        return spark_read.table(**file_options)
    elif file_type == "text":
        return spark_read.text(path, **dict_without_keys(file_options, "path"))
    elif file_type == "other":
        return spark_read.load(**file_options)
    else:
        raise DagsterInvariantViolationError(
            "Unsupported file_type {file_type}".format(file_type=file_type))


DataFrame = PythonObjectDagsterType(
    python_type=NativeSparkDataFrame,
    name="PySparkDataFrame",
    description="A PySpark data frame.",
    loader=dataframe_loader,
    materializer=dataframe_materializer,
)
Beispiel #22
0
        ]


@output_materialization_config(String)
def df_output_schema(_context, path, value):
    with open(path, 'w') as fd:
        writer = csv.DictWriter(fd, fieldnames=value[0].keys())
        writer.writeheader()
        writer.writerows(rowdicts=value)

    return Materialization.file(path)


PoorMansDataFrame = PythonObjectDagsterType(
    python_type=list,
    name='PoorMansDataFrame',
    input_hydration_config=df_input_schema,
    output_materialization_config=df_output_schema,
)


def define_test_subprocess_context(instance):
    check.inst_param(instance, 'instance', DagsterInstance)
    return define_subprocess_context_for_file(__file__, "test_repo", instance)


def define_test_context(instance):
    check.inst_param(instance, 'instance', DagsterInstance)
    return define_context_for_file(__file__, "test_repo", instance)


def create_main_recon_repo():
Beispiel #23
0
    required_resource_keys={'spark'},
)
def write_rdd(context, file_type, file_options, spark_rdd):
    if file_type == 'csv':
        df = context.resources.spark.spark_session.createDataFrame(spark_rdd)
        context.log.info('DF: {}'.format(df))
        df.write.csv(file_options['path'],
                     header=file_options.get('header'),
                     sep=file_options.get('sep'))
    else:
        check.failed('Unsupported file type: {}'.format(file_type))


SparkRDD = PythonObjectDagsterType(
    python_type=RDD,
    name='SparkRDD',
    input_hydration_config=load_rdd,
    output_materialization_config=write_rdd,
)


@output_selector_schema(
    Selector({
        'csv': {
            'path': Field(Path),
            'sep': Field(String, is_required=False),
            'header': Field(Bool, is_required=False),
        },
    }))
def spark_df_output_schema(_context, file_type, file_options, spark_df):
    if file_type == 'csv':
        spark_df.write.csv(file_options['path'],
Beispiel #24
0
        target_path = os.path.join(intermediate_storage.root, *paths)
        value.write.parquet(intermediate_storage.uri_for_paths(paths))
        return target_path

    @classmethod
    def get_intermediate_object(
        cls, intermediate_storage, context, _dagster_type, step_output_handle
    ):
        paths = ["intermediates", step_output_handle.step_key, step_output_handle.output_name]
        return context.resources.pyspark.spark_session.read.parquet(
            os.path.join(intermediate_storage.root, *paths)
        )

    @classmethod
    def required_resource_keys(cls):
        return frozenset({"pyspark"})


DataFrame = PythonObjectDagsterType(
    python_type=NativeSparkDataFrame,
    name="PySparkDataFrame",
    description="A PySpark data frame.",
    auto_plugins=[
        SparkDataFrameS3StoragePlugin,
        SparkDataFrameADLS2StoragePlugin,
        SparkDataFrameFilesystemStoragePlugin,
    ],
    loader=dataframe_loader,
    materializer=dataframe_materializer,
)
Beispiel #25
0
from dagster import make_python_type_usable_as_dagster_type, PythonObjectDagsterType, input_hydration_config, Selector, \
    Int, Field
from datetime import date
import pandas as pd


make_python_type_usable_as_dagster_type(pd.DataFrame, PythonObjectDagsterType(pd.DataFrame))


@input_hydration_config(Selector({"date": {"year": Field(Int),
                                           "month": Field(Int),
                                           "day": Field(Int)}}))
def parse_date(context, selector):
    date_selector = selector["date"]
    return date(date_selector["year"], date_selector["month"], date_selector["day"])


make_python_type_usable_as_dagster_type(date, PythonObjectDagsterType(date, input_hydration_config=parse_date))