Example #1
0
 def LR_solid():
     return dagstermill.define_dagstermill_solid(
         'linear_regression',
         nb_test_path('tutorial_LR'),
         input_defs=[InputDefinition(name='df', dagster_type=DataFrame)],
     )
Example #2
0
import re
from operator import add

from dagster_pyspark import SparkRDD, pyspark_resource

from dagster import InputDefinition, Int, ModeDefinition, OutputDefinition, Path, pipeline, solid


def parseNeighbors(urls):
    """Parses a urls pair string into urls pair."""
    parts = re.split(r'\s+', urls)
    return parts[0], parts[1]


@solid(
    input_defs=[InputDefinition('pagerank_data', Path)], output_defs=[OutputDefinition(SparkRDD)]
)
def parse_pagerank_data_step_five(context, pagerank_data):
    lines = context.resources.spark.spark_session.read.text(pagerank_data).rdd.map(lambda r: r[0])
    return lines.map(parseNeighbors)


@solid(input_defs=[InputDefinition('urls', SparkRDD)], output_defs=[OutputDefinition(SparkRDD)])
def compute_links_step_five(_context, urls):
    return urls.distinct().groupByKey().cache()


def computeContribs(urls, rank):
    """Calculates URL contributions to the rank of other URLs."""
    num_urls = len(urls)
    for url in urls:
Example #3
0
def test_failure_propagation():
    '''
      B =========== C
     //             \\
    A                F (skipped)
     \\             //
      D (fails) == E (skipped)
    '''

    solid_a = create_root_success_solid('A')

    def fail_fn(_context, inputs):
        check.failed('user error')
        return inputs

    def success_fn(_context, inputs):
        return inputs

    solid_b = single_output_solid(
        name='B',
        input_defs=[InputDefinition(name='A')],
        compute_fn=success_fn,
        output_def=OutputDefinition(),
    )

    solid_c = single_output_solid(
        name='C',
        input_defs=[InputDefinition(name='B')],
        compute_fn=success_fn,
        output_def=OutputDefinition(),
    )

    solid_d = single_output_solid(
        name='D',
        input_defs=[InputDefinition(name='A')],
        compute_fn=fail_fn,
        output_def=OutputDefinition(),
    )

    solid_e = single_output_solid(
        name='E',
        input_defs=[InputDefinition(name='D')],
        compute_fn=success_fn,
        output_def=OutputDefinition(),
    )

    solid_f = single_output_solid(
        name='F',
        input_defs=[InputDefinition(name='C'),
                    InputDefinition(name='E')],
        compute_fn=success_fn,
        output_def=OutputDefinition(),
    )

    pipeline_def = PipelineDefinition(
        solid_defs=[solid_a, solid_b, solid_c, solid_d, solid_e, solid_f],
        dependencies={
            'B': {
                'A': DependencyDefinition(solid_a.name)
            },
            'D': {
                'A': DependencyDefinition(solid_a.name)
            },
            'C': {
                'B': DependencyDefinition(solid_b.name)
            },
            'E': {
                'D': DependencyDefinition(solid_d.name)
            },
            'F': {
                'C': DependencyDefinition(solid_c.name),
                'E': DependencyDefinition(solid_e.name)
            },
        },
    )

    pipeline_result = execute_pipeline(
        pipeline_def, run_config=RunConfig.nonthrowing_in_process())

    assert pipeline_result.result_for_solid('A').success
    assert pipeline_result.result_for_solid('B').success
    assert pipeline_result.result_for_solid('C').success
    assert not pipeline_result.result_for_solid('D').success
    assert pipeline_result.result_for_solid(
        'D').failure_data.error.cls_name == 'CheckError'
    assert not pipeline_result.result_for_solid('E').success
    assert pipeline_result.result_for_solid('E').skipped
    assert not pipeline_result.result_for_solid('F').success
    assert pipeline_result.result_for_solid('F').skipped
Example #4
0
    return {'solids': solids_config}


def create_sum_table():
    def compute(_context, inputs):
        num_csv = inputs['num_csv']
        check.inst_param(num_csv, 'num_csv', pd.DataFrame)
        num_csv['sum'] = num_csv['num1'] + num_csv['num2']
        return num_csv

    return _dataframe_solid(name='sum_table',
                            input_defs=[InputDefinition('num_csv', DataFrame)],
                            compute_fn=compute)


@lambda_solid(input_defs=[InputDefinition('num_csv', DataFrame)],
              output_def=OutputDefinition(DataFrame))
def sum_table(num_csv):
    check.inst_param(num_csv, 'num_csv', pd.DataFrame)
    num_csv['sum'] = num_csv['num1'] + num_csv['num2']
    return num_csv


@lambda_solid(input_defs=[InputDefinition('sum_df', DataFrame)],
              output_def=OutputDefinition(DataFrame))
def sum_sq_table(sum_df):
    sum_df['sum_squared'] = sum_df['sum'] * sum_df['sum']
    return sum_df


@lambda_solid(
Example #5
0
        Row(name="John", age=19),
        Row(name="Jennifer", age=29),
        Row(name="Henry", age=50)
    ]
    return context.resources.pyspark.spark_session.createDataFrame(
        rows, schema)


@solid(
    name="blah",
    description="this is a test",
    config_schema={
        "foo": str,
        "bar": int
    },
    input_defs=[InputDefinition("people", DataFrame)],
    output_defs=[OutputDefinition(DataFrame)],
    required_resource_keys={"pyspark_step_launcher"},
)
def filter_df_solid(_, people):
    return people.filter(people["age"] < 30)


MODE_DEFS = [
    ModeDefinition(
        "prod",
        resource_defs={
            "pyspark_step_launcher": emr_pyspark_step_launcher,
            "pyspark": pyspark_resource,
            "s3": s3_resource,
        },
Example #6
0
def test_solid_definition_errors():
    with pytest.raises(DagsterInvalidDefinitionError,
                       match="positional vararg"):

        @solid(input_defs=[InputDefinition(name="foo")],
               output_defs=[OutputDefinition()])
        def vargs(context, foo, *args):
            pass

    with pytest.raises(DagsterInvalidDefinitionError):

        @solid(input_defs=[InputDefinition(name="foo")],
               output_defs=[OutputDefinition()])
        def wrong_name(context, bar):
            pass

    with pytest.raises(DagsterInvalidDefinitionError):

        @solid(
            input_defs=[
                InputDefinition(name="foo"),
                InputDefinition(name="bar")
            ],
            output_defs=[OutputDefinition()],
        )
        def wrong_name_2(context, foo):
            pass

    with pytest.raises(DagsterInvalidDefinitionError):

        @solid(input_defs=[InputDefinition(name="foo")],
               output_defs=[OutputDefinition()])
        def no_context(foo):
            pass

    with pytest.raises(DagsterInvalidDefinitionError):

        @solid(input_defs=[InputDefinition(name="foo")],
               output_defs=[OutputDefinition()])
        def extras(_context, foo, bar):
            pass

    @solid(
        input_defs=[InputDefinition(name="foo"),
                    InputDefinition(name="bar")],
        output_defs=[OutputDefinition()],
    )
    def valid_kwargs(context, **kwargs):
        pass

    @solid(
        input_defs=[InputDefinition(name="foo"),
                    InputDefinition(name="bar")],
        output_defs=[OutputDefinition()],
    )
    def valid(context, foo, bar):
        pass

    @solid
    def valid_because_inference(context, foo, bar):
        pass
Example #7
0
def _define_nothing_dep_pipeline():
    @lambda_solid(output_def=OutputDefinition(Nothing, "complete"))
    def start_nothing():
        pass

    @lambda_solid(input_defs=[
        InputDefinition("add_complete", Nothing),
        InputDefinition("yield_complete", Nothing),
    ])
    def end_nothing():
        pass

    @lambda_solid(output_def=OutputDefinition(Int))
    def emit_value():
        return 1

    @lambda_solid(
        input_defs=[
            InputDefinition("on_complete", Nothing),
            InputDefinition("num", Int)
        ],
        output_def=OutputDefinition(Int),
    )
    def add_value(num):
        return 1 + num

    @solid(
        name="yield_values",
        input_defs=[InputDefinition("on_complete", Nothing)],
        output_defs=[
            OutputDefinition(Int, "num_1"),
            OutputDefinition(Int, "num_2"),
            OutputDefinition(Nothing, "complete"),
        ],
    )
    def yield_values(_context):
        yield Output(1, "num_1")
        yield Output(2, "num_2")
        yield Output(None, "complete")

    return PipelineDefinition(
        name="simple_exc",
        solid_defs=[
            emit_value, add_value, start_nothing, end_nothing, yield_values
        ],
        dependencies={
            "add_value": {
                "on_complete": DependencyDefinition("start_nothing",
                                                    "complete"),
                "num": DependencyDefinition("emit_value"),
            },
            "yield_values": {
                "on_complete": DependencyDefinition("start_nothing",
                                                    "complete")
            },
            "end_nothing": {
                "add_complete": DependencyDefinition("add_value"),
                "yield_complete": DependencyDefinition("yield_values",
                                                       "complete"),
            },
        },
    )
Example #8
0
                working_directory=None,
                location_name=main_repo_location_name(),
            ),
    ) as workspace_process_context:
        yield workspace_process_context.create_request_context()


@contextmanager
def get_main_external_repo(instance):
    with get_main_workspace(instance) as workspace:
        location = workspace.get_repository_location(main_repo_location_name())
        yield location.get_repository(main_repo_name())


@lambda_solid(
    input_defs=[InputDefinition("num", PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_solid(num):
    sum_df = deepcopy(num)
    for x in sum_df:
        x["sum"] = int(x["num1"]) + int(x["num2"])
    return sum_df


@lambda_solid(
    input_defs=[InputDefinition("sum_df", PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_sq_solid(sum_df):
    sum_sq_df = deepcopy(sum_df)