コード例 #1
0
def div_2(_, x: Float) -> float:
    return x / 2


@solid
def concat(_, x: String, y: str) -> str:
    return x + y


@solid
def wait(_) -> Nothing:
    time.sleep(0.2)
    return


@solid(input_defs=[InputDefinition("ready", dagster_type=Nothing)])
def done(_) -> str:
    return "done"


@pipeline
def nothing_pipeline():
    done(wait())


@solid
def wait_int(_) -> Int:
    time.sleep(0.2)
    return 1

コード例 #2
0
# pylint: disable=unused-argument
from dagster import InputDefinition, ModeDefinition, input_manager, pipeline, solid


def read_dataframe_from_table(**_kwargs):
    pass


# start_marker
@solid(
    input_defs=[InputDefinition("dataframe", manager_key="my_root_manager")])
def my_solid(_, dataframe):
    """Do some stuff"""


@input_manager
def table1_loader(_):
    return read_dataframe_from_table(name="table1")


@pipeline(mode_defs=[
    ModeDefinition(resource_defs={"my_root_manager": table1_loader})
])
def my_pipeline():
    my_solid()


# end_marker
コード例 #3
0
    return {'solids': solids_config}


def create_sum_table():
    def transform(_context, inputs):
        num_csv = inputs['num_csv']
        check.inst_param(num_csv, 'num_csv', pd.DataFrame)
        num_csv['sum'] = num_csv['num1'] + num_csv['num2']
        return num_csv

    return _dataframe_solid(name='sum_table',
                            input_defs=[InputDefinition('num_csv', DataFrame)],
                            compute_fn=transform)


@lambda_solid(input_defs=[InputDefinition('num_csv', DataFrame)],
              output_def=OutputDefinition(DataFrame))
def sum_table(num_csv):
    check.inst_param(num_csv, 'num_csv', pd.DataFrame)
    num_csv['sum'] = num_csv['num1'] + num_csv['num2']
    return num_csv


@lambda_solid(input_defs=[InputDefinition('sum_df', DataFrame)],
              output_def=OutputDefinition(DataFrame))
def sum_sq_table(sum_df):
    sum_df['sum_squared'] = sum_df['sum'] * sum_df['sum']
    return sum_df


@lambda_solid(
コード例 #4
0
def create_sum_table():
    def transform(_context, inputs):
        num_csv = inputs['num_csv']
        check.inst_param(num_csv, 'num_csv', pd.DataFrame)
        num_csv['sum'] = num_csv['num1'] + num_csv['num2']
        return num_csv

    return _dataframe_solid(
        name='sum_table',
        inputs=[InputDefinition('num_csv', dagster_pd.DataFrame)],
        transform_fn=transform,
    )


@lambda_solid(
    inputs=[InputDefinition('num_csv', dagster_pd.DataFrame)],
    output=OutputDefinition(dagster_pd.DataFrame),
)
def sum_table(num_csv):
    check.inst_param(num_csv, 'num_csv', pd.DataFrame)
    num_csv['sum'] = num_csv['num1'] + num_csv['num2']
    return num_csv


@lambda_solid(
    inputs=[InputDefinition('sum_df', dagster_pd.DataFrame)],
    output=OutputDefinition(dagster_pd.DataFrame),
)
def sum_sq_table(sum_df):
    sum_df['sum_squared'] = sum_df['sum'] * sum_df['sum']
    return sum_df
コード例 #5
0
    execution_manager = MultiprocessingExecutionManager()
    execution_manager.execute_pipeline(handle,
                                       crashy_pipeline,
                                       pipeline_run,
                                       raise_on_error=False)
    execution_manager.join()
    assert pipeline_run.status == PipelineRunStatus.FAILURE
    last_log = pipeline_run.all_logs()[-1]
    print(last_log.message)
    assert last_log.message.startswith(
        'Exception: Pipeline execution process for {run_id} unexpectedly exited\n'
        .format(run_id=run_id))


@lambda_solid(
    input_defs=[InputDefinition('num', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_solid(num):
    sum_df = deepcopy(num)
    for x in sum_df:
        x['sum'] = x['num1'] + x['num2']
    return PoorMansDataFrame(sum_df)


@lambda_solid(
    input_defs=[InputDefinition('sum_df', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def error_solid(sum_df):  # pylint: disable=W0613
    raise Exception('foo')
コード例 #6
0
ファイル: repository.py プロジェクト: nikie/dagster
def LR_solid():
    return dagstermill.define_dagstermill_solid(
        'linear_regression',
        nb_test_path('tutorial_LR'),
        input_defs=[InputDefinition(name='df', dagster_type=DataFrame)],
    )
コード例 #7
0
def test_pipeline():
    return simple()


@pipeline(mode_defs=celery_mode_defs)
def test_serial_pipeline():
    return add_one(simple())


@solid(output_defs=[OutputDefinition(name='value_one'), OutputDefinition(name='value_two')])
def emit_values(_context):
    yield Output(1, 'value_one')
    yield Output(2, 'value_two')


@lambda_solid(input_defs=[InputDefinition('num_one'), InputDefinition('num_two')])
def subtract(num_one, num_two):
    return num_one - num_two


@pipeline(mode_defs=celery_mode_defs)
def test_diamond_pipeline():
    value_one, value_two = emit_values()
    return subtract(num_one=add_one(num=value_one), num_two=add_one.alias('renamed')(num=value_two))


@pipeline(mode_defs=celery_mode_defs)
def test_parallel_pipeline():
    value = simple()
    for i in range(10):
        add_one.alias('add_one_' + str(i))(value)
コード例 #8
0
    Float,
    InputDefinition,
    Int,
    Output,
    RunConfig,
    String,
    composite_solid,
    execute_pipeline,
    lambda_solid,
    pipeline,
    solid,
)


# have to use "pipe" solid since "result_for_solid" doesnt work with composite mappings
@lambda_solid(input_defs=[InputDefinition('input_str')])
def pipe(input_str):
    return input_str


@solid(config_field=Field(String, is_optional=True))
def scalar_config_solid(context):
    yield Output(context.solid_config)


@composite_solid(
    config={'override_str': Field(String)},
    config_fn=lambda _, cfg:
    {'scalar_config_solid': {
        'config': cfg['override_str']
    }},
コード例 #9
0
def test_wrap_all_config_one_input():
    @solid(
        config={
            'config_field_a': Field(String),
            'config_field_b': Field(String)
        },
        input_defs=[
            InputDefinition('input_a', String),
            InputDefinition('input_b', String)
        ],
    )
    def basic(context, input_a, input_b):
        res = '.'.join([
            context.solid_config['config_field_a'],
            context.solid_config['config_field_b'],
            input_a,
            input_b,
        ])
        yield Output(res)

    @composite_solid(
        input_defs=[InputDefinition('input_a', String)],
        config_fn=lambda _, cfg: {
            'basic': {
                'config': {
                    'config_field_a': cfg['config_field_a'],
                    'config_field_b': cfg['config_field_b'],
                },
                'inputs': {
                    'input_b': {
                        'value': 'set_input_b'
                    }
                },
            }
        },
        config={
            'config_field_a': Field(String),
            'config_field_b': Field(String)
        },
    )
    def wrap_all_config_one_input(input_a):
        return basic(input_a)

    @pipeline(name='config_mapping')
    def config_mapping_pipeline():
        return pipe(wrap_all_config_one_input())

    result = execute_pipeline(
        config_mapping_pipeline,
        {
            'solids': {
                'wrap_all_config_one_input': {
                    'config': {
                        'config_field_a': 'override_a',
                        'config_field_b': 'override_b'
                    },
                    'inputs': {
                        'input_a': {
                            'value': 'set_input_a'
                        }
                    },
                }
            }
        },
    )
    assert result.success
    assert (result.result_for_solid('pipe').output_value() ==
            'override_a.override_b.set_input_a.set_input_b')

    with pytest.raises(DagsterInvalidConfigError) as exc_info:
        result = execute_pipeline(
            config_mapping_pipeline,
            {
                'solids': {
                    'wrap_all_config_one_input': {
                        'config': {
                            'config_field_a': 1234,
                            'config_field_b': 'override_b'
                        },
                        'inputs': {
                            'input_a': {
                                'value': 'set_input_a'
                            }
                        },
                    }
                }
            },
        )
    assert len(exc_info.value.errors) == 1
    assert exc_info.value.errors[0].message == (
        'Value at path root:solids:wrap_all_config_one_input:config:config_field_a is not valid. '
        'Expected "String"')

    with pytest.raises(DagsterInvalidConfigError) as exc_info:
        result = execute_pipeline(
            config_mapping_pipeline,
            {
                'solids': {
                    'wrap_all_config_one_input': {
                        'config': {
                            'config_field_a': 'override_a',
                            'config_field_b': 'override_b'
                        },
                        'inputs': {
                            'input_a': {
                                'value': 1234
                            }
                        },
                    }
                }
            },
        )
    assert len(exc_info.value.errors) == 1
    assert exc_info.value.errors[0].message == (
        'Value at path root:solids:wrap_all_config_one_input:inputs:input_a:value is not valid. '
        'Expected "String"')
コード例 #10
0
ファイル: test_ops.py プロジェクト: trevenrawr/dagster
def test_pd_df_load():
    dataset = get_dataset()
    table = "%s.%s" % (dataset, "df")

    test_df = pd.DataFrame({"num1": [1, 3], "num2": [2, 4]})

    create_op = bq_create_dataset.alias("create_op")
    load_op = import_df_to_bq.alias("load_op")
    query_op = bq_op_for_queries(["SELECT num1, num2 FROM %s" % table
                                  ]).alias("query_op")
    delete_op = bq_delete_dataset.alias("delete_op")

    @op(input_defs=[InputDefinition("success", Nothing)],
        output_defs=[OutputDefinition(DataFrame)])
    def return_df(_context):  # pylint: disable=unused-argument
        return test_df

    @job(resource_defs={"bigquery": bigquery_resource})
    def bq_circle_of_life():
        delete_op(query_op(load_op(return_df(create_op()))))

    result = bq_circle_of_life.execute_in_process(
        run_config={
            "ops": {
                "create_op": {
                    "config": {
                        "dataset": dataset,
                        "exists_ok": True
                    }
                },
                "load_op": {
                    "config": {
                        "destination": table
                    }
                },
                "delete_op": {
                    "config": {
                        "dataset": dataset,
                        "delete_contents": True
                    }
                },
            }
        })
    assert result.success

    values = result.output_for_node("query_op")
    assert values[0].to_dict() == test_df.to_dict()

    # BQ loads should throw an exception if pyarrow and fastparquet aren't available
    with mock.patch.dict(sys.modules, {"pyarrow": None, "fastparquet": None}):
        with pytest.raises(DagsterExecutionStepExecutionError) as exc_info:
            bq_circle_of_life.execute_in_process(
                run_config={
                    "ops": {
                        "create_op": {
                            "config": {
                                "dataset": dataset,
                                "exists_ok": True
                            }
                        },
                        "load_op": {
                            "config": {
                                "destination": table
                            }
                        },
                        "delete_op": {
                            "config": {
                                "dataset": dataset,
                                "delete_contents": True
                            }
                        },
                    }
                })
        assert (
            "loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet"
            " to be installed" in str(exc_info.value.user_exception))

        @job(resource_defs={"bigquery": bigquery_resource})
        def cleanup_bq():
            delete_op()

        result = cleanup_bq.execute_in_process(
            run_config={
                "ops": {
                    "delete_op": {
                        "config": {
                            "dataset": dataset,
                            "delete_contents": True
                        }
                    }
                }
            })
        assert result.success

    assert not dataset_exists(dataset)
コード例 #11
0
ファイル: test_ops.py プロジェクト: trevenrawr/dagster
def test_gcs_load():
    dataset = get_dataset()
    table = "%s.%s" % (dataset, "df")

    create_op = bq_create_dataset.alias("create_op")
    query_op = bq_op_for_queries([
        "SELECT string_field_0, string_field_1 FROM %s ORDER BY string_field_0 ASC LIMIT 1"
        % table
    ]).alias("query_op")
    delete_op = bq_delete_dataset.alias("delete_op")

    @op(input_defs=[InputDefinition("success", Nothing)],
        output_defs=[OutputDefinition(List[str])])
    def return_gcs_uri(_context):  # pylint: disable=unused-argument
        return ["gs://cloud-samples-data/bigquery/us-states/us-states.csv"]

    @job(resource_defs={"bigquery": bigquery_resource})
    def bq_from_gcs():
        delete_op(query_op(import_gcs_paths_to_bq(return_gcs_uri(
            create_op()))))

    result = bq_from_gcs.execute_in_process(
        run_config={
            "ops": {
                "create_op": {
                    "config": {
                        "dataset": dataset,
                        "exists_ok": True
                    }
                },
                "import_gcs_paths_to_bq": {
                    "config": {
                        "destination": table,
                        "load_job_config": {
                            "autodetect": True,
                            "skip_leading_rows": 1,
                            "source_format": "CSV",
                            "write_disposition": "WRITE_TRUNCATE",
                        },
                    }
                },
                "delete_op": {
                    "config": {
                        "dataset": dataset,
                        "delete_contents": True
                    }
                },
            }
        })
    assert result.success

    values = result.output_for_node("query_op")
    assert values[0].to_dict() == {
        "string_field_0": {
            0: "Alabama"
        },
        "string_field_1": {
            0: "AL"
        }
    }

    assert not dataset_exists(dataset)
コード例 #12
0
ファイル: solids.py プロジェクト: yingjiebyron/dagster
def create_databricks_job_solid(
        name="databricks_job",
        num_inputs=1,
        description=None,
        required_resource_keys=frozenset(["databricks_client"]),
):
    """
    Creates a solid that launches a databricks job.

    As config, the solid accepts a blob of the form described in Databricks' job API:
    https://docs.databricks.com/dev-tools/api/latest/jobs.html.

    Returns:
        SolidDefinition: A solid definition.
    """
    check.str_param(name, "name")
    check.opt_str_param(description, "description")
    check.int_param(num_inputs, "num_inputs")
    check.set_param(required_resource_keys,
                    "required_resource_keys",
                    of_type=str)

    input_defs = [
        InputDefinition("input_" + str(i), Nothing) for i in range(num_inputs)
    ]

    @solid(
        name=name,
        description=description,
        config_schema={
            "job":
            Field(
                Permissive(),
                description=
                "Databricks job run configuration, in the form described in "
                "Databricks' job API: https://docs.databricks.com/dev-tools/api/latest/jobs.html",
            ),
            "poll_interval_sec":
            Field(
                float,
                description="Check whether the job is done at this interval.",
                default_value=10,
            ),
            "max_wait_time_sec":
            Field(
                float,
                description=
                "If the job is not complete after this length of time, raise an error.",
                default_value=(24 * 60 * 60),
            ),
        },
        input_defs=input_defs,
        output_defs=[OutputDefinition(Nothing)],
        required_resource_keys=required_resource_keys,
        tags={"kind": "databricks"},
    )
    def databricks_solid(context):
        job_config = context.solid_config["job"]
        databricks_client = context.resources.databricks_client
        run_id = databricks_client.submit_run(**job_config)

        context.log.info(
            "Launched databricks job with run id {run_id}. UI: {url}. Waiting to run to completion..."
            .format(run_id=run_id,
                    url=create_ui_url(databricks_client,
                                      context.solid_config)))
        wait_for_run_to_complete(
            databricks_client,
            context.log,
            run_id,
            context.solid_config["poll_interval_sec"],
            context.solid_config["max_wait_time_sec"],
        )

    return databricks_solid
コード例 #13
0
def define_test_all_scalars_pipeline():
    @lambda_solid(input_defs=[InputDefinition('num', Int)])
    def take_int(num):
        return num

    @lambda_solid(output_def=OutputDefinition(Int))
    def produce_int():
        return 2

    @lambda_solid(input_defs=[InputDefinition('string', String)])
    def take_string(string):
        return string

    @lambda_solid(output_def=OutputDefinition(String))
    def produce_string():
        return 'foo'

    @lambda_solid(input_defs=[InputDefinition('path', Path)])
    def take_path(path):
        return path

    @lambda_solid(output_def=OutputDefinition(Path))
    def produce_path():
        return '/path/to/foo'

    @lambda_solid(input_defs=[InputDefinition('float_number', Float)])
    def take_float(float_number):
        return float_number

    @lambda_solid(output_def=OutputDefinition(Float))
    def produce_float():
        return 3.14

    @lambda_solid(input_defs=[InputDefinition('bool_value', Bool)])
    def take_bool(bool_value):
        return bool_value

    @lambda_solid(output_def=OutputDefinition(Bool))
    def produce_bool():
        return True

    @lambda_solid(input_defs=[InputDefinition('any_value', Any)])
    def take_any(any_value):
        return any_value

    @lambda_solid(output_def=OutputDefinition(Any))
    def produce_any():
        return True

    @lambda_solid(input_defs=[InputDefinition('string_list', List[String])])
    def take_string_list(string_list):
        return string_list

    @lambda_solid(
        input_defs=[InputDefinition('nullable_string', Optional[String])])
    def take_nullable_string(nullable_string):
        return nullable_string

    return PipelineDefinition(
        name='test_all_scalars_pipeline',
        solid_defs=[
            produce_any,
            produce_bool,
            produce_float,
            produce_int,
            produce_path,
            produce_string,
            take_any,
            take_bool,
            take_float,
            take_int,
            take_nullable_string,
            take_path,
            take_string,
            take_string_list,
        ],
    )
コード例 #14
0
    lambda_solid,
    solid,
    pipeline,
)


def builder(graph):
    return graph.add_one(graph.return_one())


@lambda_solid
def return_one():
    return 1


@lambda_solid(input_defs=[InputDefinition('num')])
def add_one(num):
    return num + 1


def test_basic_use_case():
    pipeline_def = PipelineDefinition(
        name='basic',
        solid_defs=[return_one, add_one],
        dependencies={'add_one': {'num': DependencyDefinition('return_one')}},
    )

    assert execute_pipeline(pipeline_def).result_for_solid('add_one').output_value() == 2


def test_basic_use_case_with_dsl():
コード例 #15
0
ファイル: test_definitions.py プロジェクト: zorrock/dagster
def test_solid_def():
    @lambda_solid
    def produce_string():
        return 'foo'

    @solid(
        inputs=[InputDefinition('input_one', types.String)],
        outputs=[OutputDefinition(types.Any)],
        config_field=Field(Dict({'another_field': Field(types.Int)})),
    )
    def solid_one(_context, input_one):
        raise Exception('should not execute')

    pipeline_def = PipelineDefinition(
        solids=[produce_string, solid_one],
        dependencies={
            'solid_one': {
                'input_one': DependencyDefinition('produce_string')
            }
        },
    )

    assert len(pipeline_def.solids[0].output_handles()) == 1

    assert isinstance(pipeline_def.solid_named('solid_one'), Solid)

    solid_one_solid = pipeline_def.solid_named('solid_one')

    assert solid_one_solid.has_input('input_one')

    assert isinstance(solid_one_solid.input_def_named('input_one'),
                      InputDefinition)

    assert len(solid_one_solid.input_defs) == 1
    assert len(solid_one_solid.output_defs) == 1

    assert str(solid_one_solid.input_handle('input_one')) == (
        'SolidInputHandle(definition_name="\'solid_one\'", input_name="\'input_one\'", '
        'solid_name="\'solid_one\'")')

    assert repr(solid_one_solid.input_handle('input_one')) == (
        'SolidInputHandle(definition_name="\'solid_one\'", input_name="\'input_one\'", '
        'solid_name="\'solid_one\'")')

    assert str(solid_one_solid.output_handle('result')) == (
        'SolidOutputHandle(definition_name="\'solid_one\'", output_name="\'result\'", '
        'solid_name="\'solid_one\'")')

    assert repr(solid_one_solid.output_handle('result')) == (
        'SolidOutputHandle(definition_name="\'solid_one\'", output_name="\'result\'", '
        'solid_name="\'solid_one\'")')

    assert solid_one_solid.output_handle('result') == SolidOutputHandle(
        solid_one_solid, solid_one_solid.output_defs[0])

    assert len(
        pipeline_def.dependency_structure.deps_of_solid_with_input(
            'solid_one')) == 1

    assert len(
        pipeline_def.dependency_structure.depended_by_of_solid(
            'produce_string')) == 1

    assert len(pipeline_def.dependency_structure.input_handles()) == 1

    assert len(pipeline_def.dependency_structure.items()) == 1
コード例 #16
0
def test_wrap_all_config_and_inputs():
    @solid(
        config={
            'config_field_a': Field(String),
            'config_field_b': Field(String)
        },
        input_defs=[
            InputDefinition('input_a', String),
            InputDefinition('input_b', String)
        ],
    )
    def basic(context, input_a, input_b):
        res = '.'.join([
            context.solid_config['config_field_a'],
            context.solid_config['config_field_b'],
            input_a,
            input_b,
        ])
        yield Output(res)

    @composite_solid(
        config_fn=lambda _, cfg: {
            'basic': {
                'config': {
                    'config_field_a': cfg['config_field_a'],
                    'config_field_b': cfg['config_field_b'],
                },
                'inputs': {
                    'input_a': {
                        'value': 'override_input_a'
                    },
                    'input_b': {
                        'value': 'override_input_b'
                    },
                },
            }
        },
        config={
            'config_field_a': Field(String),
            'config_field_b': Field(String)
        },
    )
    def wrap_all():
        return basic()

    @pipeline(name='config_mapping')
    def config_mapping_pipeline():
        return pipe(wrap_all())

    result = execute_pipeline(
        config_mapping_pipeline,
        {
            'solids': {
                'wrap_all': {
                    'config': {
                        'config_field_a': 'override_a',
                        'config_field_b': 'override_b'
                    }
                }
            }
        },
    )

    assert result.success
    assert (result.result_for_solid('pipe').output_value() ==
            'override_a.override_b.override_input_a.override_input_b')

    with pytest.raises(DagsterInvalidConfigError) as exc_info:
        result = execute_pipeline(
            config_mapping_pipeline,
            {
                'solids': {
                    'wrap_all': {
                        'config': {
                            'config_field_a': 'override_a',
                            'this_key_doesnt_exist': 'override_b',
                        }
                    }
                }
            },
        )

    assert len(exc_info.value.errors) == 2
    assert exc_info.value.errors[0].message == (
        'Field "this_key_doesnt_exist" is not defined at path root:solids:wrap_all:config '
        'Expected: "{ config_field_a: String config_field_b: String }"')

    assert (
        exc_info.value.errors[1].message ==
        'Missing required field "config_field_b" at path root:solids:wrap_all:config '
        'Available Fields: "[\'config_field_a\', \'config_field_b\']".')
コード例 #17
0
ファイル: test_input_manager.py プロジェクト: prezi/dagster
def test_input_manager_with_retries():
    _count = {"total": 0}

    @root_input_manager
    def should_succeed_after_retries(_):
        if _count["total"] < 2:
            _count["total"] += 1
            raise RetryRequested(max_retries=3)
        return "foo"

    @root_input_manager
    def should_retry(_):
        raise RetryRequested(max_retries=3)

    @solid(input_defs=[
        InputDefinition("solid_input",
                        root_manager_key="should_succeed_after_retries")
    ])
    def take_input_1(_, solid_input):
        return solid_input

    @solid(input_defs=[
        InputDefinition("solid_input", root_manager_key="should_retry")
    ])
    def take_input_2(_, solid_input):
        return solid_input

    @solid
    def take_input_3(_, _input1, _input2):
        assert False, "should not be called"

    @pipeline(mode_defs=[
        ModeDefinition(
            resource_defs={
                "should_succeed_after_retries": should_succeed_after_retries,
                "should_retry": should_retry,
            })
    ])
    def simple():
        take_input_3(take_input_2(), take_input_1())

    with tempfile.TemporaryDirectory() as tmpdir_path:

        instance = DagsterInstance.from_ref(InstanceRef.from_dir(tmpdir_path))

        result = execute_pipeline(simple,
                                  instance=instance,
                                  raise_on_error=False)

        step_stats = instance.get_run_step_stats(result.run_id)
        assert len(step_stats) == 2

        step_stats_1 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_1"])
        assert len(step_stats_1) == 1
        step_stat_1 = step_stats_1[0]
        assert step_stat_1.status.value == "SUCCESS"
        assert step_stat_1.attempts == 3

        step_stats_2 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_2"])
        assert len(step_stats_2) == 1
        step_stat_2 = step_stats_2[0]
        assert step_stat_2.status.value == "FAILURE"
        assert step_stat_2.attempts == 4

        step_stats_3 = instance.get_run_step_stats(result.run_id,
                                                   step_keys=["take_input_3"])
        assert len(step_stats_3) == 0
コード例 #18
0
    OutputDefinition,
    Path,
    PipelineDefinition,
    solid,
)

from dagster_pyspark import spark_session_resource, SparkRDD


def parseNeighbors(urls):
    """Parses a urls pair string into urls pair."""
    parts = re.split(r'\s+', urls)
    return parts[0], parts[1]


@solid(inputs=[InputDefinition('pagerank_data', Path)],
       outputs=[OutputDefinition(SparkRDD)])
def parse_pagerank_data_step_five(context, pagerank_data):
    lines = context.resources.spark.read.text(pagerank_data).rdd.map(
        lambda r: r[0])
    return lines.map(parseNeighbors)


@solid(inputs=[InputDefinition('urls', SparkRDD)],
       outputs=[OutputDefinition(SparkRDD)])
def compute_links_step_five(_context, urls):
    return urls.distinct().groupByKey().cache()


def computeContribs(urls, rank):
    """Calculates URL contributions to the rank of other URLs."""
コード例 #19
0
ファイル: repository.py プロジェクト: nikie/dagster
def RF_solid():
    return dagstermill.define_dagstermill_solid(
        'random_forest_regression',
        nb_test_path('tutorial_RF'),
        input_defs=[InputDefinition(name='df', dagster_type=DataFrame)],
    )
コード例 #20
0
def test_depends_on_adls2_resource_file_manager(storage_account, file_system):
    bar_bytes = "bar".encode()

    @solid(output_defs=[OutputDefinition(ADLS2FileHandle)],
           required_resource_keys={"file_manager"})
    def emit_file(context):
        return context.resources.file_manager.write_data(bar_bytes)

    @solid(
        input_defs=[InputDefinition("file_handle", ADLS2FileHandle)],
        required_resource_keys={"file_manager"},
    )
    def accept_file(context, file_handle):
        local_path = context.resources.file_manager.copy_handle_to_local_temp(
            file_handle)
        assert isinstance(local_path, str)
        assert open(local_path, "rb").read() == bar_bytes

    adls2_fake_resource = FakeADLS2Resource(storage_account)
    adls2_fake_file_manager = ADLS2FileManager(
        adls2_client=adls2_fake_resource.adls2_client,
        file_system=file_system,
        prefix="some-prefix",
    )

    @pipeline(mode_defs=[
        ModeDefinition(
            intermediate_storage_defs=
            adls2_plus_default_intermediate_storage_defs,
            resource_defs={
                "adls2":
                ResourceDefinition.hardcoded_resource(adls2_fake_resource),
                "file_manager":
                ResourceDefinition.hardcoded_resource(adls2_fake_file_manager),
            },
        )
    ])
    def adls2_file_manager_test():
        accept_file(emit_file())

    result = execute_pipeline(
        adls2_file_manager_test,
        run_config={
            "intermediate_storage": {
                "adls2": {
                    "config": {
                        "adls2_file_system": file_system
                    }
                }
            }
        },
    )

    assert result.success

    keys_in_bucket = set(
        adls2_fake_resource.adls2_client.file_systems[file_system].keys())

    for step_key, output_name in [
        ("emit_file", "result"),
        ("accept_file", "result"),
    ]:
        keys_in_bucket.remove(
            create_adls2_key(result.run_id, step_key, output_name))

    assert len(keys_in_bucket) == 1

    file_key = list(keys_in_bucket)[0]
    comps = file_key.split("/")

    assert "/".join(comps[:-1]) == "some-prefix"

    assert uuid.UUID(comps[-1])
コード例 #21
0
    return DagsterGraphQLContext(
        instance=DagsterInstance.ephemeral(),
        locations=[InProcessRepositoryLocation(create_main_recon_repo())],
    )


def main_repo_location_name():
    return '<<in_process>>'


def main_repo_name():
    return 'test_repo'


@lambda_solid(
    input_defs=[InputDefinition('num', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_solid(num):
    sum_df = deepcopy(num)
    for x in sum_df:
        x['sum'] = int(x['num1']) + int(x['num2'])
    return sum_df


@lambda_solid(
    input_defs=[InputDefinition('sum_df', PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_sq_solid(sum_df):
    sum_sq_df = deepcopy(sum_df)
コード例 #22
0
def test_depends_on_adls2_resource_intermediates(storage_account, file_system):
    @solid(
        input_defs=[
            InputDefinition("num_one", Int),
            InputDefinition("num_two", Int)
        ],
        output_defs=[OutputDefinition(Int)],
    )
    def add_numbers(_, num_one, num_two):
        return num_one + num_two

    adls2_fake_resource = FakeADLS2Resource(storage_account)

    @pipeline(mode_defs=[
        ModeDefinition(
            intermediate_storage_defs=
            adls2_plus_default_intermediate_storage_defs,
            resource_defs={
                "adls2":
                ResourceDefinition.hardcoded_resource(adls2_fake_resource)
            },
        )
    ])
    def adls2_internal_pipeline():
        return add_numbers()

    result = execute_pipeline(
        adls2_internal_pipeline,
        run_config={
            "solids": {
                "add_numbers": {
                    "inputs": {
                        "num_one": {
                            "value": 2
                        },
                        "num_two": {
                            "value": 4
                        }
                    }
                }
            },
            "intermediate_storage": {
                "adls2": {
                    "config": {
                        "adls2_file_system": file_system
                    }
                }
            },
        },
    )

    assert result.success
    assert result.result_for_solid("add_numbers").output_value() == 6

    assert file_system in adls2_fake_resource.adls2_client.file_systems

    keys = set()
    for step_key, output_name in [("add_numbers", "result")]:
        keys.add(create_adls2_key(result.run_id, step_key, output_name))

    assert set(adls2_fake_resource.adls2_client.file_systems[file_system].keys(
    )) == keys
コード例 #23
0
    for chunk in response.iter_content(chunk_size=chunk_size):
        if chunk:
            output_fp.write(chunk)


def _download_zipfile_from_url(url: str, target: str, chunk_size=8192) -> str:
    with requests.get(url, stream=True) as response, open(target,
                                                          'wb+') as output_fp:
        response.raise_for_status()
        _write_chunks_to_fp(response, output_fp, chunk_size)
    return target


@solid(
    input_defs=[
        InputDefinition('file_name', str),
        InputDefinition('base_url', str)
    ],
    output_defs=[OutputDefinition(str)],
    config={'chunk_size': Field(int, is_required=False, default_value=8192)},
    required_resource_keys={'volume'},
)
def download_zipfile_from_url(context, file_name: str, base_url: str) -> str:
    # mount dirs onto volume
    target = os.path.join(context.resources.volume, file_name)
    if not os.path.exists(target):
        _download_zipfile_from_url(
            "/".join([base_url, file_name]),
            target,
            context.solid_config['chunk_size'],
        )
コード例 #24
0
@lambda_solid
def return_one():
    return 1


@lambda_solid
def return_two():
    return 2


@lambda_solid
def return_three():
    return 3


@lambda_solid(input_defs=[InputDefinition("num")])
def add_one(num):
    return num + 1


def test_basic_use_case():
    pipeline_def = PipelineDefinition(
        name="basic",
        solid_defs=[return_one, add_one],
        dependencies={"add_one": {
            "num": DependencyDefinition("return_one")
        }},
    )

    assert execute_pipeline(pipeline_def).result_for_solid(
        "add_one").output_value() == 2
コード例 #25
0
ファイル: repo.py プロジェクト: sd2k/dagster
from dagster import InputDefinition, List, OutputDefinition, pipeline, repository, solid


@solid(output_defs=[OutputDefinition(int)])
def return_one(_):
    return 1


@solid(input_defs=[InputDefinition("nums", List[int])],
       output_defs=[OutputDefinition(int)])
def sum_fan_in(_, nums):
    return sum(nums)


@pipeline
def fan_in_pipeline():
    fan_outs = []
    for i in range(0, 10):
        fan_outs.append(return_one.alias("return_one_{}".format(i))())
    sum_fan_in(fan_outs)


@repository
def fan_in_pipeline_repository():
    return [fan_in_pipeline]
コード例 #26
0
)
from dagster.utils import file_relative_path


@solid
def pandas_yielder(_):
    return read_csv("./basic.csv")


@solid(required_resource_keys={"pyspark"})
def pyspark_yielder(context):
    return (context.resources.pyspark.spark_session.read.format("csv").options(
        header="true", inferSchema="true").load("./basic.csv"))


@solid(input_defs=[InputDefinition(name="res")])
def reyielder(_context, res):
    yield Output((res["statistics"], res["results"]))


@pipeline(
    mode_defs=[
        ModeDefinition("basic",
                       resource_defs={"ge_data_context": ge_data_context})
    ], )
def hello_world_pandas_pipeline():
    return reyielder(
        ge_validation_solid_factory("getest",
                                    "basic.warning")(pandas_yielder()))

コード例 #27
0
def create_solid_with_deps(name, *solid_deps):
    inputs = [InputDefinition(solid_dep.name) for solid_dep in solid_deps]

    return SolidDefinition(
        name=name, inputs=inputs, compute_fn=_compute_fn, outputs=[OutputDefinition()]
    )
コード例 #28
0
                working_directory=None,
                location_name=main_repo_location_name(),
            ),
    ) as workspace_process_context:
        yield workspace_process_context.create_request_context()


@contextmanager
def get_main_external_repo(instance):
    with get_main_workspace(instance) as workspace:
        location = workspace.get_repository_location(main_repo_location_name())
        yield location.get_repository(main_repo_name())


@lambda_solid(
    input_defs=[InputDefinition("num", PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_solid(num):
    sum_df = deepcopy(num)
    for x in sum_df:
        x["sum"] = int(x["num1"]) + int(x["num2"])
    return sum_df


@lambda_solid(
    input_defs=[InputDefinition("sum_df", PoorMansDataFrame)],
    output_def=OutputDefinition(PoorMansDataFrame),
)
def sum_sq_solid(sum_df):
    sum_sq_df = deepcopy(sum_df)
コード例 #29
0
def return_one(context):
    return 1


# start_composite_solid_example_marker
@solid
def add_one(_, number: int):
    return number + 1


@solid
def multiply_by_three(_, number: int):
    return number * 3


@composite_solid(input_defs=[InputDefinition("number", int)])
def add_one_times_three_solid(number):
    return multiply_by_three(add_one(number))


# end_composite_solid_example_marker

# start_composite_solid_config_marker


@solid(config_schema={"n": int})
def add_n(context, number: int):
    return number + context.solid_config["n"]


@solid(config_schema={"m": int})
コード例 #30
0
ファイル: solids.py プロジェクト: syrusakbary/dagster
            )
        ],
        compute_fn=compute_fn,
        description=description,
        metadata={
            'kind': 'sql',
            'sql': sql_statement
        },
    )


@solid(
    name='unzip_file',
    inputs=[
        InputDefinition('archive_file',
                        Bytes,
                        description='The archive to unzip'),
        InputDefinition('archive_member',
                        String,
                        description='The archive member to extract.'),
    ],
    description='Extracts an archive member from a zip archive.',
    outputs=[
        OutputDefinition(Bytes, description='The unzipped archive member.')
    ],
)
def unzip_file(_context, archive_file, archive_member):

    with zipfile.ZipFile(archive_file) as zip_ref:
        return BytesIO(zip_ref.open(archive_member).read())