Example #1
0
def test_basic_pipeline_with_pandas_dataframe_dagster_type():
    def compute_event_metadata(dataframe):
        return [
            EventMetadataEntry.text(str(max(dataframe['pid'])), 'max_pid', 'maximum pid'),
        ]

    BasicDF = create_dagster_pandas_dataframe_type(
        name='BasicDF',
        columns=[
            PandasColumn.integer_column('pid', non_nullable=True),
            PandasColumn.string_column('names'),
        ],
        event_metadata_fn=compute_event_metadata,
    )

    @solid(output_defs=[OutputDefinition(name='basic_dataframe', dagster_type=BasicDF)])
    def create_dataframe(_):
        yield Output(
            DataFrame({'pid': [1, 2, 3], 'names': ['foo', 'bar', 'baz']}),
            output_name='basic_dataframe',
        )

    @pipeline
    def basic_pipeline():
        return create_dataframe()

    result = execute_pipeline(basic_pipeline)
    assert result.success
    for event in result.event_list:
        if event.event_type_value == 'STEP_OUTPUT':
            mock_df_output_event_metadata = (
                event.event_specific_data.type_check_data.metadata_entries
            )
            assert len(mock_df_output_event_metadata) == 1
            assert any([entry.label == 'max_pid' for entry in mock_df_output_event_metadata])
Example #2
0
def test_basic_pipeline_with_pandas_dataframe_dagster_type():
    def compute_event_metadata(dataframe):
        return {"max_pid": str(max(dataframe["pid"]))}

    BasicDF = create_dagster_pandas_dataframe_type(
        name="BasicDF",
        columns=[
            PandasColumn.integer_column("pid", non_nullable=True),
            PandasColumn.string_column("names"),
        ],
        event_metadata_fn=compute_event_metadata,
    )

    @op(out={"basic_dataframe": Out(dagster_type=BasicDF)})
    def create_dataframe(_):
        yield Output(
            DataFrame({"pid": [1, 2, 3], "names": ["foo", "bar", "baz"]}),
            output_name="basic_dataframe",
        )

    @graph
    def basic_graph():
        return create_dataframe()

    result = basic_graph.execute_in_process()
    assert result.success
    for event in result.all_node_events:
        if event.event_type_value == "STEP_OUTPUT":
            mock_df_output_event_metadata = (
                event.event_specific_data.type_check_data.metadata_entries
            )
            assert len(mock_df_output_event_metadata) == 1
            assert any([entry.label == "max_pid" for entry in mock_df_output_event_metadata])
Example #3
0
def test_datetime_column_with_min_max_constraints_ok():
    assert (
        validate_constraints(
            DataFrame(
                {
                    "datetime": [Timestamp("2021-03-14T12:34:56")],
                    "datetime_utc_min_max_no_tz": [Timestamp("2021-03-14T12:34:56Z")],
                    "datetime_utc_min_max_same_tz": [Timestamp("2021-03-14T12:34:56Z")],
                    "datetime_utc_min_max_from_different_tz": [Timestamp("2021-03-14T12:34:56Z")],
                }
            ),
            pandas_columns=[
                PandasColumn.datetime_column(
                    "datetime_utc_min_max_no_tz",
                    tz="UTC",
                    min_datetime=Timestamp.min,
                    max_datetime=Timestamp.max,
                ),
                PandasColumn.datetime_column(
                    "datetime_utc_min_max_same_tz",
                    tz="UTC",
                    min_datetime=Timestamp("2021-01-01T00:00:00Z"),
                    max_datetime=Timestamp("2021-12-01T00:00:00Z"),
                ),
                PandasColumn.datetime_column(
                    "datetime_utc_min_max_from_different_tz",
                    tz="UTC",
                    min_datetime=Timestamp("2021-01-01T00:00:00Z", tz="US/Eastern"),
                    max_datetime=Timestamp("2021-12-01T00:00:00Z"),
                ),
            ],
        )
        is None
    )
Example #4
0
def test_shape_validation_throw_error():
    with pytest.raises(ConstraintViolationException):
        validate_constraints(
            DataFrame({"foo": [2], "bar": ["hello"]}),
            pandas_columns=[
                PandasColumn.integer_column("foo", min_value=0),
                PandasColumn.string_column("bar"),
            ],
            dataframe_constraints=[RowCountConstraint(2)],
        )
Example #5
0
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn():
    BasicDF = create_dagster_pandas_dataframe_type(
        name="BasicDF",
        columns=[
            PandasColumn.integer_column("pid", non_nullable=True),
            PandasColumn.string_column("names"),
        ],
        event_metadata_fn=None,
    )
    assert isinstance(BasicDF, DagsterType)
    basic_type_check = check_dagster_type(BasicDF, DataFrame({"pid": [1], "names": ["foo"]}))
    assert basic_type_check.success
Example #6
0
def test_shape_validation_ok():
    assert (validate_constraints(
        DataFrame({
            'foo': [2],
            'bar': ['hello']
        }),
        pandas_columns=[
            PandasColumn.integer_column('foo', min_value=0),
            PandasColumn.string_column('bar'),
        ],
        dataframe_constraints=[RowCountConstraint(1)],
    ) is None)
Example #7
0
def test_shape_validation_ok():
    assert (
        validate_constraints(
            DataFrame({"foo": [2], "bar": ["hello"]}),
            pandas_columns=[
                PandasColumn.integer_column("foo", min_value=0),
                PandasColumn.string_column("bar"),
            ],
            dataframe_constraints=[RowCountConstraint(1)],
        )
        is None
    )
Example #8
0
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn():
    BasicDF = create_dagster_pandas_dataframe_type(
        name='BasicDF',
        columns=[
            PandasColumn.integer_column('pid', non_nullable=True),
            PandasColumn.string_column('names'),
        ],
        event_metadata_fn=None,
    )
    assert isinstance(BasicDF, DagsterType)
    basic_type_check = check_dagster_type(BasicDF, DataFrame({'pid': [1], 'names': ['foo']}))
    assert basic_type_check.success
Example #9
0
def test_shape_validation_throw_error():
    with pytest.raises(ConstraintViolationException):
        validate_constraints(
            DataFrame({
                'foo': [2],
                'bar': ['hello']
            }),
            pandas_columns=[
                PandasColumn.integer_column('foo', min_value=0),
                PandasColumn.string_column('bar'),
            ],
            dataframe_constraints=[RowCountConstraint(2)],
        )
Example #10
0
def test_validate_constraints_ok():
    column_constraints = [
        PandasColumn(name='foo', constraints=[ColumnTypeConstraint('object')]),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    assert validate_constraints(dataframe,
                                pandas_columns=column_constraints) is None
Example #11
0
def test_custom_dagster_dataframe_hydration_ok():
    input_dataframe = DataFrame({'foo': [1, 2, 3]})
    with safe_tempfile_path() as input_csv_fp, safe_tempfile_path() as output_csv_fp:
        input_dataframe.to_csv(input_csv_fp)
        TestDataFrame = create_dagster_pandas_dataframe_type(
            name='TestDataFrame', columns=[PandasColumn.exists('foo'),]
        )

        @solid(
            input_defs=[InputDefinition('test_df', TestDataFrame)],
            output_defs=[OutputDefinition(TestDataFrame)],
        )
        def use_test_dataframe(_, test_df):
            test_df['bar'] = [2, 4, 6]
            return test_df

        solid_result = execute_solid(
            use_test_dataframe,
            run_config={
                'solids': {
                    'use_test_dataframe': {
                        'inputs': {'test_df': {'csv': {'path': input_csv_fp}}},
                        'outputs': [{'result': {'csv': {'path': output_csv_fp}}},],
                    }
                }
            },
        )

        assert solid_result.success
        solid_output_df = read_csv(output_csv_fp)
        assert all(solid_output_df['bar'] == [2, 4, 6])
Example #12
0
def test_missing_column_validation_with_optional_column():
    column_constraints = [
        PandasColumn(
            name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})], is_required=False
        ),
    ]
    dataframe = DataFrame({"foo": ["bar", "baz"]})
    assert validate_constraints(dataframe, pandas_columns=column_constraints) is None
def test_create_pandas_dataframe_dagster_type():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[
            PandasColumn(name='foo',
                         constraints=[ColumnTypeConstraint('int64')])
        ],
    )
    assert isinstance(TestDataFrame, RuntimeType)
Example #14
0
def test_missing_column_validation_with_optional_column():
    column_constraints = [
        PandasColumn(name='qux',
                     constraints=[ColumnDTypeInSetConstraint({'object'})],
                     is_required=False),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    assert validate_constraints(dataframe,
                                pandas_columns=column_constraints) is None
Example #15
0
def test_dataframe_description_generation_just_type_constraint():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn(name="foo",
                         constraints=[ColumnDTypeInSetConstraint({"int64"})])
        ],
    )
    assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
Example #16
0
def test_missing_column_validation():
    column_constraints = [
        PandasColumn(name="qux", constraints=[ColumnDTypeInSetConstraint({"object"})]),
    ]
    dataframe = DataFrame({"foo": ["bar", "baz"]})
    with pytest.raises(
        ConstraintViolationException, match="Required column qux not in dataframe with columns"
    ):
        validate_constraints(dataframe, pandas_columns=column_constraints)
Example #17
0
def test_create_pandas_dataframe_dagster_type():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn(name="foo",
                         constraints=[ColumnDTypeInSetConstraint({"int64"})])
        ],
    )
    assert isinstance(TestDataFrame, DagsterType)
def test_dataframe_description_generation_just_type_constraint():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[
            PandasColumn(name='foo',
                         constraints=[ColumnTypeConstraint('int64')])
        ],
    )
    assert TestDataFrame.description == "\n### Columns\n**foo**: `int64`\n\n"
Example #19
0
def test_basic_pipeline_with_pandas_dataframe_dagster_type():
    def compute_event_metadata(dataframe):
        return [
            EventMetadataEntry.text(str(max(dataframe["pid"])), "max_pid",
                                    "maximum pid"),
        ]

    BasicDF = create_dagster_pandas_dataframe_type(
        name="BasicDF",
        columns=[
            PandasColumn.integer_column("pid", non_nullable=True),
            PandasColumn.string_column("names"),
        ],
        event_metadata_fn=compute_event_metadata,
    )

    @solid(output_defs=[
        OutputDefinition(name="basic_dataframe", dagster_type=BasicDF)
    ])
    def create_dataframe(_):
        yield Output(
            DataFrame({
                "pid": [1, 2, 3],
                "names": ["foo", "bar", "baz"]
            }),
            output_name="basic_dataframe",
        )

    @pipeline
    def basic_pipeline():
        return create_dataframe()

    result = execute_pipeline(basic_pipeline)
    assert result.success
    for event in result.event_list:
        if event.event_type_value == "STEP_OUTPUT":
            mock_df_output_event_metadata = (
                event.event_specific_data.type_check_data.metadata_entries)
            assert len(mock_df_output_event_metadata) == 1
            assert any([
                entry.label == "max_pid"
                for entry in mock_df_output_event_metadata
            ])
Example #20
0
def test_missing_column_validation():
    column_constraints = [
        PandasColumn(name='qux',
                     constraints=[ColumnDTypeInSetConstraint({'object'})]),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    with pytest.raises(
            ConstraintViolationException,
            match="Required column qux not in dataframe with columns"):
        validate_constraints(dataframe, pandas_columns=column_constraints)
Example #21
0
def test_datetime_column_with_tz_validation_fails_when_incorrect_tz():
    with pytest.raises(ConstraintViolationException):
        validate_constraints(
            DataFrame(
                {
                    "datetime_utc": [Timestamp("2021-03-14T12:34:56")],
                }
            ),
            pandas_columns=[
                PandasColumn.datetime_column("datetime_utc", tz="UTC"),
            ],
        )
Example #22
0
def test_custom_dagster_dataframe_parametrizable_input():
    @input_selector_schema(
        Selector({'door_a': Field(str), 'door_b': Field(str), 'door_c': Field(str),})
    )
    def silly_hydrator(_, which_door, _field):
        if which_door == 'door_a':
            return DataFrame({'foo': ['goat']})
        elif which_door == 'door_b':
            return DataFrame({'foo': ['car']})
        elif which_door == 'door_c':
            return DataFrame({'foo': ['goat']})
        raise DagsterInvariantViolationError(
            'You did not pick a door. You chose: {which_door}'.format(which_door=which_door)
        )

    @output_selector_schema(Selector({'devnull': Field(str), 'nothing': Field(str)}))
    def silly_materializer(_, _location, _field, _value):
        return Materialization(label='did nothing', description='just one of those days')

    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[PandasColumn.exists('foo'),],
        input_hydration_config=silly_hydrator,
        output_materialization_config=silly_materializer,
    )

    @solid(
        input_defs=[InputDefinition('df', TestDataFrame)],
        output_defs=[OutputDefinition(TestDataFrame)],
    )
    def did_i_win(_, df):
        return df

    solid_result = execute_solid(
        did_i_win,
        run_config={
            'solids': {
                'did_i_win': {
                    'inputs': {'df': {'door_a': 'bar'}},
                    'outputs': [{'result': {'devnull': 'baz'}}],
                }
            }
        },
    )
    assert solid_result.success
    output_df = solid_result.output_value()
    assert isinstance(output_df, DataFrame)
    assert output_df['foo'].tolist() == ['goat']
    materialization_events = solid_result.materialization_events_during_compute
    assert len(materialization_events) == 1
    assert materialization_events[0].event_specific_data.materialization.label == 'did nothing'
Example #23
0
def test_datetime_column_with_tz_validation_ok():
    assert (
        validate_constraints(
            DataFrame(
                {
                    "datetime": [Timestamp("2021-03-14T12:34:56")],
                    "datetime_utc": [Timestamp("2021-03-14T12:34:56Z")],
                    "datetime_dublin": [Timestamp("2021-03-14T12:34:56", tz="Europe/Dublin")],
                    "datetime_est": [Timestamp("2021-03-14T12:34:56", tz="US/Eastern")],
                    "datetime_chatham": [Timestamp("2021-03-14T12:34:56", tz="Pacific/Chatham")],
                    "datetime_utc_with_min_max": [Timestamp("2021-03-14T12:34:56Z")],
                }
            ),
            pandas_columns=[
                PandasColumn.datetime_column("datetime"),
                PandasColumn.datetime_column("datetime_utc", tz="UTC"),
                PandasColumn.datetime_column("datetime_dublin", tz="Europe/Dublin"),
                PandasColumn.datetime_column("datetime_est", tz="US/Eastern"),
                PandasColumn.datetime_column("datetime_chatham", tz="Pacific/Chatham"),
            ],
        )
        is None
    )
Example #24
0
def test_custom_dagster_dataframe_loading_ok():
    input_dataframe = DataFrame({"foo": [1, 2, 3]})
    with safe_tempfile_path() as input_csv_fp, safe_tempfile_path(
    ) as output_csv_fp:
        input_dataframe.to_csv(input_csv_fp)
        TestDataFrame = create_dagster_pandas_dataframe_type(
            name="TestDataFrame",
            columns=[
                PandasColumn.exists("foo"),
            ],
        )

        @op(
            ins={"test_df": In(TestDataFrame)},
            out=Out(TestDataFrame),
        )
        def use_test_dataframe(_, test_df):
            test_df["bar"] = [2, 4, 6]
            return test_df

        @graph
        def basic_graph():
            use_test_dataframe()

        result = basic_graph.execute_in_process(
            run_config={
                "ops": {
                    "use_test_dataframe": {
                        "inputs": {
                            "test_df": {
                                "csv": {
                                    "path": input_csv_fp
                                }
                            }
                        },
                        "outputs": [
                            {
                                "result": {
                                    "csv": {
                                        "path": output_csv_fp
                                    }
                                }
                            },
                        ],
                    }
                }
            })
        assert result.success
        output_df = read_csv(output_csv_fp)
        assert all(output_df["bar"] == [2, 4, 6])
Example #25
0
def test_custom_dagster_dataframe_loading_ok():
    input_dataframe = DataFrame({"foo": [1, 2, 3]})
    with safe_tempfile_path() as input_csv_fp, safe_tempfile_path(
    ) as output_csv_fp:
        input_dataframe.to_csv(input_csv_fp)
        TestDataFrame = create_dagster_pandas_dataframe_type(
            name="TestDataFrame",
            columns=[
                PandasColumn.exists("foo"),
            ],
        )

        @solid(
            input_defs=[InputDefinition("test_df", TestDataFrame)],
            output_defs=[OutputDefinition(TestDataFrame)],
        )
        def use_test_dataframe(_, test_df):
            test_df["bar"] = [2, 4, 6]
            return test_df

        solid_result = execute_solid(
            use_test_dataframe,
            run_config={
                "solids": {
                    "use_test_dataframe": {
                        "inputs": {
                            "test_df": {
                                "csv": {
                                    "path": input_csv_fp
                                }
                            }
                        },
                        "outputs": [
                            {
                                "result": {
                                    "csv": {
                                        "path": output_csv_fp
                                    }
                                }
                            },
                        ],
                    }
                }
            },
        )

        assert solid_result.success
        solid_output_df = read_csv(output_csv_fp)
        assert all(solid_output_df["bar"] == [2, 4, 6])
def test_dataframe_description_generation_multi_constraints():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name='TestDataFrame',
        columns=[
            PandasColumn(
                name='foo',
                constraints=[
                    ColumnTypeConstraint('int64'),
                    InRangeColumnConstraint(0, 100),
                    NonNullableColumnConstraint(),
                ],
            ),
        ],
    )
    assert (
        TestDataFrame.description ==
        "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n"
    )
Example #27
0
def test_dataframe_description_generation_multi_constraints():
    TestDataFrame = create_dagster_pandas_dataframe_type(
        name="TestDataFrame",
        columns=[
            PandasColumn(
                name="foo",
                constraints=[
                    ColumnDTypeInSetConstraint({"int64"}),
                    InRangeColumnConstraint(0, 100, ignore_missing_vals=False),
                    NonNullableColumnConstraint(),
                ],
            ),
        ],
    )
    assert (
        TestDataFrame.description ==
        "\n### Columns\n**foo**: `int64`\n+ 0 < values < 100\n+ No Null values allowed.\n\n"
    )
Example #28
0
        EventMetadataEntry.text(
            str(min(dataframe["start_time"])),
            "min_start_time",
            "Date data collection started",
        ),
        EventMetadataEntry.text(str(max(dataframe["end_time"])),
                                "max_end_time", "Timestamp of last trip"),
        EventMetadataEntry.text(str(len(dataframe)), "n_rows",
                                "Number of rows seen in the dataframe"),
        EventMetadataEntry.text(str(dataframe.columns), "columns",
                                "Keys of columns seen in the dataframe"),
    ]


TripDataFrameSchema = [
    PandasColumn.integer_column("bike_id", min_value=0),
    PandasColumn.datetime_column(
        "start_time",
        min_datetime=Timestamp(year=2017, month=1, day=1),
    ),
    PandasColumn.datetime_column(
        "end_time",
        min_datetime=Timestamp(year=2017, month=1, day=1),
    ),
    PandasColumn.string_column("interval_date"),
]

RawTripDataFrame = create_dagster_pandas_dataframe_type(
    name="RawTripDataFrame",
    columns=[
        PandasColumn(column.name) for column in TripDataFrameSchema
Example #29
0
        EventMetadataEntry.text(
            str(min(dataframe['start_time'])),
            'min_start_time',
            'Date data collection started',
        ),
        EventMetadataEntry.text(str(max(dataframe['end_time'])),
                                'max_end_time', 'Timestamp of last trip'),
        EventMetadataEntry.text(str(len(dataframe)), 'n_rows',
                                'Number of rows seen in the dataframe'),
        EventMetadataEntry.text(str(dataframe.columns), 'columns',
                                'Keys of columns seen in the dataframe'),
    ]


TripDataFrameSchema = [
    PandasColumn.integer_column('bike_id', min_value=0),
    PandasColumn.datetime_column(
        'start_time',
        min_datetime=Timestamp(year=2018, month=1, day=1),
    ),
    PandasColumn.datetime_column(
        'end_time',
        min_datetime=Timestamp(year=2018, month=1, day=1),
    ),
    PandasColumn.string_column('interval_date'),
]

RawTripDataFrame = create_dagster_pandas_dataframe_type(
    name='RawTripDataFrame',
    columns=[
        PandasColumn(column.name) for column in TripDataFrameSchema
Example #30
0
        PandasColumn(name='qux',
                     constraints=[ColumnDTypeInSetConstraint({'object'})],
                     is_required=False),
    ]
    dataframe = DataFrame({'foo': ['bar', 'baz']})
    assert validate_constraints(dataframe,
                                pandas_columns=column_constraints) is None


@pytest.mark.parametrize(
    'column_constraints, dataframe',
    [
        (
            [
                PandasColumn(
                    name='foo',
                    constraints=[ColumnDTypeInSetConstraint({'int64'})])
            ],
            DataFrame({'foo': ['bar', 'baz']}),
        ),
        (
            [
                PandasColumn(
                    name='foo',
                    constraints=[ColumnDTypeInSetConstraint({'object'})])
            ],
            DataFrame({'bar': ['bar', 'baz']}),
        ),
    ],
)
def test_validate_constraints_throw_error(column_constraints, dataframe):