def test_basic_pipeline_with_pandas_dataframe_dagster_type(): def compute_event_metadata(dataframe): return [ EventMetadataEntry.text(str(max(dataframe['pid'])), 'max_pid', 'maximum pid'), ] BasicDF = create_dagster_pandas_dataframe_type( name='BasicDF', columns=[ PandasColumn.integer_column('pid', non_nullable=True), PandasColumn.string_column('names'), ], event_metadata_fn=compute_event_metadata, ) @solid(output_defs=[OutputDefinition(name='basic_dataframe', dagster_type=BasicDF)]) def create_dataframe(_): yield Output( DataFrame({'pid': [1, 2, 3], 'names': ['foo', 'bar', 'baz']}), output_name='basic_dataframe', ) @pipeline def basic_pipeline(): return create_dataframe() result = execute_pipeline(basic_pipeline) assert result.success for event in result.event_list: if event.event_type_value == 'STEP_OUTPUT': mock_df_output_event_metadata = ( event.event_specific_data.type_check_data.metadata_entries ) assert len(mock_df_output_event_metadata) == 1 assert any([entry.label == 'max_pid' for entry in mock_df_output_event_metadata])
def test_basic_pipeline_with_pandas_dataframe_dagster_type(): def compute_event_metadata(dataframe): return {"max_pid": str(max(dataframe["pid"]))} BasicDF = create_dagster_pandas_dataframe_type( name="BasicDF", columns=[ PandasColumn.integer_column("pid", non_nullable=True), PandasColumn.string_column("names"), ], event_metadata_fn=compute_event_metadata, ) @op(out={"basic_dataframe": Out(dagster_type=BasicDF)}) def create_dataframe(_): yield Output( DataFrame({"pid": [1, 2, 3], "names": ["foo", "bar", "baz"]}), output_name="basic_dataframe", ) @graph def basic_graph(): return create_dataframe() result = basic_graph.execute_in_process() assert result.success for event in result.all_node_events: if event.event_type_value == "STEP_OUTPUT": mock_df_output_event_metadata = ( event.event_specific_data.type_check_data.metadata_entries ) assert len(mock_df_output_event_metadata) == 1 assert any([entry.label == "max_pid" for entry in mock_df_output_event_metadata])
def test_shape_validation_throw_error(): with pytest.raises(ConstraintViolationException): validate_constraints( DataFrame({"foo": [2], "bar": ["hello"]}), pandas_columns=[ PandasColumn.integer_column("foo", min_value=0), PandasColumn.string_column("bar"), ], dataframe_constraints=[RowCountConstraint(2)], )
def test_shape_validation_ok(): assert (validate_constraints( DataFrame({ 'foo': [2], 'bar': ['hello'] }), pandas_columns=[ PandasColumn.integer_column('foo', min_value=0), PandasColumn.string_column('bar'), ], dataframe_constraints=[RowCountConstraint(1)], ) is None)
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn(): BasicDF = create_dagster_pandas_dataframe_type( name='BasicDF', columns=[ PandasColumn.integer_column('pid', non_nullable=True), PandasColumn.string_column('names'), ], event_metadata_fn=None, ) assert isinstance(BasicDF, DagsterType) basic_type_check = check_dagster_type(BasicDF, DataFrame({'pid': [1], 'names': ['foo']})) assert basic_type_check.success
def test_shape_validation_ok(): assert ( validate_constraints( DataFrame({"foo": [2], "bar": ["hello"]}), pandas_columns=[ PandasColumn.integer_column("foo", min_value=0), PandasColumn.string_column("bar"), ], dataframe_constraints=[RowCountConstraint(1)], ) is None )
def test_create_dagster_pandas_dataframe_type_with_null_event_metadata_fn(): BasicDF = create_dagster_pandas_dataframe_type( name="BasicDF", columns=[ PandasColumn.integer_column("pid", non_nullable=True), PandasColumn.string_column("names"), ], event_metadata_fn=None, ) assert isinstance(BasicDF, DagsterType) basic_type_check = check_dagster_type(BasicDF, DataFrame({"pid": [1], "names": ["foo"]})) assert basic_type_check.success
def test_shape_validation_throw_error(): with pytest.raises(ConstraintViolationException): validate_constraints( DataFrame({ 'foo': [2], 'bar': ['hello'] }), pandas_columns=[ PandasColumn.integer_column('foo', min_value=0), PandasColumn.string_column('bar'), ], dataframe_constraints=[RowCountConstraint(2)], )
def test_basic_pipeline_with_pandas_dataframe_dagster_type(): def compute_event_metadata(dataframe): return [ EventMetadataEntry.text(str(max(dataframe["pid"])), "max_pid", "maximum pid"), ] BasicDF = create_dagster_pandas_dataframe_type( name="BasicDF", columns=[ PandasColumn.integer_column("pid", non_nullable=True), PandasColumn.string_column("names"), ], event_metadata_fn=compute_event_metadata, ) @solid(output_defs=[ OutputDefinition(name="basic_dataframe", dagster_type=BasicDF) ]) def create_dataframe(_): yield Output( DataFrame({ "pid": [1, 2, 3], "names": ["foo", "bar", "baz"] }), output_name="basic_dataframe", ) @pipeline def basic_pipeline(): return create_dataframe() result = execute_pipeline(basic_pipeline) assert result.success for event in result.event_list: if event.event_type_value == "STEP_OUTPUT": mock_df_output_event_metadata = ( event.event_specific_data.type_check_data.metadata_entries) assert len(mock_df_output_event_metadata) == 1 assert any([ entry.label == "max_pid" for entry in mock_df_output_event_metadata ])
EventMetadataEntry.text( str(min(dataframe["start_time"])), "min_start_time", "Date data collection started", ), EventMetadataEntry.text(str(max(dataframe["end_time"])), "max_end_time", "Timestamp of last trip"), EventMetadataEntry.text(str(len(dataframe)), "n_rows", "Number of rows seen in the dataframe"), EventMetadataEntry.text(str(dataframe.columns), "columns", "Keys of columns seen in the dataframe"), ] TripDataFrameSchema = [ PandasColumn.integer_column("bike_id", min_value=0), PandasColumn.datetime_column( "start_time", min_datetime=Timestamp(year=2017, month=1, day=1), ), PandasColumn.datetime_column( "end_time", min_datetime=Timestamp(year=2017, month=1, day=1), ), PandasColumn.string_column("interval_date"), ] RawTripDataFrame = create_dagster_pandas_dataframe_type( name="RawTripDataFrame", columns=[ PandasColumn(column.name) for column in TripDataFrameSchema
EventMetadataEntry.text( str(min(dataframe['start_time'])), 'min_start_time', 'Date data collection started', ), EventMetadataEntry.text(str(max(dataframe['end_time'])), 'max_end_time', 'Timestamp of last trip'), EventMetadataEntry.text(str(len(dataframe)), 'n_rows', 'Number of rows seen in the dataframe'), EventMetadataEntry.text(str(dataframe.columns), 'columns', 'Keys of columns seen in the dataframe'), ] TripDataFrameSchema = [ PandasColumn.integer_column('bike_id', min_value=0), PandasColumn.datetime_column( 'start_time', min_datetime=Timestamp(year=2018, month=1, day=1), ), PandasColumn.datetime_column( 'end_time', min_datetime=Timestamp(year=2018, month=1, day=1), ), PandasColumn.string_column('interval_date'), ] RawTripDataFrame = create_dagster_pandas_dataframe_type( name='RawTripDataFrame', columns=[ PandasColumn(column.name) for column in TripDataFrameSchema