def type_check_method(_, value): if not isinstance(value, dict): return TypeCheck( False, description='Value {value} should be of type {type_name}.'. format(value=value, type_name=name), ) for key in value: if not key in permitted_key_names: return TypeCheck( False, description= ('Key {name} is not a permitted value, values can only be of: ' '{name_list}').format(name=value.name, name_list=permitted_key_names), ) return TypeCheck( True, metadata_entries=[ EventMetadataEntry.text(label='row_count', text=str(len(value))), EventMetadataEntry.text(label='series_names', text=', '.join(value.keys())), ], )
def df_type_check(_, value): if not isinstance(value, dd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ # string cast columns since they may be things like datetime EventMetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"), ], )
def df_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'), # string cast columns since they may be things like datetime EventMetadataEntry.json({'columns': list(map(str, value.columns))}, 'metadata'), ], )
def df_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ MetadataEntry("row_count", value=str(len(value))), # string cast columns since they may be things like datetime MetadataEntry("metadata", value={"columns": list(map(str, value.columns))}), ], )
def df_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck(success=False) return TypeCheck( success=True, metadata_entries=[ MetadataEntry.text(str(len(value)), "row_count", "Number of rows in DataFrame"), # string cast columns since they may be things like datetime MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"), ], )
def less_simple_data_frame_type_check(_, value): if not isinstance(value, list): return TypeCheck( success=False, description=( 'LessSimpleDataFrame should be a list of dicts, got ' '{type_}' ).format(type_=type(value)), ) fields = [field for field in value[0].keys()] for i in range(len(value)): row = value[i] if not isinstance(row, dict): return TypeCheck( success=False, description=( 'LessSimpleDataFrame should be a list of dicts, ' 'got {type_} for row {idx}' ).format(type_=type(row), idx=(i + 1)), ) row_fields = [field for field in row.keys()] if fields != row_fields: return TypeCheck( success=False, description=( 'Rows in LessSimpleDataFrame should have the same fields, ' 'got {actual} for row {idx}, expected {expected}' ).format(actual=row_fields, idx=(i + 1), expected=fields), ) return TypeCheck( success=True, description='LessSimpleDataFrame summary statistics', metadata_entries=[ EventMetadataEntry.text( str(len(value)), 'n_rows', 'Number of rows seen in the data frame', ), EventMetadataEntry.text( str(len(value[0].keys()) if len(value) > 0 else 0), 'n_cols', 'Number of columns seen in the data frame', ), EventMetadataEntry.text( str(list(value[0].keys()) if len(value) > 0 else []), 'column_names', 'Keys of columns seen in the data frame', ), ], )
def validate(self, data, *columns, **kwargs): if len(columns) == 0: columns = data.columns relevant_data = data[list(columns)] offending = {} offending_values = {} # TODO: grab metadata from here inverse_validation = lambda x: not self.validation_fn(x)[0] for column in columns: results = relevant_data[relevant_data[column].apply( inverse_validation)] if len(results.index.tolist()) > 0: offending[column] = [ 'row ' + str(i) for i in (results.index.tolist()) ] offending_values[column] = results[column].tolist() if len(offending) == 0: if not self.raise_or_typecheck: return TypeCheck(success=True) else: metadict = { 'expectation': self.validation_fn.__doc__, 'actual': offending_values, 'offending': offending, } exc = self.resulting_exception( constraint_name=self.name, constraint_description=self.description, **metadict) if self.raise_or_typecheck: raise exc else: return exc.return_as_typecheck()
def validate(self, data, *columns, **kwargs): if len(columns) == 0: columns = data.columns relevant_data = data[list(columns)] offending_columns = set() offending_values = {} for column in columns: # TODO: grab extra metadata if not self.validation_fn(relevant_data[column])[0]: offending_columns.add(column) offending_values[column] = relevant_data[column].to_numpy() if len(offending_columns) == 0 and not self.raise_or_typecheck: return TypeCheck(success=True) elif len(offending_columns) > 0: metadict = { 'expectation': self.description.replace('Confirms', ''), 'actual': offending_values, 'offending': offending_columns, } exc = self.resulting_exception( constraint_name=self.name, constraint_description=self.description, **metadict) if self.raise_or_typecheck: raise exc else: return exc.return_as_typecheck()
def _pandera_errors_to_type_check( error: pa.errors.SchemaErrors, _table_schema: TableSchema ) -> TypeCheck: return TypeCheck( success=False, description=str(error), )
def validate(self, data, *columns, **kwargs): if len(columns) == 0: columns = data.columns columns = [column for column in columns if column in data.columns] relevant_data = data[list(columns)] offending_columns = set() offending_values = {} for column in columns: # TODO: grab extra metadata res = self.validation_fn(relevant_data[column]) if not res[0]: offending_columns.add(column) if not res[1].get("actual") is None: offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()] else: offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()] if len(offending_columns) == 0 and not self.raise_or_typecheck: return TypeCheck(success=True) elif len(offending_columns) > 0: metadict = { "expectation": self.description.replace("Confirms", ""), "actual": offending_values, "offending": offending_columns, } exc = self.resulting_exception( constraint_name=self.name, constraint_description=self.description, **metadict ) if self.raise_or_typecheck: raise exc else: return exc.return_as_typecheck()
def test_raise_on_error_true_type_check_returns_unsuccessful_type_check(): FalsyType = DagsterType( name="FalsyType", type_check_fn=lambda _, _val: TypeCheck( success=False, metadata_entries=[EventMetadataEntry.text("foo", "bar", "baz")] ), ) @solid(output_defs=[OutputDefinition(FalsyType)]) def foo_solid(_): return 1 @pipeline def foo_pipeline(): foo_solid() with pytest.raises(DagsterTypeCheckDidNotPass) as e: execute_pipeline(foo_pipeline) assert e.value.metadata_entries[0].label == "bar" assert e.value.metadata_entries[0].entry_data.text == "foo" assert e.value.metadata_entries[0].description == "baz" assert isinstance(e.value.dagster_type, DagsterType) pipeline_result = execute_pipeline(foo_pipeline, raise_on_error=False) assert not pipeline_result.success assert [event.event_type_value for event in pipeline_result.step_event_list] == [ DagsterEventType.STEP_START.value, DagsterEventType.STEP_OUTPUT.value, DagsterEventType.STEP_FAILURE.value, ] for event in pipeline_result.step_event_list: if event.event_type_value == DagsterEventType.STEP_FAILURE.value: assert event.event_specific_data.error.cls_name == "DagsterTypeCheckDidNotPass"
def validate_snapshot_timeseries(training_set_data): if len(training_set_data) != 2: return TypeCheck( success=False, description= 'Invalid training set. The tuple must consist of a training set, output vector, and feature_names', ) # tuple argument types X, y = training_set_data if not (isinstance(X, ndarray) and isinstance(y, ndarray)): return TypeCheck( success=False, description= 'Both input matrix and output vector must be numpy arrays. X: {} | y: {}' .format(type(X), type(y)), ) timeseries_length, snapshot_length, num_timeseries = X.shape output_vector_length = y.shape[0] if num_timeseries == 0 or output_vector_length == 0: return TypeCheck( success=False, description='No empty training sets allowed', ) if timeseries_length != output_vector_length: return TypeCheck( success=False, description= 'Every timeseries must have as many snapshots as outputs') return TypeCheck( success=True, metadata_entries=[ EventMetadataEntry.text(str(num_timeseries), 'num_ts', 'Number of parallel timeseries.'), EventMetadataEntry.text(str(timeseries_length), 'timeseries_length', 'Length of each timeseries.'), EventMetadataEntry.text( str(snapshot_length), 'snapshot_length', 'Number of past observations for each input.', ), ], )
def _dagster_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description= "Must be a pandas.DataFrame. Got value of type. {type_name}". format(type_name=type(value).__name__), ) individual_result_dict = {} if dataframe_validator is not None: individual_result_dict["dataframe"] = dataframe_validator.validate( value) if columns_validator is not None: individual_result_dict["columns"] = columns_validator.validate( value) if columns_aggregate_validator is not None: individual_result_dict[ "column-aggregates"] = columns_aggregate_validator.validate( value) typechecks_succeeded = True metadata = [] overall_description = "Failed Constraints: {}" constraint_clauses = [] for key, result in individual_result_dict.items(): result_val = result.success if result_val: continue typechecks_succeeded = typechecks_succeeded and result_val result_dict = result.metadata_entries[0].entry_data.data metadata.append( EventMetadataEntry.json( result_dict, "{}-constraint-metadata".format(key), )) constraint_clauses.append("{} failing constraints, {}".format( key, result.description)) # returns aggregates, then column, then dataframe return TypeCheck( success=typechecks_succeeded, description=overall_description.format(constraint_clauses), metadata_entries=sorted(metadata, key=lambda x: x.label), )
def file_exists_at_path_type_check(_, value): if not isinstance(value, six.string_types): return TypeCheck( success=False, description='FileExistsAtPath must be a string in memory. Got {value}'.format( value=repr(value) ), ) if not safe_isfile(value): return TypeCheck( success=False, description=( 'FileExistsAtPath must be a path that points to a file that ' 'exists. "{value}" does not exist on disk' ).format(value=value), ) return True
def type_check_fn(_context, value: object) -> TypeCheck: if isinstance(value, VALID_DATAFRAME_CLASSES): try: # `lazy` instructs pandera to capture every (not just the first) validation error schema.validate(value, lazy=True) except pa.errors.SchemaErrors as e: return _pandera_errors_to_type_check(e, table_schema) except Exception as e: return TypeCheck( success=False, description=f"Unexpected error during validation: {e}", ) else: return TypeCheck( success=False, description=f"Must be one of {VALID_DATAFRAME_CLASSES}, not {type(value).__name__}.", ) return TypeCheck(success=True)
def type_check_fn(_context, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description=f"Must be pandas.DataFrame, not {type(value).__name__}.", ) try: # `lazy` instructs pandera to capture every (not just the first) validation error schema.validate(value, lazy=True) except pa.errors.SchemaErrors as e: return TypeCheck( success=False, description=str(e), metadata={ "num_violations": len(e.failure_cases), }, ) return TypeCheck(success=True)
def _dagster_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description= 'Must be a pandas.DataFrame. Got value of type. {type_name}'. format(type_name=type(value).__name__), ) individual_result_dict = {} if columns_validator is not None: individual_result_dict["columns"] = columns_validator.validate( value) if columns_aggregate_validator is not None: individual_result_dict[ "column aggregates"] = columns_aggregate_validator.validate( value) if dataframe_validator is not None: individual_result_dict["dataframe"] = dataframe_validator.validate( value) typechecks_succeeded = True metadata = [] overall_description = "" for key, result in individual_result_dict.items(): result_val = result.success if result_val: continue typechecks_succeeded = typechecks_succeeded and result_val result_dict = result.metadata_entries[0].entry_data.data metadata.append( EventMetadataEntry.json( result_dict, '{}-constraint-metadata'.format(key), )) overall_description += "{} failing constraints, requiring {}".format( key, result.description) return TypeCheck(success=typechecks_succeeded, description=overall_description, metadata_entries=metadata)
def _dagster_type_check(value): if not isinstance(value, DataFrame): return TypeCheck( success=False, description= 'Must be a pandas.DataFrame. Got value of type. {type_name}'. format(type_name=type(value).__name__), ) if columns is not None: try: validate_collection_schema(columns, value) except ConstraintViolationException as e: return TypeCheck(success=False, description=str(e)) return TypeCheck( success=True, metadata_entries=_execute_summary_stats(name, value, summary_statistics) if summary_statistics else None, )
def _dagster_type_check(_, value): if not isinstance(value, pd.DataFrame): return TypeCheck( success=False, description= 'Must be a pandas.DataFrame. Got value of type. {type_name}'. format(type_name=type(value).__name__), ) try: validate_constraints(value, pandas_columns=columns, dataframe_constraints=dataframe_constraints) except ConstraintViolationException as e: return TypeCheck(success=False, description=str(e)) return TypeCheck( success=True, metadata_entries=_execute_summary_stats( name, value, event_metadata_fn) if event_metadata_fn else None, )
def less_simple_data_frame_type_check(_, value): if not isinstance(value, list): return TypeCheck( success=False, description= f"LessSimpleDataFrame should be a list of dicts, got {type(value)}", ) fields = [field for field in value[0].keys()] for i in range(len(value)): row = value[i] idx = i + 1 if not isinstance(row, dict): return TypeCheck( success=False, description= (f"LessSimpleDataFrame should be a list of dicts, got {type(row)} for row {idx}" ), ) row_fields = [field for field in row.keys()] if fields != row_fields: return TypeCheck( success=False, description= (f"Rows in LessSimpleDataFrame should have the same fields, got {row_fields} " f"for row {idx}, expected {fields}"), ) return TypeCheck( success=True, description="LessSimpleDataFrame summary statistics", metadata={ "n_rows": len(value), "n_cols": len(value[0].keys()) if len(value) > 0 else 0, "column_names": str(list(value[0].keys()) if len(value) > 0 else []), }, )
def positive_num_check(_, value): # return True if value > 0 else False if value <= 0: return TypeCheck( success=False, description=("Numbers cannot be 0 or negative, got " "{value} for PositiveNumber type").format( value=value), metadata_entries=[ EventMetadataEntry.int(value, "The input number") ]) else: return True
def type_check_method(_, value): if not isinstance(value, dict): return TypeCheck( False, description="Value {value} should be of type {type_name}.".format( value=value, type_name=name ), ) for key in value: if not key in permitted_key_names: return TypeCheck( False, description=( "Key {name} is not a permitted value, values can only be of: " "{name_list}" ).format(name=value.name, name_list=permitted_key_names), ) return TypeCheck( True, metadata_entries=[ MetadataEntry("row_count", value=str(len(value))), MetadataEntry("series_names", value=", ".join(value.keys())), ], )
def validate(self, data, *args, **kwargs): res = self.validation_fn(data, *args, **kwargs) if not res[0]: exc = self.resulting_exception( constraint_name=self.name, constraint_description=self.description, **res[1] ) if self.raise_or_typecheck: raise exc else: return exc.return_as_typecheck() else: if res[0]: return TypeCheck(success=True)
def type_check(self, value): if not isinstance(value, dict): raise Failure( 'Value {value} should be of type {type_name}.'.format( value=value, type_name=self.name)) for key in value: if not key in permitted_key_names: raise Failure( 'Key {name} is not a permitted value, values can only be of: {name_list}' .format(name=value.name, name_list=permitted_key_names)) return TypeCheck(metadata_entries=[ EventMetadataEntry.text(label='row_count', text=str(len(value))), EventMetadataEntry.text(label='series_names', text=', '.join(value.keys())), ])
def check_dagster_type(dagster_type, value): '''Test a custom Dagster type. Args: dagster_type (Any): The Dagster type to test. Should be one of the :ref:`built-in types <builtin>`, a dagster type explicitly constructed with :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or :py:func:`PythonObjectDagsterType`, or a Python type. value (Any): The runtime value to test. Returns: TypeCheck: The result of the type check. Examples: .. code-block:: python assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success ''' if is_typing_type(dagster_type): raise DagsterInvariantViolationError( ( 'Must pass in a type from dagster module. You passed {dagster_type} ' 'which is part of python\'s typing module.' ).format(dagster_type=dagster_type) ) dagster_type = resolve_dagster_type(dagster_type) with yield_empty_pipeline_context() as pipeline_context: context = pipeline_context.for_type(dagster_type) try: type_check = dagster_type.type_check(context, value) except Failure as failure: return TypeCheck(success=False, description=failure.description) if not isinstance(type_check, TypeCheck): raise DagsterInvariantViolationError( 'Type checks can only return TypeCheck. Type {type_name} returned {value}.'.format( type_name=dagster_type.name, value=repr(type_check) ) ) return type_check
def less_simple_data_frame_typecheck_metadata_fn(value) -> TypeCheck: return TypeCheck( 'LessSimpleDataFrame summary statistics', [ EventMetadataEntry.text( str(len(value)), 'n_rows', 'Number of rows seen in the data frame', ), EventMetadataEntry.text( str(len(value[0].keys()) if len(value) > 0 else 0), 'n_cols', 'Number of columns seen in the data frame', ), EventMetadataEntry.text( str(list(value[0].keys()) if len(value) > 0 else []), 'column_names', 'Keys of columns seen in the data frame', ), ], )
def validate_trip_dataframe(dataframe): TRIP_CONFIG = { 'bike_id': { 'bounds': (0, float('inf')), 'expected_dtypes': {'int64'} }, 'start_time': { 'bounds': (Timestamp(year=2018, month=1, day=1), Timestamp(year=2020, month=1, day=1)), 'expected_dtypes': {'<M8[ns]', 'datetime64[ns]'}, }, 'end_time': { 'bounds': (Timestamp(year=2018, month=1, day=1), Timestamp(year=2020, month=1, day=1)), 'expected_dtypes': {'<M8[ns]', 'datetime64[ns]'}, }, 'interval_date': { 'expected_dtypes': {'str', 'object'} }, } failed_type_check = DataFrameValidator( TRIP_CONFIG).validate_columns_in_dataframe(dataframe) return (failed_type_check if failed_type_check else TypeCheck( success=True, description='Yay', metadata_entries=[ EventMetadataEntry.text( str(min(dataframe['start_time'])), 'min_start_time', 'Date data collection started', ), EventMetadataEntry.text(str(max(dataframe['end_time'])), 'max_end_time', 'Timestamp of last trip'), EventMetadataEntry.text(str(len(dataframe)), 'n_rows', 'Number of rows seen in the dataframe'), EventMetadataEntry.text(str(dataframe.columns), 'columns', 'Keys of columns seen in the dataframe'), ], ))
def test_raise_on_error_true_type_check_returns_successful_type_check(): TruthyExceptionType = DagsterType( name="TruthyExceptionType", type_check_fn=lambda _, _val: TypeCheck( success=True, metadata_entries=[EventMetadataEntry.text("foo", "bar", "baz")] ), ) @solid(output_defs=[OutputDefinition(TruthyExceptionType)]) def foo_solid(_): return 1 @pipeline def foo_pipeline(): foo_solid() pipeline_result = execute_pipeline(foo_pipeline) assert pipeline_result.success for event in pipeline_result.step_event_list: if event.event_type_value == DagsterEventType.STEP_OUTPUT.value: assert event.event_specific_data.type_check_data assert event.event_specific_data.type_check_data.metadata_entries[0].label == "bar" assert ( event.event_specific_data.type_check_data.metadata_entries[0].entry_data.text == "foo" ) assert ( event.event_specific_data.type_check_data.metadata_entries[0].description == "baz" ) pipeline_result = execute_pipeline(foo_pipeline, raise_on_error=False) assert pipeline_result.success assert set( [ DagsterEventType.STEP_START.value, DagsterEventType.STEP_OUTPUT.value, DagsterEventType.STEP_SUCCESS.value, ] ).issubset([event.event_type_value for event in pipeline_result.step_event_list])
def validate_traffic_dataframe(dataframe): TRAFFIC_CONFIG = { 'interval_date': { 'expected_dtypes': {'str', 'object'} }, 'peak_traffic_load': { 'bounds': (0, float('inf')), 'expected_dtypes': {'int64'} }, } failed_type_check = DataFrameValidator( TRAFFIC_CONFIG).validate_columns_in_dataframe(dataframe) return (failed_type_check if failed_type_check else TypeCheck( success=True, description='Yay', metadata_entries=[ EventMetadataEntry.text(str(min(dataframe['peak_traffic_load'])), 'min_traffic_load', 'Best Peak Load'), EventMetadataEntry.text(str(max(dataframe['peak_traffic_load'])), 'max_traffic_load', 'Worst Peak Load'), EventMetadataEntry.text( str(mean(dataframe['peak_traffic_load'])), 'mean_traffic_load', 'Mean peak traffic', ), EventMetadataEntry.text( str(median(dataframe['peak_traffic_load'])), 'median_traffic_load', 'Median peak traffic', ), EventMetadataEntry.text(str(len(dataframe)), 'n_rows', 'Number of rows seen in the dataframe'), EventMetadataEntry.text(str(dataframe.columns), 'columns', 'Keys of columns seen in the dataframe'), ], ))
if file_type == 'csv': path = file_options['path'] return pd.read_csv(path, **dict_without_keys(file_options, 'path')) elif file_type == 'parquet': return pd.read_parquet(file_options['path']) elif file_type == 'table': return pd.read_csv(file_options['path'], sep='\t') else: raise DagsterInvariantViolationError( 'Unsupported file_type {file_type}'.format(file_type=file_type) ) DataFrame = as_dagster_type( pd.DataFrame, name='PandasDataFrame', description='''Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). See http://pandas.pydata.org/''', input_hydration_config=dataframe_input_schema, output_materialization_config=dataframe_output_schema, typecheck_metadata_fn=lambda value: TypeCheck( metadata_entries=[ EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'), # string cast columns since they may be things like datetime EventMetadataEntry.json({'columns': list(map(str, value.columns))}, 'metadata'), ] ), )