Exemple #1
0
 def type_check_method(_, value):
     if not isinstance(value, dict):
         return TypeCheck(
             False,
             description='Value {value} should be of type {type_name}.'.
             format(value=value, type_name=name),
         )
     for key in value:
         if not key in permitted_key_names:
             return TypeCheck(
                 False,
                 description=
                 ('Key {name} is not a permitted value, values can only be of: '
                  '{name_list}').format(name=value.name,
                                        name_list=permitted_key_names),
             )
     return TypeCheck(
         True,
         metadata_entries=[
             EventMetadataEntry.text(label='row_count',
                                     text=str(len(value))),
             EventMetadataEntry.text(label='series_names',
                                     text=', '.join(value.keys())),
         ],
     )
Exemple #2
0
def df_type_check(_, value):
    if not isinstance(value, dd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            # string cast columns since they may be things like datetime
            EventMetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"),
        ],
    )
Exemple #3
0
def df_type_check(_, value):
    if not isinstance(value, pd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'),
            # string cast columns since they may be things like datetime
            EventMetadataEntry.json({'columns': list(map(str, value.columns))}, 'metadata'),
        ],
    )
Exemple #4
0
def df_type_check(_, value):
    if not isinstance(value, pd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            MetadataEntry("row_count", value=str(len(value))),
            # string cast columns since they may be things like datetime
            MetadataEntry("metadata", value={"columns": list(map(str, value.columns))}),
        ],
    )
Exemple #5
0
def df_type_check(_, value):
    if not isinstance(value, pd.DataFrame):
        return TypeCheck(success=False)
    return TypeCheck(
        success=True,
        metadata_entries=[
            MetadataEntry.text(str(len(value)), "row_count", "Number of rows in DataFrame"),
            # string cast columns since they may be things like datetime
            MetadataEntry.json({"columns": list(map(str, value.columns))}, "metadata"),
        ],
    )
def less_simple_data_frame_type_check(_, value):
    if not isinstance(value, list):
        return TypeCheck(
            success=False,
            description=(
                'LessSimpleDataFrame should be a list of dicts, got '
                '{type_}'
            ).format(type_=type(value)),
        )

    fields = [field for field in value[0].keys()]

    for i in range(len(value)):
        row = value[i]
        if not isinstance(row, dict):
            return TypeCheck(
                success=False,
                description=(
                    'LessSimpleDataFrame should be a list of dicts, '
                    'got {type_} for row {idx}'
                ).format(type_=type(row), idx=(i + 1)),
            )
        row_fields = [field for field in row.keys()]
        if fields != row_fields:
            return TypeCheck(
                success=False,
                description=(
                    'Rows in LessSimpleDataFrame should have the same fields, '
                    'got {actual} for row {idx}, expected {expected}'
                ).format(actual=row_fields, idx=(i + 1), expected=fields),
            )

    return TypeCheck(
        success=True,
        description='LessSimpleDataFrame summary statistics',
        metadata_entries=[
            EventMetadataEntry.text(
                str(len(value)),
                'n_rows',
                'Number of rows seen in the data frame',
            ),
            EventMetadataEntry.text(
                str(len(value[0].keys()) if len(value) > 0 else 0),
                'n_cols',
                'Number of columns seen in the data frame',
            ),
            EventMetadataEntry.text(
                str(list(value[0].keys()) if len(value) > 0 else []),
                'column_names',
                'Keys of columns seen in the data frame',
            ),
        ],
    )
Exemple #7
0
    def validate(self, data, *columns, **kwargs):
        if len(columns) == 0:
            columns = data.columns
        relevant_data = data[list(columns)]
        offending = {}
        offending_values = {}
        # TODO:  grab metadata from here
        inverse_validation = lambda x: not self.validation_fn(x)[0]
        for column in columns:
            results = relevant_data[relevant_data[column].apply(
                inverse_validation)]
            if len(results.index.tolist()) > 0:
                offending[column] = [
                    'row ' + str(i) for i in (results.index.tolist())
                ]
                offending_values[column] = results[column].tolist()
        if len(offending) == 0:
            if not self.raise_or_typecheck:
                return TypeCheck(success=True)
        else:
            metadict = {
                'expectation': self.validation_fn.__doc__,
                'actual': offending_values,
                'offending': offending,
            }
            exc = self.resulting_exception(
                constraint_name=self.name,
                constraint_description=self.description,
                **metadict)

            if self.raise_or_typecheck:
                raise exc
            else:
                return exc.return_as_typecheck()
Exemple #8
0
    def validate(self, data, *columns, **kwargs):
        if len(columns) == 0:
            columns = data.columns
        relevant_data = data[list(columns)]
        offending_columns = set()
        offending_values = {}
        for column in columns:
            # TODO: grab extra metadata
            if not self.validation_fn(relevant_data[column])[0]:
                offending_columns.add(column)
                offending_values[column] = relevant_data[column].to_numpy()
        if len(offending_columns) == 0 and not self.raise_or_typecheck:
            return TypeCheck(success=True)
        elif len(offending_columns) > 0:
            metadict = {
                'expectation': self.description.replace('Confirms', ''),
                'actual': offending_values,
                'offending': offending_columns,
            }
            exc = self.resulting_exception(
                constraint_name=self.name,
                constraint_description=self.description,
                **metadict)

            if self.raise_or_typecheck:
                raise exc
            else:
                return exc.return_as_typecheck()
Exemple #9
0
def _pandera_errors_to_type_check(
    error: pa.errors.SchemaErrors, _table_schema: TableSchema
) -> TypeCheck:
    return TypeCheck(
        success=False,
        description=str(error),
    )
Exemple #10
0
    def validate(self, data, *columns, **kwargs):
        if len(columns) == 0:
            columns = data.columns
        columns = [column for column in columns if column in data.columns]
        relevant_data = data[list(columns)]

        offending_columns = set()
        offending_values = {}
        for column in columns:
            # TODO: grab extra metadata
            res = self.validation_fn(relevant_data[column])
            if not res[0]:
                offending_columns.add(column)
                if not res[1].get("actual") is None:
                    offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]
                else:
                    offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]
        if len(offending_columns) == 0 and not self.raise_or_typecheck:
            return TypeCheck(success=True)
        elif len(offending_columns) > 0:
            metadict = {
                "expectation": self.description.replace("Confirms", ""),
                "actual": offending_values,
                "offending": offending_columns,
            }
            exc = self.resulting_exception(
                constraint_name=self.name, constraint_description=self.description, **metadict
            )

            if self.raise_or_typecheck:
                raise exc
            else:
                return exc.return_as_typecheck()
Exemple #11
0
def test_raise_on_error_true_type_check_returns_unsuccessful_type_check():
    FalsyType = DagsterType(
        name="FalsyType",
        type_check_fn=lambda _, _val: TypeCheck(
            success=False, metadata_entries=[EventMetadataEntry.text("foo", "bar", "baz")]
        ),
    )

    @solid(output_defs=[OutputDefinition(FalsyType)])
    def foo_solid(_):
        return 1

    @pipeline
    def foo_pipeline():
        foo_solid()

    with pytest.raises(DagsterTypeCheckDidNotPass) as e:
        execute_pipeline(foo_pipeline)
    assert e.value.metadata_entries[0].label == "bar"
    assert e.value.metadata_entries[0].entry_data.text == "foo"
    assert e.value.metadata_entries[0].description == "baz"
    assert isinstance(e.value.dagster_type, DagsterType)

    pipeline_result = execute_pipeline(foo_pipeline, raise_on_error=False)
    assert not pipeline_result.success
    assert [event.event_type_value for event in pipeline_result.step_event_list] == [
        DagsterEventType.STEP_START.value,
        DagsterEventType.STEP_OUTPUT.value,
        DagsterEventType.STEP_FAILURE.value,
    ]
    for event in pipeline_result.step_event_list:
        if event.event_type_value == DagsterEventType.STEP_FAILURE.value:
            assert event.event_specific_data.error.cls_name == "DagsterTypeCheckDidNotPass"
Exemple #12
0
def validate_snapshot_timeseries(training_set_data):
    if len(training_set_data) != 2:
        return TypeCheck(
            success=False,
            description=
            'Invalid training set. The tuple must consist of a training set, output vector, and feature_names',
        )
    # tuple argument types
    X, y = training_set_data
    if not (isinstance(X, ndarray) and isinstance(y, ndarray)):
        return TypeCheck(
            success=False,
            description=
            'Both input matrix and output vector must be numpy arrays. X: {} | y: {}'
            .format(type(X), type(y)),
        )

    timeseries_length, snapshot_length, num_timeseries = X.shape
    output_vector_length = y.shape[0]
    if num_timeseries == 0 or output_vector_length == 0:
        return TypeCheck(
            success=False,
            description='No empty training sets allowed',
        )

    if timeseries_length != output_vector_length:
        return TypeCheck(
            success=False,
            description=
            'Every timeseries must have as many snapshots as outputs')

    return TypeCheck(
        success=True,
        metadata_entries=[
            EventMetadataEntry.text(str(num_timeseries), 'num_ts',
                                    'Number of parallel timeseries.'),
            EventMetadataEntry.text(str(timeseries_length),
                                    'timeseries_length',
                                    'Length of each timeseries.'),
            EventMetadataEntry.text(
                str(snapshot_length),
                'snapshot_length',
                'Number of past observations for each input.',
            ),
        ],
    )
Exemple #13
0
    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                "Must be a pandas.DataFrame. Got value of type. {type_name}".
                format(type_name=type(value).__name__),
            )
        individual_result_dict = {}

        if dataframe_validator is not None:
            individual_result_dict["dataframe"] = dataframe_validator.validate(
                value)
        if columns_validator is not None:
            individual_result_dict["columns"] = columns_validator.validate(
                value)

        if columns_aggregate_validator is not None:
            individual_result_dict[
                "column-aggregates"] = columns_aggregate_validator.validate(
                    value)

        typechecks_succeeded = True
        metadata = []
        overall_description = "Failed Constraints: {}"
        constraint_clauses = []
        for key, result in individual_result_dict.items():
            result_val = result.success
            if result_val:
                continue
            typechecks_succeeded = typechecks_succeeded and result_val
            result_dict = result.metadata_entries[0].entry_data.data
            metadata.append(
                EventMetadataEntry.json(
                    result_dict,
                    "{}-constraint-metadata".format(key),
                ))
            constraint_clauses.append("{} failing constraints, {}".format(
                key, result.description))
        # returns aggregates, then column, then dataframe
        return TypeCheck(
            success=typechecks_succeeded,
            description=overall_description.format(constraint_clauses),
            metadata_entries=sorted(metadata, key=lambda x: x.label),
        )
Exemple #14
0
def file_exists_at_path_type_check(_, value):
    if not isinstance(value, six.string_types):
        return TypeCheck(
            success=False,
            description='FileExistsAtPath must be a string in memory. Got {value}'.format(
                value=repr(value)
            ),
        )
    if not safe_isfile(value):
        return TypeCheck(
            success=False,
            description=(
                'FileExistsAtPath must be a path that points to a file that '
                'exists. "{value}" does not exist on disk'
            ).format(value=value),
        )

    return True
Exemple #15
0
    def type_check_fn(_context, value: object) -> TypeCheck:
        if isinstance(value, VALID_DATAFRAME_CLASSES):
            try:
                # `lazy` instructs pandera to capture every (not just the first) validation error
                schema.validate(value, lazy=True)
            except pa.errors.SchemaErrors as e:
                return _pandera_errors_to_type_check(e, table_schema)
            except Exception as e:
                return TypeCheck(
                    success=False,
                    description=f"Unexpected error during validation: {e}",
                )
        else:
            return TypeCheck(
                success=False,
                description=f"Must be one of {VALID_DATAFRAME_CLASSES}, not {type(value).__name__}.",
            )

        return TypeCheck(success=True)
Exemple #16
0
    def type_check_fn(_context, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=f"Must be pandas.DataFrame, not {type(value).__name__}.",
            )
        try:
            # `lazy` instructs pandera to capture every (not just the first) validation error
            schema.validate(value, lazy=True)
        except pa.errors.SchemaErrors as e:
            return TypeCheck(
                success=False,
                description=str(e),
                metadata={
                    "num_violations": len(e.failure_cases),
                },
            )

        return TypeCheck(success=True)
Exemple #17
0
    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                'Must be a pandas.DataFrame. Got value of type. {type_name}'.
                format(type_name=type(value).__name__),
            )
        individual_result_dict = {}
        if columns_validator is not None:
            individual_result_dict["columns"] = columns_validator.validate(
                value)

        if columns_aggregate_validator is not None:
            individual_result_dict[
                "column aggregates"] = columns_aggregate_validator.validate(
                    value)

        if dataframe_validator is not None:
            individual_result_dict["dataframe"] = dataframe_validator.validate(
                value)

        typechecks_succeeded = True
        metadata = []
        overall_description = ""
        for key, result in individual_result_dict.items():
            result_val = result.success
            if result_val:
                continue
            typechecks_succeeded = typechecks_succeeded and result_val
            result_dict = result.metadata_entries[0].entry_data.data
            metadata.append(
                EventMetadataEntry.json(
                    result_dict,
                    '{}-constraint-metadata'.format(key),
                ))
            overall_description += "{} failing constraints, requiring {}".format(
                key, result.description)
        return TypeCheck(success=typechecks_succeeded,
                         description=overall_description,
                         metadata_entries=metadata)
    def _dagster_type_check(value):
        if not isinstance(value, DataFrame):
            return TypeCheck(
                success=False,
                description=
                'Must be a pandas.DataFrame. Got value of type. {type_name}'.
                format(type_name=type(value).__name__),
            )

        if columns is not None:
            try:
                validate_collection_schema(columns, value)
            except ConstraintViolationException as e:
                return TypeCheck(success=False, description=str(e))

        return TypeCheck(
            success=True,
            metadata_entries=_execute_summary_stats(name, value,
                                                    summary_statistics)
            if summary_statistics else None,
        )
Exemple #19
0
    def _dagster_type_check(_, value):
        if not isinstance(value, pd.DataFrame):
            return TypeCheck(
                success=False,
                description=
                'Must be a pandas.DataFrame. Got value of type. {type_name}'.
                format(type_name=type(value).__name__),
            )

        try:
            validate_constraints(value,
                                 pandas_columns=columns,
                                 dataframe_constraints=dataframe_constraints)
        except ConstraintViolationException as e:
            return TypeCheck(success=False, description=str(e))

        return TypeCheck(
            success=True,
            metadata_entries=_execute_summary_stats(
                name, value, event_metadata_fn) if event_metadata_fn else None,
        )
Exemple #20
0
def less_simple_data_frame_type_check(_, value):
    if not isinstance(value, list):
        return TypeCheck(
            success=False,
            description=
            f"LessSimpleDataFrame should be a list of dicts, got {type(value)}",
        )

    fields = [field for field in value[0].keys()]

    for i in range(len(value)):
        row = value[i]
        idx = i + 1
        if not isinstance(row, dict):
            return TypeCheck(
                success=False,
                description=
                (f"LessSimpleDataFrame should be a list of dicts, got {type(row)} for row {idx}"
                 ),
            )
        row_fields = [field for field in row.keys()]
        if fields != row_fields:
            return TypeCheck(
                success=False,
                description=
                (f"Rows in LessSimpleDataFrame should have the same fields, got {row_fields} "
                 f"for row {idx}, expected {fields}"),
            )

    return TypeCheck(
        success=True,
        description="LessSimpleDataFrame summary statistics",
        metadata={
            "n_rows": len(value),
            "n_cols": len(value[0].keys()) if len(value) > 0 else 0,
            "column_names":
            str(list(value[0].keys()) if len(value) > 0 else []),
        },
    )
def positive_num_check(_, value):
    # return True if value > 0 else False
    if value <= 0:
        return TypeCheck(
            success=False,
            description=("Numbers cannot be 0 or negative, got "
                         "{value} for PositiveNumber type").format(
                             value=value),
            metadata_entries=[
                EventMetadataEntry.int(value, "The input number")
            ])
    else:
        return True
Exemple #22
0
 def type_check_method(_, value):
     if not isinstance(value, dict):
         return TypeCheck(
             False,
             description="Value {value} should be of type {type_name}.".format(
                 value=value, type_name=name
             ),
         )
     for key in value:
         if not key in permitted_key_names:
             return TypeCheck(
                 False,
                 description=(
                     "Key {name} is not a permitted value, values can only be of: " "{name_list}"
                 ).format(name=value.name, name_list=permitted_key_names),
             )
     return TypeCheck(
         True,
         metadata_entries=[
             MetadataEntry("row_count", value=str(len(value))),
             MetadataEntry("series_names", value=", ".join(value.keys())),
         ],
     )
Exemple #23
0
    def validate(self, data, *args, **kwargs):
        res = self.validation_fn(data, *args, **kwargs)
        if not res[0]:
            exc = self.resulting_exception(
                constraint_name=self.name, constraint_description=self.description, **res[1]
            )

            if self.raise_or_typecheck:
                raise exc
            else:
                return exc.return_as_typecheck()

        else:
            if res[0]:
                return TypeCheck(success=True)
Exemple #24
0
 def type_check(self, value):
     if not isinstance(value, dict):
         raise Failure(
             'Value {value} should be of type {type_name}.'.format(
                 value=value, type_name=self.name))
     for key in value:
         if not key in permitted_key_names:
             raise Failure(
                 'Key {name} is not a permitted value, values can only be of: {name_list}'
                 .format(name=value.name,
                         name_list=permitted_key_names))
     return TypeCheck(metadata_entries=[
         EventMetadataEntry.text(label='row_count',
                                 text=str(len(value))),
         EventMetadataEntry.text(label='series_names',
                                 text=', '.join(value.keys())),
     ])
Exemple #25
0
def check_dagster_type(dagster_type, value):
    '''Test a custom Dagster type.

    Args:
        dagster_type (Any): The Dagster type to test. Should be one of the
            :ref:`built-in types <builtin>`, a dagster type explicitly constructed with
            :py:func:`as_dagster_type`, :py:func:`@usable_as_dagster_type <dagster_type>`, or
            :py:func:`PythonObjectDagsterType`, or a Python type.
        value (Any): The runtime value to test.

    Returns:
        TypeCheck: The result of the type check.


    Examples:

        .. code-block:: python

            assert check_dagster_type(Dict[Any, Any], {'foo': 'bar'}).success
    '''

    if is_typing_type(dagster_type):
        raise DagsterInvariantViolationError(
            (
                'Must pass in a type from dagster module. You passed {dagster_type} '
                'which is part of python\'s typing module.'
            ).format(dagster_type=dagster_type)
        )

    dagster_type = resolve_dagster_type(dagster_type)
    with yield_empty_pipeline_context() as pipeline_context:
        context = pipeline_context.for_type(dagster_type)
        try:
            type_check = dagster_type.type_check(context, value)
        except Failure as failure:
            return TypeCheck(success=False, description=failure.description)

        if not isinstance(type_check, TypeCheck):
            raise DagsterInvariantViolationError(
                'Type checks can only return TypeCheck. Type {type_name} returned {value}.'.format(
                    type_name=dagster_type.name, value=repr(type_check)
                )
            )
        return type_check
Exemple #26
0
def less_simple_data_frame_typecheck_metadata_fn(value) -> TypeCheck:
    return TypeCheck(
        'LessSimpleDataFrame summary statistics',
        [
            EventMetadataEntry.text(
                str(len(value)),
                'n_rows',
                'Number of rows seen in the data frame',
            ),
            EventMetadataEntry.text(
                str(len(value[0].keys()) if len(value) > 0 else 0),
                'n_cols',
                'Number of columns seen in the data frame',
            ),
            EventMetadataEntry.text(
                str(list(value[0].keys()) if len(value) > 0 else []),
                'column_names',
                'Keys of columns seen in the data frame',
            ),
        ],
    )
Exemple #27
0
def validate_trip_dataframe(dataframe):
    TRIP_CONFIG = {
        'bike_id': {
            'bounds': (0, float('inf')),
            'expected_dtypes': {'int64'}
        },
        'start_time': {
            'bounds': (Timestamp(year=2018, month=1,
                                 day=1), Timestamp(year=2020, month=1, day=1)),
            'expected_dtypes': {'<M8[ns]', 'datetime64[ns]'},
        },
        'end_time': {
            'bounds': (Timestamp(year=2018, month=1,
                                 day=1), Timestamp(year=2020, month=1, day=1)),
            'expected_dtypes': {'<M8[ns]', 'datetime64[ns]'},
        },
        'interval_date': {
            'expected_dtypes': {'str', 'object'}
        },
    }
    failed_type_check = DataFrameValidator(
        TRIP_CONFIG).validate_columns_in_dataframe(dataframe)
    return (failed_type_check if failed_type_check else TypeCheck(
        success=True,
        description='Yay',
        metadata_entries=[
            EventMetadataEntry.text(
                str(min(dataframe['start_time'])),
                'min_start_time',
                'Date data collection started',
            ),
            EventMetadataEntry.text(str(max(dataframe['end_time'])),
                                    'max_end_time', 'Timestamp of last trip'),
            EventMetadataEntry.text(str(len(dataframe)), 'n_rows',
                                    'Number of rows seen in the dataframe'),
            EventMetadataEntry.text(str(dataframe.columns), 'columns',
                                    'Keys of columns seen in the dataframe'),
        ],
    ))
Exemple #28
0
def test_raise_on_error_true_type_check_returns_successful_type_check():
    TruthyExceptionType = DagsterType(
        name="TruthyExceptionType",
        type_check_fn=lambda _, _val: TypeCheck(
            success=True, metadata_entries=[EventMetadataEntry.text("foo", "bar", "baz")]
        ),
    )

    @solid(output_defs=[OutputDefinition(TruthyExceptionType)])
    def foo_solid(_):
        return 1

    @pipeline
    def foo_pipeline():
        foo_solid()

    pipeline_result = execute_pipeline(foo_pipeline)
    assert pipeline_result.success
    for event in pipeline_result.step_event_list:
        if event.event_type_value == DagsterEventType.STEP_OUTPUT.value:
            assert event.event_specific_data.type_check_data
            assert event.event_specific_data.type_check_data.metadata_entries[0].label == "bar"
            assert (
                event.event_specific_data.type_check_data.metadata_entries[0].entry_data.text
                == "foo"
            )
            assert (
                event.event_specific_data.type_check_data.metadata_entries[0].description == "baz"
            )

    pipeline_result = execute_pipeline(foo_pipeline, raise_on_error=False)
    assert pipeline_result.success
    assert set(
        [
            DagsterEventType.STEP_START.value,
            DagsterEventType.STEP_OUTPUT.value,
            DagsterEventType.STEP_SUCCESS.value,
        ]
    ).issubset([event.event_type_value for event in pipeline_result.step_event_list])
Exemple #29
0
def validate_traffic_dataframe(dataframe):
    TRAFFIC_CONFIG = {
        'interval_date': {
            'expected_dtypes': {'str', 'object'}
        },
        'peak_traffic_load': {
            'bounds': (0, float('inf')),
            'expected_dtypes': {'int64'}
        },
    }
    failed_type_check = DataFrameValidator(
        TRAFFIC_CONFIG).validate_columns_in_dataframe(dataframe)
    return (failed_type_check if failed_type_check else TypeCheck(
        success=True,
        description='Yay',
        metadata_entries=[
            EventMetadataEntry.text(str(min(dataframe['peak_traffic_load'])),
                                    'min_traffic_load', 'Best Peak Load'),
            EventMetadataEntry.text(str(max(dataframe['peak_traffic_load'])),
                                    'max_traffic_load', 'Worst Peak Load'),
            EventMetadataEntry.text(
                str(mean(dataframe['peak_traffic_load'])),
                'mean_traffic_load',
                'Mean peak traffic',
            ),
            EventMetadataEntry.text(
                str(median(dataframe['peak_traffic_load'])),
                'median_traffic_load',
                'Median peak traffic',
            ),
            EventMetadataEntry.text(str(len(dataframe)), 'n_rows',
                                    'Number of rows seen in the dataframe'),
            EventMetadataEntry.text(str(dataframe.columns), 'columns',
                                    'Keys of columns seen in the dataframe'),
        ],
    ))
Exemple #30
0
    if file_type == 'csv':
        path = file_options['path']
        return pd.read_csv(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'parquet':
        return pd.read_parquet(file_options['path'])
    elif file_type == 'table':
        return pd.read_csv(file_options['path'], sep='\t')
    else:
        raise DagsterInvariantViolationError(
            'Unsupported file_type {file_type}'.format(file_type=file_type)
        )


DataFrame = as_dagster_type(
    pd.DataFrame,
    name='PandasDataFrame',
    description='''Two-dimensional size-mutable, potentially heterogeneous
    tabular data structure with labeled axes (rows and columns).
    See http://pandas.pydata.org/''',
    input_hydration_config=dataframe_input_schema,
    output_materialization_config=dataframe_output_schema,
    typecheck_metadata_fn=lambda value: TypeCheck(
        metadata_entries=[
            EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'),
            # string cast columns since they may be things like datetime
            EventMetadataEntry.json({'columns': list(map(str, value.columns))}, 'metadata'),
        ]
    ),
)