예제 #1
0
    def _validate_data(self, table_definition, data):
        """Validates that the dtypes in data match the expected types in the table_definition.
        Pandas makes this difficult because DataFrames with no length have Object type,
        and those with nulls become float type.

        Dismod-AT has its own set of rules about representation of null values.

         * For a text column, a missing value is an empty string, ``""``.
         * For an integer column, a missing value is the minimum integer,
           but no integer value should ever be missing.
         * For a float column, infinity is the maximum float value,
           which is ``10e318`` or minimum, which is ``-10e318`` according to
           Dismod-AT's arbitrary version of calculating this.
        """
        if len(data) == 0:
            # Length zero columns get converted on write.
            return

        columns_checked = set()

        for column_name, column_definition in table_definition.c.items():
            if column_name in data:
                actual_type = data[column_name].dtype
                is_pandas_extension = isinstance(actual_type, ExtensionDtype)
                expected_type = self._expected_type(column_definition)
                is_nullable_numeric = (column_definition.nullable
                                       and expected_type in [int, float])
                if is_nullable_numeric:
                    data[column_name] = data[column_name].fillna(value=np.nan)

                if expected_type is int:
                    self._check_int_type(actual_type, column_name,
                                         is_pandas_extension, table_definition)
                elif expected_type is float:
                    self._check_float_type(actual_type, column_name,
                                           table_definition)
                elif expected_type is str:
                    self._check_str_type(actual_type, column_name, data,
                                         table_definition)
                else:
                    raise RuntimeError(
                        f"Unexpected type from column definitions: {expected_type}."
                    )
            elif not (column_definition.primary_key
                      or column_definition.nullable):
                raise DismodFileError(
                    f"Missing column in data for table '{table_definition.name}': '{column_name}'"
                )
            columns_checked.add(column_name)

        extra_columns = set(data.columns).difference(table_definition.c.keys())
        if extra_columns:
            raise DismodFileError(
                f"extra columns in data for table '{table_definition.name}': {extra_columns}"
            )
예제 #2
0
 def _check_int_type(actual_type, column_name, is_pandas_extension, table_definition):
     if is_pandas_extension:
         if actual_type.is_dtype(pd.Int64Dtype()):
             return
         else:
             raise DismodFileError(
                 f"column '{column_name}' in data for table '{table_definition.name}' must be integer"
             )
     else:
         # Permit np.float because an int column with a None is cast to float.
         # Same for object. This is cast on write.
         # Because we use metadata, this will be converted for us to int when it is written.
         allowed = [np.integer, np.floating]
         if not any(np.issubdtype(actual_type, given_type) for given_type in allowed):
             raise DismodFileError(
                 f"column '{column_name}' in data for table '{table_definition.name}' must be integer"
             )
예제 #3
0
 def _check_str_type(actual_type, column_name, data, table_definition):
     if len(data) > 0:
         correct = data[column_name].dtype == np.dtype('O')
         if not correct:
             raise DismodFileError(
                 f"column '{column_name}' in data for table '{table_definition.name}' must be string "
                 f"but type is {actual_type}.")
     else:
         pass  # Will convert to string on write of empty rows.
예제 #4
0
 def _check_float_type(actual_type, column_name, table_definition):
     if not np.issubdtype(actual_type, np.number):
         raise DismodFileError(
             f"column '{column_name}' in data for table '{table_definition.name}' must be numeric"
         )