def _validate_data(self, table_definition, data): """Validates that the dtypes in data match the expected types in the table_definition. Pandas makes this difficult because DataFrames with no length have Object type, and those with nulls become float type. Dismod-AT has its own set of rules about representation of null values. * For a text column, a missing value is an empty string, ``""``. * For an integer column, a missing value is the minimum integer, but no integer value should ever be missing. * For a float column, infinity is the maximum float value, which is ``10e318`` or minimum, which is ``-10e318`` according to Dismod-AT's arbitrary version of calculating this. """ if len(data) == 0: # Length zero columns get converted on write. return columns_checked = set() for column_name, column_definition in table_definition.c.items(): if column_name in data: actual_type = data[column_name].dtype is_pandas_extension = isinstance(actual_type, ExtensionDtype) expected_type = self._expected_type(column_definition) is_nullable_numeric = (column_definition.nullable and expected_type in [int, float]) if is_nullable_numeric: data[column_name] = data[column_name].fillna(value=np.nan) if expected_type is int: self._check_int_type(actual_type, column_name, is_pandas_extension, table_definition) elif expected_type is float: self._check_float_type(actual_type, column_name, table_definition) elif expected_type is str: self._check_str_type(actual_type, column_name, data, table_definition) else: raise RuntimeError( f"Unexpected type from column definitions: {expected_type}." ) elif not (column_definition.primary_key or column_definition.nullable): raise DismodFileError( f"Missing column in data for table '{table_definition.name}': '{column_name}'" ) columns_checked.add(column_name) extra_columns = set(data.columns).difference(table_definition.c.keys()) if extra_columns: raise DismodFileError( f"extra columns in data for table '{table_definition.name}': {extra_columns}" )
def _check_int_type(actual_type, column_name, is_pandas_extension, table_definition): if is_pandas_extension: if actual_type.is_dtype(pd.Int64Dtype()): return else: raise DismodFileError( f"column '{column_name}' in data for table '{table_definition.name}' must be integer" ) else: # Permit np.float because an int column with a None is cast to float. # Same for object. This is cast on write. # Because we use metadata, this will be converted for us to int when it is written. allowed = [np.integer, np.floating] if not any(np.issubdtype(actual_type, given_type) for given_type in allowed): raise DismodFileError( f"column '{column_name}' in data for table '{table_definition.name}' must be integer" )
def _check_str_type(actual_type, column_name, data, table_definition): if len(data) > 0: correct = data[column_name].dtype == np.dtype('O') if not correct: raise DismodFileError( f"column '{column_name}' in data for table '{table_definition.name}' must be string " f"but type is {actual_type}.") else: pass # Will convert to string on write of empty rows.
def _check_float_type(actual_type, column_name, table_definition): if not np.issubdtype(actual_type, np.number): raise DismodFileError( f"column '{column_name}' in data for table '{table_definition.name}' must be numeric" )