def _validate_table(self, table_name, table_meta, table_data=None): """Validate table metadata. Validate the type and subtype combination for each field in ``table_meta``. If a field has type ``id``, validate that it either is the ``primary_key`` or has a ``ref`` entry. If the table has ``primary_key``, make sure that the corresponding field exists and its type is ``id``. If ``table_data`` is provided, also check that the list of columns corresponds to the ones indicated in the metadata and that all the dtypes are valid. Args: table_name (str): Name of the table to validate. table_meta (dict): Metadata of the table to validate. table_data (pandas.DataFrame): If provided, make sure that the data matches the one described on the metadata. Raises: MetadataError: If there is any error in the metadata or the data does not match the metadata description. """ dtypes = self.get_dtypes(table_name, ids=True) # Primary key field exists and its type is 'id' primary_key = table_meta.get('primary_key') if primary_key: pk_field = table_meta['fields'].get(primary_key) if not pk_field: raise MetadataError('Primary key is not an existing field.') if pk_field['type'] != 'id': raise MetadataError('Primary key is not of type `id`.') if table_data is not None: for column in table_data: try: dtype = dtypes.pop(column) table_data[column].dropna().astype(dtype) except KeyError: message = 'Unexpected column in table `{}`: `{}`'.format( table_name, column) raise MetadataError(message) from None except ValueError as ve: message = 'Invalid values found in column `{}` of table `{}`: `{}`'.format( column, table_name, ve) raise MetadataError(message) from None # assert all dtypes are in data if dtypes: raise MetadataError('Missing columns on table {}: {}.'.format( table_name, list(dtypes.keys())))
def validate(self, tables=None): """Validate this metadata. Validate the metadata of each table: * If ``tables`` are provided or they have been loaded, check that all the metadata tables exists in the ``tables`` dictionary. * Validate the type/subtype combination for each field and if a field of type ``id`` exists it must be the ``primary_key`` or must have a ``ref`` entry. * If ``primary_key`` entry exists, check that it's an existing field and its type is ``id``. * If ``tables`` are provided or they have been loaded, check all the data types for the table correspond to each column and all the data types exists on the table. * Validate that there is no circular relatioship in the metadata. * Check that all the tables have at most one parent. Args: tables (bool, dict): If a dict of table is passed, validate that the columns and dtypes match the metadata. If ``True`` is passed, load the tables from the Metadata instead. If ``None``, omit the data validation. Defaults to ``None``. """ tables_meta = self._metadata.get('tables') if not tables_meta: raise MetadataError('"tables" entry not found in Metadata.') if tables and not isinstance(tables, dict): tables = self.load_tables() errors = [] for table_name, table_meta in tables_meta.items(): if tables: table = tables.get(table_name) if table is None: errors.append( 'Table `{}` not found in tables'.format(table_name)) else: table = None self._validate_table(table_name, table_meta, table, errors) self._validate_circular_relationships(table_name, errors=errors) if errors: raise MetadataError('Invalid Metadata specification:\n - ' + '\n - '.join(errors))
def get_dtypes(self, table_name, ids=False): """Get a ``dict`` with the ``dtypes`` for each field of a given table. Args: table_name (str): Table name for which to retrive the ``dtypes``. ids (bool): Whether or not include the id fields. Defaults to ``False``. Returns: dict: Dictionary that contains the field names and data types from a table. Raises: ValueError: If a field has an invalid type or subtype or if the table does not exist in this metadata. """ dtypes = dict() table_meta = self.get_table_meta(table_name) for name, field in table_meta['fields'].items(): field_type = field['type'] field_subtype = field.get('subtype') dtype = self._DTYPES.get((field_type, field_subtype)) if not dtype: raise MetadataError( 'Invalid type and subtype combination for field {}: ({}, {})' .format(name, field_type, field_subtype)) if ids and field_type == 'id': if (name != table_meta.get('primary_key') ) and not field.get('ref'): for child_table in self.get_children(table_name): if name == self.get_foreign_key( table_name, child_table): break else: raise MetadataError( 'id field `{}` is neither a primary or a foreign key' .format(name)) if ids or (field_type != 'id'): dtypes[name] = dtype return dtypes
def _get_field_dtype(self, field_name, field_metadata): field_type = field_metadata['type'] field_subtype = field_metadata.get('subtype') dtype = self._TYPES_TO_DTYPES.get((field_type, field_subtype)) if not dtype: raise MetadataError( 'Invalid type and subtype combination for field {}: ({}, {})'. format(field_name, field_type, field_subtype)) return dtype
def _validate_circular_relationships(self, parent, children=None): """Validate that there is no circular relatioship in the metadata.""" if children is None: children = self.get_children(parent) if parent in children: raise MetadataError('Circular relationship found for table "{}"'.format(parent)) for child in children: self._validate_circular_relationships(parent, self.get_children(child))