def validate_reload(schemas: List[DataWarehouseSchema], relations: List[RelationDescription], keep_going: bool): """ Verify that columns between unloaded tables and reloaded tables are the same. Once the designs are validated, we can unload a relation 's.t' with a target 'u' and then extract and load it back into 'u.t'. Note that the order matters for these lists of columns. (Which is also why we can't take the (symmetric) difference between columns but must be careful checking the column lists.) """ unloaded_relations = [d for d in relations if d.is_unloadable] target_lookup = { schema.name for schema in schemas if schema.is_an_unload_target } relations_lookup = {d.identifier: d for d in relations} for unloaded in unloaded_relations: try: if unloaded.unload_target not in target_lookup: raise TableDesignValidationError( "invalid target '{}' in unloadable relation '{}'".format( unloaded.unload_target, unloaded.identifier)) else: logger.debug("Checking whether '%s' is loaded back in", unloaded.identifier) reloaded = TableName(unloaded.unload_target, unloaded.target_table_name.table) if reloaded.identifier in relations_lookup: relation = relations_lookup[reloaded.identifier] logger.info( "Checking for consistency between '%s' and '%s'", unloaded.identifier, relation.identifier) unloaded_columns = unloaded.unquoted_columns reloaded_columns = relation.unquoted_columns if unloaded_columns != reloaded_columns: diff = get_list_difference(reloaded_columns, unloaded_columns) logger.error( "Column difference detected between '%s' and '%s'", unloaded.identifier, relation.identifier) logger.error( "You need to replace, insert and/or delete in '%s' some column(s): %s", relation.identifier, join_with_single_quotes(diff), ) raise TableDesignValidationError( "unloaded relation '%s' failed to match counterpart" % unloaded.identifier) except TableDesignValidationError: if keep_going: _error_occurred.set() logger.exception( "Ignoring failure to validate '%s' and proceeding as requested:", unloaded.identifier) else: raise
def validate_upstream_constraints(conn: connection, table: RelationDescription) -> None: """ Compare table constraints between database and table design file. Note that "natural_key" or "surrogate_key" constraints are not valid in upstream (source) tables. Also, a "primary_key" in upstream may be used as a "unique" constraint in the design (but not vice versa). """ current_constraint = etl.design.bootstrap.fetch_constraints(conn, table.source_table_name) design_constraint = table.table_design.get("constraints", []) current_primary_key = frozenset([col for c in current_constraint for col in c.get("primary_key", [])]) current_uniques = [frozenset(c["unique"]) for c in current_constraint if "unique" in c] design_primary_key = frozenset([col for c in design_constraint for col in c.get("primary_key", [])]) design_uniques = [frozenset(c["unique"]) for c in design_constraint if "unique" in c] # We'll pluck from the not_used info and report if anything wasn't used in the design at the end. not_used = deepcopy(current_constraint) if design_primary_key: if current_primary_key == design_primary_key: for i in range(len(not_used)): if "primary_key" in not_used[i]: del not_used[i] break elif current_primary_key: raise TableDesignValidationError("the primary_key constraint in '%s' (%s) does not match upstream (%s)" % (table.identifier, join_with_quotes(design_primary_key), join_with_quotes(current_primary_key))) else: raise TableDesignValidationError("the primary key constraint in '%s' (%s) is not enforced upstream" % (table.identifier, join_with_quotes(design_primary_key))) for design_unique in design_uniques: if current_primary_key == design_unique: for i in range(len(not_used)): if "primary_key" in not_used[i]: del not_used[i] break if design_unique in current_uniques: for i in range(len(not_used)): if "unique" in not_used[i] and frozenset(not_used[i]["unique"]) == design_unique: del not_used[i] break if current_primary_key != design_unique and design_unique not in current_uniques: raise TableDesignValidationError("the unique constraint in '%s' (%s) is not enforced upstream" % (table.identifier, join_with_quotes(design_unique))) for constraint in not_used: for constraint_type, columns in constraint.items(): logger.warning("Upstream source has additional %s constraint (%s) for '%s'", constraint_type, join_with_quotes(columns), table.table_design["source_name"])
def validate_dependencies(conn: Connection, relation: RelationDescription, tmp_view_name: TempTableName) -> None: """Download the dependencies (based on a temporary view) and compare with table design.""" if tmp_view_name.is_late_binding_view: dependencies = etl.design.bootstrap.fetch_dependency_hints( conn, relation.query_stmt) if dependencies is None: logger.warning( "Unable to validate '%s' which depends on external tables", relation.identifier) return logger.info("Dependencies of '%s' per query plan: %s", relation.identifier, join_with_single_quotes(dependencies)) else: dependencies = etl.design.bootstrap.fetch_dependencies( conn, tmp_view_name) logger.info("Dependencies of '%s' per catalog: %s", relation.identifier, join_with_single_quotes(dependencies)) difference = compare_query_to_design( dependencies, relation.table_design.get("depends_on", [])) if difference: logger.error("Mismatch in dependencies of '{}': {}".format( relation.identifier, difference)) raise TableDesignValidationError("mismatched dependencies in '%s'" % relation.identifier) logger.info("Dependencies listing in design file for '%s' matches SQL", relation.identifier)
def validate_column_ordering(conn: connection, relation: RelationDescription, tmp_view_name: TempTableName) -> None: """ Download the column order (using the temporary view) and compare with table design. """ attributes = etl.design.bootstrap.fetch_attributes(conn, tmp_view_name) actual_columns = [attribute.name for attribute in attributes] if not actual_columns and tmp_view_name.is_late_binding_view: # Thanks to late-binding views it is not an error for a view to not be able to resolve its columns. logger.warning( "Order of columns in design of '%s' cannot be validated because external table is missing", relation.identifier, ) return # Identity columns are inserted after the query has been run, so skip them here. expected_columns = [ column["name"] for column in relation.table_design["columns"] if not (column.get("skipped") or column.get("identity")) ] diff = get_list_difference(expected_columns, actual_columns) if diff: logger.error( "Order of columns in design of '%s' does not match result of running its query", relation.identifier ) logger.error( "You need to replace, insert and/or delete in '%s' some column(s): %s", relation.identifier, join_with_quotes(diff), ) raise TableDesignValidationError("invalid columns or column order in '%s'" % relation.identifier) else: logger.info("Order of columns in design of '%s' matches result of running SQL query", relation.identifier)
def validate_dependencies(conn: connection, relation: RelationDescription, tmp_view_name: TempTableName) -> None: """ Download the dependencies (usually, based on the temporary view) and compare with table design. """ if tmp_view_name.is_late_binding_view: logger.warning( "Dependencies of '%s' cannot be verified because it depends on an external table", relation.identifier) return dependencies = etl.design.bootstrap.fetch_dependencies(conn, tmp_view_name) # We break with tradition and show the list of dependencies such that they can be copied into a design file. logger.info("Dependencies of '%s' per catalog: %s", relation.identifier, json.dumps(dependencies)) difference = compare_query_to_design( dependencies, relation.table_design.get("depends_on", [])) if difference: logger.error("Mismatch in dependencies of '{}': {}".format( relation.identifier, difference)) raise TableDesignValidationError("mismatched dependencies in '%s'" % relation.identifier) else: logger.info('Dependencies listing in design file matches SQL')
def validate_upstream_columns(conn: connection, table: RelationDescription) -> None: """ Compare columns in upstream table to the table design file. """ source_table_name = table.source_table_name columns_info = etl.design.bootstrap.fetch_attributes(conn, source_table_name) if not columns_info: raise UpstreamValidationError("table '%s' is gone or has no columns left" % source_table_name.identifier) logger.info("Found %d column(s) in relation '%s'", len(columns_info), source_table_name.identifier) current_columns = frozenset(column.name for column in columns_info) design_columns = frozenset(column["name"] for column in table.table_design["columns"] if not column["name"].startswith("etl__")) if not current_columns.issuperset(design_columns): extra_columns = design_columns.difference(current_columns) raise UpstreamValidationError("design of '%s' has columns that do not exist upstream: %s" % (source_table_name.identifier, join_with_quotes(extra_columns))) missing_columns = current_columns.difference(design_columns) if missing_columns: logger.warning("Column(s) that exist upstream in '%s' but not in the design '%s': %s", table.source_name, table.identifier, join_with_quotes(missing_columns)) current_is_not_null = {column.name for column in columns_info if column.not_null} for column in table.table_design["columns"]: if column.get("not_null") and column["name"] not in current_is_not_null: raise TableDesignValidationError("not null constraint of column '%s' in '%s' not enforced upstream" % (column["name"], table.identifier))
def validate_upstream_columns(conn: connection, table: RelationDescription) -> None: """ Compare columns in upstream table to the table design file. It is an ERROR if the design lists columns that do not exist in the upstream table. Exceptions here are calculated columns (those starting with etl__) or columns that are marked as skipped. It causes a WARNING to have more columns in the upstream table than are defined in the design or to have columns skipped in the design that do not exist upstream. """ source_table_name = table.source_table_name columns_info = etl.design.bootstrap.fetch_attributes(conn, source_table_name) if not columns_info: raise UpstreamValidationError("table '%s' is gone or has no columns left" % source_table_name.identifier) logger.info("Found %d column(s) in relation '%s'", len(columns_info), source_table_name.identifier) current_columns = frozenset(column.name for column in columns_info) design_columns = frozenset( column["name"] for column in table.table_design["columns"] if not column["name"].startswith("etl__") ) design_required_columns = frozenset( column["name"] for column in table.table_design["columns"] if column["name"] in design_columns and not column.get("skipped", False) ) missing_required_columns = design_required_columns.difference(current_columns) if missing_required_columns: raise UpstreamValidationError( "design of '%s' has columns that do not exist upstream: %s" % (source_table_name.identifier, join_with_quotes(missing_required_columns)) ) extra_design_columns = design_columns.difference(current_columns) if extra_design_columns: logger.warning( "Column(s) that are in the design of '%s' but do not exist upstream in '%s': %s", table.identifier, table.source_name, join_with_quotes(extra_design_columns), ) missing_design_columns = current_columns.difference(design_columns) if missing_design_columns: logger.warning( "Column(s) that exist upstream in '%s' but not in the design '%s': %s", table.source_name, table.identifier, join_with_quotes(missing_design_columns), ) current_is_not_null = {column.name for column in columns_info if column.not_null} for column in table.table_design["columns"]: if column.get("not_null") and column["name"] not in current_is_not_null: raise TableDesignValidationError( "not null constraint of column '%s' in '%s' not enforced upstream" % (column["name"], table.identifier) )
def bootstrap_transformations(local_dir, local_files, source_name, check_only=False, update=False, replace=False, dry_run=False): """ Download design information for transformations by test-running in the data warehouse. "source_name" should be "CTAS" or "VIEW or None (in which case the relation type currently specified will continue to be used). This is a callback of a command. """ dw_config = etl.config.get_dw_config() transformation_schema = { schema.name for schema in dw_config.schemas if schema.has_transformations } transforms = [ file_set for file_set in local_files if file_set.source_name in transformation_schema ] if not (check_only or replace or update): # Filter down to new transformations: SQL files without matching YAML file transforms = [ file_set for file_set in transforms if not file_set.design_file_name ] if not transforms: logger.warning("Found no new queries without matching design files") return relations = [RelationDescription(file_set) for file_set in transforms] if check_only or update or (replace and source_name is None): logger.info("Loading existing table design file(s)") try: RelationDescription.load_in_parallel(relations) except Exception: logger.warning( "Make sure that table design files exist and are valid before trying to update" ) raise check_only_errors = 0 with closing(etl.db.connection(dw_config.dsn_etl, autocommit=True)) as conn: for index, relation in enumerate(relations): logger.info("Working on transformation '%s' (%d/%d)", relation.identifier, index + 1, len(relations)) # Be careful to not trigger a load of an unknown design file by accessing "kind". actual_kind = source_name or (relation.kind if relation.design_file_name else None) try: table_design = create_table_design_for_transformation( conn, actual_kind, relation, update or check_only) except RuntimeError as exc: if check_only: print( f"Failed to create table design for {relation:x}: {exc}" ) check_only_errors += 1 continue else: raise if check_only: if relation.table_design != table_design: check_only_errors += 1 print(f"Change detected in table design for {relation:x}") print( diff_table_designs(relation.table_design, table_design, relation.design_file_name, "bootstrap")) continue if update and relation.table_design == table_design: logger.info( f"No updates detected in table design for {relation:x}, skipping write" ) continue source_dir = os.path.join(local_dir, relation.source_name) # Derive preferred name from the current design or SQL file. if relation.design_file_name is not None: filename = relation.design_file_name elif relation.sql_file_name is not None: filename = re.sub(r".sql$", ".yaml", relation.sql_file_name) else: filename = os.path.join( source_dir, f"{relation.target_table_name.schema}-{relation.target_table_name.table}.yaml", ) save_table_design(relation.target_table_name, table_design, filename, overwrite=update or replace, dry_run=dry_run) if check_only_errors: raise TableDesignValidationError( f"found {check_only_errors} table design(s) that would be rewritten" ) if check_only: print("Congratulations. There were no changes in table design files.")