def fetch_tables(cx: Connection, source: DataWarehouseSchema, selector: TableSelector) -> List[TableName]: """ Retrieve tables (matching selector) for this source, return as a list of TableName instances. The :source configuration contains an "allowlist" (which tables to include) and a "denylist" (which tables to exclude). Note that "exclude" always overrides "include." The list of tables matching the allowlist but not the denylist can be further narrowed down by the pattern in :selector. """ # Look for relations ('r', ordinary tables), materialized views ('m'), and views ('v'). result = etl.db.query( cx, """ SELECT nsp.nspname AS "schema" , cls.relname AS "table" FROM pg_catalog.pg_class AS cls JOIN pg_catalog.pg_namespace AS nsp ON cls.relnamespace = nsp.oid WHERE cls.relname NOT LIKE 'tmp%%' AND cls.relname NOT LIKE 'pg_%%' AND cls.relkind IN ('r', 'm', 'v') ORDER BY nsp.nspname , cls.relname """, ) found = [] for row in result: source_table_name = TableName(row["schema"], row["table"]) target_table_name = TableName(source.name, row["table"]) for reject_pattern in source.exclude_tables: if source_table_name.match_pattern(reject_pattern): logger.debug("Table '%s' matches denylist", source_table_name.identifier) break else: for accept_pattern in source.include_tables: if source_table_name.match_pattern(accept_pattern): if selector.match(target_table_name): found.append(source_table_name) logger.debug("Table '%s' is included in result set", source_table_name.identifier) break else: logger.debug( "Table '%s' matches allowlist but is not selected", source_table_name.identifier) logger.info( "Found %d table(s) matching patterns; allowlist=%s, denylist=%s, subset='%s'", len(found), source.include_tables, source.exclude_tables, selector, ) return found
def build_column_description(column: Dict[str, str], skip_identity=False, skip_references=False) -> str: """ Return the description of a table column suitable for a table creation. See build_columns. >>> build_column_description({"name": "key", "sql_type": "int", "not_null": True, "encoding": "raw"}) '"key" int ENCODE raw NOT NULL' >>> build_column_description({"name": "my_key", "sql_type": "int", "references": ["sch.ble", ["key"]]}) '"my_key" int REFERENCES "sch"."ble" ( "key" )' """ column_ddl = '"{name}" {sql_type}'.format(**column) if column.get("identity", False) and not skip_identity: column_ddl += " IDENTITY(1, 1)" if "encoding" in column: column_ddl += " ENCODE {}".format(column["encoding"]) if column.get("not_null", False): column_ddl += " NOT NULL" if "references" in column and not skip_references: [table_identifier, [foreign_column]] = column["references"] foreign_name = TableName.from_identifier(table_identifier) column_ddl += ' REFERENCES {} ( "{}" )'.format(foreign_name, foreign_column) return column_ddl
def fetch_dependencies(cx: Connection, table_name: TableName) -> List[str]: """ Lookup dependencies (other relations). Note that this will return an empty list for a late-binding view. """ # See https://github.com/awslabs/amazon-redshift-utils/blob/master/src/AdminViews/v_constraint_dependency.sql stmt = """ SELECT DISTINCT target_ns.nspname AS "schema" , target_cls.relname AS "table" FROM pg_catalog.pg_class AS cls JOIN pg_catalog.pg_namespace AS ns ON cls.relnamespace = ns.oid JOIN pg_catalog.pg_depend AS dep ON cls.oid = dep.refobjid JOIN pg_catalog.pg_depend AS target_dep ON dep.objid = target_dep.objid JOIN pg_catalog.pg_class AS target_cls ON target_dep.refobjid = target_cls.oid AND cls.oid <> target_cls.oid JOIN pg_catalog.pg_namespace AS target_ns ON target_cls.relnamespace = target_ns.oid WHERE ns.nspname LIKE %s AND cls.relname = %s ORDER BY "schema", "table" """ dependencies = etl.db.query(cx, stmt, (table_name.schema, table_name.table)) return [TableName(**row).identifier for row in dependencies]
def dependencies(self) -> FrozenSet[TableName]: if self._dependencies is None: dependent_table_names = [ TableName.from_identifier(d) for d in self.table_design.get("depends_on", []) ] self._dependencies = frozenset(dependent_table_names) return self._dependencies
def validate_reload(schemas: List[DataWarehouseSchema], relations: List[RelationDescription], keep_going: bool): """ Verify that columns between unloaded tables and reloaded tables are the same. Once the designs are validated, we can unload a relation 's.t' with a target 'u' and then extract and load it back into 'u.t'. Note that the order matters for these lists of columns. (Which is also why we can't take the (symmetric) difference between columns but must be careful checking the column lists.) """ unloaded_relations = [d for d in relations if d.is_unloadable] target_lookup = { schema.name for schema in schemas if schema.is_an_unload_target } relations_lookup = {d.identifier: d for d in relations} for unloaded in unloaded_relations: try: if unloaded.unload_target not in target_lookup: raise TableDesignValidationError( "invalid target '{}' in unloadable relation '{}'".format( unloaded.unload_target, unloaded.identifier)) else: logger.debug("Checking whether '%s' is loaded back in", unloaded.identifier) reloaded = TableName(unloaded.unload_target, unloaded.target_table_name.table) if reloaded.identifier in relations_lookup: relation = relations_lookup[reloaded.identifier] logger.info( "Checking for consistency between '%s' and '%s'", unloaded.identifier, relation.identifier) unloaded_columns = unloaded.unquoted_columns reloaded_columns = relation.unquoted_columns if unloaded_columns != reloaded_columns: diff = get_list_difference(reloaded_columns, unloaded_columns) logger.error( "Column difference detected between '%s' and '%s'", unloaded.identifier, relation.identifier) logger.error( "You need to replace, insert and/or delete in '%s' some column(s): %s", relation.identifier, join_with_single_quotes(diff), ) raise TableDesignValidationError( "unloaded relation '%s' failed to match counterpart" % unloaded.identifier) except TableDesignValidationError: if keep_going: _error_occurred.set() logger.exception( "Ignoring failure to validate '%s' and proceeding as requested:", unloaded.identifier) else: raise
def create_table_designs_from_source(source, selector, local_dir, local_files, dry_run=False): """ Create table design files for tables from a single source to local directory. Whenever some table designs already exist locally, validate them against the information found from upstream. """ source_dir = os.path.join(local_dir, source.name) normalize_and_create(source_dir, dry_run=dry_run) source_files = { file_set.source_table_name: file_set for file_set in local_files if file_set.source_name == source.name and file_set.design_file_name } try: logger.info("Connecting to database source '%s' to look for tables", source.name) with closing( etl.db.connection(source.dsn, autocommit=True, readonly=True)) as conn: source_tables = fetch_tables(conn, source, selector) for source_table_name in source_tables: if source_table_name in source_files: logger.info( "Skipping '%s' from source '%s' because table design file exists: '%s'", source_table_name.identifier, source.name, source_files[source_table_name].design_file_name) else: target_table_name = TableName(source.name, source_table_name.table) table_design = create_table_design_for_source( conn, source_table_name, target_table_name) save_table_design(source_dir, source_table_name, target_table_name, table_design, dry_run=dry_run) logger.info("Done with %d table(s) from source '%s'", len(source_tables), source.name) except Exception: logger.error("Error while processing source '%s'", source.name) raise existent = frozenset(name.identifier for name in source_files) upstream = frozenset(name.identifier for name in source_tables) not_found = upstream.difference(existent) if not_found: logger.warning("New table(s) in source '%s' without local design: %s", source.name, join_with_quotes(not_found)) too_many = existent.difference(upstream) if too_many: logger.error("Local table design(s) without table in source '%s': %s", source.name, join_with_quotes(too_many)) return len(source_tables)
def _find_file_sets_from(iterable, selector): """ Return list of file sets ordered by (target) schema name, (source) schema_name and (source) table_name. Remember that the (target) schema name is the same as the source name (for upstream sources). The selector's base schemas (if present) will override alphabetical sorting for the source_name. """ target_map = {} # Always return files sorted by sources (in original order) and target name. schema_index = { name: index for index, name in enumerate(selector.base_schemas) } for filename, values in _find_matching_files_from(iterable, selector): source_table_name = TableName(values["schema_name"], values["table_name"]) target_table_name = TableName(values["source_name"], values["table_name"]) if target_table_name.identifier not in target_map: natural_order = schema_index.get( values["source_name"]), source_table_name.identifier target_map[target_table_name.identifier] = TableFileSet( source_table_name, target_table_name, natural_order) file_set = target_map[target_table_name.identifier] file_type = values["file_type"] if file_type == "yaml": file_set.design_file_name = filename elif file_type == "sql": file_set.sql_file_name = filename elif file_type == "manifest": file_set.manifest_file_name = filename elif file_type == "data": file_set.add_data_file(filename) file_sets = sorted(target_map.values()) logger.info("Found %d matching file(s) for %d table(s)", sum(len(fs) for fs in file_sets), len(file_sets)) return file_sets
def _find_matching_files_from(iterable, pattern, return_success_file=False): """ Match file names against the target pattern and expected path format, return file information based on source name, source schema, table name and file type. This generator provides all files that are possibly relevant and it is up to the consumer to ensure consistency (e.g. that a design file exists or that a SQL file is not present along with a manifest). Files ending in '_SUCCESS' or '_$folder$' are ignored (which are created by some Spark jobs). """ file_names_re = re.compile( r"""(?:^schemas|/schemas|^data|/data) /(?P<source_name>\w+) /(?P<schema_name>\w+)-(?P<table_name>\w+) (?:(?P<file_ext>.yaml|.sql|.manifest|/csv/(:?part-.*(:?\.gz)?|_SUCCESS)))$ """, re.VERBOSE, ) for filename in iterable: match = file_names_re.search(filename) if match: values = match.groupdict() target_table_name = TableName(values["source_name"], values["table_name"]) if pattern.match(target_table_name): file_ext = values["file_ext"] if file_ext in [".yaml", ".sql", ".manifest"]: values["file_type"] = file_ext[1:] elif file_ext.endswith("_SUCCESS"): values["file_type"] = "success" elif file_ext.startswith("/csv"): values["file_type"] = "data" # E.g. when deleting files out of a folder we want to know about the /csv/_SUCCESS file. if return_success_file or values["file_type"] != "success": yield (filename, values) elif not filename.endswith("_$folder$"): logger.warning("Found file not matching expected format: '%s'", filename)
def _find_matching_files_from(iterable, pattern): """ Return file information based on source name, source schema, table name and file type. This generator provides all files that are relevant and match the pattern. It is up to the consumer to ensure consistency (e.g. that a design file exists or that a SQL file is not present along with a manifest). Files ending in '_$folder$' are ignored. (They are created by some Spark jobs.) >>> found = list(_find_matching_files_from([ ... "/schemas/store/public-orders.yaml", ... "/data/store/public-orders.manifest", ... "/data/store/public-orders/csv/_SUCCESS", ... "/data/store/public-orders/csv/part-0.gz", ... "/schemas/dw/orders.sql", ... "/schemas/dw/orders.yaml", ... "/data/events/kinesis-stream/json/part-0.gz", ... "/schemas/store/not-selected.yaml", ... ], pattern=TableSelector(["dw.*", "events", "store.orders"]))) >>> files = {file_info.filename: file_info for file_info in found} >>> len(files) 7 >>> files["/schemas/store/public-orders.yaml"].file_type 'yaml' >>> files["/schemas/store/public-orders.yaml"].schema_name 'store' >>> files["/schemas/store/public-orders.yaml"].source_schema_name 'public' >>> files["/schemas/store/public-orders.yaml"].table_name 'orders' >>> files["/data/store/public-orders.manifest"].file_type 'manifest' >>> files["/data/store/public-orders/csv/_SUCCESS"].file_type 'success' >>> files["/data/store/public-orders/csv/part-0.gz"].file_type 'data' >>> files["/data/store/public-orders/csv/part-0.gz"].file_format 'CSV' >>> files["/schemas/dw/orders.sql"].schema_name 'dw' >>> files["/schemas/dw/orders.sql"].source_schema_name 'dw' >>> files["/schemas/dw/orders.sql"].table_name 'orders' >>> files["/schemas/dw/orders.sql"].file_type 'sql' >>> files["/schemas/dw/orders.yaml"].file_type 'yaml' >>> files["/data/events/kinesis-stream/json/part-0.gz"].file_format 'JSON' """ for filename in iterable: file_info = FileInfo.from_filename(filename) if file_info is None: if not filename.endswith("_$folder$"): logger.warning("Found file not matching expected format: '%s'", filename) continue target_table_name = TableName(file_info.schema_name, file_info.table_name) if pattern.match(target_table_name): yield file_info
def create_table_designs_from_source(source, selector, local_dir, relations, update=False, replace=False, dry_run=False): """ Create table design files for tables from a single upstream source to local directory. Whenever some table designs already exist locally, validate them, update them or replace them against the information found from the upstream source. """ source_dir = os.path.join(local_dir, source.name) normalize_and_create(source_dir, dry_run=dry_run) relation_lookup = { relation.source_table_name: relation for relation in relations } try: logger.info("Connecting to database source '%s' to look for tables", source.name) with closing( etl.db.connection(source.dsn, autocommit=True, readonly=True)) as conn: source_tables = fetch_tables(conn, source, selector) for source_table_name in source_tables: if source_table_name in relation_lookup and not (update or replace): logger.info( "Skipping '%s' from source '%s' because table design already exists: '%s'", source_table_name.identifier, source.name, relation_lookup[source_table_name].design_file_name, ) continue relation = relation_lookup.get( source_table_name) if update else None target_table_name = TableName(source.name, source_table_name.table) table_design = create_table_design_for_source( conn, source_table_name, target_table_name, relation) if relation is not None and relation.table_design == table_design: logger.info( f"No updates detected in table design for {target_table_name:x}, skipping write" ) continue filename = os.path.join( source_dir, f"{source_table_name.schema}-{source_table_name.table}.yaml" ) save_table_design(target_table_name, table_design, filename, overwrite=update or replace, dry_run=dry_run) logger.info("Done with %d table(s) from source '%s'", len(source_tables), source.name) except Exception: logger.error("Error while processing source '%s'", source.name) raise existent = frozenset(relation.source_table_name.identifier for relation in relations) upstream = frozenset(name.identifier for name in source_tables) not_found = upstream.difference(existent) if not_found: logger.warning("New table(s) in source '%s' without local design: %s", source.name, join_with_single_quotes(not_found)) too_many = existent.difference(upstream) if too_many: logger.error("Local table design(s) without table in source '%s': %s", source.name, join_with_single_quotes(too_many)) return len(source_tables)