def validate_dependencies(conn: Connection, relation: RelationDescription, tmp_view_name: TempTableName) -> None: """Download the dependencies (based on a temporary view) and compare with table design.""" if tmp_view_name.is_late_binding_view: dependencies = etl.design.bootstrap.fetch_dependency_hints( conn, relation.query_stmt) if dependencies is None: logger.warning( "Unable to validate '%s' which depends on external tables", relation.identifier) return logger.info("Dependencies of '%s' per query plan: %s", relation.identifier, join_with_single_quotes(dependencies)) else: dependencies = etl.design.bootstrap.fetch_dependencies( conn, tmp_view_name) logger.info("Dependencies of '%s' per catalog: %s", relation.identifier, join_with_single_quotes(dependencies)) difference = compare_query_to_design( dependencies, relation.table_design.get("depends_on", [])) if difference: logger.error("Mismatch in dependencies of '{}': {}".format( relation.identifier, difference)) raise TableDesignValidationError("mismatched dependencies in '%s'" % relation.identifier) logger.info("Dependencies listing in design file for '%s' matches SQL", relation.identifier)
def compare_query_to_design(from_query: Iterable, from_design: Iterable) -> Optional[str]: """ Calculate difference between the two lists and return human-interpretable string. The assumption here is that we can run a query to find dependencies and compare that list to what is stored in the table design file. >>> compare_query_to_design(["a", "b"], ["b", "a"]) >>> compare_query_to_design(["a", "b"], ["a"]) "not listed in design = 'b'" >>> compare_query_to_design(["a"], ["a", "b"]) "not found from query = 'b'" >>> compare_query_to_design(["a", "b"], ["d", "c", "a"]) "not listed in design = 'b'; not found from query = 'c', 'd'" """ actual = frozenset(from_query) design = frozenset(from_design) not_in_design = join_with_single_quotes(actual - design) not_in_query = join_with_single_quotes(design - actual) if not_in_design and not_in_query: return "not listed in design = {}; not found from query = {}".format( not_in_design, not_in_query) elif not_in_design: return "not listed in design = {}".format(not_in_design) elif not_in_query: return "not found from query = {}".format(not_in_query) else: return None
def _sanitize_dependencies( descriptions: Sequence[SortableRelationDescription]) -> None: """ Pass 1 of ordering -- make sure to drop unknown dependencies. This will change the sortable relations in place. """ known_tables = frozenset( {description.target_table_name for description in descriptions}) has_unknown_dependencies = set() has_pg_catalog_dependencies = set() known_unknowns = set() for initial_order, description in enumerate(descriptions): unmanaged_dependencies = frozenset(dep for dep in description.dependencies if not dep.is_managed) pg_catalog_dependencies = frozenset(dep for dep in description.dependencies if dep.schema == "pg_catalog") unknowns = description.dependencies - known_tables - unmanaged_dependencies if unknowns: known_unknowns.update(unknowns) has_unknown_dependencies.add(description.target_table_name) # Drop the unknowns from the list of dependencies so that the loop below doesn't wait # for their resolution. description.dependencies = description.dependencies.difference( unknowns) if unmanaged_dependencies: logger.info( "The following dependencies for relation '%s' are not managed by Arthur: %s", description.identifier, join_with_single_quotes( [dep.identifier for dep in unmanaged_dependencies]), ) if pg_catalog_dependencies: has_pg_catalog_dependencies.add(description.target_table_name) if has_unknown_dependencies: logger.warning( "These relations were unknown during dependency ordering: %s", join_with_single_quotes([dep.identifier for dep in known_unknowns]), ) logger.warning( "This caused these relations to have dependencies that are not known: %s", join_with_single_quotes( [dep.identifier for dep in has_unknown_dependencies]), ) # Make tables that depend on tables in pg_catalog depend on all our tables (except those # depending on pg_catalog tables). has_no_internal_dependencies = known_tables - known_unknowns - has_pg_catalog_dependencies for description in descriptions: if description.target_table_name in has_pg_catalog_dependencies: description.dependencies.update(has_no_internal_dependencies)
def show_pipelines(selection: Sequence[str], as_json=False) -> None: """ List the currently installed pipelines, possibly using a subset based on the selection pattern. Without a selection, prints an overview of the pipelines. With selection of a single pipeline, digs into details of that selected pipeline. If "as json" is chosen, then the output is JSON-formatted and includes all fields, not just the ones shown in the tables. """ pipelines = list_pipelines(selection) if not pipelines: if selection: logger.warning("Found no pipelines matching glob pattern") else: logger.warning("Found no pipelines") if not as_json: print("*** No pipelines found ***") return if selection: if len(pipelines) > 1: logger.warning("Selection matched more than one pipeline") logger.info( "Currently selected pipelines: %s", join_with_single_quotes(pipeline.pipeline_id for pipeline in pipelines), ) else: logger.info( "Available pipelines: %s", join_with_single_quotes(pipeline.pipeline_id for pipeline in pipelines), ) if as_json: print(DataPipeline.as_json(pipelines)) return print( etl.text.format_lines( [ ( pipeline.pipeline_id, pipeline.name, pipeline.health_status or "---", pipeline.state or "---", pipeline.latest_run_time or "---", pipeline.next_run_time or "---", ) for pipeline in pipelines ], header_row=["Pipeline ID", "Name", "Health", "State", "Latest Run Time", "Next Run Time"], max_column_width=80, ) ) if len(pipelines) == 1: _show_pipeline_details(pipelines.pop())
def validate_column_references(table_design): """Make sure that table attributes and constraints only reference columns that actually exist.""" column_list_references = [ ("constraints", "primary_key"), ("constraints", "natural_key"), ("constraints", "surrogate_key"), ("constraints", "unique"), ("attributes", "interleaved_sort"), ("attributes", "compound_sort"), ] valid_columns = frozenset(column["name"] for column in table_design["columns"] if not column.get("skipped")) constraints = table_design.get("constraints", []) for obj, key in column_list_references: if obj == "constraints": # This evaluates all unique constraints at once by concatenating all of the columns. cols = [ col for constraint in constraints for col in constraint.get(key, []) ] else: # 'attributes' cols = table_design.get(obj, {}).get(key, []) unknown = join_with_single_quotes( frozenset(cols).difference(valid_columns)) if unknown: raise TableDesignSemanticError( "{key} columns in {obj} contain unknown column(s): {unknown}". format(obj=obj, key=key, unknown=unknown))
def backup_schemas(schemas: Iterable[DataWarehouseSchema], dry_run=False) -> None: """ For existing schemas, rename them and drop access. Once the access is revoked, the backup schemas "disappear" from BI tools. """ schema_lookup = {schema.name: schema for schema in schemas} dsn_etl = etl.config.get_dw_config().dsn_etl with closing(etl.db.connection(dsn_etl, autocommit=True, readonly=dry_run)) as conn: found = etl.db.select_schemas(conn, schema_lookup.keys()) if not found: logger.info("Found no existing schemas to backup") return selected_names = join_with_single_quotes(found) if dry_run: logger.info("Dry-run: Skipping backup of schema(s): %s", selected_names) return logger.info("Creating backup of schema(s) %s", selected_names) for schema in [schema_lookup[name] for name in found]: logger.info( "Revoking access from readers and writers to schema '%s' before backup", schema.name) revoke_schema_permissions(conn, schema) logger.info("Renaming schema '%s' to backup '%s'", schema.name, schema.backup_name) etl.db.drop_schema(conn, schema.backup_name) etl.db.alter_schema_rename(conn, schema.name, schema.backup_name)
def from_file_sets( cls, file_sets, required_relation_selector=None) -> List["RelationDescription"]: """ Return a list of relation descriptions based on a list of file sets. If there's a file set without a table design file, then there's a warning and that file set is skipped. (This comes in handy when creating the design file for a CTAS or VIEW automatically.) If provided, the required_relation_selector will be used to mark dependencies of high-priority. A failure to dump or load in these relations will end the ETL run. """ relations = [] for file_set in file_sets: if file_set.design_file_name is not None: relations.append(cls(file_set)) else: logger.warning( "Found file(s) without matching table design: %s", join_with_single_quotes(file_set.files)) if required_relation_selector: set_required_relations(relations, required_relation_selector) return relations
def __init__(self, source_relations, extracted_targets): missing_relations = [ relation for relation in source_relations if relation.identifier not in extracted_targets ] self.message = ( "Some source relations did not have extract events after the step start time: " + join_with_single_quotes(missing_relations))
def validate_reload(schemas: List[DataWarehouseSchema], relations: List[RelationDescription], keep_going: bool): """ Verify that columns between unloaded tables and reloaded tables are the same. Once the designs are validated, we can unload a relation 's.t' with a target 'u' and then extract and load it back into 'u.t'. Note that the order matters for these lists of columns. (Which is also why we can't take the (symmetric) difference between columns but must be careful checking the column lists.) """ unloaded_relations = [d for d in relations if d.is_unloadable] target_lookup = { schema.name for schema in schemas if schema.is_an_unload_target } relations_lookup = {d.identifier: d for d in relations} for unloaded in unloaded_relations: try: if unloaded.unload_target not in target_lookup: raise TableDesignValidationError( "invalid target '{}' in unloadable relation '{}'".format( unloaded.unload_target, unloaded.identifier)) else: logger.debug("Checking whether '%s' is loaded back in", unloaded.identifier) reloaded = TableName(unloaded.unload_target, unloaded.target_table_name.table) if reloaded.identifier in relations_lookup: relation = relations_lookup[reloaded.identifier] logger.info( "Checking for consistency between '%s' and '%s'", unloaded.identifier, relation.identifier) unloaded_columns = unloaded.unquoted_columns reloaded_columns = relation.unquoted_columns if unloaded_columns != reloaded_columns: diff = get_list_difference(reloaded_columns, unloaded_columns) logger.error( "Column difference detected between '%s' and '%s'", unloaded.identifier, relation.identifier) logger.error( "You need to replace, insert and/or delete in '%s' some column(s): %s", relation.identifier, join_with_single_quotes(diff), ) raise TableDesignValidationError( "unloaded relation '%s' failed to match counterpart" % unloaded.identifier) except TableDesignValidationError: if keep_going: _error_occurred.set() logger.exception( "Ignoring failure to validate '%s' and proceeding as requested:", unloaded.identifier) else: raise
def extract_sources(self) -> None: """Iterate over sources to be extracted and parallelize extraction at the source level.""" self.logger.info("Starting to extract %d relation(s) in %d schema(s)", len(self.relations), len(self.schemas)) self.failed_sources.clear() max_workers = len(self.schemas) with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers, thread_name_prefix="extract-source") as executor: futures = [] for source_name, relation_group in groupby( self.relations, attrgetter("source_name")): future = executor.submit(self.extract_source, self.schemas[source_name], list(relation_group)) futures.append(future) if self.keep_going: done, not_done = concurrent.futures.wait( futures, return_when=concurrent.futures.ALL_COMPLETED) else: done, not_done = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_EXCEPTION) if self.failed_sources: self.logger.error("Failed to extract from these source(s): %s", join_with_single_quotes(self.failed_sources)) # Note that iterating over result of futures may raise an exception which surfaces # exceptions from threads. This happens when there is (at least) one required table # that failed to extract. missing_tables: List[str] = [] for future in done: missing_tables.extend(future.result()) if missing_tables: self.logger.warning( "Failed to extract %d relation(s): %s", len(missing_tables), join_with_single_quotes(missing_tables), ) if not_done: raise DataExtractError( "Extract failed to complete for {:d} source(s)".format( len(not_done)))
def filter_relations_for_sources( source_lookup: Dict[str, DataWarehouseSchema], relations: List[RelationDescription]) -> List[RelationDescription]: """Filter for the relations that a given "extract" stage cares about.""" selected = [ relation for relation in relations if relation.source_name in source_lookup ] if selected: sources = frozenset(relation.source_name for relation in selected) logger.info("Selected %d relation(s) from source(s): %s", len(selected), join_with_single_quotes(sources)) return selected
def _promote_schemas(schemas: Iterable[DataWarehouseSchema], from_where: str, dry_run=False) -> None: """ Promote (staging or backup) schemas into their standard names and permissions. Changes schema.from_name_attr -> schema.name; expects from_name_attr to be 'backup_name' or 'staging_name' """ attr_name = from_where + "_name" assert attr_name in ("backup_name", "staging_name") from_name_schema_lookup = { getattr(schema, attr_name): schema for schema in schemas } dsn_etl = etl.config.get_dw_config().dsn_etl with closing(etl.db.connection(dsn_etl, autocommit=True, readonly=dry_run)) as conn: need_promotion = etl.db.select_schemas(conn, from_name_schema_lookup.keys()) if not need_promotion: logger.info("Found no %s schemas to promote", from_where) return # Always log the original names, not the ones found in need_promotion. selected_names = join_with_single_quotes( from_name_schema_lookup[from_name].name for from_name in need_promotion) if dry_run: logger.info( "Dry-run: Skipping promotion of %d schema(s) from %s position: %s", len(need_promotion), from_where, selected_names, ) return logger.info("Promoting %d schema(s) from %s position: %s", len(need_promotion), from_where, selected_names) for from_name in need_promotion: schema = from_name_schema_lookup[from_name] logger.info("Renaming schema '%s' from '%s'", schema.name, from_name) etl.db.drop_schema(conn, schema.name) etl.db.alter_schema_rename(conn, from_name, schema.name) logger.info( "Granting readers and writers access to schema '%s' after promotion", schema.name) grant_schema_permissions(conn, schema)
def create_schema_and_grant_access(conn, schema, owner=None, use_staging=False, dry_run=False) -> None: group_names = join_with_single_quotes(schema.groups) name = schema.staging_name if use_staging else schema.name if dry_run: logger.info("Dry-run: Skipping creating schema '%s'", name) else: logger.info("Creating schema '%s'", name) etl.db.create_schema(conn, name, owner) etl.db.grant_all_on_schema_to_user(conn, name, schema.owner) if not schema.groups or use_staging: # Don't grant usage on staging schemas to readers/writers (if any) return None if dry_run: logger.info("Dry-run: Skipping granting access in '%s' to '%s'", name, group_names) else: # Readers/writers are differentiated in table permissions, not schema permissions logger.info("Granting access in '%s' to %s", name, group_names) etl.db.grant_usage(conn, name, schema.groups)
def validate_column_ordering(conn: Connection, relation: RelationDescription, tmp_view_name: TempTableName) -> None: """Download the column order (using the temporary view) and compare with table design.""" attributes = etl.design.bootstrap.fetch_attributes(conn, tmp_view_name) actual_columns = [attribute.name for attribute in attributes] if not actual_columns and tmp_view_name.is_late_binding_view: # Thanks to late-binding views it is not an error for a view to not be able to resolve # its columns. logger.warning( "Order of columns in design of '%s' cannot be validated because external table is missing", relation.identifier, ) return # Identity columns are inserted after the query has been run, so skip them here. expected_columns = [ column["name"] for column in relation.table_design["columns"] if not (column.get("skipped") or column.get("identity")) ] diff = get_list_difference(expected_columns, actual_columns) if diff: logger.error( "Order of columns in design of '%s' does not match result of running its query", relation.identifier) logger.error( "You need to replace, insert and/or delete in '%s' some column(s): %s", relation.identifier, join_with_single_quotes(diff), ) raise TableDesignValidationError( "invalid columns or column order in '%s'" % relation.identifier) else: logger.info( "Order of columns in design of '%s' matches result of running SQL query", relation.identifier)
def validate_upstream_constraints(conn: Connection, table: RelationDescription) -> None: """ Compare table constraints between database and table design file. Note that "natural_key" or "surrogate_key" constraints are not valid in upstream (source) tables. Also, a "primary_key" in upstream may be used as a "unique" constraint in the design (but not vice versa). """ current_constraint = etl.design.bootstrap.fetch_constraints( conn, table.source_table_name) design_constraint = table.table_design.get("constraints", []) current_primary_key = frozenset( [col for c in current_constraint for col in c.get("primary_key", [])]) current_uniques = [ frozenset(c["unique"]) for c in current_constraint if "unique" in c ] design_primary_key = frozenset( [col for c in design_constraint for col in c.get("primary_key", [])]) design_uniques = [ frozenset(c["unique"]) for c in design_constraint if "unique" in c ] # We'll pluck from the not_used info and report if anything wasn't used in the design. not_used = deepcopy(current_constraint) if design_primary_key: if current_primary_key == design_primary_key: for i in range(len(not_used)): if "primary_key" in not_used[i]: del not_used[i] break elif current_primary_key: raise TableDesignValidationError( "the primary_key constraint in '%s' (%s) does not match upstream (%s)" % ( table.identifier, join_with_single_quotes(design_primary_key), join_with_single_quotes(current_primary_key), )) else: raise TableDesignValidationError( "the primary key constraint in '%s' (%s) is not enforced upstream" % (table.identifier, join_with_single_quotes(design_primary_key))) for design_unique in design_uniques: if current_primary_key == design_unique: for i in range(len(not_used)): if "primary_key" in not_used[i]: del not_used[i] break if design_unique in current_uniques: for i in range(len(not_used)): if "unique" in not_used[i] and frozenset( not_used[i]["unique"]) == design_unique: del not_used[i] break if current_primary_key != design_unique and design_unique not in current_uniques: raise TableDesignValidationError( "the unique constraint in '%s' (%s) is not enforced upstream" % (table.identifier, join_with_single_quotes(design_unique))) for constraint in not_used: for constraint_type, columns in constraint.items(): logger.warning( "Upstream source has additional %s constraint (%s) for '%s'", constraint_type, join_with_single_quotes(columns), table.table_design["source_name"], )
def validate_upstream_columns(conn: Connection, table: RelationDescription) -> None: """ Compare columns in upstream table to the table design file. It is an ERROR if the design lists columns that do not exist in the upstream table. Exceptions here are calculated columns (those starting with etl__) or columns that are marked as skipped. It causes a WARNING to have more columns in the upstream table than are defined in the design or to have columns skipped in the design that do not exist upstream. """ source_table_name = table.source_table_name columns_info = etl.design.bootstrap.fetch_attributes( conn, source_table_name) if not columns_info: raise UpstreamValidationError( "table '%s' is gone or has no columns left" % source_table_name.identifier) logger.info("Found %d column(s) in relation '%s'", len(columns_info), source_table_name.identifier) current_columns = frozenset(column.name for column in columns_info) design_columns = frozenset(column["name"] for column in table.table_design["columns"] if not column["name"].startswith("etl__")) design_required_columns = frozenset( column["name"] for column in table.table_design["columns"] if column["name"] in design_columns and not column.get("skipped", False)) missing_required_columns = design_required_columns.difference( current_columns) if missing_required_columns: raise UpstreamValidationError( "design of '%s' has columns that do not exist upstream: %s" % (source_table_name.identifier, join_with_single_quotes(missing_required_columns))) extra_design_columns = design_columns.difference(current_columns) if extra_design_columns: logger.warning( "Column(s) that are in the design of '%s' but do not exist upstream in '%s': %s", table.identifier, table.source_name, join_with_single_quotes(extra_design_columns), ) missing_design_columns = current_columns.difference(design_columns) if missing_design_columns: logger.warning( "Column(s) that exist upstream in '%s' but not in the design '%s': %s", table.source_name, table.identifier, join_with_single_quotes(missing_design_columns), ) current_is_not_null = { column.name for column in columns_info if column.not_null } for column in table.table_design["columns"]: if column.get( "not_null") and column["name"] not in current_is_not_null: raise TableDesignValidationError( "not null constraint of column '{}' in '{}' not enforced upstream" .format(column["name"], table.identifier))
def __str__(self) -> str: # See __init__ for tests if len(self._patterns) == 0: return "['*.*']" else: return "[{}]".format(join_with_single_quotes(p.identifier for p in self._patterns))
def create_index(relations: Sequence[RelationDescription], groups: Iterable[str], with_columns: Optional[bool]) -> None: """ Create an "index" page with Markdown that lists all schemas and their tables. The parameter groups filters schemas to those that can be accessed by those groups. """ group_set = frozenset(groups) show_details = True if with_columns else False # We iterate of the list of relations so that we preserve their order with respect to schemas. schemas: Dict[str, dict] = OrderedDict() for relation in relations: if not group_set.intersection(relation.schema_config.reader_groups): continue schema_name = relation.target_table_name.schema if schema_name not in schemas: schemas[schema_name] = { "description": relation.schema_config.description, "relations": [] } schemas[schema_name]["relations"].append(relation) if not schemas: logger.info("List of schemas is empty, selected groups: %s", join_with_single_quotes(group_set)) return print("# List Of Relations By Schema") for schema_name, schema_info in schemas.items(): print(f"""\n## Schema: "{schema_name}"\n""") if schema_info["description"]: print(f"{schema_info['description']}\n") rows = ([relation.target_table_name.table, relation.description] for relation in schema_info["relations"]) print( tabulate(rows, headers=["Relation", "Description"], tablefmt="pipe")) if not show_details: continue for relation in schema_info["relations"]: relation_kind = "View" if relation.is_view_relation else "Table" print(f"""\n### {relation_kind}: "{relation.identifier}"\n""") if relation.description: print(f"{relation.description}\n") key_columns: FrozenSet[str] = frozenset() for constraint in relation.table_design.get("constraints", []): for constraint_name, constraint_columns in constraint.items(): if constraint_name in ("primary_key", "surrogate_key"): key_columns = frozenset(constraint_columns) break rows = ([ ":key:" if column["name"] in key_columns else "", column["name"], column.get("type", ""), column.get("description", ""), ] for column in relation.table_design["columns"]) print( tabulate(rows, headers=[ "Key?", "Column Name", "Column Type", "Column Description" ], tablefmt="pipe"))