Ejemplos de join_with_single_quotes en Python, ejemplos de etl.text.join_with_single_quotes en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: validate.py Proyecto: bhtucker/arthur-redshift-etl

def validate_dependencies(conn: Connection, relation: RelationDescription,
                          tmp_view_name: TempTableName) -> None:
    """Download the dependencies (based on a temporary view) and compare with table design."""
    if tmp_view_name.is_late_binding_view:
        dependencies = etl.design.bootstrap.fetch_dependency_hints(
            conn, relation.query_stmt)
        if dependencies is None:
            logger.warning(
                "Unable to validate '%s' which depends on external tables",
                relation.identifier)
            return
        logger.info("Dependencies of '%s' per query plan: %s",
                    relation.identifier, join_with_single_quotes(dependencies))
    else:
        dependencies = etl.design.bootstrap.fetch_dependencies(
            conn, tmp_view_name)
        logger.info("Dependencies of '%s' per catalog: %s",
                    relation.identifier, join_with_single_quotes(dependencies))

    difference = compare_query_to_design(
        dependencies, relation.table_design.get("depends_on", []))
    if difference:
        logger.error("Mismatch in dependencies of '{}': {}".format(
            relation.identifier, difference))
        raise TableDesignValidationError("mismatched dependencies in '%s'" %
                                         relation.identifier)
    logger.info("Dependencies listing in design file for '%s' matches SQL",
                relation.identifier)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: validate.py Proyecto: bhtucker/arthur-redshift-etl

def compare_query_to_design(from_query: Iterable,
                            from_design: Iterable) -> Optional[str]:
    """
    Calculate difference between the two lists and return human-interpretable string.

    The assumption here is that we can run a query to find dependencies
    and compare that list to what is stored in the table design file.

    >>> compare_query_to_design(["a", "b"], ["b", "a"])
    >>> compare_query_to_design(["a", "b"], ["a"])
    "not listed in design = 'b'"
    >>> compare_query_to_design(["a"], ["a", "b"])
    "not found from query = 'b'"
    >>> compare_query_to_design(["a", "b"], ["d", "c", "a"])
    "not listed in design = 'b'; not found from query = 'c', 'd'"
    """
    actual = frozenset(from_query)
    design = frozenset(from_design)

    not_in_design = join_with_single_quotes(actual - design)
    not_in_query = join_with_single_quotes(design - actual)

    if not_in_design and not_in_query:
        return "not listed in design = {}; not found from query = {}".format(
            not_in_design, not_in_query)
    elif not_in_design:
        return "not listed in design = {}".format(not_in_design)
    elif not_in_query:
        return "not found from query = {}".format(not_in_query)
    else:
        return None

Ejemplo n.º 3

0

Mostrar archivo

Archivo: relation.py Proyecto: bhtucker/arthur-redshift-etl

def _sanitize_dependencies(
        descriptions: Sequence[SortableRelationDescription]) -> None:
    """
    Pass 1 of ordering -- make sure to drop unknown dependencies.

    This will change the sortable relations in place.
    """
    known_tables = frozenset(
        {description.target_table_name
         for description in descriptions})
    has_unknown_dependencies = set()
    has_pg_catalog_dependencies = set()
    known_unknowns = set()

    for initial_order, description in enumerate(descriptions):
        unmanaged_dependencies = frozenset(dep
                                           for dep in description.dependencies
                                           if not dep.is_managed)
        pg_catalog_dependencies = frozenset(dep
                                            for dep in description.dependencies
                                            if dep.schema == "pg_catalog")
        unknowns = description.dependencies - known_tables - unmanaged_dependencies
        if unknowns:
            known_unknowns.update(unknowns)
            has_unknown_dependencies.add(description.target_table_name)
            # Drop the unknowns from the list of dependencies so that the loop below doesn't wait
            # for their resolution.
            description.dependencies = description.dependencies.difference(
                unknowns)
        if unmanaged_dependencies:
            logger.info(
                "The following dependencies for relation '%s' are not managed by Arthur: %s",
                description.identifier,
                join_with_single_quotes(
                    [dep.identifier for dep in unmanaged_dependencies]),
            )
        if pg_catalog_dependencies:
            has_pg_catalog_dependencies.add(description.target_table_name)

    if has_unknown_dependencies:
        logger.warning(
            "These relations were unknown during dependency ordering: %s",
            join_with_single_quotes([dep.identifier
                                     for dep in known_unknowns]),
        )
        logger.warning(
            "This caused these relations to have dependencies that are not known: %s",
            join_with_single_quotes(
                [dep.identifier for dep in has_unknown_dependencies]),
        )

    # Make tables that depend on tables in pg_catalog depend on all our tables (except those
    # depending on pg_catalog tables).
    has_no_internal_dependencies = known_tables - known_unknowns - has_pg_catalog_dependencies
    for description in descriptions:
        if description.target_table_name in has_pg_catalog_dependencies:
            description.dependencies.update(has_no_internal_dependencies)

Ejemplo n.º 4

0

Mostrar archivo

def show_pipelines(selection: Sequence[str], as_json=False) -> None:
    """
    List the currently installed pipelines, possibly using a subset based on the selection pattern.

    Without a selection, prints an overview of the pipelines.
    With selection of a single pipeline, digs into details of that selected pipeline.

    If "as json" is chosen, then the output is JSON-formatted
    and includes all fields, not just the ones shown in the tables.
    """
    pipelines = list_pipelines(selection)

    if not pipelines:
        if selection:
            logger.warning("Found no pipelines matching glob pattern")
        else:
            logger.warning("Found no pipelines")
        if not as_json:
            print("*** No pipelines found ***")
        return

    if selection:
        if len(pipelines) > 1:
            logger.warning("Selection matched more than one pipeline")
        logger.info(
            "Currently selected pipelines: %s",
            join_with_single_quotes(pipeline.pipeline_id for pipeline in pipelines),
        )
    else:
        logger.info(
            "Available pipelines: %s",
            join_with_single_quotes(pipeline.pipeline_id for pipeline in pipelines),
        )
    if as_json:
        print(DataPipeline.as_json(pipelines))
        return

    print(
        etl.text.format_lines(
            [
                (
                    pipeline.pipeline_id,
                    pipeline.name,
                    pipeline.health_status or "---",
                    pipeline.state or "---",
                    pipeline.latest_run_time or "---",
                    pipeline.next_run_time or "---",
                )
                for pipeline in pipelines
            ],
            header_row=["Pipeline ID", "Name", "Health", "State", "Latest Run Time", "Next Run Time"],
            max_column_width=80,
        )
    )
    if len(pipelines) == 1:
        _show_pipeline_details(pipelines.pop())

Ejemplo n.º 5

0

Mostrar archivo

def validate_column_references(table_design):
    """Make sure that table attributes and constraints only reference columns that actually exist."""
    column_list_references = [
        ("constraints", "primary_key"),
        ("constraints", "natural_key"),
        ("constraints", "surrogate_key"),
        ("constraints", "unique"),
        ("attributes", "interleaved_sort"),
        ("attributes", "compound_sort"),
    ]
    valid_columns = frozenset(column["name"]
                              for column in table_design["columns"]
                              if not column.get("skipped"))

    constraints = table_design.get("constraints", [])
    for obj, key in column_list_references:
        if obj == "constraints":
            # This evaluates all unique constraints at once by concatenating all of the columns.
            cols = [
                col for constraint in constraints
                for col in constraint.get(key, [])
            ]
        else:  # 'attributes'
            cols = table_design.get(obj, {}).get(key, [])
        unknown = join_with_single_quotes(
            frozenset(cols).difference(valid_columns))
        if unknown:
            raise TableDesignSemanticError(
                "{key} columns in {obj} contain unknown column(s): {unknown}".
                format(obj=obj, key=key, unknown=unknown))

Ejemplo n.º 6

0

Mostrar archivo

Archivo: data_warehouse.py Proyecto: bhtucker/arthur-redshift-etl

def backup_schemas(schemas: Iterable[DataWarehouseSchema],
                   dry_run=False) -> None:
    """
    For existing schemas, rename them and drop access.

    Once the access is revoked, the backup schemas "disappear" from BI tools.
    """
    schema_lookup = {schema.name: schema for schema in schemas}
    dsn_etl = etl.config.get_dw_config().dsn_etl
    with closing(etl.db.connection(dsn_etl, autocommit=True,
                                   readonly=dry_run)) as conn:
        found = etl.db.select_schemas(conn, schema_lookup.keys())
        if not found:
            logger.info("Found no existing schemas to backup")
            return

        selected_names = join_with_single_quotes(found)
        if dry_run:
            logger.info("Dry-run: Skipping backup of schema(s): %s",
                        selected_names)
            return

        logger.info("Creating backup of schema(s) %s", selected_names)
        for schema in [schema_lookup[name] for name in found]:
            logger.info(
                "Revoking access from readers and writers to schema '%s' before backup",
                schema.name)
            revoke_schema_permissions(conn, schema)
            logger.info("Renaming schema '%s' to backup '%s'", schema.name,
                        schema.backup_name)
            etl.db.drop_schema(conn, schema.backup_name)
            etl.db.alter_schema_rename(conn, schema.name, schema.backup_name)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: relation.py Proyecto: bhtucker/arthur-redshift-etl

    def from_file_sets(
            cls,
            file_sets,
            required_relation_selector=None) -> List["RelationDescription"]:
        """
        Return a list of relation descriptions based on a list of file sets.

        If there's a file set without a table design file, then there's a warning and that file set
        is skipped. (This comes in handy when creating the design file for a CTAS or VIEW
        automatically.)

        If provided, the required_relation_selector will be used to mark dependencies of
        high-priority. A failure to dump or load in these relations will end the ETL run.
        """
        relations = []
        for file_set in file_sets:
            if file_set.design_file_name is not None:
                relations.append(cls(file_set))
            else:
                logger.warning(
                    "Found file(s) without matching table design: %s",
                    join_with_single_quotes(file_set.files))

        if required_relation_selector:
            set_required_relations(relations, required_relation_selector)

        return relations

Ejemplo n.º 8

0

Mostrar archivo

Archivo: errors.py Proyecto: bhtucker/arthur-redshift-etl

 def __init__(self, source_relations, extracted_targets):
     missing_relations = [
         relation for relation in source_relations
         if relation.identifier not in extracted_targets
     ]
     self.message = (
         "Some source relations did not have extract events after the step start time: "
         + join_with_single_quotes(missing_relations))

Ejemplo n.º 9

0

Mostrar archivo

Archivo: validate.py Proyecto: bhtucker/arthur-redshift-etl

def validate_reload(schemas: List[DataWarehouseSchema],
                    relations: List[RelationDescription], keep_going: bool):
    """
    Verify that columns between unloaded tables and reloaded tables are the same.

    Once the designs are validated, we can unload a relation 's.t' with a target 'u' and
    then extract and load it back into 'u.t'.

    Note that the order matters for these lists of columns.  (Which is also why we can't take
    the (symmetric) difference between columns but must be careful checking the column lists.)
    """
    unloaded_relations = [d for d in relations if d.is_unloadable]
    target_lookup = {
        schema.name
        for schema in schemas if schema.is_an_unload_target
    }
    relations_lookup = {d.identifier: d for d in relations}

    for unloaded in unloaded_relations:
        try:
            if unloaded.unload_target not in target_lookup:
                raise TableDesignValidationError(
                    "invalid target '{}' in unloadable relation '{}'".format(
                        unloaded.unload_target, unloaded.identifier))
            else:
                logger.debug("Checking whether '%s' is loaded back in",
                             unloaded.identifier)
                reloaded = TableName(unloaded.unload_target,
                                     unloaded.target_table_name.table)
                if reloaded.identifier in relations_lookup:
                    relation = relations_lookup[reloaded.identifier]
                    logger.info(
                        "Checking for consistency between '%s' and '%s'",
                        unloaded.identifier, relation.identifier)
                    unloaded_columns = unloaded.unquoted_columns
                    reloaded_columns = relation.unquoted_columns
                    if unloaded_columns != reloaded_columns:
                        diff = get_list_difference(reloaded_columns,
                                                   unloaded_columns)
                        logger.error(
                            "Column difference detected between '%s' and '%s'",
                            unloaded.identifier, relation.identifier)
                        logger.error(
                            "You need to replace, insert and/or delete in '%s' some column(s): %s",
                            relation.identifier,
                            join_with_single_quotes(diff),
                        )
                        raise TableDesignValidationError(
                            "unloaded relation '%s' failed to match counterpart"
                            % unloaded.identifier)
        except TableDesignValidationError:
            if keep_going:
                _error_occurred.set()
                logger.exception(
                    "Ignoring failure to validate '%s' and proceeding as requested:",
                    unloaded.identifier)
            else:
                raise

Ejemplo n.º 10

0

Mostrar archivo

    def extract_sources(self) -> None:
        """Iterate over sources to be extracted and parallelize extraction at the source level."""
        self.logger.info("Starting to extract %d relation(s) in %d schema(s)",
                         len(self.relations), len(self.schemas))
        self.failed_sources.clear()
        max_workers = len(self.schemas)

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=max_workers,
                thread_name_prefix="extract-source") as executor:
            futures = []
            for source_name, relation_group in groupby(
                    self.relations, attrgetter("source_name")):
                future = executor.submit(self.extract_source,
                                         self.schemas[source_name],
                                         list(relation_group))
                futures.append(future)
            if self.keep_going:
                done, not_done = concurrent.futures.wait(
                    futures, return_when=concurrent.futures.ALL_COMPLETED)
            else:
                done, not_done = concurrent.futures.wait(
                    futures, return_when=concurrent.futures.FIRST_EXCEPTION)
        if self.failed_sources:
            self.logger.error("Failed to extract from these source(s): %s",
                              join_with_single_quotes(self.failed_sources))

        # Note that iterating over result of futures may raise an exception which surfaces
        # exceptions from threads. This happens when there is (at least) one required table
        # that failed to extract.
        missing_tables: List[str] = []
        for future in done:
            missing_tables.extend(future.result())

        if missing_tables:
            self.logger.warning(
                "Failed to extract %d relation(s): %s",
                len(missing_tables),
                join_with_single_quotes(missing_tables),
            )
        if not_done:
            raise DataExtractError(
                "Extract failed to complete for {:d} source(s)".format(
                    len(not_done)))

Ejemplo n.º 11

0

Mostrar archivo

Archivo: __init__.py Proyecto: bhtucker/arthur-redshift-etl

def filter_relations_for_sources(
        source_lookup: Dict[str, DataWarehouseSchema],
        relations: List[RelationDescription]) -> List[RelationDescription]:
    """Filter for the relations that a given "extract" stage cares about."""
    selected = [
        relation for relation in relations
        if relation.source_name in source_lookup
    ]
    if selected:
        sources = frozenset(relation.source_name for relation in selected)
        logger.info("Selected %d relation(s) from source(s): %s",
                    len(selected), join_with_single_quotes(sources))
    return selected

Ejemplo n.º 12

0

Mostrar archivo

Archivo: data_warehouse.py Proyecto: bhtucker/arthur-redshift-etl

def _promote_schemas(schemas: Iterable[DataWarehouseSchema],
                     from_where: str,
                     dry_run=False) -> None:
    """
    Promote (staging or backup) schemas into their standard names and permissions.

    Changes schema.from_name_attr -> schema.name; expects from_name_attr to be 'backup_name'
    or 'staging_name'
    """
    attr_name = from_where + "_name"
    assert attr_name in ("backup_name", "staging_name")
    from_name_schema_lookup = {
        getattr(schema, attr_name): schema
        for schema in schemas
    }

    dsn_etl = etl.config.get_dw_config().dsn_etl
    with closing(etl.db.connection(dsn_etl, autocommit=True,
                                   readonly=dry_run)) as conn:
        need_promotion = etl.db.select_schemas(conn,
                                               from_name_schema_lookup.keys())
        if not need_promotion:
            logger.info("Found no %s schemas to promote", from_where)
            return

        # Always log the original names, not the ones found in need_promotion.
        selected_names = join_with_single_quotes(
            from_name_schema_lookup[from_name].name
            for from_name in need_promotion)
        if dry_run:
            logger.info(
                "Dry-run: Skipping promotion of %d schema(s) from %s position: %s",
                len(need_promotion),
                from_where,
                selected_names,
            )
            return

        logger.info("Promoting %d schema(s) from %s position: %s",
                    len(need_promotion), from_where, selected_names)
        for from_name in need_promotion:
            schema = from_name_schema_lookup[from_name]
            logger.info("Renaming schema '%s' from '%s'", schema.name,
                        from_name)
            etl.db.drop_schema(conn, schema.name)
            etl.db.alter_schema_rename(conn, from_name, schema.name)
            logger.info(
                "Granting readers and writers access to schema '%s' after promotion",
                schema.name)
            grant_schema_permissions(conn, schema)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: data_warehouse.py Proyecto: bhtucker/arthur-redshift-etl

def create_schema_and_grant_access(conn,
                                   schema,
                                   owner=None,
                                   use_staging=False,
                                   dry_run=False) -> None:
    group_names = join_with_single_quotes(schema.groups)
    name = schema.staging_name if use_staging else schema.name
    if dry_run:
        logger.info("Dry-run: Skipping creating schema '%s'", name)
    else:
        logger.info("Creating schema '%s'", name)
        etl.db.create_schema(conn, name, owner)
        etl.db.grant_all_on_schema_to_user(conn, name, schema.owner)
    if not schema.groups or use_staging:
        # Don't grant usage on staging schemas to readers/writers (if any)
        return None
    if dry_run:
        logger.info("Dry-run: Skipping granting access in '%s' to '%s'", name,
                    group_names)
    else:
        # Readers/writers are differentiated in table permissions, not schema permissions
        logger.info("Granting access in '%s' to %s", name, group_names)
        etl.db.grant_usage(conn, name, schema.groups)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: validate.py Proyecto: bhtucker/arthur-redshift-etl

def validate_column_ordering(conn: Connection, relation: RelationDescription,
                             tmp_view_name: TempTableName) -> None:
    """Download the column order (using the temporary view) and compare with table design."""
    attributes = etl.design.bootstrap.fetch_attributes(conn, tmp_view_name)
    actual_columns = [attribute.name for attribute in attributes]

    if not actual_columns and tmp_view_name.is_late_binding_view:
        # Thanks to late-binding views it is not an error for a view to not be able to resolve
        # its columns.
        logger.warning(
            "Order of columns in design of '%s' cannot be validated because external table is missing",
            relation.identifier,
        )
        return

    # Identity columns are inserted after the query has been run, so skip them here.
    expected_columns = [
        column["name"] for column in relation.table_design["columns"]
        if not (column.get("skipped") or column.get("identity"))
    ]

    diff = get_list_difference(expected_columns, actual_columns)
    if diff:
        logger.error(
            "Order of columns in design of '%s' does not match result of running its query",
            relation.identifier)
        logger.error(
            "You need to replace, insert and/or delete in '%s' some column(s): %s",
            relation.identifier,
            join_with_single_quotes(diff),
        )
        raise TableDesignValidationError(
            "invalid columns or column order in '%s'" % relation.identifier)
    else:
        logger.info(
            "Order of columns in design of '%s' matches result of running SQL query",
            relation.identifier)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: validate.py Proyecto: bhtucker/arthur-redshift-etl

def validate_upstream_constraints(conn: Connection,
                                  table: RelationDescription) -> None:
    """
    Compare table constraints between database and table design file.

    Note that "natural_key" or "surrogate_key" constraints are not valid in upstream
    (source) tables. Also, a "primary_key" in upstream may be used as a "unique" constraint
    in the design (but not vice versa).
    """
    current_constraint = etl.design.bootstrap.fetch_constraints(
        conn, table.source_table_name)
    design_constraint = table.table_design.get("constraints", [])

    current_primary_key = frozenset(
        [col for c in current_constraint for col in c.get("primary_key", [])])
    current_uniques = [
        frozenset(c["unique"]) for c in current_constraint if "unique" in c
    ]

    design_primary_key = frozenset(
        [col for c in design_constraint for col in c.get("primary_key", [])])
    design_uniques = [
        frozenset(c["unique"]) for c in design_constraint if "unique" in c
    ]

    # We'll pluck from the not_used info and report if anything wasn't used in the design.
    not_used = deepcopy(current_constraint)

    if design_primary_key:
        if current_primary_key == design_primary_key:
            for i in range(len(not_used)):
                if "primary_key" in not_used[i]:
                    del not_used[i]
                    break
        elif current_primary_key:
            raise TableDesignValidationError(
                "the primary_key constraint in '%s' (%s) does not match upstream (%s)"
                % (
                    table.identifier,
                    join_with_single_quotes(design_primary_key),
                    join_with_single_quotes(current_primary_key),
                ))
        else:
            raise TableDesignValidationError(
                "the primary key constraint in '%s' (%s) is not enforced upstream"
                % (table.identifier,
                   join_with_single_quotes(design_primary_key)))

    for design_unique in design_uniques:
        if current_primary_key == design_unique:
            for i in range(len(not_used)):
                if "primary_key" in not_used[i]:
                    del not_used[i]
                    break
        if design_unique in current_uniques:
            for i in range(len(not_used)):
                if "unique" in not_used[i] and frozenset(
                        not_used[i]["unique"]) == design_unique:
                    del not_used[i]
                    break
        if current_primary_key != design_unique and design_unique not in current_uniques:
            raise TableDesignValidationError(
                "the unique constraint in '%s' (%s) is not enforced upstream" %
                (table.identifier, join_with_single_quotes(design_unique)))

    for constraint in not_used:
        for constraint_type, columns in constraint.items():
            logger.warning(
                "Upstream source has additional %s constraint (%s) for '%s'",
                constraint_type,
                join_with_single_quotes(columns),
                table.table_design["source_name"],
            )

Ejemplo n.º 16

0

Mostrar archivo

Archivo: validate.py Proyecto: bhtucker/arthur-redshift-etl

def validate_upstream_columns(conn: Connection,
                              table: RelationDescription) -> None:
    """
    Compare columns in upstream table to the table design file.

    It is an ERROR if the design lists columns that do not exist in the upstream table. Exceptions
    here are calculated columns (those starting with etl__) or columns that are marked as skipped.

    It causes a WARNING to have more columns in the upstream table than are defined in the design
    or to have columns skipped in the design that do not exist upstream.
    """
    source_table_name = table.source_table_name

    columns_info = etl.design.bootstrap.fetch_attributes(
        conn, source_table_name)
    if not columns_info:
        raise UpstreamValidationError(
            "table '%s' is gone or has no columns left" %
            source_table_name.identifier)
    logger.info("Found %d column(s) in relation '%s'", len(columns_info),
                source_table_name.identifier)

    current_columns = frozenset(column.name for column in columns_info)
    design_columns = frozenset(column["name"]
                               for column in table.table_design["columns"]
                               if not column["name"].startswith("etl__"))
    design_required_columns = frozenset(
        column["name"] for column in table.table_design["columns"] if
        column["name"] in design_columns and not column.get("skipped", False))

    missing_required_columns = design_required_columns.difference(
        current_columns)
    if missing_required_columns:
        raise UpstreamValidationError(
            "design of '%s' has columns that do not exist upstream: %s" %
            (source_table_name.identifier,
             join_with_single_quotes(missing_required_columns)))

    extra_design_columns = design_columns.difference(current_columns)
    if extra_design_columns:
        logger.warning(
            "Column(s) that are in the design of '%s' but do not exist upstream in '%s': %s",
            table.identifier,
            table.source_name,
            join_with_single_quotes(extra_design_columns),
        )

    missing_design_columns = current_columns.difference(design_columns)
    if missing_design_columns:
        logger.warning(
            "Column(s) that exist upstream in '%s' but not in the design '%s': %s",
            table.source_name,
            table.identifier,
            join_with_single_quotes(missing_design_columns),
        )

    current_is_not_null = {
        column.name
        for column in columns_info if column.not_null
    }
    for column in table.table_design["columns"]:
        if column.get(
                "not_null") and column["name"] not in current_is_not_null:
            raise TableDesignValidationError(
                "not null constraint of column '{}' in '{}' not enforced upstream"
                .format(column["name"], table.identifier))

Ejemplo n.º 17

0

Mostrar archivo

 def __str__(self) -> str:
     # See __init__ for tests
     if len(self._patterns) == 0:
         return "['*.*']"
     else:
         return "[{}]".format(join_with_single_quotes(p.identifier for p in self._patterns))

Ejemplo n.º 18

0

Mostrar archivo

Archivo: relation.py Proyecto: bhtucker/arthur-redshift-etl

def create_index(relations: Sequence[RelationDescription],
                 groups: Iterable[str], with_columns: Optional[bool]) -> None:
    """
    Create an "index" page with Markdown that lists all schemas and their tables.

    The parameter groups filters schemas to those that can be accessed by those groups.
    """
    group_set = frozenset(groups)
    show_details = True if with_columns else False

    # We iterate of the list of relations so that we preserve their order with respect to schemas.
    schemas: Dict[str, dict] = OrderedDict()
    for relation in relations:
        if not group_set.intersection(relation.schema_config.reader_groups):
            continue
        schema_name = relation.target_table_name.schema
        if schema_name not in schemas:
            schemas[schema_name] = {
                "description": relation.schema_config.description,
                "relations": []
            }
        schemas[schema_name]["relations"].append(relation)

    if not schemas:
        logger.info("List of schemas is empty, selected groups: %s",
                    join_with_single_quotes(group_set))
        return

    print("# List Of Relations By Schema")
    for schema_name, schema_info in schemas.items():
        print(f"""\n## Schema: "{schema_name}"\n""")
        if schema_info["description"]:
            print(f"{schema_info['description']}\n")

        rows = ([relation.target_table_name.table, relation.description]
                for relation in schema_info["relations"])
        print(
            tabulate(rows,
                     headers=["Relation", "Description"],
                     tablefmt="pipe"))

        if not show_details:
            continue

        for relation in schema_info["relations"]:
            relation_kind = "View" if relation.is_view_relation else "Table"
            print(f"""\n### {relation_kind}: "{relation.identifier}"\n""")
            if relation.description:
                print(f"{relation.description}\n")

            key_columns: FrozenSet[str] = frozenset()
            for constraint in relation.table_design.get("constraints", []):
                for constraint_name, constraint_columns in constraint.items():
                    if constraint_name in ("primary_key", "surrogate_key"):
                        key_columns = frozenset(constraint_columns)
                        break
            rows = ([
                ":key:" if column["name"] in key_columns else "",
                column["name"],
                column.get("type", ""),
                column.get("description", ""),
            ] for column in relation.table_design["columns"])
            print(
                tabulate(rows,
                         headers=[
                             "Key?", "Column Name", "Column Type",
                             "Column Description"
                         ],
                         tablefmt="pipe"))