def set_required_relations(relations: List[RelationDescription],
                           required_selector: TableSelector) -> None:
    """
    Set the required property of the relations if they are directly or indirectly feeding
    into relations selected by the :required_selector.
    """
    logger.info(
        "Loading table design for %d relation(s) to mark required relations",
        len(relations))
    ordered_descriptions = order_by_dependencies(relations)
    # Start with all descriptions that are matching the required selector
    required_relations = [
        description for description in ordered_descriptions
        if required_selector.match(description.target_table_name)
    ]
    # Walk through descriptions in reverse dependency order, expanding required set based on dependency fan-out
    for description in ordered_descriptions[::-1]:
        if any([
                description.target_table_name in required.dependencies
                for required in required_relations
        ]):
            required_relations.append(description)

    for relation in ordered_descriptions:
        relation._is_required = False
    for relation in required_relations:
        relation._is_required = True

    logger.info("Marked %d relation(s) as required based on selector: %s",
                len(required_relations), required_selector)
def find_matches(relations: Sequence[RelationDescription],
                 selector: TableSelector):
    """Return list of matching relations."""
    return [
        relation for relation in relations
        if selector.match(relation.target_table_name)
    ]
def fetch_tables(cx: Connection, source: DataWarehouseSchema,
                 selector: TableSelector) -> List[TableName]:
    """
    Retrieve tables (matching selector) for this source, return as a list of TableName instances.

    The :source configuration contains an "allowlist" (which tables to include) and a
    "denylist" (which tables to exclude). Note that "exclude" always overrides "include."
    The list of tables matching the allowlist but not the denylist can be further narrowed
    down by the pattern in :selector.
    """
    # Look for relations ('r', ordinary tables), materialized views ('m'), and views ('v').
    result = etl.db.query(
        cx,
        """
        SELECT nsp.nspname AS "schema"
             , cls.relname AS "table"
          FROM pg_catalog.pg_class AS cls
          JOIN pg_catalog.pg_namespace AS nsp ON cls.relnamespace = nsp.oid
         WHERE cls.relname NOT LIKE 'tmp%%'
           AND cls.relname NOT LIKE 'pg_%%'
           AND cls.relkind IN ('r', 'm', 'v')
         ORDER BY nsp.nspname
                , cls.relname
         """,
    )
    found = []
    for row in result:
        source_table_name = TableName(row["schema"], row["table"])
        target_table_name = TableName(source.name, row["table"])
        for reject_pattern in source.exclude_tables:
            if source_table_name.match_pattern(reject_pattern):
                logger.debug("Table '%s' matches denylist",
                             source_table_name.identifier)
                break
        else:
            for accept_pattern in source.include_tables:
                if source_table_name.match_pattern(accept_pattern):
                    if selector.match(target_table_name):
                        found.append(source_table_name)
                        logger.debug("Table '%s' is included in result set",
                                     source_table_name.identifier)
                        break
                    else:
                        logger.debug(
                            "Table '%s' matches allowlist but is not selected",
                            source_table_name.identifier)
    logger.info(
        "Found %d table(s) matching patterns; allowlist=%s, denylist=%s, subset='%s'",
        len(found),
        source.include_tables,
        source.exclude_tables,
        selector,
    )
    return found
Beispiel #4
0
def find_data_files_in_s3(bucket_name: str, prefix: str) -> Iterator[str]:
    """Return paths of data files."""
    iterable = etl.s3.list_objects_for_prefix(bucket_name, prefix)
    for file_info in _find_matching_files_from(iterable, TableSelector()):
        if file_info.file_type == "data":
            yield file_info.filename