Esempio n. 1
0
def fetch_tables(cx: Connection, source: DataWarehouseSchema,
                 selector: TableSelector) -> List[TableName]:
    """
    Retrieve tables (matching selector) for this source, return as a list of TableName instances.

    The :source configuration contains an "allowlist" (which tables to include) and a
    "denylist" (which tables to exclude). Note that "exclude" always overrides "include."
    The list of tables matching the allowlist but not the denylist can be further narrowed
    down by the pattern in :selector.
    """
    # Look for relations ('r', ordinary tables), materialized views ('m'), and views ('v').
    result = etl.db.query(
        cx,
        """
        SELECT nsp.nspname AS "schema"
             , cls.relname AS "table"
          FROM pg_catalog.pg_class AS cls
          JOIN pg_catalog.pg_namespace AS nsp ON cls.relnamespace = nsp.oid
         WHERE cls.relname NOT LIKE 'tmp%%'
           AND cls.relname NOT LIKE 'pg_%%'
           AND cls.relkind IN ('r', 'm', 'v')
         ORDER BY nsp.nspname
                , cls.relname
         """,
    )
    found = []
    for row in result:
        source_table_name = TableName(row["schema"], row["table"])
        target_table_name = TableName(source.name, row["table"])
        for reject_pattern in source.exclude_tables:
            if source_table_name.match_pattern(reject_pattern):
                logger.debug("Table '%s' matches denylist",
                             source_table_name.identifier)
                break
        else:
            for accept_pattern in source.include_tables:
                if source_table_name.match_pattern(accept_pattern):
                    if selector.match(target_table_name):
                        found.append(source_table_name)
                        logger.debug("Table '%s' is included in result set",
                                     source_table_name.identifier)
                        break
                    else:
                        logger.debug(
                            "Table '%s' matches allowlist but is not selected",
                            source_table_name.identifier)
    logger.info(
        "Found %d table(s) matching patterns; allowlist=%s, denylist=%s, subset='%s'",
        len(found),
        source.include_tables,
        source.exclude_tables,
        selector,
    )
    return found
Esempio n. 2
0
def build_column_description(column: Dict[str, str],
                             skip_identity=False,
                             skip_references=False) -> str:
    """
    Return the description of a table column suitable for a table creation.
    See build_columns.

    >>> build_column_description({"name": "key", "sql_type": "int", "not_null": True, "encoding": "raw"})
    '"key" int ENCODE raw NOT NULL'
    >>> build_column_description({"name": "my_key", "sql_type": "int", "references": ["sch.ble", ["key"]]})
    '"my_key" int REFERENCES "sch"."ble" ( "key" )'
    """
    column_ddl = '"{name}" {sql_type}'.format(**column)
    if column.get("identity", False) and not skip_identity:
        column_ddl += " IDENTITY(1, 1)"
    if "encoding" in column:
        column_ddl += " ENCODE {}".format(column["encoding"])
    if column.get("not_null", False):
        column_ddl += " NOT NULL"
    if "references" in column and not skip_references:
        [table_identifier, [foreign_column]] = column["references"]
        foreign_name = TableName.from_identifier(table_identifier)
        column_ddl += ' REFERENCES {} ( "{}" )'.format(foreign_name,
                                                       foreign_column)
    return column_ddl
Esempio n. 3
0
def fetch_dependencies(cx: Connection, table_name: TableName) -> List[str]:
    """
    Lookup dependencies (other relations).

    Note that this will return an empty list for a late-binding view.
    """
    # See https://github.com/awslabs/amazon-redshift-utils/blob/master/src/AdminViews/v_constraint_dependency.sql
    stmt = """
        SELECT DISTINCT
               target_ns.nspname AS "schema"
             , target_cls.relname AS "table"
          FROM pg_catalog.pg_class AS cls
          JOIN pg_catalog.pg_namespace AS ns ON cls.relnamespace = ns.oid
          JOIN pg_catalog.pg_depend AS dep ON cls.oid = dep.refobjid
          JOIN pg_catalog.pg_depend AS target_dep ON dep.objid = target_dep.objid
          JOIN pg_catalog.pg_class AS target_cls ON target_dep.refobjid = target_cls.oid
                                                AND cls.oid <> target_cls.oid
          JOIN pg_catalog.pg_namespace AS target_ns ON target_cls.relnamespace = target_ns.oid
         WHERE ns.nspname LIKE %s
           AND cls.relname = %s
         ORDER BY "schema", "table"
        """
    dependencies = etl.db.query(cx, stmt,
                                (table_name.schema, table_name.table))
    return [TableName(**row).identifier for row in dependencies]
Esempio n. 4
0
 def dependencies(self) -> FrozenSet[TableName]:
     if self._dependencies is None:
         dependent_table_names = [
             TableName.from_identifier(d)
             for d in self.table_design.get("depends_on", [])
         ]
         self._dependencies = frozenset(dependent_table_names)
     return self._dependencies
Esempio n. 5
0
def validate_reload(schemas: List[DataWarehouseSchema],
                    relations: List[RelationDescription], keep_going: bool):
    """
    Verify that columns between unloaded tables and reloaded tables are the same.

    Once the designs are validated, we can unload a relation 's.t' with a target 'u' and
    then extract and load it back into 'u.t'.

    Note that the order matters for these lists of columns.  (Which is also why we can't take
    the (symmetric) difference between columns but must be careful checking the column lists.)
    """
    unloaded_relations = [d for d in relations if d.is_unloadable]
    target_lookup = {
        schema.name
        for schema in schemas if schema.is_an_unload_target
    }
    relations_lookup = {d.identifier: d for d in relations}

    for unloaded in unloaded_relations:
        try:
            if unloaded.unload_target not in target_lookup:
                raise TableDesignValidationError(
                    "invalid target '{}' in unloadable relation '{}'".format(
                        unloaded.unload_target, unloaded.identifier))
            else:
                logger.debug("Checking whether '%s' is loaded back in",
                             unloaded.identifier)
                reloaded = TableName(unloaded.unload_target,
                                     unloaded.target_table_name.table)
                if reloaded.identifier in relations_lookup:
                    relation = relations_lookup[reloaded.identifier]
                    logger.info(
                        "Checking for consistency between '%s' and '%s'",
                        unloaded.identifier, relation.identifier)
                    unloaded_columns = unloaded.unquoted_columns
                    reloaded_columns = relation.unquoted_columns
                    if unloaded_columns != reloaded_columns:
                        diff = get_list_difference(reloaded_columns,
                                                   unloaded_columns)
                        logger.error(
                            "Column difference detected between '%s' and '%s'",
                            unloaded.identifier, relation.identifier)
                        logger.error(
                            "You need to replace, insert and/or delete in '%s' some column(s): %s",
                            relation.identifier,
                            join_with_single_quotes(diff),
                        )
                        raise TableDesignValidationError(
                            "unloaded relation '%s' failed to match counterpart"
                            % unloaded.identifier)
        except TableDesignValidationError:
            if keep_going:
                _error_occurred.set()
                logger.exception(
                    "Ignoring failure to validate '%s' and proceeding as requested:",
                    unloaded.identifier)
            else:
                raise
def create_table_designs_from_source(source,
                                     selector,
                                     local_dir,
                                     local_files,
                                     dry_run=False):
    """
    Create table design files for tables from a single source to local directory.
    Whenever some table designs already exist locally, validate them against the information found from upstream.
    """
    source_dir = os.path.join(local_dir, source.name)
    normalize_and_create(source_dir, dry_run=dry_run)

    source_files = {
        file_set.source_table_name: file_set
        for file_set in local_files
        if file_set.source_name == source.name and file_set.design_file_name
    }
    try:
        logger.info("Connecting to database source '%s' to look for tables",
                    source.name)
        with closing(
                etl.db.connection(source.dsn, autocommit=True,
                                  readonly=True)) as conn:
            source_tables = fetch_tables(conn, source, selector)
            for source_table_name in source_tables:
                if source_table_name in source_files:
                    logger.info(
                        "Skipping '%s' from source '%s' because table design file exists: '%s'",
                        source_table_name.identifier, source.name,
                        source_files[source_table_name].design_file_name)
                else:
                    target_table_name = TableName(source.name,
                                                  source_table_name.table)
                    table_design = create_table_design_for_source(
                        conn, source_table_name, target_table_name)
                    save_table_design(source_dir,
                                      source_table_name,
                                      target_table_name,
                                      table_design,
                                      dry_run=dry_run)
        logger.info("Done with %d table(s) from source '%s'",
                    len(source_tables), source.name)
    except Exception:
        logger.error("Error while processing source '%s'", source.name)
        raise

    existent = frozenset(name.identifier for name in source_files)
    upstream = frozenset(name.identifier for name in source_tables)
    not_found = upstream.difference(existent)
    if not_found:
        logger.warning("New table(s) in source '%s' without local design: %s",
                       source.name, join_with_quotes(not_found))
    too_many = existent.difference(upstream)
    if too_many:
        logger.error("Local table design(s) without table in source '%s': %s",
                     source.name, join_with_quotes(too_many))

    return len(source_tables)
def _find_file_sets_from(iterable, selector):
    """
    Return list of file sets ordered by (target) schema name, (source) schema_name and (source) table_name.
    Remember that the (target) schema name is the same as the source name (for upstream sources).
    The selector's base schemas (if present) will override alphabetical sorting for the source_name.
    """
    target_map = {}

    # Always return files sorted by sources (in original order) and target name.
    schema_index = {
        name: index
        for index, name in enumerate(selector.base_schemas)
    }

    for filename, values in _find_matching_files_from(iterable, selector):
        source_table_name = TableName(values["schema_name"],
                                      values["table_name"])
        target_table_name = TableName(values["source_name"],
                                      values["table_name"])

        if target_table_name.identifier not in target_map:
            natural_order = schema_index.get(
                values["source_name"]), source_table_name.identifier
            target_map[target_table_name.identifier] = TableFileSet(
                source_table_name, target_table_name, natural_order)

        file_set = target_map[target_table_name.identifier]
        file_type = values["file_type"]
        if file_type == "yaml":
            file_set.design_file_name = filename
        elif file_type == "sql":
            file_set.sql_file_name = filename
        elif file_type == "manifest":
            file_set.manifest_file_name = filename
        elif file_type == "data":
            file_set.add_data_file(filename)

    file_sets = sorted(target_map.values())
    logger.info("Found %d matching file(s) for %d table(s)",
                sum(len(fs) for fs in file_sets), len(file_sets))
    return file_sets
def _find_matching_files_from(iterable, pattern, return_success_file=False):
    """
    Match file names against the target pattern and expected path format,
    return file information based on source name, source schema, table name and file type.

    This generator provides all files that are possibly relevant and it
    is up to the consumer to ensure consistency (e.g. that a design
    file exists or that a SQL file is not present along with a manifest).

    Files ending in '_SUCCESS' or '_$folder$' are ignored (which are created by some Spark jobs).
    """
    file_names_re = re.compile(
        r"""(?:^schemas|/schemas|^data|/data)
                                   /(?P<source_name>\w+)
                                   /(?P<schema_name>\w+)-(?P<table_name>\w+)
                                   (?:(?P<file_ext>.yaml|.sql|.manifest|/csv/(:?part-.*(:?\.gz)?|_SUCCESS)))$
                               """,
        re.VERBOSE,
    )

    for filename in iterable:
        match = file_names_re.search(filename)
        if match:
            values = match.groupdict()
            target_table_name = TableName(values["source_name"],
                                          values["table_name"])
            if pattern.match(target_table_name):
                file_ext = values["file_ext"]
                if file_ext in [".yaml", ".sql", ".manifest"]:
                    values["file_type"] = file_ext[1:]
                elif file_ext.endswith("_SUCCESS"):
                    values["file_type"] = "success"
                elif file_ext.startswith("/csv"):
                    values["file_type"] = "data"
                # E.g. when deleting files out of a folder we want to know about the /csv/_SUCCESS file.
                if return_success_file or values["file_type"] != "success":
                    yield (filename, values)
        elif not filename.endswith("_$folder$"):
            logger.warning("Found file not matching expected format: '%s'",
                           filename)
Esempio n. 9
0
def _find_matching_files_from(iterable, pattern):
    """
    Return file information based on source name, source schema, table name and file type.

    This generator provides all files that are relevant and match the pattern. It
    is up to the consumer to ensure consistency (e.g. that a design
    file exists or that a SQL file is not present along with a manifest).

    Files ending in '_$folder$' are ignored. (They are created by some Spark jobs.)

    >>> found = list(_find_matching_files_from([
    ...     "/schemas/store/public-orders.yaml",
    ...     "/data/store/public-orders.manifest",
    ...     "/data/store/public-orders/csv/_SUCCESS",
    ...     "/data/store/public-orders/csv/part-0.gz",
    ...     "/schemas/dw/orders.sql",
    ...     "/schemas/dw/orders.yaml",
    ...     "/data/events/kinesis-stream/json/part-0.gz",
    ...     "/schemas/store/not-selected.yaml",
    ... ], pattern=TableSelector(["dw.*", "events", "store.orders"])))
    >>> files = {file_info.filename: file_info for file_info in found}
    >>> len(files)
    7
    >>> files["/schemas/store/public-orders.yaml"].file_type
    'yaml'
    >>> files["/schemas/store/public-orders.yaml"].schema_name
    'store'
    >>> files["/schemas/store/public-orders.yaml"].source_schema_name
    'public'
    >>> files["/schemas/store/public-orders.yaml"].table_name
    'orders'
    >>> files["/data/store/public-orders.manifest"].file_type
    'manifest'
    >>> files["/data/store/public-orders/csv/_SUCCESS"].file_type
    'success'
    >>> files["/data/store/public-orders/csv/part-0.gz"].file_type
    'data'
    >>> files["/data/store/public-orders/csv/part-0.gz"].file_format
    'CSV'
    >>> files["/schemas/dw/orders.sql"].schema_name
    'dw'
    >>> files["/schemas/dw/orders.sql"].source_schema_name
    'dw'
    >>> files["/schemas/dw/orders.sql"].table_name
    'orders'
    >>> files["/schemas/dw/orders.sql"].file_type
    'sql'
    >>> files["/schemas/dw/orders.yaml"].file_type
    'yaml'
    >>> files["/data/events/kinesis-stream/json/part-0.gz"].file_format
    'JSON'
    """
    for filename in iterable:
        file_info = FileInfo.from_filename(filename)
        if file_info is None:
            if not filename.endswith("_$folder$"):
                logger.warning("Found file not matching expected format: '%s'", filename)
            continue

        target_table_name = TableName(file_info.schema_name, file_info.table_name)
        if pattern.match(target_table_name):
            yield file_info
Esempio n. 10
0
def create_table_designs_from_source(source,
                                     selector,
                                     local_dir,
                                     relations,
                                     update=False,
                                     replace=False,
                                     dry_run=False):
    """
    Create table design files for tables from a single upstream source to local directory.

    Whenever some table designs already exist locally, validate them, update them or replace
    them against the information found from the upstream source.
    """
    source_dir = os.path.join(local_dir, source.name)
    normalize_and_create(source_dir, dry_run=dry_run)
    relation_lookup = {
        relation.source_table_name: relation
        for relation in relations
    }

    try:
        logger.info("Connecting to database source '%s' to look for tables",
                    source.name)
        with closing(
                etl.db.connection(source.dsn, autocommit=True,
                                  readonly=True)) as conn:
            source_tables = fetch_tables(conn, source, selector)
            for source_table_name in source_tables:
                if source_table_name in relation_lookup and not (update
                                                                 or replace):
                    logger.info(
                        "Skipping '%s' from source '%s' because table design already exists: '%s'",
                        source_table_name.identifier,
                        source.name,
                        relation_lookup[source_table_name].design_file_name,
                    )
                    continue
                relation = relation_lookup.get(
                    source_table_name) if update else None
                target_table_name = TableName(source.name,
                                              source_table_name.table)
                table_design = create_table_design_for_source(
                    conn, source_table_name, target_table_name, relation)
                if relation is not None and relation.table_design == table_design:
                    logger.info(
                        f"No updates detected in table design for {target_table_name:x}, skipping write"
                    )
                    continue
                filename = os.path.join(
                    source_dir,
                    f"{source_table_name.schema}-{source_table_name.table}.yaml"
                )
                save_table_design(target_table_name,
                                  table_design,
                                  filename,
                                  overwrite=update or replace,
                                  dry_run=dry_run)

        logger.info("Done with %d table(s) from source '%s'",
                    len(source_tables), source.name)
    except Exception:
        logger.error("Error while processing source '%s'", source.name)
        raise

    existent = frozenset(relation.source_table_name.identifier
                         for relation in relations)
    upstream = frozenset(name.identifier for name in source_tables)
    not_found = upstream.difference(existent)
    if not_found:
        logger.warning("New table(s) in source '%s' without local design: %s",
                       source.name, join_with_single_quotes(not_found))
    too_many = existent.difference(upstream)
    if too_many:
        logger.error("Local table design(s) without table in source '%s': %s",
                     source.name, join_with_single_quotes(too_many))

    return len(source_tables)