Esempio n. 1
0
    def test_render_test_template_with_macro(self):
        template_name = "unique_rows.sql"
        expected_contents = f"""
        with hashed_rows as (
            select
                md5(concat(
            coalesce(cast({test_column}_1 as varchar), ''),'|',coalesce(cast({test_column}_2 as varchar), ''))) as row_hash
            from {test_schema}.{test_entity}
        )
        select (count(*) - count(distinct row_hash)) as test_result
        from hashed_rows
        """.strip()

        _kwargs = {
            "schema": test_schema,
            "entity": test_entity,
            "kwargs": {
                "columns": [f"{test_column}_1", f"{test_column}_2"]
            },
        }

        template_contents = self._render_template("tests", template_name,
                                                  _kwargs)
        logger.info(template_contents)
        assert template_contents is not None
        assert strings_match_ignore_whitespace(template_contents,
                                               expected_contents)
Esempio n. 2
0
    def _process_database_schema(self, aggregation_type):

        supported_aggregators = {
            "tests": TestAggregator,
            "metrics": MetricsAggregator
        }

        aggregation_results = []

        for database_name in self._databases:
            self._connector.database = database_name

            logger.info(f"Inspecting database {database_name}...")

            schemas = self._databases[database_name]

            if schemas is None:
                logger.info(
                    "No schemas specified, getting all schemas from database..."
                )
                schemas = self._connector.get_schema_names()

            for schema_name in schemas:

                p = Path("output")
                schema_path = Path(self._project_name).joinpath(
                    database_name, schema_name, "*.yml")
                schema_files = sorted(list(p.glob(str(schema_path))))

                for p in schema_files:

                    entity_schema = read_yaml(p.resolve())

                    for entity in entity_schema["models"]:

                        entity_name = entity["name"]

                        entity_aggregator = supported_aggregators[
                            aggregation_type](self._connector,
                                              self._project_name,
                                              database_name, schema_name,
                                              entity_name)

                        entity_aggregations = entity[
                            aggregation_type] if aggregation_type in entity else None
                        if entity_aggregations:
                            entity_aggregation_results = entity_aggregator.run_entity_aggregations(
                                entity_aggregations)
                            aggregation_results += entity_aggregation_results

                        columns = entity["columns"]
                        column_aggregation_results = entity_aggregator.run_column_aggregations(
                            columns)
                        aggregation_results += column_aggregation_results

        return aggregation_results
Esempio n. 3
0
def read_yaml(yaml_path, storage_model='local'):

    try:
        with open(Path(yaml_path).resolve(), 'r') as f:
            yml = yaml.load(f, Loader=yaml.FullLoader)
    except FileNotFoundError:
        logger.info(f"Could not find {yaml_path}. Please check that {yaml_path} exists.")
        raise

    return yml
Esempio n. 4
0
    def generate_entity_schema(self):

        self._columns = self._connector.get_columns(self._entity, self._schema)
        logger.info(
            f"Generating schema for {self._database}.{self._schema}.{self._entity} ({len(self._columns)} columns)"
        )
        yml = self._make_schema_yaml()

        self._write_entity_schema_yaml(yml)

        return yml
Esempio n. 5
0
def main(
    action,
    project=None,
    config_file="config.yml",
    connections_file="connections.yml",
):

    # logger.info(f"schemaql_path: {schemaql_path}")

    actions = ["generate", "test", "agg"]
    assert (action in actions), f"'{action}' is currently not supported"

    config = read_yaml(config_file)
    connections = read_yaml(connections_file)

    assert ("collector"
            in config), "'collector' needs to be specified in config.yml"

    collector_config = config["collector"]
    collector = _get_collector(collector_config, connections)

    projects_config = config["projects"]
    if project is not None:
        projects_config = {project: projects_config[project]}

    failures = 0
    results = None

    for project_name in projects_config:

        connector, databases = _get_project_config(projects_config,
                                                   project_name, connections)

        project = Project(project_name, connector, databases)

        if action == "generate":
            project.generate_database_schema()

        elif action == "test":
            results = project.test_database_schema()

        elif action == "agg":
            results = project.aggregate_database_schema()

        if results:
            failures += _check_for_failures(results)
            collector.save_test_results(project_name, results)

    logger.info("Done!")

    exit_code = 0 + failures
    sys.exit(exit_code)
Esempio n. 6
0
    def generate_database_schema(self):
        """Generates yaml output file for connection and databases"""
        for database in self._databases:

            logger.info(f"database: {database}")
            self._connector.database = database

            schemas = self._databases[database]
            logger.info(f"schemas: {schemas}")

            if schemas is None:
                logger.info(
                    "No schemas specified, getting all schemas from database..."
                )
                schemas = self._connector.get_schema_names(database)

            for schema in schemas:
                logger.info(f"schema: {schema}")

                tables = self._connector.get_table_names(schema)
                # remove schema prefixes if in table name
                # (this can happen on BigQuery)
                tables = [table.replace(f"{schema}.", "") for table in tables]

                for table in tables:
                    generator = EntitySchemaGenerator(self._project_name,
                                                      self._connector,
                                                      database, schema, table)
                    generator.generate_entity_schema()
Esempio n. 7
0
def _check_for_failures(results):
    results_count = len(results)
    failures = sum([not a["aggregation_passed"] for a in results])
    failed_tasks = [a for a in results if not a["aggregation_passed"]]
    logger.info((LINE_WIDTH - 18) * "-")

    if failures == 0:
        color = "green"
        logger.info(
            color_me(f"{results_count} task(s) ran successfully.", color))
    else:
        color = "red"
        logger.info(
            color_me(f"{failures} failure(s) out of {results_count} task(s).",
                     color))
        for task in failed_tasks:
            task_desc = f'- {task["aggregation_name"]}'
            logger.info(task_desc)

    logger.info((LINE_WIDTH - 18) * "-")

    return failures
Esempio n. 8
0
 def engine(self):
     if self._engine is None:
         self._engine = self._make_engine()
         logger.info(f"Connecting to {self.connector_type}...")
     return self._engine
Esempio n. 9
0
 def _log_result(self, bullet, aggregation_name, result_msg):
     logger.info(self._format_left_align(f"{bullet} {aggregation_name}") +
                 self._format_right_align(result_msg)
                 )
Esempio n. 10
0
 def engine(self):
     if self._engine is None:
         self._engine = self._make_engine()
         logger.info(self._engine)
     return self._engine
Esempio n. 11
0
 def log(self, context, msg):
     logger.info(msg)
     return ""