Ejemplo n.º 1
0
    def analyze(self, warehouse: Warehouse, table_name: str):
        dialect = warehouse.dialect
        qualified_table_name = dialect.qualify_table_name(table_name)

        analyze_results: List[ColumnAnalysisResult] = []

        sql = dialect.sql_columns_metadata_query(table_name)
        column_tuples = warehouse.sql_fetchall(sql)
        for column_tuple in column_tuples:
            column_name = column_tuple[0]
            source_type = column_tuple[1]

            column_analysis_result = ColumnAnalysisResult(
                column_name=column_name, source_type=source_type)
            analyze_results.append(column_analysis_result)

            qualified_column_name = dialect.qualify_column_name(column_name)

            if dialect.is_text(source_type):
                column_analysis_result.is_text = True

                validity_format_count_fields = []
                validity_counts = []
                for validity_format in Validity.FORMATS:
                    format_regex = Validity.FORMATS[validity_format]
                    validity_counts.append({'format': validity_format})
                    qualified_regex = dialect.qualify_regex(format_regex)
                    regexp_like = dialect.sql_expr_regexp_like(
                        qualified_column_name, qualified_regex)
                    count_field = f'COUNT(CASE WHEN {regexp_like} THEN 1 END)'
                    validity_format_count_fields.append(count_field)

                row = warehouse.sql_fetchone(
                    f'SELECT \n  ' +
                    (',\n  '.join(validity_format_count_fields)) + ',\n'
                    f'  COUNT({qualified_column_name}) \n'
                    f'FROM {qualified_table_name} \n'
                    f'LIMIT 1000')

                values_count = row[len(validity_counts)]
                column_analysis_result.values_count = values_count

                if values_count > 0:
                    for i in range(len(validity_counts)):
                        validity_count = validity_counts[i]
                        validity_count['count'] = row[i]

                    sorted_validity_counts = sorted(validity_counts,
                                                    key=lambda c: c['count'],
                                                    reverse=True)
                    most_frequent_validity_format = sorted_validity_counts[0]
                    valid_count = most_frequent_validity_format['count']
                    column_analysis_result.valid_count = valid_count

                    if valid_count > (values_count / 2):
                        column_analysis_result.validity_format = most_frequent_validity_format[
                            'format']

        return analyze_results
Ejemplo n.º 2
0
    def execute_metric(self,
                       warehouse: Warehouse,
                       metric: dict,
                       scan_dict: dict = None):
        dialect = warehouse.dialect
        if not scan_dict:
            scan_dict = {}
        if KEY_TABLE_NAME not in scan_dict:
            scan_dict[KEY_TABLE_NAME] = self.default_test_table_name
        scan_configuration_parser = ScanYmlParser(scan_dict, 'test-scan')
        scan_configuration_parser.assert_no_warnings_or_errors()
        scan = warehouse.create_scan(
            scan_yml=scan_configuration_parser.scan_yml)
        scan.close_warehouse = False
        scan.execute()

        fields: List[str] = []
        group_by_column_names: List[str] = metric.get('groupBy')
        if group_by_column_names:
            for group_by_column in group_by_column_names:
                fields.append(dialect.qualify_column_name(group_by_column))

        column_name: str = metric.get('columnName')
        qualified_column_name = dialect.qualify_column_name(column_name)

        metric_type = metric['type']
        if metric_type == Metric.ROW_COUNT:
            fields.append('COUNT(*)')
        if metric_type == Metric.MIN:
            fields.append(f'MIN({qualified_column_name})')
        elif metric_type == Metric.MAX:
            fields.append(f'MAX({qualified_column_name})')
        elif metric_type == Metric.SUM:
            fields.append(f'SUM({qualified_column_name})')

        sql = 'SELECT \n  ' + ',\n  '.join(fields) + ' \n' \
              'FROM ' + scan.qualified_table_name

        where_clauses = []

        metric_filter = metric.get('filter')
        if metric_filter:
            where_clauses.append(dialect.sql_expression(metric_filter))

        scan_column: ScanColumn = scan.scan_columns.get(column_name)
        if scan_column and scan_column.non_missing_and_valid_condition:
            where_clauses.append(scan_column.non_missing_and_valid_condition)

        if where_clauses:
            sql += '\nWHERE ' + '\n      AND '.join(where_clauses)

        if group_by_column_names:
            sql += '\nGROUP BY ' + ', '.join(group_by_column_names)

        return warehouse.sql_fetchall(sql)
Ejemplo n.º 3
0
def analyze(warehouse_file: str, include: str, exclude: str):
    """
    Analyzes tables in the warehouse and creates scan YAML files based on the data in the table. By default it creates
    files in a subdirectory called "tables" on the same level as the warehouse file.

    WAREHOUSE_FILE contains the connection details to the warehouse. This file can be created using the `soda create` command.
    The warehouse file argument is optional and defaults to 'warehouse.yml'.
    """
    logging.info(SODA_SQL_VERSION)
    file_system = FileSystemSingleton.INSTANCE
    warehouse = None

    try:
        logging.info(f'Analyzing {warehouse_file} ...')

        warehouse_yml_dict = read_warehouse_yml_file(warehouse_file)
        warehouse_yml_parser = WarehouseYmlParser(warehouse_yml_dict,
                                                  warehouse_file)
        warehouse = Warehouse(warehouse_yml_parser.warehouse_yml)

        logging.info('Querying warehouse for tables')
        warehouse_dir = file_system.dirname(warehouse_file)

        file_system = FileSystemSingleton.INSTANCE

        def fileify(name: str):
            return re.sub(r'[^A-Za-z0-9_.]+', '_', name).lower()

        table_dir = file_system.join(warehouse_dir, 'tables')
        if not file_system.file_exists(table_dir):
            logging.info(f'Creating tables directory {table_dir}')
            file_system.mkdirs(table_dir)
        else:
            logging.info(f'Directory {table_dir} already exists')

        first_table_scan_yml_file = None

        dialect = warehouse.dialect
        tables_metadata_query = dialect.sql_tables_metadata_query()
        rows = warehouse.sql_fetchall(tables_metadata_query)

        table_include_regex = create_table_filter_regex(include)
        table_exclude_regex = create_table_filter_regex(exclude)

        for row in rows:
            table_name = row[0]

            if (matches_table_include(table_name, table_include_regex) and
                    matches_table_exclude(table_name, table_exclude_regex)):
                dataset_analyzer = DatasetAnalyzer()
                dataset_analyze_results = dataset_analyzer.analyze(
                    warehouse, table_name)

                table_scan_yaml_file = file_system.join(
                    table_dir, f'{fileify(table_name)}.yml')

                if not first_table_scan_yml_file:
                    first_table_scan_yml_file = table_scan_yaml_file

                if file_system.file_exists(table_scan_yaml_file):
                    logging.info(
                        f"Scan file {table_scan_yaml_file} already exists")
                else:
                    logging.info(f"Creating {table_scan_yaml_file} ...")
                    from sodasql.scan.scan_yml_parser import (
                        KEY_METRICS, KEY_TABLE_NAME, KEY_TESTS, KEY_COLUMNS,
                        COLUMN_KEY_VALID_FORMAT, COLUMN_KEY_TESTS)
                    scan_yaml_dict = {
                        KEY_TABLE_NAME:
                        table_name,
                        KEY_METRICS: [Metric.ROW_COUNT] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_MISSING] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_VALIDITY] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_LENGTH] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_STATISTICS],
                        KEY_TESTS: ['row_count > 0']
                    }

                    columns = {}
                    for column_analysis_result in dataset_analyze_results:
                        if column_analysis_result.validity_format:
                            column_yml = {
                                COLUMN_KEY_VALID_FORMAT:
                                column_analysis_result.validity_format
                            }
                            values_count = column_analysis_result.values_count
                            valid_count = column_analysis_result.valid_count
                            if valid_count > (values_count * .8):
                                valid_percentage = valid_count * 100 / values_count
                                invalid_threshold = (100 -
                                                     valid_percentage) * 1.1
                                invalid_threshold_rounded = ceil(
                                    invalid_threshold)
                                invalid_comparator = '==' if invalid_threshold_rounded == 0 else '<='
                                column_yml[COLUMN_KEY_TESTS] = [
                                    f'invalid_percentage {invalid_comparator} {str(invalid_threshold_rounded)}'
                                ]
                                columns[column_analysis_result.
                                        column_name] = column_yml

                    if columns:
                        scan_yaml_dict[KEY_COLUMNS] = columns

                    scan_yml_str = yaml.dump(scan_yaml_dict,
                                             sort_keys=False,
                                             Dumper=IndentingDumper,
                                             default_flow_style=False)
                    file_system.file_write_from_str(table_scan_yaml_file,
                                                    scan_yml_str)
            else:
                logging.info(f"Skipping table {table_name}")

        logging.info(
            f"Next run 'soda scan {warehouse_file} {first_table_scan_yml_file}' to calculate measurements and run tests"
        )

    except Exception as e:
        logging.exception(f'Exception: {str(e)}')
        sys.exit(1)

    finally:
        if warehouse and warehouse.connection:
            try:
                warehouse.connection.close()
            except Exception as e:
                logging.debug(f'Closing connection failed: {str(e)}')