Ejemplo n.º 1
0
    def analyze(self, warehouse: Warehouse, table_name: str):
        dialect = warehouse.dialect
        qualified_table_name = dialect.qualify_table_name(table_name)

        analyze_results: List[ColumnAnalysisResult] = []

        sql = dialect.sql_columns_metadata_query(table_name)
        column_tuples = warehouse.sql_fetchall(sql)
        for column_tuple in column_tuples:
            column_name = column_tuple[0]
            source_type = column_tuple[1]

            column_analysis_result = ColumnAnalysisResult(
                column_name=column_name, source_type=source_type)
            analyze_results.append(column_analysis_result)

            qualified_column_name = dialect.qualify_column_name(column_name)

            if dialect.is_text(source_type):
                column_analysis_result.is_text = True

                validity_format_count_fields = []
                validity_counts = []
                for validity_format in Validity.FORMATS:
                    format_regex = Validity.FORMATS[validity_format]
                    validity_counts.append({'format': validity_format})
                    qualified_regex = dialect.qualify_regex(format_regex)
                    regexp_like = dialect.sql_expr_regexp_like(
                        qualified_column_name, qualified_regex)
                    count_field = f'COUNT(CASE WHEN {regexp_like} THEN 1 END)'
                    validity_format_count_fields.append(count_field)

                row = warehouse.sql_fetchone(
                    f'SELECT \n  ' +
                    (',\n  '.join(validity_format_count_fields)) + ',\n'
                    f'  COUNT({qualified_column_name}) \n'
                    f'FROM {qualified_table_name} \n'
                    f'LIMIT 1000')

                values_count = row[len(validity_counts)]
                column_analysis_result.values_count = values_count

                if values_count > 0:
                    for i in range(len(validity_counts)):
                        validity_count = validity_counts[i]
                        validity_count['count'] = row[i]

                    sorted_validity_counts = sorted(validity_counts,
                                                    key=lambda c: c['count'],
                                                    reverse=True)
                    most_frequent_validity_format = sorted_validity_counts[0]
                    valid_count = most_frequent_validity_format['count']
                    column_analysis_result.valid_count = valid_count

                    if valid_count > (values_count / 2):
                        column_analysis_result.validity_format = most_frequent_validity_format[
                            'format']

        return analyze_results
Ejemplo n.º 2
0
    def execute_metric(self,
                       warehouse: Warehouse,
                       metric: dict,
                       scan_dict: dict = None):
        dialect = warehouse.dialect
        if not scan_dict:
            scan_dict = {}
        if KEY_TABLE_NAME not in scan_dict:
            scan_dict[KEY_TABLE_NAME] = self.default_test_table_name
        scan_configuration_parser = ScanYmlParser(scan_dict, 'test-scan')
        scan_configuration_parser.assert_no_warnings_or_errors()
        scan = warehouse.create_scan(
            scan_yml=scan_configuration_parser.scan_yml)
        scan.close_warehouse = False
        scan.execute()

        fields: List[str] = []
        group_by_column_names: List[str] = metric.get('groupBy')
        if group_by_column_names:
            for group_by_column in group_by_column_names:
                fields.append(dialect.qualify_column_name(group_by_column))

        column_name: str = metric.get('columnName')
        qualified_column_name = dialect.qualify_column_name(column_name)

        metric_type = metric['type']
        if metric_type == Metric.ROW_COUNT:
            fields.append('COUNT(*)')
        if metric_type == Metric.MIN:
            fields.append(f'MIN({qualified_column_name})')
        elif metric_type == Metric.MAX:
            fields.append(f'MAX({qualified_column_name})')
        elif metric_type == Metric.SUM:
            fields.append(f'SUM({qualified_column_name})')

        sql = 'SELECT \n  ' + ',\n  '.join(fields) + ' \n' \
              'FROM ' + scan.qualified_table_name

        where_clauses = []

        metric_filter = metric.get('filter')
        if metric_filter:
            where_clauses.append(dialect.sql_expression(metric_filter))

        scan_column: ScanColumn = scan.scan_columns.get(column_name)
        if scan_column and scan_column.non_missing_and_valid_condition:
            where_clauses.append(scan_column.non_missing_and_valid_condition)

        if where_clauses:
            sql += '\nWHERE ' + '\n      AND '.join(where_clauses)

        if group_by_column_names:
            sql += '\nGROUP BY ' + ', '.join(group_by_column_names)

        return warehouse.sql_fetchall(sql)
Ejemplo n.º 3
0
 def __init__(self, target: str) -> None:
     super().__init__()
     self.target: str = target
     self.dialect = self.create_dialect(self.target)
     self.warehouse_yml = WarehouseYml(dialect=self.dialect, name='test_warehouse')
     self.warehouse: Optional[Warehouse] = Warehouse(self.warehouse_yml)
     self.database: Optional[str] = None
     self.create_database()
Ejemplo n.º 4
0
    def build(self):
        self._build_warehouse_yml()
        self._build_scan_yml()

        for parser in self.parsers:
            parser.assert_no_warnings_or_errors()
        if not self.scan_yml or not self.warehouse_yml:
            return

        from sodasql.scan.warehouse import Warehouse
        warehouse = Warehouse(self.warehouse_yml)

        self._create_soda_server_client()

        return Scan(warehouse=warehouse,
                    scan_yml=self.scan_yml,
                    variables=self.variables,
                    soda_server_client=self.soda_server_client,
                    time=self.time)
Ejemplo n.º 5
0
    def setup_get_warehouse_fixture(self):
        """self.target may be initialized by a test suite"""
        if self.target is None:
            self.target = os.getenv('SODA_TEST_TARGET', TARGET_POSTGRES)

        warehouse_fixture = SqlTestCase.warehouse_fixtures_by_target.get(
            self.target)
        if warehouse_fixture is None:
            logging.debug(f'Creating warehouse {self.target}')

            warehouse_fixture = WarehouseFixture.create(self.target)
            dialect = self.create_dialect(self.target)

            warehouse_yml = WarehouseYml(dialect=dialect)
            warehouse_fixture.warehouse = Warehouse(warehouse_yml)
            warehouse_fixture.create_database()
            SqlTestCase.warehouse_fixtures_by_target[
                self.target] = warehouse_fixture

        return warehouse_fixture
Ejemplo n.º 6
0
def init(warehouse_file: str):
    """
    Finds tables in the warehouse and creates scan YAML files based on the data in the table. By default it creates
    files in a subdirectory called "tables" on the same level as the warehouse file.

    WAREHOUSE_FILE contains the connection details to the warehouse. This file can be created using the `soda create` command.
    The warehouse file argument is optional and defaults to 'warehouse.yml'.
    """
    logging.info(SODA_SQL_VERSION)
    file_system = FileSystemSingleton.INSTANCE
    warehouse = None

    try:
        logging.info(f'Initializing {warehouse_file} ...')

        warehouse_yml_dict = read_warehouse_yml_file(warehouse_file)
        warehouse_yml_parser = WarehouseYmlParser(warehouse_yml_dict,
                                                  warehouse_file)
        warehouse = Warehouse(warehouse_yml_parser.warehouse_yml)

        logging.info('Querying warehouse for tables')
        warehouse_dir = file_system.dirname(warehouse_file)
        scan_initializer = ScanInitializer(warehouse, warehouse_dir)
        scan_initializer.initialize_scan_ymls()

        logging.info(
            f"Next run 'soda scan {warehouse_file} {scan_initializer.first_table_scan_yml_file}' to calculate measurements and run tests"
        )

    except Exception as e:
        logging.exception(f'Exception: {str(e)}')
        return 1

    finally:
        if warehouse and warehouse.connection:
            try:
                warehouse.connection.close()
            except Exception as e:
                logging.debug(f'Closing connection failed: {str(e)}')
Ejemplo n.º 7
0
    def build(self, offline: bool = False):
        self._build_warehouse_yml()
        self._build_scan_yml()

        for parser in self.parsers:
            parser.assert_no_warnings_or_errors()
        if not self.scan_yml or not self.warehouse_yml:
            return

        from sodasql.scan.warehouse import Warehouse
        warehouse = Warehouse(self.warehouse_yml)

        if not offline:
            self._create_soda_server_client()

        return Scan(warehouse=warehouse,
                    scan_yml=self.scan_yml,
                    variables=self.variables,
                    soda_server_client=self.soda_server_client,
                    time=self.time,
                    scan_results_file=self.scan_results_json_path,
                    failed_rows_processor=self.failed_rows_processor)
Ejemplo n.º 8
0
            KEY_METRICS: ['distinct', 'uniqueness'],
            'tests': {
                'nomissing': 'missing_percentage < 3.0',
                'noinvalid': 'invalid_count == 0'
            }
        }
    }
}

scan_configuration_parser = ScanYmlParser(scan_configuration_dict,
                                          'demodata-scan')
scan_configuration_parser.assert_no_warnings_or_errors()

dialect = SqlTestCase.create_dialect('postgres')
warehouse_yml = WarehouseYml(dialect=dialect)
warehouse = Warehouse(warehouse_yml)

row = warehouse.sql_fetchone('SELECT MIN(date), MAX(date) FROM demodata')
min_date = row[0]
max_date = row[1]

scan_results = []

date = min_date
while date != max_date:
    timeslice = datetime(year=date.year, month=date.month,
                         day=date.day).isoformat()
    variables = {'date': date.strftime("%Y-%m-%d")}
    scan = Scan(warehouse=warehouse,
                scan_yml=scan_configuration_parser.scan_yml,
                variables=variables,
Ejemplo n.º 9
0
def analyze(warehouse_file: str, include: str, exclude: str):
    """
    Analyzes tables in the warehouse and creates scan YAML files based on the data in the table. By default it creates
    files in a subdirectory called "tables" on the same level as the warehouse file.

    WAREHOUSE_FILE contains the connection details to the warehouse. This file can be created using the `soda create` command.
    The warehouse file argument is optional and defaults to 'warehouse.yml'.
    """
    logging.info(SODA_SQL_VERSION)
    file_system = FileSystemSingleton.INSTANCE
    warehouse = None

    try:
        logging.info(f'Analyzing {warehouse_file} ...')

        warehouse_yml_dict = read_warehouse_yml_file(warehouse_file)
        warehouse_yml_parser = WarehouseYmlParser(warehouse_yml_dict,
                                                  warehouse_file)
        warehouse = Warehouse(warehouse_yml_parser.warehouse_yml)

        logging.info('Querying warehouse for tables')
        warehouse_dir = file_system.dirname(warehouse_file)

        file_system = FileSystemSingleton.INSTANCE

        def fileify(name: str):
            return re.sub(r'[^A-Za-z0-9_.]+', '_', name).lower()

        table_dir = file_system.join(warehouse_dir, 'tables')
        if not file_system.file_exists(table_dir):
            logging.info(f'Creating tables directory {table_dir}')
            file_system.mkdirs(table_dir)
        else:
            logging.info(f'Directory {table_dir} already exists')

        first_table_scan_yml_file = None

        dialect = warehouse.dialect
        tables_metadata_query = dialect.sql_tables_metadata_query()
        rows = warehouse.sql_fetchall(tables_metadata_query)

        table_include_regex = create_table_filter_regex(include)
        table_exclude_regex = create_table_filter_regex(exclude)

        for row in rows:
            table_name = row[0]

            if (matches_table_include(table_name, table_include_regex) and
                    matches_table_exclude(table_name, table_exclude_regex)):
                dataset_analyzer = DatasetAnalyzer()
                dataset_analyze_results = dataset_analyzer.analyze(
                    warehouse, table_name)

                table_scan_yaml_file = file_system.join(
                    table_dir, f'{fileify(table_name)}.yml')

                if not first_table_scan_yml_file:
                    first_table_scan_yml_file = table_scan_yaml_file

                if file_system.file_exists(table_scan_yaml_file):
                    logging.info(
                        f"Scan file {table_scan_yaml_file} already exists")
                else:
                    logging.info(f"Creating {table_scan_yaml_file} ...")
                    from sodasql.scan.scan_yml_parser import (
                        KEY_METRICS, KEY_TABLE_NAME, KEY_TESTS, KEY_COLUMNS,
                        COLUMN_KEY_VALID_FORMAT, COLUMN_KEY_TESTS)
                    scan_yaml_dict = {
                        KEY_TABLE_NAME:
                        table_name,
                        KEY_METRICS: [Metric.ROW_COUNT] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_MISSING] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_VALIDITY] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_LENGTH] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_STATISTICS],
                        KEY_TESTS: ['row_count > 0']
                    }

                    columns = {}
                    for column_analysis_result in dataset_analyze_results:
                        if column_analysis_result.validity_format:
                            column_yml = {
                                COLUMN_KEY_VALID_FORMAT:
                                column_analysis_result.validity_format
                            }
                            values_count = column_analysis_result.values_count
                            valid_count = column_analysis_result.valid_count
                            if valid_count > (values_count * .8):
                                valid_percentage = valid_count * 100 / values_count
                                invalid_threshold = (100 -
                                                     valid_percentage) * 1.1
                                invalid_threshold_rounded = ceil(
                                    invalid_threshold)
                                invalid_comparator = '==' if invalid_threshold_rounded == 0 else '<='
                                column_yml[COLUMN_KEY_TESTS] = [
                                    f'invalid_percentage {invalid_comparator} {str(invalid_threshold_rounded)}'
                                ]
                                columns[column_analysis_result.
                                        column_name] = column_yml

                    if columns:
                        scan_yaml_dict[KEY_COLUMNS] = columns

                    scan_yml_str = yaml.dump(scan_yaml_dict,
                                             sort_keys=False,
                                             Dumper=IndentingDumper,
                                             default_flow_style=False)
                    file_system.file_write_from_str(table_scan_yaml_file,
                                                    scan_yml_str)
            else:
                logging.info(f"Skipping table {table_name}")

        logging.info(
            f"Next run 'soda scan {warehouse_file} {first_table_scan_yml_file}' to calculate measurements and run tests"
        )

    except Exception as e:
        logging.exception(f'Exception: {str(e)}')
        sys.exit(1)

    finally:
        if warehouse and warehouse.connection:
            try:
                warehouse.connection.close()
            except Exception as e:
                logging.debug(f'Closing connection failed: {str(e)}')
Ejemplo n.º 10
0
            KEY_METRICS: ['distinct', 'uniqueness'],
            'tests': {
                'nomissing': 'missing_percentage < 3.0',
                'noinvalid': 'invalid_count == 0'
            }
        }
    }
}

scan_configuration_parser = ScanYmlParser(scan_configuration_dict,
                                          'demodata-scan')
scan_configuration_parser.assert_no_warnings_or_errors()

dialect = SqlTestCase.create_dialect('postgres')
warehouse_configuration = WarehouseYml(dialect=dialect)
warehouse = Warehouse(warehouse_configuration)

row = warehouse.sql_fetchone('SELECT MIN(date), MAX(date) FROM demodata')
min_date = row[0]
max_date = row[1]

scan_results = []

date = min_date
while date != max_date:
    timeslice = datetime(year=date.year, month=date.month,
                         day=date.day).isoformat()
    variables = {'date': date.strftime("%Y-%m-%d")}
    scan = Scan(warehouse=warehouse,
                scan_yml=scan_configuration_parser.scan_yml,
                variables=variables,