def _build_scan_yml(self): if not self.scan_yml_file and not self.scan_yml_dict and not self.scan_yml: logger.error(f'No scan file specified') return elif self.scan_yml_file and not self.scan_yml_dict and not self.scan_yml: scan_yml_file_str: Optional[str] = None if isinstance(self.scan_yml_file, pathlib.PurePath): scan_yml_file_str = str(self.scan_yml_file) elif isinstance(self.scan_yml_file, str): scan_yml_file_str = self.scan_yml_file if not isinstance(scan_yml_file_str, str): logger.error( f'scan_builder.scan_yml_file must be str, but was {type(scan_yml_file_str)}: {scan_yml_file_str}' ) elif self.file_system.is_readable_file(scan_yml_file_str): scan_yml_str = self.file_system.file_read_as_str( scan_yml_file_str) if scan_yml_str: self.scan_yml_dict = YamlHelper.parse_yaml( scan_yml_str, scan_yml_file_str) else: logger.error( f'Failed to parse scan yaml file: {scan_yml_file_str}') if self.scan_yml_dict and not self.scan_yml: from sodasql.scan.scan_yml_parser import ScanYmlParser scan_yml_parser = ScanYmlParser(self.scan_yml_dict, self.scan_yml_file) scan_yml_parser.log() self.parsers.append(scan_yml_parser) self.scan_yml = scan_yml_parser.scan_yml
def _build_scan_yml(self): file_system = FileSystemSingleton.INSTANCE if not self.scan_yml_file and not self.scan_yml_dict and not self.scan_yml: logging.error(f'No scan specified') return elif self.scan_yml_file and not self.scan_yml_dict and not self.scan_yml: if not isinstance(self.scan_yml_file, str): logging.error( f'scan_builder.scan_yml_file must be str, but was {type(self.scan_yml_file)}: {self.scan_yml_file}' ) elif file_system.is_readable_file(self.scan_yml_file): scan_yml_str = self.file_system.file_read_as_str( self.scan_yml_file) if scan_yml_str: self.scan_yml_dict = YamlHelper.parse_yaml( scan_yml_str, self.scan_yml_file) else: logging.error( f'Failed to file scan yaml file: {self.scan_yml_file}') if self.scan_yml_dict and not self.scan_yml: from sodasql.scan.scan_yml_parser import ScanYmlParser scan_yml_parser = ScanYmlParser(self.scan_yml_dict, self.scan_yml_file) scan_yml_parser.log() self.parsers.append(scan_yml_parser) self.scan_yml = scan_yml_parser.scan_yml
def execute_metric(self, warehouse: Warehouse, metric: dict, scan_dict: dict = None): dialect = warehouse.dialect if not scan_dict: scan_dict = {} if KEY_TABLE_NAME not in scan_dict: scan_dict[KEY_TABLE_NAME] = self.default_test_table_name scan_configuration_parser = ScanYmlParser(scan_dict, 'test-scan') scan_configuration_parser.assert_no_warnings_or_errors() scan = warehouse.create_scan( scan_yml=scan_configuration_parser.scan_yml) scan.close_warehouse = False scan.execute() fields: List[str] = [] group_by_column_names: List[str] = metric.get('groupBy') if group_by_column_names: for group_by_column in group_by_column_names: fields.append(dialect.qualify_column_name(group_by_column)) column_name: str = metric.get('columnName') qualified_column_name = dialect.qualify_column_name(column_name) metric_type = metric['type'] if metric_type == Metric.ROW_COUNT: fields.append('COUNT(*)') if metric_type == Metric.MIN: fields.append(f'MIN({qualified_column_name})') elif metric_type == Metric.MAX: fields.append(f'MAX({qualified_column_name})') elif metric_type == Metric.SUM: fields.append(f'SUM({qualified_column_name})') sql = 'SELECT \n ' + ',\n '.join(fields) + ' \n' \ 'FROM ' + scan.qualified_table_name where_clauses = [] metric_filter = metric.get('filter') if metric_filter: where_clauses.append(dialect.sql_expression(metric_filter)) scan_column: ScanColumn = scan.scan_columns.get(column_name) if scan_column and scan_column.non_missing_and_valid_condition: where_clauses.append(scan_column.non_missing_and_valid_condition) if where_clauses: sql += '\nWHERE ' + '\n AND '.join(where_clauses) if group_by_column_names: sql += '\nGROUP BY ' + ', '.join(group_by_column_names) return warehouse.sql_fetchall(sql)
def scan(self, scan_yml_dict: Optional[dict] = None, variables: Optional[dict] = None) -> ScanResult: if not scan_yml_dict: scan_yml_dict = {} if KEY_TABLE_NAME not in scan_yml_dict: scan_yml_dict[KEY_TABLE_NAME] = self.default_test_table_name logging.debug('Scan configuration \n' + json.dumps(scan_yml_dict, indent=2)) scan_configuration_parser = ScanYmlParser(scan_yml_dict, 'test-scan') scan_configuration_parser.assert_no_warnings_or_errors() scan = self.warehouse.create_scan(scan_yml=scan_configuration_parser.scan_yml, variables=variables, soda_server_client=self.mock_soda_server_client, time=datetime.now().isoformat(timespec='seconds')) scan.close_warehouse = False return scan.execute()
def test_soda_server_client(self): self.sql_recreate_table([f"name {self.dialect.data_type_integer}"], ["(1)", "(2)", "(3)", "(null)"]) scan_yml_dict = { KEY_TABLE_NAME: self.default_test_table_name, KEY_METRIC_GROUPS: [ Metric.METRIC_GROUP_MISSING, Metric.METRIC_GROUP_VALIDITY, Metric.METRIC_GROUP_DUPLICATES, Metric.METRIC_GROUP_STATISTICS, Metric.METRIC_GROUP_LENGTH, Metric.METRIC_GROUP_PROFILING ], 'tests': [f'{Metric.ROW_COUNT} > 0'], KEY_SQL_METRICS: [{ SQL_METRIC_KEY_SQL: f'SELECT 0 AS zero FROM {self.default_test_table_name}', SQL_METRIC_KEY_TESTS: ['zero == 0'] }], KEY_COLUMNS: { 'name': { COLUMN_KEY_TESTS: [ f'{Metric.MISSING_COUNT} < 1', ] } } } scan_configuration_parser = ScanYmlParser(scan_yml_dict, 'test-scan') scan_configuration_parser.assert_no_warnings_or_errors() soda_server_client = SodaServerClient(host='localhost', port='5000', protocol='http', token='testtoken') scan = self.warehouse.create_scan( scan_yml=scan_configuration_parser.scan_yml, soda_server_client=soda_server_client, time=datetime.now().isoformat()) scan.close_warehouse = False return scan.execute()
def test_invalid_column_metric(self): parser = ScanYmlParser({ KEY_TABLE_NAME: 't', KEY_METRICS: ['revenue'] }, 'Test scan') log = parser.logs[0] self.assertIn(WARNING, log.level) self.assertIn('Invalid key', log.message) self.assertIn('metrics', log.message) self.assertIn('revenue', log.message)
def test_metrics_not_a_list(self): parser = ScanYmlParser({ KEY_TABLE_NAME: 't', KEY_METRICS: 'txt' }, 'Test scan') log = parser.logs[0] self.assertIn(ERROR, log.level) self.assertIn('Invalid metrics', log.message) self.assertIn('list', log.message) self.assertIn('str', log.message)
def test_invalid_valid_format(self): parser = ScanYmlParser( { KEY_TABLE_NAME: 't', KEY_COLUMNS: { 'col': { 'valid_format': 'buzz' } } }, 'Test scan') log = parser.logs[0] self.assertIn(WARNING, log.level) self.assertIn('Invalid', log.message) self.assertIn('valid_format', log.message) self.assertIn('buzz', log.message)
KEY_METRICS: [ 'missing', 'validity', 'min', 'max', 'avg', 'sum', 'min_length', 'max_length', 'avg_length' ], KEY_COLUMNS: { 'ID': { KEY_METRICS: ['distinct', 'uniqueness'], 'tests': { 'nomissing': 'missing_percentage < 3.0', 'noinvalid': 'invalid_count == 0' } } } } scan_configuration_parser = ScanYmlParser(scan_configuration_dict, 'demodata-scan') scan_configuration_parser.assert_no_warnings_or_errors() dialect = SqlTestCase.create_dialect('postgres') warehouse_yml = WarehouseYml(dialect=dialect) warehouse = Warehouse(warehouse_yml) row = warehouse.sql_fetchone('SELECT MIN(date), MAX(date) FROM demodata') min_date = row[0] max_date = row[1] scan_results = [] date = min_date while date != max_date: timeslice = datetime(year=date.year, month=date.month,
def test_table_name_required(self): parser = ScanYmlParser({}, 'Test scan') log = parser.logs[0] self.assertIn(ERROR, log.level) self.assertIn('table_name', log.message) self.assertIn('does not exist', log.message)