Example #1
0
    def _build_scan_yml(self):
        if not self.scan_yml_file and not self.scan_yml_dict and not self.scan_yml:
            logger.error(f'No scan file specified')
            return
        elif self.scan_yml_file and not self.scan_yml_dict and not self.scan_yml:
            scan_yml_file_str: Optional[str] = None
            if isinstance(self.scan_yml_file, pathlib.PurePath):
                scan_yml_file_str = str(self.scan_yml_file)
            elif isinstance(self.scan_yml_file, str):
                scan_yml_file_str = self.scan_yml_file

            if not isinstance(scan_yml_file_str, str):
                logger.error(
                    f'scan_builder.scan_yml_file must be str, but was {type(scan_yml_file_str)}: {scan_yml_file_str}'
                )
            elif self.file_system.is_readable_file(scan_yml_file_str):
                scan_yml_str = self.file_system.file_read_as_str(
                    scan_yml_file_str)

                if scan_yml_str:
                    self.scan_yml_dict = YamlHelper.parse_yaml(
                        scan_yml_str, scan_yml_file_str)
                else:
                    logger.error(
                        f'Failed to parse scan yaml file: {scan_yml_file_str}')

        if self.scan_yml_dict and not self.scan_yml:
            from sodasql.scan.scan_yml_parser import ScanYmlParser
            scan_yml_parser = ScanYmlParser(self.scan_yml_dict,
                                            self.scan_yml_file)
            scan_yml_parser.log()
            self.parsers.append(scan_yml_parser)
            self.scan_yml = scan_yml_parser.scan_yml
Example #2
0
    def _build_scan_yml(self):
        file_system = FileSystemSingleton.INSTANCE

        if not self.scan_yml_file and not self.scan_yml_dict and not self.scan_yml:
            logging.error(f'No scan specified')
            return

        elif self.scan_yml_file and not self.scan_yml_dict and not self.scan_yml:
            if not isinstance(self.scan_yml_file, str):
                logging.error(
                    f'scan_builder.scan_yml_file must be str, but was {type(self.scan_yml_file)}: {self.scan_yml_file}'
                )
            elif file_system.is_readable_file(self.scan_yml_file):
                scan_yml_str = self.file_system.file_read_as_str(
                    self.scan_yml_file)
                if scan_yml_str:
                    self.scan_yml_dict = YamlHelper.parse_yaml(
                        scan_yml_str, self.scan_yml_file)
                else:
                    logging.error(
                        f'Failed to file scan yaml file: {self.scan_yml_file}')

        if self.scan_yml_dict and not self.scan_yml:
            from sodasql.scan.scan_yml_parser import ScanYmlParser
            scan_yml_parser = ScanYmlParser(self.scan_yml_dict,
                                            self.scan_yml_file)
            scan_yml_parser.log()
            self.parsers.append(scan_yml_parser)
            self.scan_yml = scan_yml_parser.scan_yml
Example #3
0
    def execute_metric(self,
                       warehouse: Warehouse,
                       metric: dict,
                       scan_dict: dict = None):
        dialect = warehouse.dialect
        if not scan_dict:
            scan_dict = {}
        if KEY_TABLE_NAME not in scan_dict:
            scan_dict[KEY_TABLE_NAME] = self.default_test_table_name
        scan_configuration_parser = ScanYmlParser(scan_dict, 'test-scan')
        scan_configuration_parser.assert_no_warnings_or_errors()
        scan = warehouse.create_scan(
            scan_yml=scan_configuration_parser.scan_yml)
        scan.close_warehouse = False
        scan.execute()

        fields: List[str] = []
        group_by_column_names: List[str] = metric.get('groupBy')
        if group_by_column_names:
            for group_by_column in group_by_column_names:
                fields.append(dialect.qualify_column_name(group_by_column))

        column_name: str = metric.get('columnName')
        qualified_column_name = dialect.qualify_column_name(column_name)

        metric_type = metric['type']
        if metric_type == Metric.ROW_COUNT:
            fields.append('COUNT(*)')
        if metric_type == Metric.MIN:
            fields.append(f'MIN({qualified_column_name})')
        elif metric_type == Metric.MAX:
            fields.append(f'MAX({qualified_column_name})')
        elif metric_type == Metric.SUM:
            fields.append(f'SUM({qualified_column_name})')

        sql = 'SELECT \n  ' + ',\n  '.join(fields) + ' \n' \
              'FROM ' + scan.qualified_table_name

        where_clauses = []

        metric_filter = metric.get('filter')
        if metric_filter:
            where_clauses.append(dialect.sql_expression(metric_filter))

        scan_column: ScanColumn = scan.scan_columns.get(column_name)
        if scan_column and scan_column.non_missing_and_valid_condition:
            where_clauses.append(scan_column.non_missing_and_valid_condition)

        if where_clauses:
            sql += '\nWHERE ' + '\n      AND '.join(where_clauses)

        if group_by_column_names:
            sql += '\nGROUP BY ' + ', '.join(group_by_column_names)

        return warehouse.sql_fetchall(sql)
Example #4
0
    def scan(self,
             scan_yml_dict: Optional[dict] = None,
             variables: Optional[dict] = None) -> ScanResult:
        if not scan_yml_dict:
            scan_yml_dict = {}
        if KEY_TABLE_NAME not in scan_yml_dict:
            scan_yml_dict[KEY_TABLE_NAME] = self.default_test_table_name
        logging.debug('Scan configuration \n' + json.dumps(scan_yml_dict, indent=2))
        scan_configuration_parser = ScanYmlParser(scan_yml_dict, 'test-scan')
        scan_configuration_parser.assert_no_warnings_or_errors()

        scan = self.warehouse.create_scan(scan_yml=scan_configuration_parser.scan_yml,
                                          variables=variables,
                                          soda_server_client=self.mock_soda_server_client,
                                          time=datetime.now().isoformat(timespec='seconds'))
        scan.close_warehouse = False
        return scan.execute()
    def test_soda_server_client(self):
        self.sql_recreate_table([f"name {self.dialect.data_type_integer}"],
                                ["(1)", "(2)", "(3)", "(null)"])

        scan_yml_dict = {
            KEY_TABLE_NAME:
            self.default_test_table_name,
            KEY_METRIC_GROUPS: [
                Metric.METRIC_GROUP_MISSING, Metric.METRIC_GROUP_VALIDITY,
                Metric.METRIC_GROUP_DUPLICATES, Metric.METRIC_GROUP_STATISTICS,
                Metric.METRIC_GROUP_LENGTH, Metric.METRIC_GROUP_PROFILING
            ],
            'tests': [f'{Metric.ROW_COUNT} > 0'],
            KEY_SQL_METRICS: [{
                SQL_METRIC_KEY_SQL:
                f'SELECT 0 AS zero FROM {self.default_test_table_name}',
                SQL_METRIC_KEY_TESTS: ['zero == 0']
            }],
            KEY_COLUMNS: {
                'name': {
                    COLUMN_KEY_TESTS: [
                        f'{Metric.MISSING_COUNT} < 1',
                    ]
                }
            }
        }

        scan_configuration_parser = ScanYmlParser(scan_yml_dict, 'test-scan')
        scan_configuration_parser.assert_no_warnings_or_errors()

        soda_server_client = SodaServerClient(host='localhost',
                                              port='5000',
                                              protocol='http',
                                              token='testtoken')

        scan = self.warehouse.create_scan(
            scan_yml=scan_configuration_parser.scan_yml,
            soda_server_client=soda_server_client,
            time=datetime.now().isoformat())

        scan.close_warehouse = False
        return scan.execute()
Example #6
0
    def test_invalid_column_metric(self):
        parser = ScanYmlParser({
            KEY_TABLE_NAME: 't',
            KEY_METRICS: ['revenue']
        }, 'Test scan')

        log = parser.logs[0]
        self.assertIn(WARNING, log.level)
        self.assertIn('Invalid key', log.message)
        self.assertIn('metrics', log.message)
        self.assertIn('revenue', log.message)
Example #7
0
    def test_metrics_not_a_list(self):
        parser = ScanYmlParser({
            KEY_TABLE_NAME: 't',
            KEY_METRICS: 'txt'
        }, 'Test scan')

        log = parser.logs[0]
        self.assertIn(ERROR, log.level)
        self.assertIn('Invalid metrics', log.message)
        self.assertIn('list', log.message)
        self.assertIn('str', log.message)
Example #8
0
    def test_invalid_valid_format(self):
        parser = ScanYmlParser(
            {
                KEY_TABLE_NAME: 't',
                KEY_COLUMNS: {
                    'col': {
                        'valid_format': 'buzz'
                    }
                }
            }, 'Test scan')

        log = parser.logs[0]
        self.assertIn(WARNING, log.level)
        self.assertIn('Invalid', log.message)
        self.assertIn('valid_format', log.message)
        self.assertIn('buzz', log.message)
Example #9
0
    KEY_METRICS: [
        'missing', 'validity', 'min', 'max', 'avg', 'sum', 'min_length',
        'max_length', 'avg_length'
    ],
    KEY_COLUMNS: {
        'ID': {
            KEY_METRICS: ['distinct', 'uniqueness'],
            'tests': {
                'nomissing': 'missing_percentage < 3.0',
                'noinvalid': 'invalid_count == 0'
            }
        }
    }
}

scan_configuration_parser = ScanYmlParser(scan_configuration_dict,
                                          'demodata-scan')
scan_configuration_parser.assert_no_warnings_or_errors()

dialect = SqlTestCase.create_dialect('postgres')
warehouse_yml = WarehouseYml(dialect=dialect)
warehouse = Warehouse(warehouse_yml)

row = warehouse.sql_fetchone('SELECT MIN(date), MAX(date) FROM demodata')
min_date = row[0]
max_date = row[1]

scan_results = []

date = min_date
while date != max_date:
    timeslice = datetime(year=date.year, month=date.month,
Example #10
0
 def test_table_name_required(self):
     parser = ScanYmlParser({}, 'Test scan')
     log = parser.logs[0]
     self.assertIn(ERROR, log.level)
     self.assertIn('table_name', log.message)
     self.assertIn('does not exist', log.message)