Beispiel #1
0
def build_warehouse_yml_parser(
        warehouse_yml_file: Optional[str] = None,
        warehouse_yml_dict: Optional[dict] = None) -> WarehouseYmlParser:
    """
    Build a warehouse yml parser.

    Parameters
    ----------
    warehouse_yml_file : Optional[str], optional (default: None)
        A warehouse yml file.
    warehouse_yml_dict : Optional[dict], optional (default: None)
        A warehouse yml dict.

    Returns
    -------
    out : WarehouseYmlParser
        The warehouse yml parser.
    """
    if not warehouse_yml_dict:
        if isinstance(warehouse_yml_file, pathlib.PurePath):
            warehouse_yml_file_str = str(warehouse_yml_file)
        elif isinstance(warehouse_yml_file, str):
            warehouse_yml_file_str = warehouse_yml_file
        else:
            logger.error(
                "scan_builder.warehouse_yml_file must be an instance of Purepath or str, "
                f"but was {type(warehouse_yml_file)}: {warehouse_yml_file}")

        warehouse_yml_dict = read_warehouse_yml_file(warehouse_yml_file_str)

    warehouse_yml_parser = WarehouseYmlParser(warehouse_yml_dict,
                                              warehouse_yml_file)

    return warehouse_yml_parser
Beispiel #2
0
    def _build_warehouse_yml(self):
        if not self.warehouse_yml_file and not self.warehouse_yml_dict and not self.warehouse_yml:
            logging.error(f'No warehouse specified')
            return

        elif self.warehouse_yml_file and not self.warehouse_yml_dict and not self.warehouse_yml:
            if not isinstance(self.warehouse_yml_file, str):
                logging.error(
                    f'scan_builder.warehouse_yml_file must be str, but was {type(self.warehouse_yml_file)}: {self.warehouse_yml_file}'
                )
            else:
                self.warehouse_yml_dict = read_warehouse_yml_file(
                    self.warehouse_yml_file)

        if self.warehouse_yml_dict and not self.warehouse_yml:
            from sodasql.scan.warehouse_yml_parser import WarehouseYmlParser
            self.parse_warehouse_yml(
                WarehouseYmlParser(self.warehouse_yml_dict,
                                   self.warehouse_yml_file))
Beispiel #3
0
def init(warehouse_file: str):
    """
    Finds tables in the warehouse and creates scan YAML files based on the data in the table. By default it creates
    files in a subdirectory called "tables" on the same level as the warehouse file.

    WAREHOUSE_FILE contains the connection details to the warehouse. This file can be created using the `soda create` command.
    The warehouse file argument is optional and defaults to 'warehouse.yml'.
    """
    logging.info(SODA_SQL_VERSION)
    file_system = FileSystemSingleton.INSTANCE
    warehouse = None

    try:
        logging.info(f'Initializing {warehouse_file} ...')

        warehouse_yml_dict = read_warehouse_yml_file(warehouse_file)
        warehouse_yml_parser = WarehouseYmlParser(warehouse_yml_dict,
                                                  warehouse_file)
        warehouse = Warehouse(warehouse_yml_parser.warehouse_yml)

        logging.info('Querying warehouse for tables')
        warehouse_dir = file_system.dirname(warehouse_file)
        scan_initializer = ScanInitializer(warehouse, warehouse_dir)
        scan_initializer.initialize_scan_ymls()

        logging.info(
            f"Next run 'soda scan {warehouse_file} {scan_initializer.first_table_scan_yml_file}' to calculate measurements and run tests"
        )

    except Exception as e:
        logging.exception(f'Exception: {str(e)}')
        return 1

    finally:
        if warehouse and warehouse.connection:
            try:
                warehouse.connection.close()
            except Exception as e:
                logging.debug(f'Closing connection failed: {str(e)}')
Beispiel #4
0
def analyze(warehouse_file: str, include: str, exclude: str):
    """
    Analyzes tables in the warehouse and creates scan YAML files based on the data in the table. By default it creates
    files in a subdirectory called "tables" on the same level as the warehouse file.

    WAREHOUSE_FILE contains the connection details to the warehouse. This file can be created using the `soda create` command.
    The warehouse file argument is optional and defaults to 'warehouse.yml'.
    """
    logging.info(SODA_SQL_VERSION)
    file_system = FileSystemSingleton.INSTANCE
    warehouse = None

    try:
        logging.info(f'Analyzing {warehouse_file} ...')

        warehouse_yml_dict = read_warehouse_yml_file(warehouse_file)
        warehouse_yml_parser = WarehouseYmlParser(warehouse_yml_dict,
                                                  warehouse_file)
        warehouse = Warehouse(warehouse_yml_parser.warehouse_yml)

        logging.info('Querying warehouse for tables')
        warehouse_dir = file_system.dirname(warehouse_file)

        file_system = FileSystemSingleton.INSTANCE

        def fileify(name: str):
            return re.sub(r'[^A-Za-z0-9_.]+', '_', name).lower()

        table_dir = file_system.join(warehouse_dir, 'tables')
        if not file_system.file_exists(table_dir):
            logging.info(f'Creating tables directory {table_dir}')
            file_system.mkdirs(table_dir)
        else:
            logging.info(f'Directory {table_dir} already exists')

        first_table_scan_yml_file = None

        dialect = warehouse.dialect
        tables_metadata_query = dialect.sql_tables_metadata_query()
        rows = warehouse.sql_fetchall(tables_metadata_query)

        table_include_regex = create_table_filter_regex(include)
        table_exclude_regex = create_table_filter_regex(exclude)

        for row in rows:
            table_name = row[0]

            if (matches_table_include(table_name, table_include_regex) and
                    matches_table_exclude(table_name, table_exclude_regex)):
                dataset_analyzer = DatasetAnalyzer()
                dataset_analyze_results = dataset_analyzer.analyze(
                    warehouse, table_name)

                table_scan_yaml_file = file_system.join(
                    table_dir, f'{fileify(table_name)}.yml')

                if not first_table_scan_yml_file:
                    first_table_scan_yml_file = table_scan_yaml_file

                if file_system.file_exists(table_scan_yaml_file):
                    logging.info(
                        f"Scan file {table_scan_yaml_file} already exists")
                else:
                    logging.info(f"Creating {table_scan_yaml_file} ...")
                    from sodasql.scan.scan_yml_parser import (
                        KEY_METRICS, KEY_TABLE_NAME, KEY_TESTS, KEY_COLUMNS,
                        COLUMN_KEY_VALID_FORMAT, COLUMN_KEY_TESTS)
                    scan_yaml_dict = {
                        KEY_TABLE_NAME:
                        table_name,
                        KEY_METRICS: [Metric.ROW_COUNT] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_MISSING] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_VALIDITY] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_LENGTH] +
                        Metric.METRIC_GROUPS[Metric.METRIC_GROUP_STATISTICS],
                        KEY_TESTS: ['row_count > 0']
                    }

                    columns = {}
                    for column_analysis_result in dataset_analyze_results:
                        if column_analysis_result.validity_format:
                            column_yml = {
                                COLUMN_KEY_VALID_FORMAT:
                                column_analysis_result.validity_format
                            }
                            values_count = column_analysis_result.values_count
                            valid_count = column_analysis_result.valid_count
                            if valid_count > (values_count * .8):
                                valid_percentage = valid_count * 100 / values_count
                                invalid_threshold = (100 -
                                                     valid_percentage) * 1.1
                                invalid_threshold_rounded = ceil(
                                    invalid_threshold)
                                invalid_comparator = '==' if invalid_threshold_rounded == 0 else '<='
                                column_yml[COLUMN_KEY_TESTS] = [
                                    f'invalid_percentage {invalid_comparator} {str(invalid_threshold_rounded)}'
                                ]
                                columns[column_analysis_result.
                                        column_name] = column_yml

                    if columns:
                        scan_yaml_dict[KEY_COLUMNS] = columns

                    scan_yml_str = yaml.dump(scan_yaml_dict,
                                             sort_keys=False,
                                             Dumper=IndentingDumper,
                                             default_flow_style=False)
                    file_system.file_write_from_str(table_scan_yaml_file,
                                                    scan_yml_str)
            else:
                logging.info(f"Skipping table {table_name}")

        logging.info(
            f"Next run 'soda scan {warehouse_file} {first_table_scan_yml_file}' to calculate measurements and run tests"
        )

    except Exception as e:
        logging.exception(f'Exception: {str(e)}')
        sys.exit(1)

    finally:
        if warehouse and warehouse.connection:
            try:
                warehouse.connection.close()
            except Exception as e:
                logging.debug(f'Closing connection failed: {str(e)}')