def __init__(self, warehouse_yml_dict: dict, warehouse_yml_path: str = 'warehouse-dict'): super().__init__(description=warehouse_yml_path) if isinstance(warehouse_yml_dict, dict): self._push_context(object=warehouse_yml_dict, name=self.description) self.warehouse_yml = WarehouseYml() self.warehouse_yml.name = self.get_str_required(KEY_NAME) EnvVars.load_env_vars(self.warehouse_yml.name) connection_dict = self.get_dict_required(KEY_CONNECTION) if connection_dict: self._push_context(object=connection_dict, name=KEY_CONNECTION) self.warehouse_yml.dialect = Dialect.create(self) if self.warehouse_yml.dialect: self.warehouse_yml.dialect.validate_connection() self._pop_context() soda_account_dict = self.get_dict_optional(KEY_SODA_ACCOUNT) if soda_account_dict: self._push_context(object=soda_account_dict, name=KEY_SODA_ACCOUNT) self.warehouse_yml.soda_host = self.get_str_optional( SODA_KEY_HOST, 'cloud.soda.io') self.warehouse_yml.soda_port = self.get_int_optional( SODA_KEY_PORT, 443) self.warehouse_yml.soda_protocol = self.get_str_optional( SODA_KEY_PROTOCOL, 'https') self.warehouse_yml.soda_api_key_id = self.get_str_required_env( SODA_KEY_API_KEY_ID) self.warehouse_yml.soda_api_key_secret = self.get_str_required_env( SODA_KEY_API_KEY_SECRET) self._pop_context() ingest_dict = self.get_dict_optional(KEY_INGEST) if ingest_dict: self._push_context(object=ingest_dict, name=KEY_INGEST) self.warehouse_yml.dbt_cloud_api_token = self.get_str_optional( DBT_CLOUD_KEY_API_TOKEN) self._pop_context() self.check_invalid_keys(VALID_WAREHOUSE_KEYS) else: self.error('No warehouse configuration provided')
def __get_valid_condition(self, column_metadata: ColumnMetadata, validity: Validity, dialect: Dialect): qualified_column_name = dialect.qualify_column_name( column_metadata.name, column_metadata.data_type) if validity is None: return '', True validity_clauses = [] if validity.format is not None and self.is_text: format_regex = Validity.FORMATS.get(validity.format) qualified_regex = dialect.qualify_regex(format_regex) validity_clauses.append( dialect.sql_expr_regexp_like(qualified_column_name, qualified_regex)) if validity.regex is not None and self.is_text: qualified_regex = dialect.qualify_regex(validity.regex) validity_clauses.append( dialect.sql_expr_regexp_like(qualified_column_name, qualified_regex)) if validity.values is not None: valid_values_sql = dialect.sql_expr_list(column_metadata, validity.values) validity_clauses.append( dialect.sql_expr_in(qualified_column_name, valid_values_sql)) if validity.min_length is not None and self.is_text: validity_clauses.append( f'{dialect.sql_expr_length(qualified_column_name)} >= {validity.min_length}' ) if validity.max_length is not None and self.is_text: validity_clauses.append( f'{dialect.sql_expr_length(qualified_column_name)} <= {validity.max_length}' ) if validity.min is not None and self.is_number: validity_clauses.append( f'{qualified_column_name} >= {validity.min}') if validity.max is not None and self.is_number: validity_clauses.append( f'{qualified_column_name} <= {validity.max}') if validity.min is not None and self.is_text: validity_clauses.append( f'{dialect.sql_expr_cast_text_to_number(qualified_column_name, self.validity_format)} >= {validity.min}' ) if validity.max is not None and self.is_text: validity_clauses.append( f'{dialect.sql_expr_cast_text_to_number(qualified_column_name, self.validity_format)} <= {validity.max}' ) if len(validity_clauses) != 0: return '(' + ' AND '.join(validity_clauses) + ')', len( validity_clauses) == 0 else: return '', True
def __get_missing_condition(cls, column_metadata: ColumnMetadata, missing: Missing, dialect: Dialect): qualified_column_name = dialect.qualify_column_name(column_metadata.name) validity_clauses = [f'{qualified_column_name} IS NULL'] if missing: if missing.values: sql_expr_missing_values = dialect.sql_expr_list(column_metadata, missing.values) validity_clauses.append(f'{qualified_column_name} IN {sql_expr_missing_values}') if missing.format: format_regex = Missing.FORMATS.get(missing.format) qualified_regex = dialect.qualify_regex(format_regex) validity_clauses.append(dialect.sql_expr_regexp_like(qualified_column_name, qualified_regex)) if missing.regex: qualified_regex = dialect.qualify_regex(missing.regex) validity_clauses.append(dialect.sql_expr_regexp_like(qualified_column_name, qualified_regex)) return " OR ".join(validity_clauses), len(validity_clauses) == 1
def __get_valid_condition(cls, column_metadata: ColumnMetadata, validity: Validity, dialect: Dialect): qualified_column_name = dialect.qualify_column_name( column_metadata.name) if validity is None: return '', True validity_clauses = [] if validity.format: format_regex = Validity.FORMATS.get(validity.format) qualified_regex = dialect.qualify_regex(format_regex) validity_clauses.append( dialect.sql_expr_regexp_like(qualified_column_name, qualified_regex)) if validity.regex: qualified_regex = dialect.qualify_regex(validity.regex) validity_clauses.append( dialect.sql_expr_regexp_like(qualified_column_name, qualified_regex)) if validity.values: valid_values_sql = dialect.literal_list(validity.values) validity_clauses.append( dialect.sql_expr_in(qualified_column_name, valid_values_sql)) if validity.min_length: validity_clauses.append( f'{dialect.sql_expr_length(qualified_column_name)} >= {validity.min_length}' ) if validity.max_length: validity_clauses.append( f'{dialect.sql_expr_length(qualified_column_name)} <= {validity.max_length}' ) if validity.min: validity_clauses.append( f'{qualified_column_name} >= {validity.min}') if validity.max: validity_clauses.append( f'{qualified_column_name} <= {validity.max}') return '(' + ' AND '.join(validity_clauses) + ')', len( validity_clauses) == 0
def create(warehouse_type: str, file: Optional[str], warehouse: Optional[str], database: Optional[str], username: Optional[str], password: Optional[str]): """ Creates a new warehouse.yml file and prepares credentials in your ~/.soda/env_vars.yml Nothing will be overwritten or removed, only added if it does not exist yet. WAREHOUSE_TYPE is one of {postgres, snowflake, redshift, bigquery, athena} """ try: """ Creates a warehouse.yml file """ logging.info(f"Soda CLI version {SODA_SQL_VERSION}") file_system = FileSystemSingleton.INSTANCE # if not warehouse: # warehouse_dir_parent, warehouse_dir_name = file_system.split(warehouse_dir) # warehouse = warehouse_dir_name if warehouse_dir_name != '.' else warehouse_type from sodasql.scan.dialect import ALL_WAREHOUSE_TYPES, Dialect dialect = Dialect.create_for_warehouse_type(warehouse_type) if not dialect: logging.info( f"Invalid warehouse type {warehouse_type}, use one of {str(ALL_WAREHOUSE_TYPES)}" ) sys.exit(1) configuration_params = {} if isinstance(database, str): configuration_params['database'] = database if isinstance(username, str): configuration_params['username'] = username if isinstance(password, str): configuration_params['password'] = password connection_properties = dialect.default_connection_properties( configuration_params) warehouse_env_vars_dict = dialect.default_env_vars( configuration_params) if file_system.file_exists(file): logging.info(f"Warehouse file {file} already exists") else: logging.info(f"Creating warehouse YAML file {file} ...") file_system.mkdirs(file_system.dirname(file)) if not warehouse: warehouse = warehouse_type warehouse_dict = { 'name': warehouse, 'connection': connection_properties } warehouse_yml_str = yaml.dump(warehouse_dict, default_flow_style=False, sort_keys=False) file_system.file_write_from_str(file, warehouse_yml_str) dot_soda_dir = file_system.join(file_system.user_home_dir(), '.soda') if not file_system.file_exists(dot_soda_dir): file_system.mkdirs(dot_soda_dir) env_vars_file = file_system.join(dot_soda_dir, 'env_vars.yml') env_vars_yml_str = '' env_vars_file_exists = file_system.file_exists(env_vars_file) if env_vars_file_exists: env_vars_yml_str = file_system.file_read_as_str(env_vars_file) existing_env_vars_yml_dict = yaml.load(env_vars_yml_str, Loader=yaml.SafeLoader) if isinstance(existing_env_vars_yml_dict, dict) and warehouse in existing_env_vars_yml_dict: logging.info( f"Warehouse section {warehouse} already exists in {env_vars_file}. Skipping..." ) warehouse_env_vars_dict = None if warehouse_env_vars_dict: warehouse_env_vars_dict = {warehouse: warehouse_env_vars_dict} if len(env_vars_yml_str) > 0: env_vars_yml_str += '\n' env_vars_yml_str += yaml.dump(warehouse_env_vars_dict, default_flow_style=False, sort_keys=False) if env_vars_file_exists: logging.info( f"Adding env vars for {warehouse} to {env_vars_file}") else: logging.info( f"Creating {env_vars_file} with example env vars in section {warehouse}" ) file_system.file_write_from_str(env_vars_file, env_vars_yml_str) logging.info(f"Review warehouse.yml by running command") logging.info(f" cat {file}") if warehouse_env_vars_dict: logging.info( f"Review section {warehouse} in ~/.soda/env_vars.yml by running command" ) logging.info(f" cat ~/.soda/env_vars.yml") logging.info(f"Then run the soda analyze command") except Exception as e: logging.exception(f'Exception: {str(e)}') sys.exit(1)
class TestSqlExpressions(TestCase): dialect = Dialect('test') def test_string(self): self.assertEqual( "'hello'", self.dialect.sql_expression({ 'type': 'string', 'value': 'hello' })) def test_number(self): self.assertEqual( "93", self.dialect.sql_expression({ 'type': 'number', 'value': 93 })) self.assertEqual( "93.11", self.dialect.sql_expression({ 'type': 'number', 'value': 93.11 })) def test_column_value(self): self.assertEqual( "col_name", self.dialect.sql_expression({ 'type': 'columnValue', 'columnName': 'col_name' })) def test_collection(self): self.assertEqual( "('a','b',3,4)", self.dialect.sql_expression({ 'type': 'collection', 'value': ['a', 'b', 3, 4] })) def test_equals(self): self.assertEqual( "name = 't'", self.dialect.sql_expression({ 'type': 'equals', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'string', 'value': 't' } })) def test_less_than(self): self.assertEqual( "name < 3", self.dialect.sql_expression({ 'type': 'lessThan', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'number', 'value': 3 } })) def test_less_than_or_equals(self): self.assertEqual( "name <= 3", self.dialect.sql_expression({ 'type': 'lessThanOrEqual', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'number', 'value': 3 } })) def test_greater_than(self): self.assertEqual( "name > 3", self.dialect.sql_expression({ 'type': 'greaterThan', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'number', 'value': 3 } })) def test_greater_than_or_equals(self): self.assertEqual( "name >= 3", self.dialect.sql_expression({ 'type': 'greaterThanOrEqual', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'number', 'value': 3 } })) def test_between(self): self.assertEqual( "3 <= name AND name < 44", self.dialect.sql_expression({ 'type': 'between', 'gte': 3, 'value': { 'type': 'columnValue', 'columnName': 'name' }, 'lt': 44 })) self.assertEqual( "3 < name", self.dialect.sql_expression({ 'type': 'between', 'gt': 3, 'value': { 'type': 'columnValue', 'columnName': 'name' } })) self.assertEqual( "name <= 44", self.dialect.sql_expression({ 'type': 'between', 'value': { 'type': 'columnValue', 'columnName': 'name' }, 'lte': 44 })) def test_in(self): self.assertEqual( "name IN ('a','b','c')", self.dialect.sql_expression({ 'type': 'in', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'collection', 'value': ['a', 'b', 'c'] } })) def test_contains(self): self.assertEqual( "name LIKE '%t%'", self.dialect.sql_expression({ 'type': 'contains', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'string', 'value': 't' } })) def test_starts_with(self): self.assertEqual( "name LIKE 't%'", self.dialect.sql_expression({ 'type': 'startsWith', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'string', 'value': 't' } })) def test_ends_with(self): self.assertEqual( "name LIKE '%t'", self.dialect.sql_expression({ 'type': 'endsWith', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'string', 'value': 't' } })) def test_not(self): self.assertEqual( "NOT (name = 't')", self.dialect.sql_expression({ 'type': 'not', 'expression': { 'type': 'equals', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'string', 'value': 't' } } })) def test_and(self): self.assertEqual( "(name = 't') AND (size = 3)", self.dialect.sql_expression({ 'type': 'and', 'andExpressions': [{ 'type': 'equals', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'string', 'value': 't' } }, { 'type': 'equals', 'left': { 'type': 'columnValue', 'columnName': 'size' }, 'right': { 'type': 'number', 'value': 3 } }] })) def test_or(self): self.assertEqual( "(name = 't') OR (size = 3)", self.dialect.sql_expression({ 'type': 'or', 'orExpressions': [{ 'type': 'equals', 'left': { 'type': 'columnValue', 'columnName': 'name' }, 'right': { 'type': 'string', 'value': 't' } }, { 'type': 'equals', 'left': { 'type': 'columnValue', 'columnName': 'size' }, 'right': { 'type': 'number', 'value': 3 } }] }))
def __init__(self, warehouse_connection_dict: dict): super().__init__('warehouse') self._push_context(warehouse_connection_dict, 'connection') self.dialect = Dialect.create(self)
def obtain_datasource_hash(dialect: Dialect): return dialect.generate_hash_safe()
def create_dialect(target): return Dialect.create(DialectParser({KEY_WAREHOUSE_TYPE: target}))