def _set_overriding_params_for_node(self, relation: Relation, configs: Configuration) -> Relation: """Finds and applies specific params from config. If multiple conflicting specific params are found they will be applied in descending order from the originating replica file. Args: relation: A :class:`Relation <snowshu.core.models.relation.Relation>` to be tested for specific configs. configs: :class:`Configuration <snowshu.core.configuration_parser.Configuration>` object to search for matches and specified params. Returns: The :class:`Relation <snowshu.core.models.relation.Relation>` with all updated params applied. """ for pattern in configs.specified_relations: if single_full_pattern_match(relation, pattern): for attr in ( 'unsampled', 'include_outliers', ): pattern_val = getattr(pattern, attr, None) relation.__dict__[ attr] = pattern_val if pattern_val is not None else relation.__dict__[ attr] if getattr(pattern, 'sampling', None) is not None: relation.sampling = pattern.sampling return relation approved_specified_patterns = [ dict(database=r.database_pattern, schema=r.schema_pattern, name=r.relation_pattern) for r in config.specified_relations ]
def test_directional_statement(): sf = SnowflakeAdapter() DATABASE, SCHEMA, TABLE, LOCAL_KEY, REMOTE_KEY = [ rand_string(10) for _ in range(5) ] relation = Relation(database=DATABASE, schema=SCHEMA, name=TABLE, materialization=TABLE, attributes=[]) relation.core_query = f""" SELECT * FROM {DATABASE}.{SCHEMA}.{TABLE} SAMPLE BERNOULLI (10) """ statement = sf.predicate_constraint_statement(relation, True, LOCAL_KEY, REMOTE_KEY) assert query_equalize(statement) == query_equalize(f""" {LOCAL_KEY} IN ( SELECT {REMOTE_KEY} AS {LOCAL_KEY} FROM ( SELECT * FROM {DATABASE}.{SCHEMA}.{TABLE} SAMPLE BERNOULLI (10) )) """)
def _set_globals_for_node(self, relation: Relation, configs: Configuration) -> Relation: """Sets the initial (default) node values from the config ARGS: relation: the :class:`Relation <snowshu.core.models.relation>` to set values of. configs: the :class"`Configuration <snowshu.core.configuration_parser.Configuration` object to derive default values from. Returns: The updated :class:`Relation <snowshu.core.models.relation>` """ relation.sampling = configs.sampling relation.include_outliers = configs.include_outliers relation.max_number_of_outliers = configs.max_number_of_outliers return relation
def get_relations_from_database(self, database: str) -> List[Relation]: relations_sql = f""" SELECT m.table_schema AS schema, m.table_name AS relation, m.table_type AS materialization, c.column_name AS attribute, c.ordinal_position AS ordinal, c.data_type AS data_type FROM {database}.INFORMATION_SCHEMA.TABLES m INNER JOIN {database}.INFORMATION_SCHEMA.COLUMNS c ON c.table_schema = m.table_schema AND c.table_name = m.table_name WHERE m.table_schema <> 'INFORMATION_SCHEMA' """ logger.debug( f'Collecting detailed relations from database {database}...') relations_frame = self._safe_query(relations_sql) unique_relations = (relations_frame['schema'] + '.' + relations_frame['relation']).unique().tolist() logger.debug( f'Done collecting relations. Found a total of {len(unique_relations)} unique relations in database {database}' ) relations = list() for relation in unique_relations: logger.debug(f'Building relation { database + "." + relation }...') attributes = list() for attribute in relations_frame.loc[( relations_frame['schema'] + '.' + relations_frame['relation']) == relation].itertuples(): logger.debug( f'adding attribute {attribute.attribute} to relation..') attributes.append( Attribute(self._correct_case(attribute.attribute), self._get_data_type(attribute.data_type))) relation = Relation( self._correct_case(database), self._correct_case(attribute.schema), self._correct_case(attribute.relation), self.MATERIALIZATION_MAPPINGS[attribute.materialization], attributes) logger.debug(f'Added relation {relation.dot_notation} to pool.') relations.append(relation) logger.debug( f'Acquired {len(relations)} total relations from database {database}.' ) return relations
def test_sample_statement(): sf = SnowflakeAdapter() DATABASE, SCHEMA, TABLE = [rand_string(10) for _ in range(3)] relation = Relation(database=DATABASE, schema=SCHEMA, name=TABLE, materialization=TABLE, attributes=[]) sample = sf.sample_statement_from_relation( relation, BernoulliSampleMethod(10, units="probability")) assert query_equalize(sample) == query_equalize(f""" SELECT * FROM {DATABASE}.{SCHEMA}.{TABLE} SAMPLE BERNOULLI (10) """)
def test_analyze_wrap_statement(): sf = SnowflakeAdapter() DATABASE, SCHEMA, NAME = [rand_string(10) for _ in range(3)] relation = Relation(database=DATABASE, schema=SCHEMA, name=NAME, materialization=TABLE, attributes=[]) sql = f"SELECT * FROM some_crazy_query" statement = sf.analyze_wrap_statement(sql, relation) assert query_equalize(statement) == query_equalize(f""" WITH {relation.scoped_cte('SNOWSHU_COUNT_POPULATION')} AS ( SELECT COUNT(*) AS population_size FROM {relation.quoted_dot_notation} ) ,{relation.scoped_cte('SNOWSHU_CORE_SAMPLE')} AS ( {sql} ) ,{relation.scoped_cte('SNOWSHU_CORE_SAMPLE_COUNT')} AS ( SELECT COUNT(*) AS sample_size FROM {relation.scoped_cte('SNOWSHU_CORE_SAMPLE')} ) SELECT s.sample_size AS sample_size ,p.population_size AS population_size FROM {relation.scoped_cte('SNOWSHU_CORE_SAMPLE_COUNT')} s INNER JOIN {relation.scoped_cte('SNOWSHU_COUNT_POPULATION')} p ON 1=1 LIMIT 1 """)
def test_x00_replacement(): adapter = PostgresAdapter(replica_metadata={}) id_col = "id" content_col = "content" normal_val = "normal_value" weird_value = "weird\x00value" custom_replacement = "__CUSTOM_VALUE__" cols = [ Attribute(id_col, data_types.BIGINT), Attribute(content_col, data_types.VARCHAR) ] # test default replacement relation = Relation("db", "schema", "relation", TABLE, cols) relation.data = DataFrame({ id_col: [1, 2], content_col: [normal_val, weird_value] }) fixed_relation = adapter.replace_x00_values(relation) assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 1, [content_col]] == normal_val) assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 2, [content_col]] == "weirdvalue") # test custom replacement adapter = PostgresAdapter(replica_metadata={}, pg_0x00_replacement=custom_replacement) relation = Relation("db", "schema", "relation", TABLE, cols) relation.data = DataFrame({ id_col: [1, 2], content_col: [normal_val, weird_value] }) fixed_relation = adapter.replace_x00_values(relation) assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 1, [content_col]] == normal_val) assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 2, [content_col]] == f"weird{custom_replacement}value")
def _get_relations_from_database( self, schema_obj: BaseTargetAdapter._DatabaseObject) -> List[Relation]: quoted_database = schema_obj.full_relation.quoted( schema_obj.full_relation.database) # quoted db name relation_database = schema_obj.full_relation.database # case corrected db name case_sensitive_schema = schema_obj.case_sensitive_name # case sensitive schame name relations_sql = f""" SELECT m.table_schema AS schema, m.table_name AS relation, m.table_type AS materialization, c.column_name AS attribute, c.ordinal_position AS ordinal, c.data_type AS data_type FROM {quoted_database}.information_schema.tables m INNER JOIN {quoted_database}.information_schema.columns c ON c.table_schema = m.table_schema AND c.table_name = m.table_name WHERE m.table_schema = '{case_sensitive_schema}' AND m.table_schema NOT IN ('information_schema', 'pg_catalog') AND m.table_type <> 'external' """ logger.debug( f'Collecting detailed relations from database {quoted_database}...' ) relations_frame = self._safe_query(relations_sql, quoted_database) unique_relations = (relations_frame['schema'] + '.' + relations_frame['relation']).unique().tolist() logger.debug( f'Done collecting relations. Found a total of {len(unique_relations)} ' f'unique relations in database {quoted_database}') relations = list() for relation in unique_relations: logger.debug( f'Building relation { quoted_database + "." + relation }...') attributes = list() for attribute in relations_frame.loc[( relations_frame['schema'] + '.' + relations_frame['relation']) == relation].itertuples(): logger.debug( f'adding attribute {attribute.attribute} to relation..') attributes.append( Attribute(self._correct_case(attribute.attribute), self._get_data_type(attribute.data_type))) relation = Relation( relation_database, self._correct_case(attribute.schema), # noqa pylint: disable=undefined-loop-variable self._correct_case(attribute.relation), # noqa pylint: disable=undefined-loop-variable self.MATERIALIZATION_MAPPINGS[ attribute.materialization.replace(" ", "_")], # noqa pylint: disable=undefined-loop-variable attributes) logger.debug(f'Added relation {relation.dot_notation} to pool.') relations.append(relation) logger.debug( f'Acquired {len(relations)} total relations from database {quoted_database}.' ) return relations
def test_run_deps_mixed_multi_deps(): r""" a --bidir--> c <--dir-- b \ / \ \ bidir dir \ | | \ V V dir-> d e """ relation_helper = RelationTestHelper() relation_a = Relation(name='rel_a', **relation_helper.rand_relation_helper()) relation_a.attributes = [ Attribute('col_a_c', dt.INTEGER), Attribute('col_a_d', dt.VARCHAR) ] relation_a.data = pd.DataFrame({ "col_a_c": [ 1, 2, 3, 4, 5, ], "col_a_d": ["var_a_1", "var_a_2", "var_a_3", "var_a_1", "var_a_2"], }) relation_b = Relation(name='rel_b', **relation_helper.rand_relation_helper()) relation_b.attributes = [Attribute('col_b_c', dt.VARCHAR)] relation_b.data = pd.DataFrame({ "col_b_c": [ "val1", "val2", "val3", "val4", "val5", ], }) relation_c = Relation(name='rel_c', **relation_helper.rand_relation_helper()) relation_c.attributes = [ Attribute('col_c_ae', dt.INTEGER), Attribute('col_c_bd', dt.VARCHAR) ] relation_c.data = pd.DataFrame({ "col_c_ae": [ 1, 1, 2, 2, 5, 5, 5, ], "col_c_bd": [ "val1", "val1", "val2", "val2", "val5", "val5", "val5", ] }) relation_d = Relation(name='rel_d', **relation_helper.rand_relation_helper()) relation_d.attributes = [ Attribute('col_d_a', dt.INTEGER), Attribute('col_d_c', dt.INTEGER) ] relation_e = Relation(name='rel_e', **relation_helper.rand_relation_helper()) relation_e.attributes = [Attribute('col_e_c', dt.INTEGER)] for relation in ( relation_a, relation_b, relation_c, relation_d, relation_e, ): relation = stub_out_sampling(relation) dag = nx.DiGraph() dag.add_edge(relation_a, relation_c, direction="bidirectional", remote_attribute="col_a_c", local_attribute="col_c_ae") dag.add_edge(relation_a, relation_d, direction="directional", remote_attribute="col_a_d", local_attribute="col_d_a") dag.add_edge(relation_b, relation_c, direction="directional", remote_attribute="col_b_c", local_attribute="col_c_bd") dag.add_edge(relation_c, relation_d, direction="bidirectional", remote_attribute="col_c_bd", local_attribute="col_d_c") dag.add_edge(relation_c, relation_e, direction="directional", remote_attribute="col_c_ae", local_attribute="col_e_c") adapter = SnowflakeAdapter() RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_d, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_e, dag, adapter, False) assert query_equalize(relation_a.compiled_query) == query_equalize(f""" WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_a.quoted_dot_notation} WHERE col_a_c in (SELECT col_c_ae FROM {relation_c.quoted_dot_notation}) ) ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """) assert query_equalize(relation_b.compiled_query) == query_equalize(f""" SELECT * FROM {relation_b.quoted_dot_notation} SAMPLE BERNOULLI (1500 ROWS) """) assert query_equalize(relation_c.compiled_query) == query_equalize(f""" SELECT * FROM {relation_c.quoted_dot_notation} WHERE col_c_bd in (SELECT col_d_c FROM {relation_d.quoted_dot_notation}) AND col_c_ae IN (1,2,3,4,5) AND col_c_bd IN ('val1','val2','val3','val4','val5') """) assert query_equalize(relation_d.compiled_query) == query_equalize(f""" SELECT * FROM {relation_d.quoted_dot_notation} WHERE col_d_a IN ('var_a_1','var_a_2','var_a_3') AND col_d_c IN ('val1','val2','val5') """) assert query_equalize(relation_e.compiled_query) == query_equalize(f""" WITH {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_e.quoted_dot_notation} WHERE col_e_c IN (1,2,5) ) ,{relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """)
def test_run_deps_bidirectional_multi_deps(): """ a --bidir--> c <--bidir-- b """ relation_helper = RelationTestHelper() relation_a = Relation(name='rel_a', **relation_helper.rand_relation_helper()) relation_a.attributes = [Attribute('col_a', dt.INTEGER)] relation_a.data = pd.DataFrame({"col_a": [ 1, 2, 3, 4, 5, ]}) relation_b = Relation(name='rel_b', **relation_helper.rand_relation_helper()) relation_b.attributes = [Attribute('col_b', dt.VARCHAR)] relation_b.data = pd.DataFrame({ "col_b": [ "val1", "val2", "val3", "val4", "val5", ], }) relation_c = Relation(name='rel_c', **relation_helper.rand_relation_helper()) relation_c.attributes = [ Attribute('col_c_a', dt.INTEGER), Attribute('col_c_b', dt.VARCHAR) ] for relation in ( relation_a, relation_b, relation_c, ): relation = stub_out_sampling(relation) dag = nx.DiGraph() dag.add_edge(relation_a, relation_c, direction="bidirectional", remote_attribute="col_a", local_attribute="col_c_a") dag.add_edge(relation_b, relation_c, direction="bidirectional", remote_attribute="col_b", local_attribute="col_c_b") adapter = SnowflakeAdapter() RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag, adapter, False) RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag, adapter, False) assert query_equalize(relation_a.compiled_query) == query_equalize(f""" WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_a.quoted_dot_notation} WHERE col_a in (SELECT col_c_a FROM {relation_c.quoted_dot_notation}) ) ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """) assert query_equalize(relation_b.compiled_query) == query_equalize(f""" WITH {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( SELECT * FROM {relation_b.quoted_dot_notation} WHERE col_b in (SELECT col_c_b FROM {relation_c.quoted_dot_notation}) ) ,{relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( SELECT * FROM {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) ) SELECT * FROM {relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} """) assert query_equalize(relation_c.compiled_query) == query_equalize(f""" SELECT * FROM {relation_c.quoted_dot_notation} WHERE col_c_a IN (1,2,3,4,5) AND col_c_b IN ('val1','val2','val3','val4','val5') """)
def stub_out_sampling(rel: Relation) -> Relation: rel.sampling = DefaultSampling() rel.sampling.sample_method = BernoulliSampleMethod(1500, units='rows') return rel