def test_sets_only_existing_adapters(): shgraph = SnowShuGraph() test_relation = Relation(database='SNOWSHU_DEVELOPMENT', schema='SOURCE_SYSTEM', name='ORDER_ITEMS', materialization=mz.TABLE, attributes=[]) test_relation.include_outliers, test_relation.unsampled = [ False for _ in range(2) ] test_relation.sampling = DefaultSampling() config_dict = copy.deepcopy(CONFIGURATION) config_dict['preserve_case'] = True config_dict['source']['specified_relations'][1]['sampling'] = 'lucky_guess' with pytest.raises(AttributeError): config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) assert isinstance(test_relation.sampling, DefaultSampling) config_dict['source']['specified_relations'][1]['sampling'] = 'brute_force' config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) assert isinstance( shgraph._set_overriding_params_for_node(test_relation, config).sampling, BruteForceSampling)
def test_graph_difference_more_both_isolated_non_isolated_relations_source( stub_graph_set, stub_relation_set): """ Tests graph_difference returns graph with expected nodes if source graph has non-isolated and isolated nodes which are not present in target catalog """ _, vals = stub_graph_set common_relation = Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()) source_catalog = [ common_relation, Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), vals.downstream_relation, vals.upstream_relation, vals.birelation_right ] target_catalog = { common_relation, Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), vals.birelation_left, vals.birelation_right } config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation.*$", "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) shgraph = SnowShuGraph() with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = source_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) expected_nodes = source_catalog[1:] actual = SnowShuGraph.catalog_difference(shgraph, target_catalog) assert list(actual.nodes) == expected_nodes
def _get_filtered_schemas( self, filters: Iterable[dict]) -> List[_DatabaseObject]: """ Get all of the filtered schema structures based on the provided filters. """ db_filters = [] schema_filters = [] for _filter in filters: new_filter = _filter.copy() new_filter["name"] = ".*" if schema_filters.count(new_filter) == 0: schema_filters.append(new_filter) for s_filter in schema_filters: new_filter = s_filter.copy() new_filter["schema"] = ".*" if db_filters.count(new_filter) == 0: db_filters.append(new_filter) databases = self._get_all_databases() database_relations = [ Relation(self._correct_case(database), "", "", None, None) for database in databases ] filtered_databases = [ rel for rel in database_relations if at_least_one_full_pattern_match(rel, db_filters) ] # get all schemas in all databases filtered_schemas = [] for db_rel in filtered_databases: schemas = self._get_all_schemas( database=db_rel.quoted(db_rel.database)) schema_objs = [ BaseSourceAdapter._DatabaseObject( schema, Relation(db_rel.database, self._correct_case(schema), "", None, None)) for schema in schemas ] filtered_schemas += [ d for d in schema_objs if at_least_one_full_pattern_match( d.full_relation, schema_filters) ] return filtered_schemas
def _initialize_snowshu_meta_database(self) -> None: self.create_database_if_not_exists('snowshu') self.create_schema_if_not_exists('snowshu', 'snowshu') attributes = [ Attribute('created_at', dt.TIMESTAMP_TZ), Attribute('name', dt.VARCHAR), Attribute('short_description', dt.VARCHAR), Attribute('long_description', dt.VARCHAR) ] relation = Relation("snowshu", "snowshu", "replica_meta", mz.TABLE, attributes) relation.data = pd.DataFrame([ dict(created_at=datetime.now(), name=self.replica_meta['name'], short_description=self.replica_meta['short_description'], long_description=self.replica_meta['long_description']) ]) self.create_and_load_relation(relation)
def __init__(self): self.downstream_relation = Relation( name='downstream_relation', **self.rand_relation_helper()) self.upstream_relation = Relation( name='upstream_relation', **self.rand_relation_helper()) self.iso_relation = Relation( name='iso_relation', **self.rand_relation_helper()) self.birelation_left = Relation( name='birelation_left', **self.rand_relation_helper()) self.birelation_right = Relation( name='birelation_right', **self.rand_relation_helper()) self.view_relation = Relation( name='view_relation', **self.rand_relation_helper()) self.bidirectional_key_left = rand_string(10), self.bidirectional_key_right = rand_string(8), self.directional_key = rand_string(15) # update specifics self.view_relation.materialization = mz.VIEW for n in ('downstream_relation', 'upstream_relation',): self.__dict__[n].attributes = [ Attribute(self.directional_key, dt.INTEGER)] self.birelation_right.attributes = [ Attribute(self.bidirectional_key_right, dt.VARCHAR)] self.birelation_left.attributes = [ Attribute(self.bidirectional_key_left, dt.VARCHAR)] for r in ('downstream_relation', 'upstream_relation', 'iso_relation', 'birelation_left', 'birelation_right', 'view_relation',): self.__dict__[r].compiled_query = ''
def test_build_catalog(): config_patterns = [ dict(database="snowshu_development", schema=".*", name="(?i)^.*(?<!_view)$"), dict(database="snowshu_development", schema="source_system", name="order_items_view") ] mock_filtered_schema = [ BaseSQLAdapter._DatabaseObject("SOURCE_SYSTEM", Relation("snowshu_development", "source_system", "", None, None)), BaseSQLAdapter._DatabaseObject("Cased_Schema", Relation("snowshu_development", "Cased_Schema", "", None, None)), ] included_relations = [ # included tables Relation("snowshu_development", "source_system", "fake_table_1", mz.TABLE, []), Relation("snowshu_development", "Cased_Schema", "fake_table_2", mz.TABLE, []), # included view Relation("snowshu_development", "source_system", "order_items_view", mz.VIEW, []), ] excluded_relations = [ # excluded _view Relation("snowshu_development", "source_system", "some_other_view", mz.VIEW, []), Relation("snowshu_development", "Cased_Schema", "another_view", mz.VIEW, []), ] mock_relations = included_relations + excluded_relations def mock_get_relations_func(schema_obj: BaseSQLAdapter._DatabaseObject): return [r for r in mock_relations if schema_obj.full_relation.schema == r.schema] # stubbed version of the BaseSourceAdapter with the required class vars class StubbedSourceAdapter(BaseSourceAdapter): REQUIRED_CREDENTIALS = [] ALLOWED_CREDENTIALS = [] MATERIALIZATION_MAPPINGS = {} DATA_TYPE_MAPPINGS = {} SUPPORTED_SAMPLE_METHODS = [] with patch("snowshu.adapters.BaseSQLAdapter._get_filtered_schemas", return_value=mock_filtered_schema) \ , patch("snowshu.adapters.BaseSQLAdapter._get_relations_from_database", side_effect=mock_get_relations_func): adapter = StubbedAdapter() catalog = adapter.build_catalog(config_patterns, thread_workers=1) for r in excluded_relations: assert not r in catalog for r in included_relations: assert r in catalog
def test_get_relations_from_database(end_to_end): adapter = PostgresAdapter(replica_metadata={}) if adapter.target != "localhost": adapter._credentials.host = 'integration-test' config_patterns = [dict(database="snowshu", schema=".*", name=".*")] attributes = [ Attribute('created_at', data_types.TIMESTAMP_TZ), Attribute('config_json', data_types.JSON), Attribute('name', data_types.VARCHAR), Attribute('short_description', data_types.VARCHAR), Attribute('long_description', data_types.VARCHAR) ] relation = Relation("snowshu", "snowshu", "replica_meta", TABLE, attributes) catalog = adapter.build_catalog(config_patterns, thread_workers=1) relations = [] for rel in catalog: relations.append(rel.__dict__.items()) assert relation.__dict__.items() in relations
def compile_queries_for_relation( relation: Relation, # noqa pylint: disable=too-many-branches dag: networkx.Graph, source_adapter: Type[BaseSourceAdapter], analyze: bool) -> Relation: """ Generates the sql statements for the given relation Args: relation (Relation): the relation to generate the sql for dag (Graph): the connected dependency graph that contains the relation source_adapter (BaseSourceAdapter): the source adapter for the sql dialect analyze (bool): whether to generate sql statements for analyze or actaul sampling Returns: Relation: the given relation with `compiled_query` populated """ if relation.is_view: relation.core_query, relation.compiled_query = [ source_adapter.view_creation_statement(relation) for _ in range(2) ] return relation if relation.unsampled: query = source_adapter.unsampled_statement(relation) else: do_not_sample = False predicates = list() unions = list() polymorphic_predicates = list() for child in dag.successors(relation): # parallel edges aren't currently supported edge = dag.edges[relation, child] if edge['direction'] == 'bidirectional': predicates.append( source_adapter.upstream_constraint_statement( child, edge['remote_attribute'], edge['local_attribute'])) if relation.include_outliers and edge[ 'direction'] == 'polymorphic': logger.warning( "Polymorphic relationships currently do not support including outliers. " "Ignoring include_outliers flag for edge " f"from {relation.dot_notation} to {child.dot_notation}. " ) elif relation.include_outliers: unions.append( source_adapter.union_constraint_statement( relation, child, edge['remote_attribute'], edge['local_attribute'], relation.max_number_of_outliers)) for parent in dag.predecessors(relation): edge = dag.edges[parent, relation] # if any incoming edge is bidirectional or polymorphic set do_not_sample flag # do_not_sample is set since those types are most likely already restricted do_not_sample = (edge['direction'] in ( 'bidirectional', 'polymorphic', ) or do_not_sample) if edge['direction'] == 'polymorphic': # if the local type attribute is set, the constraint needs to account for it # otherwise we only need the normal predicate constraint if 'local_type_attribute' in edge: local_type_override = edge['local_type_overrides'].get( parent.dot_notation, None) polymorphic_predicates.append( source_adapter.polymorphic_constraint_statement( parent, analyze, edge['local_attribute'], edge['remote_attribute'], edge['local_type_attribute'], local_type_override)) else: polymorphic_predicates.append( source_adapter.predicate_constraint_statement( parent, analyze, edge['local_attribute'], edge['remote_attribute'])) else: predicates.append( source_adapter.predicate_constraint_statement( parent, analyze, edge['local_attribute'], edge['remote_attribute'])) if relation.include_outliers and edge[ 'direction'] == 'polymorphic': logger.warning( "Polymorphic relationships currently do not support including outliers. " "Ignoring include_outliers flag for edge " f"from {parent.dot_notation} to {relation.dot_notation}. " ) elif relation.include_outliers: unions.append( source_adapter.union_constraint_statement( relation, parent, edge['local_attribute'], edge['remote_attribute'], relation.max_number_of_outliers)) # if polymorphic predicates are set up, then generate the or predicate if polymorphic_predicates: full_polymorphic_predicate = " OR ".join( polymorphic_predicates) predicates.append(f"( {full_polymorphic_predicate} )") query = source_adapter.sample_statement_from_relation( relation, (None if predicates else relation.sampling.sample_method)) if predicates: query += " WHERE " + ' AND '.join(predicates) query = source_adapter.directionally_wrap_statement( query, relation, (None if do_not_sample else relation.sampling.sample_method)) if unions: query += " UNION ".join([''] + unions) relation.core_query = query if analyze: query = source_adapter.analyze_wrap_statement(query, relation) relation.compiled_query = query return relation
def compile_queries_for_relation(relation: Relation, dag: networkx.Graph, source_adapter: Type[BaseSourceAdapter], analyze: bool) -> Relation: """generates and populates the compiled sql for each relation in a dag.""" query = str() if relation.is_view: relation.core_query, relation.compiled_query = [ source_adapter.view_creation_statement(relation) for _ in range(2) ] return relation if relation.unsampled: query = source_adapter.unsampled_statement(relation) else: do_not_sample = False predicates = list() unions = list() for child in [c for c in dag.successors(relation)]: for edge in dag.edges((relation, child), True): edge_data = edge[2] if edge_data['direction'] == 'bidirectional': predicates.append( source_adapter.upstream_constraint_statement( child, edge_data['remote_attribute'], edge_data['local_attribute'])) if relation.include_outliers: unions.append( source_adapter.union_constraint_statement( relation, child, edge_data['remote_attribute'], edge_data['local_attribute'], relation.max_number_of_outliers)) for parent in [p for p in dag.predecessors(relation)]: for edge in dag.edges(( parent, relation, ), True): edge_data = edge[2] do_not_sample = edge_data['direction'] == 'bidirectional' predicates.append( source_adapter.predicate_constraint_statement( parent, analyze, edge_data['local_attribute'], edge_data['remote_attribute'])) if relation.include_outliers: unions.append( source_adapter.union_constraint_statement( relation, parent, edge_data['local_attribute'], edge_data['remote_attribute'], relation.max_number_of_outliers)) query = source_adapter.sample_statement_from_relation( relation, (None if predicates else relation.sampling.sample_method)) if predicates: query += " WHERE " + ' AND '.join(predicates) query = source_adapter.directionally_wrap_statement( query, relation, (None if do_not_sample else relation.sampling.sample_method)) if unions: query += " UNION ".join([''] + unions) relation.core_query = query if analyze: query = source_adapter.analyze_wrap_statement(query, relation) relation.compiled_query = query return relation
schema='THING', relation='.*poly2$', relationships=dict(polymorphic=[ dict(local_attribute='id', local_type_attribute='', remote_attribute='parent_id', database='', schema='', relation='^poly_child_[0-9]_items$') ], )), ]), target=dict(adapter='default'), storage=dict(profile='default')) MOCKED_CATALOG = ( Relation('snowyes', 'thing', 'foo_suffix', mz.TABLE, []), Relation('SNOWYES', 'thing', 'bar_suffix', mz.TABLE, []), Relation('SNOWNO', 'THING', 'nevermatch_except_bidirectional', mz.TABLE, []), Relation('noperope', 'thing', 'foo_suffix', mz.TABLE, []), Relation('SNOWNO', 'THING', 'bar_suffix', mz.TABLE, []), Relation('SNOWNO', 'dont_match', 'nevermatch_except_bidirectional', mz.TABLE, []), Relation('snowno', 'thing', 'matches_in_directional', mz.TABLE, []), Relation('SNOWYES', 'thing', 'nevermatch_except_bidirectional', mz.TABLE, []), Relation('snowyes', 'thing', 'nevermatch_except_bidirectional', mz.TABLE, []), Relation('snowyes', 'thing', 'parent_poly', mz.TABLE, []), Relation('snowyes', 'thing', 'parent_poly2', mz.TABLE, []), Relation('snowyes', 'thing', 'poly_child_1_items', mz.TABLE, []),
schema='', relation='nevermatch_except_bidirectional') ], directional=[ dict(local_attribute='id', remote_attribute='id', database='snowno', schema='THING', relation='matches_in_directional') ])) ]), target=dict(adapter='default'), storage=dict(profile='default')) MOCKED_CATALOG = ( Relation('snowyes', 'thing', 'foo_suffix', mz.TABLE, []), Relation('SNOWYES', 'thing', 'bar_suffix', mz.TABLE, []), Relation('SNOWNO', 'THING', 'nevermatch_except_bidirectional', mz.TABLE, []), Relation('noperope', 'thing', 'foo_suffix', mz.TABLE, []), Relation('SNOWNO', 'THING', 'bar_suffix', mz.TABLE, []), Relation('SNOWNO', 'dont_match', 'nevermatch_except_bidirectional', mz.TABLE, []), Relation('snowno', 'thing', 'matches_in_directional', mz.TABLE, []), Relation('SNOWYES', 'thing', 'nevermatch_except_bidirectional', mz.TABLE, []), Relation('snowyes', 'thing', 'nevermatch_except_bidirectional', mz.TABLE, []), )
def __init__(self): self.downstream_relation = Relation(name='downstream_relation', **self.rand_relation_helper()) self.upstream_relation = Relation(name='upstream_relation', **self.rand_relation_helper()) self.iso_relation = Relation(name='iso_relation', **self.rand_relation_helper()) self.birelation_left = Relation(name='birelation_left', **self.rand_relation_helper()) self.birelation_right = Relation(name='birelation_right', **self.rand_relation_helper()) self.view_relation = Relation(name='view_relation', **self.rand_relation_helper()) self.downstream_wildcard_relation_1 = Relation( name='downstream_wildcard_relation_1', **self.rand_relation_helper()) self.downstream_wildcard_relation_2 = Relation( name='downstream_wildcard_relation_2', **self.rand_relation_helper()) self.upstream_wildcard_relation_1 = Relation( name='upstream_wildcard_relation_1', schema=self.downstream_wildcard_relation_1.schema, database=self.downstream_wildcard_relation_1.database, materialization=mz.TABLE, attributes=[]) self.upstream_wildcard_relation_2 = Relation( name='upstream_wildcard_relation_2', schema=self.downstream_wildcard_relation_2.schema, database=self.downstream_wildcard_relation_2.database, materialization=mz.TABLE, attributes=[]) self.parent_relation_childid_type = Relation( name='parent_relation_childid_type', **self.rand_relation_helper()) self.parent_relation_parentid = Relation( name='parent_relation_parentid', **self.rand_relation_helper()) self.child_relation_type_1 = Relation(name='child_type_1_records', **self.rand_relation_helper()) self.child_relation_type_2 = Relation(name='child_type_2_records', **self.rand_relation_helper()) self.child_relation_type_3 = Relation(name='child_type_3_records', **self.rand_relation_helper()) self.bidirectional_key_left = rand_string(10) self.bidirectional_key_right = rand_string(8) self.directional_key = rand_string(15) self.parentid_key = rand_string(15) self.childid_key = rand_string(15) self.childtype_key = rand_string(15) self.child2override_key = rand_string(20) # update specifics self.view_relation.materialization = mz.VIEW for n in ('downstream_relation', 'upstream_relation', 'downstream_wildcard_relation_1', 'downstream_wildcard_relation_2', 'upstream_wildcard_relation_1', 'upstream_wildcard_relation_2'): self.__dict__[n].attributes = [ Attribute(self.directional_key, dt.INTEGER) ] for n in ( 'child_relation_type_1', 'child_relation_type_2', 'child_relation_type_3', ): self.__dict__[n].attributes = [ Attribute(self.parentid_key, dt.VARCHAR), Attribute(self.childid_key, dt.VARCHAR) ] self.parent_relation_childid_type.attributes = [ Attribute(self.childid_key, dt.VARCHAR), Attribute(self.childtype_key, dt.VARCHAR) ] self.parent_relation_parentid.attributes = [ Attribute(self.parentid_key, dt.VARCHAR) ] self.birelation_right.attributes = [ Attribute(self.bidirectional_key_right, dt.VARCHAR) ] self.birelation_left.attributes = [ Attribute(self.bidirectional_key_left, dt.VARCHAR) ] for r in ('downstream_relation', 'upstream_relation', 'iso_relation', 'birelation_left', 'birelation_right', 'view_relation', 'downstream_wildcard_relation_1', 'downstream_wildcard_relation_2', 'upstream_wildcard_relation_1', 'upstream_wildcard_relation_2', 'child_relation_type_1', 'child_relation_type_2', 'child_relation_type_3', 'parent_relation_childid_type', 'parent_relation_parentid'): self.__dict__[r].compiled_query = ''
def compile_queries_for_relation(relation: Relation, dag: networkx.Graph, source_adapter: Type[BaseSourceAdapter], analyze: bool) -> Relation: """ Generates the sql statements for the given relation Args: relation (Relation): the relation to generate the sql for dag (Graph): the connected dependency graph that contains the relation source_adapter (BaseSourceAdapter): the source adapter for the sql dialect analyze (bool): whether to generate sql statements for analyze or actaul sampling Returns: Relation: the given relation with `compiled_query` populated """ query = str() if relation.is_view: relation.core_query, relation.compiled_query = [ source_adapter.view_creation_statement(relation) for _ in range(2) ] return relation if relation.unsampled: query = source_adapter.unsampled_statement(relation) else: do_not_sample = False predicates = list() unions = list() for child in dag.successors(relation): # parallel edges aren't currently supported edge = dag.edges[relation, child] if edge['direction'] == 'bidirectional': predicates.append( source_adapter.upstream_constraint_statement( child, edge['remote_attribute'], edge['local_attribute'])) if relation.include_outliers: unions.append( source_adapter.union_constraint_statement( relation, child, edge['remote_attribute'], edge['local_attribute'], relation.max_number_of_outliers)) for parent in dag.predecessors(relation): edge = dag.edges[parent, relation] # if any incoming edge is birectional set do_not_sample flag do_not_sample = (edge['direction'] == 'bidirectional' or do_not_sample) predicates.append( source_adapter.predicate_constraint_statement( parent, analyze, edge['local_attribute'], edge['remote_attribute'])) if relation.include_outliers: unions.append( source_adapter.union_constraint_statement( relation, parent, edge['local_attribute'], edge['remote_attribute'], relation.max_number_of_outliers)) query = source_adapter.sample_statement_from_relation( relation, (None if predicates else relation.sampling.sample_method)) if predicates: query += " WHERE " + ' AND '.join(predicates) query = source_adapter.directionally_wrap_statement( query, relation, (None if do_not_sample else relation.sampling.sample_method)) if unions: query += " UNION ".join([''] + unions) relation.core_query = query if analyze: query = source_adapter.analyze_wrap_statement(query, relation) relation.compiled_query = query return relation