def test_split_dag_to_parallel(): shgraph=SnowShuGraph() dag=nx.DiGraph() dag.add_edges_from([(1,2,),(1,4,),(2,3,),(5,6,)]) split=shgraph._split_dag_for_parallel(dag) assert set([frozenset(val) for val in split]) == set([frozenset([1,2,4,3]),frozenset([5,6])])
def test_build_graph_fails_many_to_many(stub_graph_set): """ Tests build_graph exits on many-to-many relationships """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": ".*", "schema": ".*", "relation": ".*relation_.*$", # birelations "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation$", # non birelations "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock with pytest.raises(InvalidRelationshipException) as exc: shgraph.build_graph(config) assert "defines a many-to-many relationship" in str(exc.value) assert "Many-to-many relationship are not allowed by SnowShu" in str( exc.value)
def test_graph_allows_upstream_wildcards(stub_graph_set): shgraph = SnowShuGraph() _, vals = stub_graph_set vals.upstream_relation.database = vals.downstream_relation.database vals.upstream_relation.schema = vals.downstream_relation.schema full_catalog = [vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right] config_dict = copy.deepcopy(CONFIGURATION) config_dict['source']['specified_relations'] = [dict(relation=vals.downstream_relation.name, database=vals.downstream_relation.database, schema=vals.downstream_relation.schema, unsampled=False, relationships=dict(directional=[], bidirectional=[dict(relation=vals.upstream_relation.name, database='', schema='', local_attribute=vals.downstream_relation.attributes[ 0].name, remote_attribute=vals.upstream_relation.attributes[0].name)]))] config = ConfigurationParser().from_file_or_path(StringIO(yaml.dump(config_dict))) modified_graph = shgraph._apply_specifications( config, nx.DiGraph(), full_catalog) assert (vals.upstream_relation, vals.downstream_relation, ) in modified_graph.edges
def test_sets_only_existing_adapters(): shgraph = SnowShuGraph() test_relation = Relation(database='SNOWSHU_DEVELOPMENT', schema='SOURCE_SYSTEM', name='ORDER_ITEMS', materialization=mz.TABLE, attributes=[]) test_relation.include_outliers, test_relation.unsampled = [ False for _ in range(2) ] test_relation.sampling = DefaultSampling() config_dict = copy.deepcopy(CONFIGURATION) config_dict['preserve_case'] = True config_dict['source']['specified_relations'][1]['sampling'] = 'lucky_guess' with pytest.raises(AttributeError): config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) assert isinstance(test_relation.sampling, DefaultSampling) config_dict['source']['specified_relations'][1]['sampling'] = 'brute_force' config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) assert isinstance( shgraph._set_overriding_params_for_node(test_relation, config).sampling, BruteForceSampling)
def test_unsampled(stub_graph_set): shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right] config_dict = copy.deepcopy(CONFIGURATION) config_dict['source']['specified_relations'] = [dict(relation=vals.iso_relation.name, database=vals.iso_relation.database, schema=vals.iso_relation.schema, unsampled=True)] config = ConfigurationParser().from_file_or_path(StringIO(yaml.dump(config_dict))) assert vals.iso_relation.unsampled == False modified_graph = shgraph._apply_specifications( config, nx.DiGraph(), full_catalog) modified_graph=shgraph._apply_specifications(config,nx.DiGraph(),full_catalog) assert vals.iso_relation.unsampled==True
def test_graph_difference_more_both_isolated_non_isolated_relations_source( stub_graph_set, stub_relation_set): """ Tests graph_difference returns graph with expected nodes if source graph has non-isolated and isolated nodes which are not present in target catalog """ _, vals = stub_graph_set common_relation = Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()) source_catalog = [ common_relation, Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), vals.downstream_relation, vals.upstream_relation, vals.birelation_right ] target_catalog = { common_relation, Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), Relation(name=rand_string(10), **stub_relation_set.rand_relation_helper()), vals.birelation_left, vals.birelation_right } config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation.*$", "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) shgraph = SnowShuGraph() with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = source_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) expected_nodes = source_catalog[1:] actual = SnowShuGraph.catalog_difference(shgraph, target_catalog) assert list(actual.nodes) == expected_nodes
def test_included_and_excluded(adapter, target): shgraph = SnowShuGraph() conf_obj = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(MOCKED_CONFIG))) shgraph.build_graph(conf_obj, MOCKED_CATALOG) matched_nodes = shgraph.graph assert MOCKED_CATALOG[0] in matched_nodes.nodes assert MOCKED_CATALOG[1] in matched_nodes.nodes assert MOCKED_CATALOG[2] not in matched_nodes.nodes assert MOCKED_CATALOG[3] not in matched_nodes.nodes assert MOCKED_CATALOG[4] not in matched_nodes.nodes assert MOCKED_CATALOG[5] not in matched_nodes.nodes assert MOCKED_CATALOG[6] in matched_nodes.nodes
def test_graph_builds_dags_correctly(stub_graph_set): shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] graph = nx.Graph() graph.add_nodes_from(full_catalog) shgraph.graph = graph for sub in shgraph.get_graphs(): assert isinstance(sub, nx.DiGraph)
def test_included_and_excluded(target, adapter): shgraph = SnowShuGraph() conf_obj = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(MOCKED_CONFIG))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = MOCKED_CATALOG conf_obj.source_profile.adapter = adapter_mock shgraph.build_graph(conf_obj) matched_nodes = shgraph.graph assert MOCKED_CATALOG[0] in matched_nodes.nodes assert MOCKED_CATALOG[1] in matched_nodes.nodes assert MOCKED_CATALOG[2] not in matched_nodes.nodes assert MOCKED_CATALOG[3] not in matched_nodes.nodes assert MOCKED_CATALOG[4] not in matched_nodes.nodes assert MOCKED_CATALOG[5] not in matched_nodes.nodes assert MOCKED_CATALOG[6] in matched_nodes.nodes
def test_build_graph_fails_no_downstream(): """ Tests build_graph exits on no downstream relations """ shgraph = SnowShuGraph() full_catalog = [] # no relations in filtered catalog config_dict = copy.deepcopy( CONFIGURATION) # use the "live" config on random test data config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock with pytest.raises(InvalidRelationshipException) as exc: # building the graph should raise when no downstream relations are found shgraph.build_graph(config) assert "does not match any relations" in str(exc.value)
def test_build_graph_partitions_wildcards(stub_graph_set): """ Tests build_graph partitions wildcard relationships """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.downstream_wildcard_relation_1, vals.downstream_wildcard_relation_2, vals.upstream_wildcard_relation_1, vals.upstream_wildcard_relation_2, ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": f"({vals.downstream_wildcard_relation_1.database}|{vals.downstream_wildcard_relation_2.database})", "schema": f"({vals.downstream_wildcard_relation_1.schema}|{vals.downstream_wildcard_relation_2.schema})", "relation": ".*downstream.*$", "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": "", "schema": "", "relation": ".*upstream.*$", "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) assert len(shgraph.graph.edges()) == 2 assert (vals.upstream_wildcard_relation_1, vals.downstream_wildcard_relation_1) in shgraph.graph.edges() assert ( vals.upstream_wildcard_relation_1, vals.downstream_wildcard_relation_2) not in shgraph.graph.edges() assert ( vals.upstream_wildcard_relation_2, vals.downstream_wildcard_relation_1) not in shgraph.graph.edges() assert (vals.upstream_wildcard_relation_2, vals.downstream_wildcard_relation_2) in shgraph.graph.edges()
def _execute(self, barf: bool = False, name: Union[str, None] = None) -> None: graph = SnowShuGraph() if name is not None: self.config.name = name graph.build_graph(self.config) graphs = graph.get_graphs() if len(graphs) < 1: return "No relations found per provided replica configuration, exiting." # TODO replica container should not be started for analyze commands self.config.target_profile.adapter.initialize_replica( self.config.source_profile.name) runner = GraphSetRunner() runner.execute_graph_set(graphs, self.config.source_profile.adapter, self.config.target_profile.adapter, threads=self.config.threads, analyze=self.run_analyze, barf=barf) if not self.run_analyze: relations = [ relation for graph in graphs for relation in graph.nodes] if self.config.source_profile.adapter.SUPPORTS_CROSS_DATABASE: logger.info('Creating x-database links in target...') self.config.target_profile.adapter.enable_cross_database( relations) logger.info('X-database enabled.') logger.info( 'Applying %s emulation functions to target...', self.config.source_profile.adapter.name) for function in self.config.source_profile.adapter.SUPPORTED_FUNCTIONS: self.config.target_profile.adapter.create_function_if_available( function, relations) logger.info('Emulation functions applied.') self.config.target_profile.adapter.finalize_replica() return printable_result( graph_to_result_list(graphs), self.run_analyze)
def test_no_duplicates(stub_graph_set): shgraph=SnowShuGraph() _,vals = stub_graph_set full_catalog=[ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right] config_dict=copy.deepcopy(CONFIGURATION) config=ConfigurationParser().from_file_or_path(StringIO(yaml.dump(config_dict))) shgraph.build_graph(config,full_catalog) graphs = shgraph.get_graphs() all_nodes=[node for graph in graphs for node in graph.nodes] assert len(set(all_nodes)) == len(all_nodes)
def test_graph_difference_less_relations_source(stub_graph_set): """ Tests graph_difference returns graph with no nodes if target catalog has more nodes than source, including all nodes present in source graph """ shgraph = SnowShuGraph() _, vals = stub_graph_set source_catalog = [ vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] target_catalog = [ vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right, vals.iso_relation ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation.*$", # incl birelations "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = source_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) actual = SnowShuGraph.catalog_difference(shgraph, target_catalog) assert len(actual) == 0
def test_build_graph_allows_upstream_regex(stub_graph_set): """ Tests build_graph builds multiple upstream relationships """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation.*$", # incl birelations "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) assert len(shgraph.graph.edges()) == 3 assert (vals.upstream_relation, vals.downstream_relation) in shgraph.graph.edges() assert (vals.birelation_left, vals.downstream_relation) in shgraph.graph.edges() assert (vals.birelation_right, vals.downstream_relation) in shgraph.graph.edges()
def test_build_graph_fails_no_distinct_upstream(stub_graph_set): """ Tests build_graph exits on no distinct upstream relations """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) # add relationship where downstream == upstream config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock with pytest.raises(InvalidRelationshipException) as exc: shgraph.build_graph(config) assert "was specified as a dependency, but it does not exist." in str( exc.value)
def test_sets_outliers(stub_graph_set): shgraph=SnowShuGraph() _,vals = stub_graph_set full_catalog=[ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right] config_dict=copy.deepcopy(CONFIGURATION) config_dict['source']['include_outliers']=True config_dict['source']['max_number_of_outliers']=1000 config=ConfigurationParser().from_file_or_path(StringIO(yaml.dump(config_dict))) modified_graph=shgraph.build_graph(config,full_catalog) assert vals.iso_relation.include_outliers==True assert vals.iso_relation.max_number_of_outliers==1000
def test_no_duplicates(stub_graph_set): shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) graphs = shgraph.get_connected_subgraphs() all_nodes = [node for graph in graphs for node in graph.nodes] assert len(set(all_nodes)) == len(all_nodes)
def test_graph_difference_empty_target(stub_graph_set): """ Tests graph_difference returns source graph with all nodes if target catalog is empty """ shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict["source"]["specified_relations"] = [{ "database": vals.downstream_relation.database, "schema": vals.downstream_relation.schema, "relation": vals.downstream_relation.name, "relationships": { "directional": [{ "local_attribute": vals.directional_key, "database": ".*", "schema": ".*", "relation": ".*relation.*$", "remote_attribute": vals.directional_key }] } }] config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock shgraph.build_graph(config) expected = shgraph.graph target_catalog = set() actual = SnowShuGraph.catalog_difference(shgraph, target_catalog) assert actual == expected
def test_sets_outliers(stub_graph_set): shgraph = SnowShuGraph() _, vals = stub_graph_set full_catalog = [ vals.iso_relation, vals.view_relation, vals.downstream_relation, vals.upstream_relation, vals.birelation_left, vals.birelation_right ] config_dict = copy.deepcopy(BASIC_CONFIGURATION) config_dict['source']['include_outliers'] = True config_dict['source']['max_number_of_outliers'] = 1000 config = ConfigurationParser().from_file_or_path( StringIO(yaml.dump(config_dict))) with mock.MagicMock() as adapter_mock: adapter_mock.build_catalog.return_value = full_catalog config.source_profile.adapter = adapter_mock _ = shgraph.build_graph(config) assert vals.iso_relation.include_outliers is True assert vals.iso_relation.max_number_of_outliers == 1000
def _execute(self, barf: bool = False, name: Optional[str] = None) -> Optional[str]: graph = SnowShuGraph() if name is not None: self.config.name = name graph.build_graph(self.config) if self.incremental: # TODO replica container should not be started for analyze commands self.config.target_profile.adapter.initialize_replica( self.config.source_profile.name, self.incremental) incremental_target_catalog = self.config.target_profile.adapter.build_catalog( patterns=SnowShuGraph.build_sum_patterns_from_configs( self.config), thread_workers=self.config.threads) graph.graph = SnowShuGraph.catalog_difference( graph.graph, incremental_target_catalog) graphs = graph.get_connected_subgraphs() if len(graphs) < 1: args = (' new ', ' incremental ', '; image up-to-date') if self.incremental else (' ', ' ', '') message = "No{}relations found per provided{}replica configuration{}, exiting.".format( *args) return message if not self.config.target_profile.adapter.container: # TODO replica container should not be started for analyze commands self.config.target_profile.adapter.initialize_replica( self.config.source_profile.name) runner = GraphSetRunner() runner.execute_graph_set(graphs, self.config.source_profile.adapter, self.config.target_profile.adapter, threads=self.config.threads, analyze=self.run_analyze, barf=barf) if not self.run_analyze: relations = [ relation for graph in graphs for relation in graph.nodes ] if self.config.source_profile.adapter.SUPPORTS_CROSS_DATABASE: logger.info('Creating x-database links in target...') self.config.target_profile.adapter.enable_cross_database() logger.info('X-database enabled.') self.config.target_profile.adapter.create_all_database_extensions() logger.info('Applying %s emulation functions to target...', self.config.source_profile.adapter.name) for function in self.config.source_profile.adapter.SUPPORTED_FUNCTIONS: self.config.target_profile.adapter.create_function_if_available( function, relations) logger.info('Emulation functions applied.') self.config.target_profile.adapter.finalize_replica() return printable_result(graph_to_result_list(graphs), self.run_analyze)