Example #1
0
    def _set_overriding_params_for_node(self, relation: Relation,
                                        configs: Configuration) -> Relation:
        """Finds and applies specific params from config.

        If multiple conflicting specific params are found they will be applied in descending order from the originating replica file.

        Args:
            relation: A :class:`Relation <snowshu.core.models.relation.Relation>` to be tested for specific configs.
            configs: :class:`Configuration <snowshu.core.configuration_parser.Configuration>` object to search for matches and specified params.
        Returns:
            The :class:`Relation <snowshu.core.models.relation.Relation>` with all updated params applied.
        """
        for pattern in configs.specified_relations:
            if single_full_pattern_match(relation, pattern):
                for attr in (
                        'unsampled',
                        'include_outliers',
                ):
                    pattern_val = getattr(pattern, attr, None)
                    relation.__dict__[
                        attr] = pattern_val if pattern_val is not None else relation.__dict__[
                            attr]

                if getattr(pattern, 'sampling', None) is not None:
                    relation.sampling = pattern.sampling
        return relation

        approved_specified_patterns = [
            dict(database=r.database_pattern,
                 schema=r.schema_pattern,
                 name=r.relation_pattern) for r in config.specified_relations
        ]
Example #2
0
def test_single_full_pattern_match():
    test_relation = relation.Relation(
        database='TEST_DATABASE', schema="TEST_SCHEMA", name="TEST_RELATION", materialization=TABLE, attributes=[])
    pattern = dict(database=".*", schema=".*", name=".*")
    assert relation.single_full_pattern_match(test_relation, pattern)

    pattern = dict(database="TEST_DATABASE", schema=".*", name=".*")
    assert relation.single_full_pattern_match(test_relation, pattern)

    pattern = dict(database="(?i)test_.*", schema=".*", name=".*")
    assert relation.single_full_pattern_match(test_relation, pattern)

    pattern = dict(database="(?i)test_.*", schema="TEST_SCHEMA", name=".*")
    assert relation.single_full_pattern_match(test_relation, pattern)

    pattern = dict(database="test_.*", schema="TEST_SCHEMA", name=".*")
    assert not relation.single_full_pattern_match(test_relation, pattern)
Example #3
0
    def _process_downstream_relation_set(
            relationship: dict,
            downstream_set: Set[Relation],
            graph: networkx.DiGraph,
            full_relation_set: Set[Relation]) -> networkx.Graph:
        """ Adds the appropriate edges to the graph for the given relationship """
        # find any of the upstream relations
        upstream_relations = set(
            filter(
                lambda x: single_full_pattern_match(x, relationship),  # noqa pylint: disable=cell-var-from-loop
                full_relation_set
            )
        )
        # determine the set difference for verification
        upstream_without_downstream = upstream_relations.difference(downstream_set)

        # check to make sure an upstream relation was found
        if not upstream_without_downstream:
            raise InvalidRelationshipException(
                f'It looks like the relation '
                f'{relationship["database"]}.{relationship["schema"]}.{relationship["name"]} '
                f'was specified as a dependency, but it does not exist.'
            )
        # check to see if there was an intersection between the upstream and downstream relations
        if len(upstream_without_downstream) != len(upstream_relations):
            logger.warning(
                f'Relationship {relationship} defines at least one downstream '
                f'relation in the upstream relation set. Ignoring the occurrence '
                f'in the upstream set. Please verify replica config file.'
            )
        # check to make sure we aren't trying to generate a many-to-many relationship
        if len(upstream_without_downstream) > 1 and len(downstream_set) > 1:
            raise InvalidRelationshipException(
                f'Relationship {relationship} defines a many-to-many '
                f'relationship between tables in the source location. '
                f'Many-to-many relationship are not allowed by SnowShu '
                f'as they are usually unintended side effects of lenient regex.'
            )
        # check to make sure found upstream relation is not a view
        view_relations = [r.quoted_dot_notation for r in upstream_relations if r.is_view]
        if view_relations:
            raise InvalidRelationshipException(
                f'Relations {view_relations} are views, '
                f'but have been specified as an upstream dependency for '
                f'the relationship {relationship}. '
                f'View dependencies are not allowed by SnowShu.'
            )

        for downstream_relation in downstream_set:
            for upstream_relation in upstream_without_downstream:
                graph.add_edge(upstream_relation,
                               downstream_relation,
                               **relationship['edge_attributes'])
        return graph
Example #4
0
    def _apply_specifications(
            self, configs: Configuration, graph: networkx.DiGraph,
            available_nodes: Set[Relation]) -> networkx.DiGraph:
        """takes a configuration file, a graph and a collection of available
        nodes, applies configs as edges and returns the graph."""
        for relation in configs.specified_relations:
            relation_dict = dict(name=relation.relation_pattern,
                                 database=relation.database_pattern,
                                 schema=relation.schema_pattern)
            if relation.unsampled:
                unsampled_relations = set(
                    filter(
                        lambda x: single_full_pattern_match(x, relation_dict),
                        available_nodes))
                for rel in unsampled_relations:
                    rel.unsampled = True
                    graph.add_node(rel)
                continue

            edges = list()
            for direction in (
                    'bidirectional',
                    'directional',
            ):
                edges += [
                    dict(direction=direction,
                         database=val.database_pattern,
                         schema=val.schema_pattern,
                         relation=val.relation_pattern,
                         remote_attribute=val.remote_attribute,
                         local_attribute=val.local_attribute)
                    for val in relation.relationships.__dict__[direction]
                ]

            for edge in edges:
                downstream_relations = set(
                    filter(
                        lambda x: single_full_pattern_match(x, relation_dict),
                        available_nodes))
                for rel in downstream_relations:
                    # populate any string wildcard upstreams
                    for attr in (
                            'database',
                            'schema',
                    ):
                        edge[attr] = edge[attr] if edge[
                            attr] is not None else getattr(rel, attr)
                    upstream_relation = lookup_single_relation(
                        edge, available_nodes)
                    if upstream_relation is None:
                        raise ValueError(
                            f'It looks like the wildcard relation {edge["database"]}.{edge["schema"]}.{edge["relation"]} was specified as a dependency, but it does not exist.'
                        )
                    if upstream_relation.is_view:
                        raise InvalidRelationshipException(
                            f'Relation {upstream_relation.quoted_dot_notation} is a view, but has been specified as an upstream dependency for relation {relation.quoted_dot_notation}. View dependencies are not allowed by SnowShu.'
                        )
                    if upstream_relation == rel:
                        continue
                    graph.add_edge(upstream_relation,
                                   rel,
                                   direction=edge['direction'],
                                   remote_attribute=edge['remote_attribute'],
                                   local_attribute=edge['local_attribute'])
        if not graph.is_directed():
            raise ValueError(
                'The graph created by the specified trail path is not directed (circular reference detected).'
            )
        return graph
Example #5
0
    def _apply_specifications(
            configs: Configuration, graph: networkx.DiGraph,
            available_nodes: Set[Relation]) -> networkx.DiGraph:
        """ Takes a configuration file, a graph and a collection of available
            nodes, applies configs as edges and returns the graph.

            When edges are added, they are always out of the remote relation
            and into the local relation. The other details of the relationship
            are included in the edge data.

            Args:
                configs: Configuration to translate into a digraph
                graph: The graph object to apply edges to. Assumed to have most nodes included already
                available_nodes: The set of nodes that are available to be in the graph

            Returns:
                - The final digraph with edges that represents the given configuration
        """
        for relation in configs.specified_relations:
            relation_dict = dict(name=relation.relation_pattern,
                                 database=relation.database_pattern,
                                 schema=relation.schema_pattern)
            if relation.unsampled:
                unsampled_relations = set(
                    filter(
                        lambda x: single_full_pattern_match(x, relation_dict),  # noqa pylint: disable=cell-var-from-loop
                        available_nodes))
                for rel in unsampled_relations:
                    rel.unsampled = True
                    graph.add_node(rel)
                continue

            edges = list()
            for direction in (
                    'bidirectional',
                    'directional',
            ):
                edges += [
                    dict(direction=direction,
                         database=val.database_pattern,
                         schema=val.schema_pattern,
                         relation=val.relation_pattern,
                         remote_attribute=val.remote_attribute,
                         local_attribute=val.local_attribute)
                    for val in relation.relationships.__dict__[direction]
                ]

            for edge in edges:
                downstream_relations = set(
                    filter(
                        lambda x: single_full_pattern_match(x, relation_dict),  # noqa  pylint: disable=cell-var-from-loop
                        available_nodes))
                for rel in downstream_relations:
                    # populate any string wildcard upstreams
                    for attr in (
                            'database',
                            'schema',
                    ):
                        edge[attr] = edge[attr] if edge[
                            attr] is not None else getattr(rel, attr)
                    upstream_relation = lookup_single_relation(
                        edge, available_nodes)
                    if upstream_relation is None:
                        raise ValueError(
                            f'It looks like the wildcard relation '
                            f'{edge["database"]}.{edge["schema"]}.{edge["relation"]} '
                            f'was specified as a dependency, but it does not exist.'
                        )
                    if upstream_relation.is_view:
                        raise InvalidRelationshipException(
                            f'Relation {upstream_relation.quoted_dot_notation} is a view, '
                            f'but has been specified as an upstream dependency for '
                            f'relation {relation.quoted_dot_notation}. '
                            f'View dependencies are not allowed by SnowShu.')
                    if upstream_relation == rel:
                        continue
                    graph.add_edge(upstream_relation,
                                   rel,
                                   direction=edge['direction'],
                                   remote_attribute=edge['remote_attribute'],
                                   local_attribute=edge['local_attribute'])
        return graph
Example #6
0
    def _apply_specifications(  # noqa pylint: disable=too-many-locals
            configs: Configuration,
            graph: networkx.DiGraph,
            available_nodes: Set[Relation]) -> networkx.DiGraph:
        """ Takes a configuration file, a graph and a collection of available
            nodes, applies configs as edges and returns the graph.

            When edges are added, they are always out of the remote relation
            and into the local relation. The other details of the relationship
            are included in the edge data.

            Args:
                configs: Configuration to translate into a digraph
                graph: The graph object to apply edges to. Assumed to have most nodes included already
                available_nodes: The set of nodes that are available to be in the graph

            Returns:
                - The final digraph with edges that represents the given configuration
        """
        for relation in configs.specified_relations:
            # create dict for pattern matching of specified relation pattern
            relation_pattern_dict = dict(
                name=relation.relation_pattern,
                database=relation.database_pattern,
                schema=relation.schema_pattern)
            # if the relation is unsampled, set all matching nodes to be unsampled and break back to for loop
            if relation.unsampled:
                unsampled_relations = set(
                    filter(
                        lambda x: single_full_pattern_match(
                            x,
                            relation_pattern_dict),     # noqa pylint: disable=cell-var-from-loop
                        available_nodes))
                for uns_rel in unsampled_relations:
                    uns_rel.unsampled = True
                    graph.add_node(uns_rel)
                continue

            # processing for non-unsampled relations
            # create a list of the relationship remote patterns and attributes
            relationship_dicts = []
            for relationship_type in ('bidirectional', 'directional',):
                relationship_dicts += [
                    dict(
                        database=val.database_pattern,
                        schema=val.schema_pattern,
                        name=val.relation_pattern,
                        edge_attributes={
                            "direction": relationship_type,
                            "remote_attribute": val.remote_attribute,
                            "local_attribute": val.local_attribute
                        }
                    ) for val in relation.relationships.__dict__[relationship_type]]

            for val in relation.relationships.polymorphic:
                edge_attr = {
                    "direction": "polymorphic",
                    "remote_attribute": val.remote_attribute,
                    "local_attribute": val.local_attribute,
                }
                if val.local_type_attribute:
                    edge_attr["local_type_attribute"] = val.local_type_attribute
                    edge_attr["local_type_overrides"] = val.local_type_overrides

                rel_dict = {
                    "database": val.database_pattern,
                    "schema": val.schema_pattern,
                    "name": val.relation_pattern,
                    "edge_attributes": edge_attr
                }
                relationship_dicts.append(rel_dict)

            # determine downstream relations from relation patterns
            downstream_relations = set(
                filter(
                    lambda x: single_full_pattern_match(x, relation_pattern_dict),  # noqa  pylint: disable=cell-var-from-loop
                    available_nodes
                )
            )
            if not downstream_relations:
                raise InvalidRelationshipException(
                    f'Relationship {relation_pattern_dict} was specified, '
                    f'but does not match any relations. '
                    f'Please verify replica configuration.'
                )

            # for each relationship, find up/downstream relations, then create the appropriate edges
            for relationship in relationship_dicts:
                # check for wild cards
                possible_wildcard_attrs = ('database', 'schema',)
                wildcard_attrs = [attr for attr in possible_wildcard_attrs if relationship[attr] is None]

                # if there are wildcard attributes, partition downstream relations
                if wildcard_attrs:
                    wildcard_partitions = {}
                    for down_rel in downstream_relations:
                        # create wildcard key off of (in case there are multiple wildcards)
                        wildcard_key = '|'.join([getattr(down_rel, attr) for attr in wildcard_attrs])
                        val = wildcard_partitions.get(wildcard_key, [])
                        val.append(down_rel)
                        wildcard_partitions[wildcard_key] = val

                    for _, downstream_partition in wildcard_partitions.items():
                        # populate any wildcard patterns with the appropriate values from first element
                        for attr in wildcard_attrs:
                            relationship[attr] = getattr(downstream_partition[0], attr)

                        graph = SnowShuGraph._process_downstream_relation_set(relationship,
                                                                              downstream_partition,
                                                                              graph,
                                                                              available_nodes)
                # no wildcards present in relationship definition
                else:
                    graph = SnowShuGraph._process_downstream_relation_set(relationship,
                                                                          downstream_relations,
                                                                          graph,
                                                                          available_nodes)

        return graph