Example #1
0
    def _set_overriding_params_for_node(self, relation: Relation,
                                        configs: Configuration) -> Relation:
        """Finds and applies specific params from config.

        If multiple conflicting specific params are found they will be applied in descending order from the originating replica file.

        Args:
            relation: A :class:`Relation <snowshu.core.models.relation.Relation>` to be tested for specific configs.
            configs: :class:`Configuration <snowshu.core.configuration_parser.Configuration>` object to search for matches and specified params.
        Returns:
            The :class:`Relation <snowshu.core.models.relation.Relation>` with all updated params applied.
        """
        for pattern in configs.specified_relations:
            if single_full_pattern_match(relation, pattern):
                for attr in (
                        'unsampled',
                        'include_outliers',
                ):
                    pattern_val = getattr(pattern, attr, None)
                    relation.__dict__[
                        attr] = pattern_val if pattern_val is not None else relation.__dict__[
                            attr]

                if getattr(pattern, 'sampling', None) is not None:
                    relation.sampling = pattern.sampling
        return relation

        approved_specified_patterns = [
            dict(database=r.database_pattern,
                 schema=r.schema_pattern,
                 name=r.relation_pattern) for r in config.specified_relations
        ]
def test_directional_statement():
    sf = SnowflakeAdapter()
    DATABASE, SCHEMA, TABLE, LOCAL_KEY, REMOTE_KEY = [
        rand_string(10) for _ in range(5)
    ]
    relation = Relation(database=DATABASE,
                        schema=SCHEMA,
                        name=TABLE,
                        materialization=TABLE,
                        attributes=[])
    relation.core_query = f"""
SELECT
    *
FROM 
    {DATABASE}.{SCHEMA}.{TABLE}
    SAMPLE BERNOULLI (10)
"""
    statement = sf.predicate_constraint_statement(relation, True, LOCAL_KEY,
                                                  REMOTE_KEY)
    assert query_equalize(statement) == query_equalize(f"""
{LOCAL_KEY} IN 
    ( SELECT  
        {REMOTE_KEY}
      AS {LOCAL_KEY}
    FROM (
SELECT
    *
FROM 
    {DATABASE}.{SCHEMA}.{TABLE}
    SAMPLE BERNOULLI (10)
))
""")
Example #3
0
 def _set_globals_for_node(self, relation: Relation,
                           configs: Configuration) -> Relation:
     """Sets the initial (default) node values from the config
         
     ARGS:
         relation: the :class:`Relation <snowshu.core.models.relation>` to set values of.
         configs: the :class"`Configuration <snowshu.core.configuration_parser.Configuration` object to derive default values from.
     
     Returns:
         The updated :class:`Relation <snowshu.core.models.relation>`
     """
     relation.sampling = configs.sampling
     relation.include_outliers = configs.include_outliers
     relation.max_number_of_outliers = configs.max_number_of_outliers
     return relation
    def get_relations_from_database(self, database: str) -> List[Relation]:
        relations_sql = f"""
                                 SELECT
                                    m.table_schema AS schema,
                                    m.table_name AS relation,
                                    m.table_type AS materialization,
                                    c.column_name AS attribute,
                                    c.ordinal_position AS ordinal,
                                    c.data_type AS data_type
                                 FROM
                                    {database}.INFORMATION_SCHEMA.TABLES m
                                 INNER JOIN
                                    {database}.INFORMATION_SCHEMA.COLUMNS c
                                 ON
                                    c.table_schema = m.table_schema
                                 AND
                                    c.table_name = m.table_name
                                 WHERE
                                    m.table_schema <> 'INFORMATION_SCHEMA'
                              """

        logger.debug(
            f'Collecting detailed relations from database {database}...')
        relations_frame = self._safe_query(relations_sql)
        unique_relations = (relations_frame['schema'] + '.' +
                            relations_frame['relation']).unique().tolist()
        logger.debug(
            f'Done collecting relations. Found a total of {len(unique_relations)} unique relations in database {database}'
        )
        relations = list()
        for relation in unique_relations:
            logger.debug(f'Building relation { database + "." + relation }...')
            attributes = list()

            for attribute in relations_frame.loc[(
                    relations_frame['schema'] + '.' +
                    relations_frame['relation']) == relation].itertuples():
                logger.debug(
                    f'adding attribute {attribute.attribute} to relation..')
                attributes.append(
                    Attribute(self._correct_case(attribute.attribute),
                              self._get_data_type(attribute.data_type)))

            relation = Relation(
                self._correct_case(database),
                self._correct_case(attribute.schema),
                self._correct_case(attribute.relation),
                self.MATERIALIZATION_MAPPINGS[attribute.materialization],
                attributes)
            logger.debug(f'Added relation {relation.dot_notation} to pool.')
            relations.append(relation)

        logger.debug(
            f'Acquired {len(relations)} total relations from database {database}.'
        )
        return relations
def test_sample_statement():
    sf = SnowflakeAdapter()
    DATABASE, SCHEMA, TABLE = [rand_string(10) for _ in range(3)]
    relation = Relation(database=DATABASE,
                        schema=SCHEMA,
                        name=TABLE,
                        materialization=TABLE,
                        attributes=[])
    sample = sf.sample_statement_from_relation(
        relation, BernoulliSampleMethod(10, units="probability"))
    assert query_equalize(sample) == query_equalize(f"""
SELECT
    *
FROM 
    {DATABASE}.{SCHEMA}.{TABLE}
    SAMPLE BERNOULLI (10)
""")
def test_analyze_wrap_statement():
    sf = SnowflakeAdapter()
    DATABASE, SCHEMA, NAME = [rand_string(10) for _ in range(3)]
    relation = Relation(database=DATABASE,
                        schema=SCHEMA,
                        name=NAME,
                        materialization=TABLE,
                        attributes=[])
    sql = f"SELECT * FROM some_crazy_query"
    statement = sf.analyze_wrap_statement(sql, relation)
    assert query_equalize(statement) == query_equalize(f"""
WITH
    {relation.scoped_cte('SNOWSHU_COUNT_POPULATION')} AS (
SELECT
    COUNT(*) AS population_size
FROM
    {relation.quoted_dot_notation}
)
,{relation.scoped_cte('SNOWSHU_CORE_SAMPLE')} AS (
{sql}
)
,{relation.scoped_cte('SNOWSHU_CORE_SAMPLE_COUNT')} AS (
SELECT
    COUNT(*) AS sample_size
FROM
    {relation.scoped_cte('SNOWSHU_CORE_SAMPLE')}
)
SELECT
    s.sample_size AS sample_size
    ,p.population_size AS population_size
FROM
    {relation.scoped_cte('SNOWSHU_CORE_SAMPLE_COUNT')} s
INNER JOIN
    {relation.scoped_cte('SNOWSHU_COUNT_POPULATION')} p
ON
    1=1
LIMIT 1
""")
def test_x00_replacement():
    adapter = PostgresAdapter(replica_metadata={})
    id_col = "id"
    content_col = "content"
    normal_val = "normal_value"
    weird_value = "weird\x00value"
    custom_replacement = "__CUSTOM_VALUE__"

    cols = [
        Attribute(id_col, data_types.BIGINT),
        Attribute(content_col, data_types.VARCHAR)
    ]
    # test default replacement
    relation = Relation("db", "schema", "relation", TABLE, cols)
    relation.data = DataFrame({
        id_col: [1, 2],
        content_col: [normal_val, weird_value]
    })

    fixed_relation = adapter.replace_x00_values(relation)
    assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 1,
                                       [content_col]] == normal_val)
    assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 2,
                                       [content_col]] == "weirdvalue")

    # test custom replacement
    adapter = PostgresAdapter(replica_metadata={},
                              pg_0x00_replacement=custom_replacement)
    relation = Relation("db", "schema", "relation", TABLE, cols)
    relation.data = DataFrame({
        id_col: [1, 2],
        content_col: [normal_val, weird_value]
    })

    fixed_relation = adapter.replace_x00_values(relation)
    assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 1,
                                       [content_col]] == normal_val)
    assert all(fixed_relation.data.loc[fixed_relation.data[id_col] == 2,
                                       [content_col]] ==
               f"weird{custom_replacement}value")
Example #8
0
    def _get_relations_from_database(
            self,
            schema_obj: BaseTargetAdapter._DatabaseObject) -> List[Relation]:
        quoted_database = schema_obj.full_relation.quoted(
            schema_obj.full_relation.database)  # quoted db name
        relation_database = schema_obj.full_relation.database  # case corrected db name
        case_sensitive_schema = schema_obj.case_sensitive_name  # case sensitive schame name
        relations_sql = f"""
                                 SELECT
                                    m.table_schema AS schema,
                                    m.table_name AS relation,
                                    m.table_type AS materialization,
                                    c.column_name AS attribute,
                                    c.ordinal_position AS ordinal,
                                    c.data_type AS data_type
                                 FROM
                                    {quoted_database}.information_schema.tables m
                                 INNER JOIN
                                    {quoted_database}.information_schema.columns c
                                 ON
                                    c.table_schema = m.table_schema
                                 AND
                                    c.table_name = m.table_name
                                 WHERE
                                    m.table_schema = '{case_sensitive_schema}'
                                    AND m.table_schema NOT IN ('information_schema', 'pg_catalog')
                                    AND m.table_type <> 'external'
                              """

        logger.debug(
            f'Collecting detailed relations from database {quoted_database}...'
        )
        relations_frame = self._safe_query(relations_sql, quoted_database)
        unique_relations = (relations_frame['schema'] + '.' +
                            relations_frame['relation']).unique().tolist()
        logger.debug(
            f'Done collecting relations. Found a total of {len(unique_relations)} '
            f'unique relations in database {quoted_database}')
        relations = list()
        for relation in unique_relations:
            logger.debug(
                f'Building relation { quoted_database + "." + relation }...')
            attributes = list()

            for attribute in relations_frame.loc[(
                    relations_frame['schema'] + '.' +
                    relations_frame['relation']) == relation].itertuples():
                logger.debug(
                    f'adding attribute {attribute.attribute} to relation..')
                attributes.append(
                    Attribute(self._correct_case(attribute.attribute),
                              self._get_data_type(attribute.data_type)))

                relation = Relation(
                    relation_database,
                    self._correct_case(attribute.schema),  # noqa pylint: disable=undefined-loop-variable
                    self._correct_case(attribute.relation),  # noqa pylint: disable=undefined-loop-variable
                    self.MATERIALIZATION_MAPPINGS[
                        attribute.materialization.replace(" ", "_")],  # noqa pylint: disable=undefined-loop-variable
                    attributes)
            logger.debug(f'Added relation {relation.dot_notation} to pool.')
            relations.append(relation)
        logger.debug(
            f'Acquired {len(relations)} total relations from database {quoted_database}.'
        )
        return relations
Example #9
0
def test_run_deps_mixed_multi_deps():
    r"""
        a --bidir--> c <--dir-- b
         \          / \
          \     bidir  dir
           \       |   |
            \      V   V
             dir-> d   e
    """
    relation_helper = RelationTestHelper()
    relation_a = Relation(name='rel_a',
                          **relation_helper.rand_relation_helper())
    relation_a.attributes = [
        Attribute('col_a_c', dt.INTEGER),
        Attribute('col_a_d', dt.VARCHAR)
    ]
    relation_a.data = pd.DataFrame({
        "col_a_c": [
            1,
            2,
            3,
            4,
            5,
        ],
        "col_a_d": ["var_a_1", "var_a_2", "var_a_3", "var_a_1", "var_a_2"],
    })
    relation_b = Relation(name='rel_b',
                          **relation_helper.rand_relation_helper())
    relation_b.attributes = [Attribute('col_b_c', dt.VARCHAR)]
    relation_b.data = pd.DataFrame({
        "col_b_c": [
            "val1",
            "val2",
            "val3",
            "val4",
            "val5",
        ],
    })

    relation_c = Relation(name='rel_c',
                          **relation_helper.rand_relation_helper())
    relation_c.attributes = [
        Attribute('col_c_ae', dt.INTEGER),
        Attribute('col_c_bd', dt.VARCHAR)
    ]
    relation_c.data = pd.DataFrame({
        "col_c_ae": [
            1,
            1,
            2,
            2,
            5,
            5,
            5,
        ],
        "col_c_bd": [
            "val1",
            "val1",
            "val2",
            "val2",
            "val5",
            "val5",
            "val5",
        ]
    })

    relation_d = Relation(name='rel_d',
                          **relation_helper.rand_relation_helper())
    relation_d.attributes = [
        Attribute('col_d_a', dt.INTEGER),
        Attribute('col_d_c', dt.INTEGER)
    ]

    relation_e = Relation(name='rel_e',
                          **relation_helper.rand_relation_helper())
    relation_e.attributes = [Attribute('col_e_c', dt.INTEGER)]

    for relation in (
            relation_a,
            relation_b,
            relation_c,
            relation_d,
            relation_e,
    ):
        relation = stub_out_sampling(relation)

    dag = nx.DiGraph()
    dag.add_edge(relation_a,
                 relation_c,
                 direction="bidirectional",
                 remote_attribute="col_a_c",
                 local_attribute="col_c_ae")
    dag.add_edge(relation_a,
                 relation_d,
                 direction="directional",
                 remote_attribute="col_a_d",
                 local_attribute="col_d_a")
    dag.add_edge(relation_b,
                 relation_c,
                 direction="directional",
                 remote_attribute="col_b_c",
                 local_attribute="col_c_bd")
    dag.add_edge(relation_c,
                 relation_d,
                 direction="bidirectional",
                 remote_attribute="col_c_bd",
                 local_attribute="col_d_c")
    dag.add_edge(relation_c,
                 relation_e,
                 direction="directional",
                 remote_attribute="col_c_ae",
                 local_attribute="col_e_c")
    adapter = SnowflakeAdapter()
    RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_d, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_e, dag,
                                                       adapter, False)
    assert query_equalize(relation_a.compiled_query) == query_equalize(f"""
        WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
            {relation_a.quoted_dot_notation} 
        WHERE 
            col_a_c 
        in (SELECT 
                col_c_ae 
            FROM 
                {relation_c.quoted_dot_notation}) ) 
        ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
            SELECT 
                * 
            FROM 
                {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
    assert query_equalize(relation_b.compiled_query) == query_equalize(f"""
        SELECT
            *
        FROM
            {relation_b.quoted_dot_notation}
        SAMPLE BERNOULLI (1500 ROWS)
    """)
    assert query_equalize(relation_c.compiled_query) == query_equalize(f"""
        SELECT 
            * 
        FROM 
            {relation_c.quoted_dot_notation} 
        WHERE 
            col_c_bd 
        in (SELECT 
                col_d_c 
            FROM 
                {relation_d.quoted_dot_notation})
        AND
            col_c_ae IN (1,2,3,4,5)
        AND
            col_c_bd IN ('val1','val2','val3','val4','val5')
    """)
    assert query_equalize(relation_d.compiled_query) == query_equalize(f"""
        SELECT 
            * 
        FROM 
            {relation_d.quoted_dot_notation}
        WHERE 
            col_d_a IN ('var_a_1','var_a_2','var_a_3') 
        AND
            col_d_c IN ('val1','val2','val5') 
    """)
    assert query_equalize(relation_e.compiled_query) == query_equalize(f"""
        WITH 
        {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_e.quoted_dot_notation}
        WHERE 
            col_e_c IN (1,2,5) 
        )
        ,{relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
Example #10
0
def test_run_deps_bidirectional_multi_deps():
    """
        a --bidir--> c <--bidir-- b
    """
    relation_helper = RelationTestHelper()
    relation_a = Relation(name='rel_a',
                          **relation_helper.rand_relation_helper())
    relation_a.attributes = [Attribute('col_a', dt.INTEGER)]
    relation_a.data = pd.DataFrame({"col_a": [
        1,
        2,
        3,
        4,
        5,
    ]})

    relation_b = Relation(name='rel_b',
                          **relation_helper.rand_relation_helper())
    relation_b.attributes = [Attribute('col_b', dt.VARCHAR)]
    relation_b.data = pd.DataFrame({
        "col_b": [
            "val1",
            "val2",
            "val3",
            "val4",
            "val5",
        ],
    })

    relation_c = Relation(name='rel_c',
                          **relation_helper.rand_relation_helper())
    relation_c.attributes = [
        Attribute('col_c_a', dt.INTEGER),
        Attribute('col_c_b', dt.VARCHAR)
    ]

    for relation in (
            relation_a,
            relation_b,
            relation_c,
    ):
        relation = stub_out_sampling(relation)

    dag = nx.DiGraph()
    dag.add_edge(relation_a,
                 relation_c,
                 direction="bidirectional",
                 remote_attribute="col_a",
                 local_attribute="col_c_a")
    dag.add_edge(relation_b,
                 relation_c,
                 direction="bidirectional",
                 remote_attribute="col_b",
                 local_attribute="col_c_b")
    adapter = SnowflakeAdapter()
    RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag,
                                                       adapter, False)
    assert query_equalize(relation_a.compiled_query) == query_equalize(f"""
        WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
            {relation_a.quoted_dot_notation} 
        WHERE 
            col_a 
        in (SELECT 
                col_c_a 
            FROM 
                {relation_c.quoted_dot_notation}) ) 
        ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
            SELECT 
                * 
            FROM 
                {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
    assert query_equalize(relation_b.compiled_query) == query_equalize(f"""
        WITH 
        {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_b.quoted_dot_notation}
        WHERE 
            col_b 
        in (SELECT 
                col_c_b 
            FROM 
                {relation_c.quoted_dot_notation})
        )
        ,{relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
    assert query_equalize(relation_c.compiled_query) == query_equalize(f"""
            SELECT
                *
            FROM {relation_c.quoted_dot_notation} 
            WHERE
                col_c_a IN (1,2,3,4,5)
            AND
                col_c_b IN ('val1','val2','val3','val4','val5')
    """)
Example #11
0
def stub_out_sampling(rel: Relation) -> Relation:
    rel.sampling = DefaultSampling()
    rel.sampling.sample_method = BernoulliSampleMethod(1500, units='rows')
    return rel