Esempio n. 1
0
def test_run_deps_polymorphic_parentid(stub_relation_set):
    child1 = stub_relation_set.child_relation_type_1
    child2 = stub_relation_set.child_relation_type_2
    child3 = stub_relation_set.child_relation_type_3
    parent = stub_relation_set.parent_relation_parentid
    parentid = stub_relation_set.parentid_key
    for relation in (
            child1,
            child2,
            child3,
            parent,
    ):
        relation = stub_out_sampling(relation)

    child1.data = pd.DataFrame([{parentid: "1"}, {parentid: "10"}])
    child2.data = pd.DataFrame([{parentid: "2"}, {parentid: "20"}])
    child3.data = pd.DataFrame([{parentid: "3"}, {parentid: "30"}])
    dag = nx.DiGraph()
    dag.add_edge(child1,
                 parent,
                 direction="polymorphic",
                 remote_attribute=parentid,
                 local_attribute=parentid)
    dag.add_edge(child2,
                 parent,
                 direction="polymorphic",
                 remote_attribute=parentid,
                 local_attribute=parentid)
    dag.add_edge(child3,
                 parent,
                 direction="polymorphic",
                 remote_attribute=parentid,
                 local_attribute=parentid)
    adapter = SnowflakeAdapter()
    child1 = RuntimeSourceCompiler.compile_queries_for_relation(
        child1, dag, adapter, False)
    child2 = RuntimeSourceCompiler.compile_queries_for_relation(
        child2, dag, adapter, False)
    child3 = RuntimeSourceCompiler.compile_queries_for_relation(
        child3, dag, adapter, False)
    parent = RuntimeSourceCompiler.compile_queries_for_relation(
        parent, dag, adapter, False)

    expected_query = f"""
        SELECT 
            * 
        FROM 
        {parent.quoted_dot_notation}
        WHERE ( {parentid} IN ('1','10') OR {parentid} IN ('2','20') OR {parentid} IN ('3','30') )
    """
    assert query_equalize(
        parent.compiled_query) == query_equalize(expected_query)
Esempio n. 2
0
def test_run_deps_bidirectional_exclude_outliers(stub_relation_set):
    upstream = stub_relation_set.upstream_relation
    downstream = stub_relation_set.downstream_relation
    for relation in (
            downstream,
            upstream,
    ):
        relation.attributes = [Attribute('id', dt.INTEGER)]
        relation = stub_out_sampling(relation)
    upstream.data = pd.DataFrame([dict(id=1), dict(id=2), dict(id=3)])

    dag = nx.DiGraph()
    dag.add_edge(upstream,
                 downstream,
                 direction="bidirectional",
                 remote_attribute='id',
                 local_attribute='id')
    adapter = SnowflakeAdapter()
    RuntimeSourceCompiler.compile_queries_for_relation(upstream, dag, adapter,
                                                       False)
    RuntimeSourceCompiler.compile_queries_for_relation(downstream, dag,
                                                       adapter, False)
    assert query_equalize(downstream.compiled_query) == query_equalize(f"""
            SELECT
                *
            FROM {downstream.quoted_dot_notation}
            WHERE id IN (1,2,3)
    """)

    assert query_equalize(upstream.compiled_query) == query_equalize(f"""
        WITH {upstream.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
            {upstream.quoted_dot_notation} 
        WHERE 
            id 
        in (SELECT 
                id 
            FROM 
                {downstream.quoted_dot_notation}) ) 
        ,{upstream.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
            SELECT 
                * 
            FROM 
                {upstream.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {upstream.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
Esempio n. 3
0
def test_run_iso(stub_relation_set):
    iso = stub_relation_set.iso_relation
    iso = stub_out_sampling(iso)
    dag = nx.DiGraph()
    dag.add_nodes_from([iso])
    compiler = RuntimeSourceCompiler()
    adapter = SnowflakeAdapter()
    result = compiler.compile_queries_for_relation(iso, dag, adapter, False)
    assert query_equalize(iso.compiled_query) == query_equalize(f"""
SELECT
    *
FROM 
    {iso.quoted_dot_notation}
    SAMPLE BERNOULLI (1500 ROWS)
""")
Esempio n. 4
0
def test_analyze_unsampled(stub_relation_set):
    upstream = stub_relation_set.upstream_relation
    upstream.unsampled = True
    dag = nx.DiGraph()
    dag.add_edges_from([(
        upstream,
        stub_relation_set.downstream_relation,
    )])
    compiler = RuntimeSourceCompiler()
    adapter, unsampled_method = [mock.MagicMock() for _ in range(2)]
    adapter.unsampled_statement = unsampled_method

    result = compiler.compile_queries_for_relation(upstream, dag, adapter,
                                                   True)
    assert unsampled_method.called
Esempio n. 5
0
def test_analyze_iso(stub_relation_set):
    iso = stub_relation_set.iso_relation
    iso = stub_out_sampling(iso)
    dag = nx.DiGraph()
    dag.add_nodes_from([iso])
    compiler = RuntimeSourceCompiler()
    adapter = SnowflakeAdapter()
    result = compiler.compile_queries_for_relation(iso, dag, adapter, True)
    assert query_equalize(iso.compiled_query) == query_equalize(f"""
WITH
    {iso.scoped_cte('SNOWSHU_COUNT_POPULATION')} AS (
SELECT
    COUNT(*) AS population_size
FROM
    {iso.quoted_dot_notation}
)
,{iso.scoped_cte('SNOWSHU_CORE_SAMPLE')} AS (
SELECT
    *
FROM 
    {iso.quoted_dot_notation}
    SAMPLE BERNOULLI (1500 ROWS)
)
,{iso.scoped_cte('SNOWSHU_CORE_SAMPLE')}_COUNT AS (
SELECT
    COUNT(*) AS sample_size
FROM
    {iso.scoped_cte('SNOWSHU_CORE_SAMPLE')}
)
SELECT
    s.sample_size AS sample_size
    ,p.population_size AS population_size
FROM
    {iso.scoped_cte('SNOWSHU_CORE_SAMPLE')}_COUNT s
INNER JOIN
    {iso.scoped_cte('SNOWSHU_COUNT_POPULATION')} p
ON
    1=1
LIMIT 1
""")
Esempio n. 6
0
def test_run_unsampled(stub_relation_set):
    upstream = stub_relation_set.upstream_relation
    upstream.unsampled = True
    dag = nx.DiGraph()
    dag.add_edges_from([(
        upstream,
        stub_relation_set.downstream_relation,
    )])
    adapter = SnowflakeAdapter()
    upstream = RuntimeSourceCompiler.compile_queries_for_relation(
        upstream, dag, adapter, False)
    assert query_equalize(upstream.compiled_query) == query_equalize(f"""
        SELECT
            *
        FROM {upstream.quoted_dot_notation}
    """)
Esempio n. 7
0
def test_analyze_unsampled(stub_relation_set):
    upstream = stub_relation_set.upstream_relation
    upstream.unsampled = True
    dag = nx.DiGraph()
    dag.add_edges_from([(
        upstream,
        stub_relation_set.downstream_relation,
    )])
    adapter = SnowflakeAdapter()
    upstream = RuntimeSourceCompiler.compile_queries_for_relation(
        upstream, dag, adapter, True)
    assert query_equalize(upstream.compiled_query) == query_equalize(f"""
        WITH
            {upstream.scoped_cte('SNOWSHU_COUNT_POPULATION')} AS (
        SELECT
            COUNT(*) AS population_size
        FROM
            {upstream.quoted_dot_notation}
        )
        ,{upstream.scoped_cte('SNOWSHU_CORE_SAMPLE')} AS (
        SELECT
            *
        FROM
            {upstream.quoted_dot_notation}
        )
        ,{upstream.scoped_cte('SNOWSHU_CORE_SAMPLE_COUNT')} AS (
        SELECT
            COUNT(*) AS sample_size
        FROM
            {upstream.scoped_cte('SNOWSHU_CORE_SAMPLE')}
        )
        SELECT
            s.sample_size AS sample_size
            ,p.population_size AS population_size
        FROM
            {upstream.scoped_cte('SNOWSHU_CORE_SAMPLE_COUNT')} s
        INNER JOIN
            {upstream.scoped_cte('SNOWSHU_COUNT_POPULATION')} p
        ON
            1=1
        LIMIT 1
    """)
Esempio n. 8
0
def test_run_deps_mixed_multi_deps():
    r"""
        a --bidir--> c <--dir-- b
         \          / \
          \     bidir  dir
           \       |   |
            \      V   V
             dir-> d   e
    """
    relation_helper = RelationTestHelper()
    relation_a = Relation(name='rel_a',
                          **relation_helper.rand_relation_helper())
    relation_a.attributes = [
        Attribute('col_a_c', dt.INTEGER),
        Attribute('col_a_d', dt.VARCHAR)
    ]
    relation_a.data = pd.DataFrame({
        "col_a_c": [
            1,
            2,
            3,
            4,
            5,
        ],
        "col_a_d": ["var_a_1", "var_a_2", "var_a_3", "var_a_1", "var_a_2"],
    })
    relation_b = Relation(name='rel_b',
                          **relation_helper.rand_relation_helper())
    relation_b.attributes = [Attribute('col_b_c', dt.VARCHAR)]
    relation_b.data = pd.DataFrame({
        "col_b_c": [
            "val1",
            "val2",
            "val3",
            "val4",
            "val5",
        ],
    })

    relation_c = Relation(name='rel_c',
                          **relation_helper.rand_relation_helper())
    relation_c.attributes = [
        Attribute('col_c_ae', dt.INTEGER),
        Attribute('col_c_bd', dt.VARCHAR)
    ]
    relation_c.data = pd.DataFrame({
        "col_c_ae": [
            1,
            1,
            2,
            2,
            5,
            5,
            5,
        ],
        "col_c_bd": [
            "val1",
            "val1",
            "val2",
            "val2",
            "val5",
            "val5",
            "val5",
        ]
    })

    relation_d = Relation(name='rel_d',
                          **relation_helper.rand_relation_helper())
    relation_d.attributes = [
        Attribute('col_d_a', dt.INTEGER),
        Attribute('col_d_c', dt.INTEGER)
    ]

    relation_e = Relation(name='rel_e',
                          **relation_helper.rand_relation_helper())
    relation_e.attributes = [Attribute('col_e_c', dt.INTEGER)]

    for relation in (
            relation_a,
            relation_b,
            relation_c,
            relation_d,
            relation_e,
    ):
        relation = stub_out_sampling(relation)

    dag = nx.DiGraph()
    dag.add_edge(relation_a,
                 relation_c,
                 direction="bidirectional",
                 remote_attribute="col_a_c",
                 local_attribute="col_c_ae")
    dag.add_edge(relation_a,
                 relation_d,
                 direction="directional",
                 remote_attribute="col_a_d",
                 local_attribute="col_d_a")
    dag.add_edge(relation_b,
                 relation_c,
                 direction="directional",
                 remote_attribute="col_b_c",
                 local_attribute="col_c_bd")
    dag.add_edge(relation_c,
                 relation_d,
                 direction="bidirectional",
                 remote_attribute="col_c_bd",
                 local_attribute="col_d_c")
    dag.add_edge(relation_c,
                 relation_e,
                 direction="directional",
                 remote_attribute="col_c_ae",
                 local_attribute="col_e_c")
    adapter = SnowflakeAdapter()
    RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_d, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_e, dag,
                                                       adapter, False)
    assert query_equalize(relation_a.compiled_query) == query_equalize(f"""
        WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
            {relation_a.quoted_dot_notation} 
        WHERE 
            col_a_c 
        in (SELECT 
                col_c_ae 
            FROM 
                {relation_c.quoted_dot_notation}) ) 
        ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
            SELECT 
                * 
            FROM 
                {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
    assert query_equalize(relation_b.compiled_query) == query_equalize(f"""
        SELECT
            *
        FROM
            {relation_b.quoted_dot_notation}
        SAMPLE BERNOULLI (1500 ROWS)
    """)
    assert query_equalize(relation_c.compiled_query) == query_equalize(f"""
        SELECT 
            * 
        FROM 
            {relation_c.quoted_dot_notation} 
        WHERE 
            col_c_bd 
        in (SELECT 
                col_d_c 
            FROM 
                {relation_d.quoted_dot_notation})
        AND
            col_c_ae IN (1,2,3,4,5)
        AND
            col_c_bd IN ('val1','val2','val3','val4','val5')
    """)
    assert query_equalize(relation_d.compiled_query) == query_equalize(f"""
        SELECT 
            * 
        FROM 
            {relation_d.quoted_dot_notation}
        WHERE 
            col_d_a IN ('var_a_1','var_a_2','var_a_3') 
        AND
            col_d_c IN ('val1','val2','val5') 
    """)
    assert query_equalize(relation_e.compiled_query) == query_equalize(f"""
        WITH 
        {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_e.quoted_dot_notation}
        WHERE 
            col_e_c IN (1,2,5) 
        )
        ,{relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_e.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_e.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
Esempio n. 9
0
def test_run_deps_bidirectional_multi_deps():
    """
        a --bidir--> c <--bidir-- b
    """
    relation_helper = RelationTestHelper()
    relation_a = Relation(name='rel_a',
                          **relation_helper.rand_relation_helper())
    relation_a.attributes = [Attribute('col_a', dt.INTEGER)]
    relation_a.data = pd.DataFrame({"col_a": [
        1,
        2,
        3,
        4,
        5,
    ]})

    relation_b = Relation(name='rel_b',
                          **relation_helper.rand_relation_helper())
    relation_b.attributes = [Attribute('col_b', dt.VARCHAR)]
    relation_b.data = pd.DataFrame({
        "col_b": [
            "val1",
            "val2",
            "val3",
            "val4",
            "val5",
        ],
    })

    relation_c = Relation(name='rel_c',
                          **relation_helper.rand_relation_helper())
    relation_c.attributes = [
        Attribute('col_c_a', dt.INTEGER),
        Attribute('col_c_b', dt.VARCHAR)
    ]

    for relation in (
            relation_a,
            relation_b,
            relation_c,
    ):
        relation = stub_out_sampling(relation)

    dag = nx.DiGraph()
    dag.add_edge(relation_a,
                 relation_c,
                 direction="bidirectional",
                 remote_attribute="col_a",
                 local_attribute="col_c_a")
    dag.add_edge(relation_b,
                 relation_c,
                 direction="bidirectional",
                 remote_attribute="col_b",
                 local_attribute="col_c_b")
    adapter = SnowflakeAdapter()
    RuntimeSourceCompiler.compile_queries_for_relation(relation_a, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_b, dag,
                                                       adapter, False)
    RuntimeSourceCompiler.compile_queries_for_relation(relation_c, dag,
                                                       adapter, False)
    assert query_equalize(relation_a.compiled_query) == query_equalize(f"""
        WITH {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
            {relation_a.quoted_dot_notation} 
        WHERE 
            col_a 
        in (SELECT 
                col_c_a 
            FROM 
                {relation_c.quoted_dot_notation}) ) 
        ,{relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
            SELECT 
                * 
            FROM 
                {relation_a.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_a.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
    assert query_equalize(relation_b.compiled_query) == query_equalize(f"""
        WITH 
        {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_b.quoted_dot_notation}
        WHERE 
            col_b 
        in (SELECT 
                col_c_b 
            FROM 
                {relation_c.quoted_dot_notation})
        )
        ,{relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')} AS ( 
        SELECT 
            * 
        FROM 
        {relation_b.scoped_cte('SNOWSHU_FINAL_SAMPLE')} SAMPLE BERNOULLI (1500 ROWS) 
        ) 
        SELECT 
            * 
        FROM 
        {relation_b.scoped_cte('SNOWSHU_DIRECTIONAL_SAMPLE')}
    """)
    assert query_equalize(relation_c.compiled_query) == query_equalize(f"""
            SELECT
                *
            FROM {relation_c.quoted_dot_notation} 
            WHERE
                col_c_a IN (1,2,3,4,5)
            AND
                col_c_b IN ('val1','val2','val3','val4','val5')
    """)
Esempio n. 10
0
def test_run_deps_polymorphic_idtype(stub_relation_set):
    child1 = stub_relation_set.child_relation_type_1
    child2 = stub_relation_set.child_relation_type_2
    child3 = stub_relation_set.child_relation_type_3
    parent = stub_relation_set.parent_relation_childid_type
    childid = stub_relation_set.childid_key
    childtype = stub_relation_set.childtype_key
    child2type_override = stub_relation_set.child2override_key
    local_overrides = {child2.dot_notation: child2type_override}
    for relation in (
            child1,
            child2,
            child3,
            parent,
    ):
        relation = stub_out_sampling(relation)

    child1.data = pd.DataFrame([{childid: "1"}, {childid: "2"}])
    child2.data = pd.DataFrame([{childid: "1"}, {childid: "3"}])
    child3.data = pd.DataFrame([{childid: "1"}, {childid: "4"}])
    dag = nx.DiGraph()
    dag.add_edge(child1,
                 parent,
                 direction="polymorphic",
                 remote_attribute=childid,
                 local_attribute=childid,
                 local_type_attribute=childtype,
                 local_type_overrides=local_overrides)
    dag.add_edge(child2,
                 parent,
                 direction="polymorphic",
                 remote_attribute=childid,
                 local_attribute=childid,
                 local_type_attribute=childtype,
                 local_type_overrides=local_overrides)
    dag.add_edge(child3,
                 parent,
                 direction="polymorphic",
                 remote_attribute=childid,
                 local_attribute=childid,
                 local_type_attribute=childtype,
                 local_type_overrides=local_overrides)
    adapter = SnowflakeAdapter()
    child1 = RuntimeSourceCompiler.compile_queries_for_relation(
        child1, dag, adapter, False)
    child2 = RuntimeSourceCompiler.compile_queries_for_relation(
        child2, dag, adapter, False)
    child3 = RuntimeSourceCompiler.compile_queries_for_relation(
        child3, dag, adapter, False)
    parent = RuntimeSourceCompiler.compile_queries_for_relation(
        parent, dag, adapter, False)

    expected_query = f"""
        SELECT 
            * 
        FROM 
        {parent.quoted_dot_notation}
        WHERE ( ({childid} IN ('1','2') AND LOWER({childtype}) = LOWER('child_type_1_record') ) OR ({childid} IN ('1','3') AND LOWER({childtype}) = LOWER('{child2type_override}') ) OR ({childid} IN ('1','4') AND LOWER({childtype}) = LOWER('child_type_3_record') ) )
    """
    assert query_equalize(
        parent.compiled_query) == query_equalize(expected_query)
Esempio n. 11
0
    def _traverse_and_execute(
            self, executable: GraphExecutable
    ) -> None:  # noqa mccabe: disable=MC0001
        """ Processes a single graph and loads the data into the replica if required

            To save memory after processing, the loaded dataframes are deleted, and
            garbage collection manually called.

            Args:
                executable (GraphExecutable): object that contains all of the necessary info for
                    executing a sample and loading it into the target
        """
        start_time = time.time()
        if self.barf:
            with open(
                    os.path.join(
                        self.barf_output,
                        f'{[n for n in executable.graph.nodes][0].dot_notation}.component'
                    ), 'wb') as cmp_file:  # noqa pylint: disable=unnecessary-comprehension
                nx.write_multiline_adjlist(executable.graph, cmp_file)
        try:
            logger.debug(
                f"Executing graph with {len(executable.graph)} relations in it..."
            )
            for i, relation in enumerate(
                    nx.algorithms.dag.topological_sort(executable.graph)):
                relation.population_size = executable.source_adapter.scalar_query(
                    executable.source_adapter.population_count_statement(
                        relation))
                logger.info(
                    f'Executing source query for relation {relation.dot_notation} '
                    f'({i+1} of {len(executable.graph)} in graph)...')

                relation.sampling.prepare(relation, executable.source_adapter)
                relation = RuntimeSourceCompiler.compile_queries_for_relation(
                    relation, executable.graph, executable.source_adapter,
                    executable.analyze)

                if executable.analyze:
                    if relation.is_view:
                        relation.population_size = "N/A"
                        relation.sample_size = "N/A"
                        logger.info(
                            f'Relation {relation.dot_notation} is a view, skipping.'
                        )
                    else:
                        result = executable.source_adapter.check_count_and_query(
                            relation.compiled_query, MAX_ALLOWED_ROWS,
                            relation.unsampled).iloc[0]
                        relation.population_size = result.population_size
                        relation.sample_size = result.sample_size
                        logger.info(
                            f'Analysis of relation {relation.dot_notation} completed in {duration(start_time)}.'
                        )
                else:
                    executable.target_adapter.create_database_if_not_exists(
                        relation.quoted(relation.database))
                    executable.target_adapter.create_schema_if_not_exists(
                        relation.quoted(relation.database),
                        relation.quoted(relation.schema))
                    if relation.is_view:
                        logger.info(
                            f'Retrieving DDL statement for view {relation.dot_notation} in source...'
                        )
                        relation.population_size = "N/A"
                        relation.sample_size = "N/A"
                        try:
                            relation.view_ddl = executable.source_adapter.scalar_query(
                                relation.compiled_query)
                        except Exception:
                            raise SystemError(
                                f'Failed to extract DDL statement: {relation.compiled_query}'
                            )
                        logger.info(
                            f'Successfully extracted DDL statement for view {relation.quoted_dot_notation}'
                        )
                    else:
                        logger.info(
                            f'Retrieving records from source {relation.dot_notation}...'
                        )
                        try:
                            relation.data = executable.source_adapter.check_count_and_query(
                                relation.compiled_query, MAX_ALLOWED_ROWS,
                                relation.unsampled)
                        except Exception as exc:
                            raise SystemError(
                                f'Failed execution of extraction sql statement: {relation.compiled_query} {exc}'
                            )

                        relation.sample_size = len(relation.data)
                        logger.info(
                            f'{relation.sample_size} records retrieved for relation {relation.dot_notation}.'
                        )

                    logger.info(
                        f'Inserting relation {relation.quoted_dot_notation} into target...'
                    )
                    try:
                        executable.target_adapter.create_and_load_relation(
                            relation)
                    except Exception as exc:
                        raise SystemError(
                            f'Failed to load relation {relation.quoted_dot_notation} into target: {exc}'
                        )

                    logger.info(
                        f'Done replication of relation {relation.dot_notation} in {duration(start_time)}.'
                    )
                    relation.target_loaded = True
                relation.source_extracted = True
                logger.info(
                    f'population:{relation.population_size}, sample:{relation.sample_size}'
                )
                if self.barf:
                    with open(
                            os.path.join(self.barf_output,
                                         f'{relation.dot_notation}.sql'),
                            'w') as barf_file:
                        barf_file.write(relation.compiled_query)
            try:
                for relation in executable.graph.nodes:
                    del relation.data
            except AttributeError:
                pass
            gc.collect()
        except Exception as exc:
            logger.error(f'failed with error of type {type(exc)}: {str(exc)}')
            raise exc
Esempio n. 12
0
    def _traverse_and_execute(self, executable: GraphExecutable,
                              start_time: int) -> None:
        try:
            logger.debug(
                f"Executing graph with {len(executable.graph)} relations in it..."
            )
            for i, relation in enumerate(
                    nx.algorithms.dag.topological_sort(executable.graph)):
                relation.population_size = executable.source_adapter.scalar_query(
                    executable.source_adapter.population_count_statement(
                        relation))
                logger.info(
                    f'Executing graph {i+1} of {len(executable.graph)} source query for relation {relation.dot_notation}...'
                )

                relation.sampling.prepare(relation, executable.source_adapter)
                relation = RuntimeSourceCompiler.compile_queries_for_relation(
                    relation, executable.graph, executable.source_adapter,
                    executable.analyze)

                if executable.analyze:
                    if relation.is_view:
                        relation.population_size = "N/A"
                        relation.sample_size = "N/A"
                        logger.info(
                            f'Relation {relation.dot_notation} is a view, skipping.'
                        )
                    else:
                        result = [
                            row for row in
                            executable.source_adapter.check_count_and_query(
                                relation.compiled_query,
                                MAX_ALLOWED_ROWS).itertuples()
                        ][0]
                        relation.population_size = result.population_size
                        relation.sample_size = result.sample_size
                        logger.info(
                            f'Analysis of relation {relation.dot_notation} completed in {duration(start_time)}.'
                        )
                else:
                    executable.target_adapter.create_database_if_not_exists(
                        relation.quoted(relation.database))
                    executable.target_adapter.create_schema_if_not_exists(
                        relation.quoted(relation.database),
                        relation.quoted(relation.schema))
                    if relation.is_view:
                        logger.info(
                            f'Retrieving DDL statement for view {relation.dot_notation} in source...'
                        )
                        relation.population_size = "N/A"
                        relation.sample_size = "N/A"
                        try:
                            relation.view_ddl = executable.source_adapter.scalar_query(
                                relation.compiled_query)
                        except Exception:
                            raise SystemError(
                                f'Failed to extract DDL statement: {relation.compiled_query}'
                            )
                        logger.info(
                            f'Successfully extracted DDL statement for view {relation.quoted_dot_notation}'
                        )
                    else:
                        logger.info(
                            f'Retrieving records from source {relation.dot_notation}...'
                        )
                        try:
                            relation.data = executable.source_adapter.check_count_and_query(
                                relation.compiled_query, MAX_ALLOWED_ROWS)
                        except Exception as e:
                            raise SystemError(
                                f'Failed execution of extraction sql statement: {relation.compiled_query} {e}'
                            )

                        relation.sample_size = len(relation.data)
                        logger.info(
                            f'{relation.sample_size} records retrieved for relation {relation.dot_notation}.'
                        )

                    logger.info(
                        f'Inserting relation {relation.quoted_dot_notation} into target...'
                    )
                    try:
                        executable.target_adapter.create_and_load_relation(
                            relation)
                    except Exception as e:
                        raise SystemError(
                            f'Failed to load relation {relation.quoted_dot_notation} into target: {e}'
                        )

                    logger.info(
                        f'Done replication of relation {relation.dot_notation} in {duration(start_time)}.'
                    )
                    relation.target_loaded = True
                relation.source_extracted = True
                logger.info(
                    f'population:{relation.population_size}, sample:{relation.sample_size}'
                )
                if self.barf:
                    with open(
                            os.path.join(self.barf_output,
                                         f'{relation.dot_notation}.sql'),
                            'w') as f:
                        f.write(relation.compiled_query)
            try:
                for relation in executable.graph.nodes:
                    del relation.data
            except AttributeError:
                pass
            gc.collect()
        except Exception as e:
            logger.error(f'failed with error of type {type(e)}: {str(e)}')
            raise e