def test_enrichment_query_by_points_two_variables_different_datasets(
            self, geography_get_mock, dataset_get_mock):
        enrichment = Enrichment(credentials=self.credentials)

        temp_table_name = 'test_table'
        project = 'project'
        dataset1 = 'dataset1'
        dataset2 = 'dataset2'
        table1 = 'table1'
        table2 = 'table2'
        variable1_name = 'variable1'
        variable2_name = 'variable2'
        column1 = 'column1'
        column2 = 'column2'
        geo_table = 'geo_table'
        view1 = 'view_{}_{}'.format(dataset1, table1)
        view2 = 'view_{}_{}'.format(dataset2, table2)
        geo_view = 'view_{}_{}'.format(dataset1, geo_table)

        variable1 = Variable({
            'id':
            '{}.{}.{}.{}'.format(project, dataset1, table1, variable1_name),
            'column_name':
            column1,
            'dataset_id':
            'fake_name'
        })
        variable2 = Variable({
            'id':
            '{}.{}.{}.{}'.format(project, dataset2, table2, variable2_name),
            'column_name':
            column2,
            'dataset_id':
            'fake_name'
        })
        variables = [variable1, variable2]

        catalog = CatalogEntityWithGeographyMock('{}.{}.{}'.format(
            project, dataset1, geo_table))
        dataset_get_mock.return_value = catalog
        geography_get_mock.return_value = GeographyMock()

        actual_queries = enrichment._get_points_enrichment_sql(
            temp_table_name, variables, [])

        expected_queries = [
            get_query([column1], self.username, view1, geo_view,
                      temp_table_name),
            get_query([column2], self.username, view2, geo_view,
                      temp_table_name)
        ]

        actual = sorted(_clean_queries(actual_queries))
        expected = sorted(_clean_queries(expected_queries))

        assert actual == expected
    def test_build_where_conditions_by_variable(self):
        variable = Variable({
            'id': 'id',
            'column_name': 'column',
            'dataset_id': 'fake_name',
            'agg_method': 'sum'
        })

        filters = {}
        result = _build_where_conditions_by_variable(variable, filters)
        assert result is None

        filters = {'unexistingid': ''}
        result = _build_where_conditions_by_variable(variable, filters)
        assert result is None

        filters = {variable.id: '> 50'}
        expected = ["enrichment_table.{} > 50".format(variable.column_name)]
        result = _build_where_conditions_by_variable(variable, filters)
        assert result == expected

        filters = {variable.id: ['> 50', '< 100']}
        expected = [
            "enrichment_table.{} > 50".format(variable.column_name),
            "enrichment_table.{} < 100".format(variable.column_name)
        ]
        result = _build_where_conditions_by_variable(variable, filters)
        assert result == expected
    def test_build_polygons_query_variables_with_aggregation(self, column_query_mock):
        def get_column(variable, aggregation, sufix=False):
            return '{}_{}'.format(variable.column_name, str(sufix))

        column_query_mock.side_effect = get_column

        variable = Variable({
            'id': 'id',
            'column_name': 'column',
            'dataset_id': 'fake_name',
            'agg_method': 'SUM'
        })

        expected_result = '{}_{}'.format(variable.column_name, 'False')
        assert _build_polygons_query_variables_with_aggregation([variable], AGGREGATION_DEFAULT) == expected_result

        expected_result = '{}_{}'.format(variable.column_name, 'False')
        assert _build_polygons_query_variables_with_aggregation([variable], AGGREGATION_NONE) == expected_result

        expected_result = '{}_{}'.format(variable.column_name, 'False')
        assert _build_polygons_query_variables_with_aggregation([variable], 'AVG') == expected_result

        expected_result = '{}_{}'.format(variable.column_name, 'False')
        agg = {variable.id: 'AVG'}
        assert _build_polygons_query_variables_with_aggregation([variable], agg) == expected_result

        expected_result = '{}_{}'.format(variable.column_name, 'False')
        agg = {'unexisting_id': 'AVG'}
        assert _build_polygons_query_variables_with_aggregation([variable], agg) == expected_result

        expected_result = '{}_{}, {}_{}'.format(variable.column_name, 'True', variable.column_name, 'True')
        agg = {variable.id: ['AVG', 'SUM']}
        assert _build_polygons_query_variables_with_aggregation([variable], agg) == expected_result
    def test_prepare_variables_without_agg_method_and_custom_agg(self, get_mock, _validate_bq_operations_mock):
        _validate_bq_operations_mock.return_value = True

        variable_id = 'project.dataset.table.variable'
        variable = Variable({
            'id': variable_id,
            'column_name': 'column',
            'dataset_id': 'fake_name',
            'agg_method': None
        })

        get_mock.return_value = variable

        credentials = Credentials('fake_user', '1234')

        one_variable_cases = [
            variable_id,
            variable
        ]

        for case in one_variable_cases:
            result = prepare_variables(case, credentials)

            assert result == [variable]

        for case in one_variable_cases:
            result = prepare_variables(case, credentials, aggregation={})

            assert result == []

        for case in one_variable_cases:
            result = prepare_variables(case, credentials, aggregation={variable_id: 'SUM'})

            assert result == [variable]
    def test_prepare_variables_works_with_private_and_subscribed(self, get_mock, entity_repo, get_all_mock):
        dataset = Dataset({
            'id': 'id',
            'slug': 'slug',
            'name': 'name',
            'description': 'description',
            'available_in': ['bq'],
            'geography_id': 'geography',
            'is_public_data': False
        })

        # mock dataset
        entity_repo.return_value = dataset

        # mock subscriptions
        get_all_mock.return_value = [dataset]

        variable = Variable({
            'id': 'id',
            'column_name': 'column',
            'dataset_id': 'fake_name',
            'slug': 'slug'
        })

        get_mock.return_value = variable

        credentials = Credentials('fake_user', '1234')

        result = prepare_variables(variable, credentials)
        assert result == [variable]
    def test_prepare_variables_fails_with_private(self, get_mock, entity_repo, get_all_mock):
        dataset = Dataset({
            'id': 'id',
            'slug': 'slug',
            'name': 'name',
            'description': 'description',
            'available_in': ['bq'],
            'geography_id': 'geography',
            'is_public_data': False
        })

        # mock dataset
        entity_repo.return_value = dataset

        # mock subscriptions
        get_all_mock.return_value = []

        variable = Variable({
            'id': 'id',
            'column_name': 'column',
            'dataset_id': 'fake_name',
            'slug': 'slug'
        })

        get_mock.return_value = variable

        credentials = Credentials('fake_user', '1234')

        with pytest.raises(EnrichmentError) as e:
            prepare_variables(variable, credentials)

        error = """
            You are not subscribed to the Dataset '{}' yet. Please, use the subscribe method first.
        """.format(dataset.id)
        assert str(e.value) == error
    def test_prepare_variables_raises_if_not_available_in_bq(self, get_mock, entity_repo, get_all_mock):
        dataset = Dataset({
            'id': 'id',
            'slug': 'slug',
            'name': 'name',
            'description': 'description',
            'available_in': [],
            'geography_id': 'geography',
            'is_public_data': False
        })

        # mock dataset
        entity_repo.return_value = dataset

        # mock subscriptions
        get_all_mock.return_value = [dataset]

        variable = Variable({
            'id': 'id',
            'column_name': 'column',
            'dataset_id': 'fake_name',
            'slug': 'slug'
        })

        get_mock.return_value = variable

        credentials = Credentials('fake_user', '1234')

        with pytest.raises(EnrichmentError) as e:
            prepare_variables(variable, credentials)

        error = """
            The Dataset '{}' is not ready for Enrichment. Please, contact us for more information.
        """.format(dataset)
        assert str(e.value) == error
    def test_prepare_variables(self, get_mock, _validate_bq_operations_mock):
        _validate_bq_operations_mock.return_value = True

        variable_id = 'project.dataset.table.variable'
        variable = Variable({
            'id': variable_id,
            'column_name': 'column',
            'dataset_id': 'fake_name'
        })

        get_mock.return_value = variable

        credentials = Credentials('fake_user', '1234')

        one_variable_cases = [
            variable_id,
            variable
        ]

        for case in one_variable_cases:
            result = prepare_variables(case, credentials)

            assert result == [variable]

        several_variables_cases = [
            [variable_id, variable_id],
            [variable, variable],
            [variable, variable_id]
        ]

        for case in several_variables_cases:
            result = prepare_variables(case, credentials)

            assert result == [variable, variable]
    def test_get_aggregation(self):
        variable_agg = Variable({
            'id': 'id',
            'column_name': 'column',
            'dataset_id': 'fake_name',
            'agg_method': 'SUM'
        })

        assert _get_aggregation(variable_agg, AGGREGATION_DEFAULT) == variable_agg.agg_method.lower()
        assert _get_aggregation(variable_agg, AGGREGATION_NONE) is None
        assert _get_aggregation(variable_agg, 'sum') == 'sum'
        assert _get_aggregation(variable_agg, 'SUM') == 'sum'
        assert _get_aggregation(variable_agg, 'avg') == 'avg'
        assert _get_aggregation(variable_agg, 'AVG') == 'avg'
        custom_agg = {variable_agg.id: 'AVG'}
        assert _get_aggregation(variable_agg, custom_agg) == 'avg'
        custom_agg = {}
        assert _get_aggregation(variable_agg, custom_agg) == variable_agg.agg_method.lower()
        custom_agg = {variable_agg.id: ['sum', 'avg']}
        assert _get_aggregation(variable_agg, custom_agg) == ['sum', 'avg']
        custom_agg = {variable_agg.id: ['SUM', 'aVg']}
        assert _get_aggregation(variable_agg, custom_agg) == ['sum', 'avg']

        variable_agg_none = Variable({
            'id': 'id',
            'column_name': 'column',
            'dataset_id': 'fake_name',
            'agg_method': None
        })

        assert _get_aggregation(variable_agg_none, AGGREGATION_DEFAULT) is None
        assert _get_aggregation(variable_agg_none, AGGREGATION_NONE) is None
        assert _get_aggregation(variable_agg_none, 'sum') == 'sum'
        assert _get_aggregation(variable_agg_none, 'SUM') == 'sum'
        assert _get_aggregation(variable_agg_none, 'avg') == 'avg'
        assert _get_aggregation(variable_agg_none, 'AVG') == 'avg'
        custom_agg = {variable_agg.id: 'AVG'}
        assert _get_aggregation(variable_agg_none, custom_agg) == 'avg'
        custom_agg = {}
        assert _get_aggregation(variable_agg_none, custom_agg) is None
        custom_agg = {variable_agg.id: ['sum', 'avg']}
        assert _get_aggregation(variable_agg_none, custom_agg) == ['sum', 'avg']
        custom_agg = {variable_agg.id: ['suM', 'AVG']}
        assert _get_aggregation(variable_agg_none, custom_agg) == ['sum', 'avg']
    def test_build_polygons_column_with_aggregation(self):
        variable = Variable({
            'id': 'id',
            'column_name': 'column',
            'dataset_id': 'fake_name',
            'agg_method': 'sum'
        })

        aggregation = 'sum'
        expected_sql = """
            sum(
                enrichment_table.{column} * (
                    ST_AREA(ST_INTERSECTION(enrichment_geo_table.geom, data_table.{geo_column}))
                    /
                    NULLIF(ST_AREA(enrichment_geo_table.geom), 0)
                )
            ) AS {column_name}
            """.format(
                column=variable.column_name,
                column_name=variable.column_name,
                geo_column=_GEOM_COLUMN)
        sql = _build_polygons_column_with_aggregation(variable, aggregation)
        assert sql == expected_sql

        aggregation = 'sum'
        expected_sql = """
            sum(
                enrichment_table.{column} * (
                    ST_AREA(ST_INTERSECTION(enrichment_geo_table.geom, data_table.{geo_column}))
                    /
                    NULLIF(ST_AREA(enrichment_geo_table.geom), 0)
                )
            ) AS {column_name}
            """.format(
                column=variable.column_name,
                column_name='sum_{}'.format(variable.column_name),
                geo_column=_GEOM_COLUMN)
        sql = _build_polygons_column_with_aggregation(variable, aggregation, True)
        assert sql == expected_sql

        aggregation = 'avg'
        expected_sql = 'avg(enrichment_table.{column}) AS {column_name}'.format(
            column=variable.column_name,
            column_name=variable.column_name)

        sql = _build_polygons_column_with_aggregation(variable, aggregation)
        assert sql.strip() == expected_sql.strip()

        aggregation = 'avg'
        expected_sql = 'avg(enrichment_table.{column}) AS {column_name}'.format(
            column=variable.column_name,
            column_name='avg_{}'.format(variable.column_name))
        sql = _build_polygons_column_with_aggregation(variable, aggregation, True)
        assert sql.strip() == expected_sql.strip()
    def test_enrichment_query_by_points_with_filters(self, geography_get_mock,
                                                     dataset_get_mock,
                                                     _is_available_in_bq_mock):
        _is_available_in_bq_mock.return_value = True

        enrichment = Enrichment(credentials=self.credentials)

        temp_table_name = 'test_table'
        project = 'project'
        dataset = 'dataset'
        table = 'table'
        variable_name = 'variable1'
        column = 'column1'
        geo_table = 'geo_table'
        view = 'view_{}_{}'.format(dataset, table)
        geo_view = 'view_{}_{}'.format(dataset, geo_table)

        variable = Variable({
            'id':
            '{}.{}.{}.{}'.format(project, dataset, table, variable_name),
            'column_name':
            column,
            'dataset_id':
            'fake_name'
        })
        variables = [variable]

        filters = {variable.id: "= 'a string'"}
        expected_filters = ["{} = 'a string'".format(variable.column_name)]

        catalog = CatalogEntityWithGeographyMock('{}.{}.{}'.format(
            project, dataset, geo_table))
        dataset_get_mock.return_value = catalog
        geography_get_mock.return_value = GeographyMock()

        actual_queries = enrichment._get_points_enrichment_sql(
            temp_table_name, variables, filters)

        expected_queries = [
            get_query([column], self.username, view, geo_view, temp_table_name,
                      expected_filters)
        ]

        actual = sorted(_clean_queries(actual_queries))
        expected = sorted(_clean_queries(expected_queries))

        assert actual == expected
    def test_enrichment_query_by_points_one_variable(self, geography_get_mock,
                                                     dataset_get_mock):
        enrichment = Enrichment(credentials=self.credentials)

        temp_table_name = 'test_table'
        project = 'project'
        dataset = 'dataset'
        table = 'table'
        variable_name = 'variable1'
        column = 'column1'
        geo_table = 'geo_table'
        view = 'view_{}_{}'.format(dataset, table)
        geo_view = 'view_{}_{}'.format(dataset, geo_table)

        variable = Variable({
            'id':
            '{}.{}.{}.{}'.format(project, dataset, table, variable_name),
            'column_name':
            column,
            'dataset_id':
            'fake_name'
        })
        variables = [variable]

        catalog = CatalogEntityWithGeographyMock('{}.{}.{}'.format(
            project, dataset, geo_table))
        dataset_get_mock.return_value = catalog
        geography_get_mock.return_value = GeographyMock()

        actual_queries = enrichment._get_points_enrichment_sql(
            temp_table_name, variables, [])

        expected_queries = [
            get_query([column], self.username, view, geo_view, temp_table_name)
        ]

        actual = sorted(_clean_queries(actual_queries))
        expected = sorted(_clean_queries(expected_queries))

        assert actual == expected
Beispiel #13
0
from cartoframes.data.observatory import Enrichment, Variable


def file_path(path):
    return '{}/{}'.format(Path(__file__).parent.absolute(), path)


def clean_gdf(gdf, sort_column=None):
    if sort_column:
        return gdf.sort_index(axis=1).sort_values(
            by=sort_column).round(5).reset_index(drop=True)
    else:
        return gdf.sort_index(axis=1).round(5).reset_index(drop=True)


public_variable1 = Variable.get('poverty_a86da569')  # FLOAT, AVG
public_variable2 = Variable.get('one_car_f7f299a7')  # FLOAT, SUM
public_variable3 = Variable.get('geoid_e99a58c1')  # STRING, NONE
private_variable1 = Variable.get('RSGCY7224_cb77b41d')  # INTEGER, SUM
private_variable2 = Variable.get('MLTCY7224_4ba39c69')  # INTEGER, SUM
private_variable3 = Variable.get('BLOCKGROUP_f1b3a750')  # STRING, NONE


class TestEnrichment(object):
    def setup_method(self):
        if (os.environ.get('APIKEY') and os.environ.get('USERNAME')):
            self.apikey = os.environ['APIKEY']
            self.username = os.environ['USERNAME']
        else:
            creds = json.loads(open('tests/e2e/secret.json').read())
            self.apikey = creds['APIKEY']