def test_enrichment_query_by_points_two_variables_different_datasets( self, geography_get_mock, dataset_get_mock): enrichment = Enrichment(credentials=self.credentials) temp_table_name = 'test_table' project = 'project' dataset1 = 'dataset1' dataset2 = 'dataset2' table1 = 'table1' table2 = 'table2' variable1_name = 'variable1' variable2_name = 'variable2' column1 = 'column1' column2 = 'column2' geo_table = 'geo_table' view1 = 'view_{}_{}'.format(dataset1, table1) view2 = 'view_{}_{}'.format(dataset2, table2) geo_view = 'view_{}_{}'.format(dataset1, geo_table) variable1 = Variable({ 'id': '{}.{}.{}.{}'.format(project, dataset1, table1, variable1_name), 'column_name': column1, 'dataset_id': 'fake_name' }) variable2 = Variable({ 'id': '{}.{}.{}.{}'.format(project, dataset2, table2, variable2_name), 'column_name': column2, 'dataset_id': 'fake_name' }) variables = [variable1, variable2] catalog = CatalogEntityWithGeographyMock('{}.{}.{}'.format( project, dataset1, geo_table)) dataset_get_mock.return_value = catalog geography_get_mock.return_value = GeographyMock() actual_queries = enrichment._get_points_enrichment_sql( temp_table_name, variables, []) expected_queries = [ get_query([column1], self.username, view1, geo_view, temp_table_name), get_query([column2], self.username, view2, geo_view, temp_table_name) ] actual = sorted(_clean_queries(actual_queries)) expected = sorted(_clean_queries(expected_queries)) assert actual == expected
def test_build_where_conditions_by_variable(self): variable = Variable({ 'id': 'id', 'column_name': 'column', 'dataset_id': 'fake_name', 'agg_method': 'sum' }) filters = {} result = _build_where_conditions_by_variable(variable, filters) assert result is None filters = {'unexistingid': ''} result = _build_where_conditions_by_variable(variable, filters) assert result is None filters = {variable.id: '> 50'} expected = ["enrichment_table.{} > 50".format(variable.column_name)] result = _build_where_conditions_by_variable(variable, filters) assert result == expected filters = {variable.id: ['> 50', '< 100']} expected = [ "enrichment_table.{} > 50".format(variable.column_name), "enrichment_table.{} < 100".format(variable.column_name) ] result = _build_where_conditions_by_variable(variable, filters) assert result == expected
def test_build_polygons_query_variables_with_aggregation(self, column_query_mock): def get_column(variable, aggregation, sufix=False): return '{}_{}'.format(variable.column_name, str(sufix)) column_query_mock.side_effect = get_column variable = Variable({ 'id': 'id', 'column_name': 'column', 'dataset_id': 'fake_name', 'agg_method': 'SUM' }) expected_result = '{}_{}'.format(variable.column_name, 'False') assert _build_polygons_query_variables_with_aggregation([variable], AGGREGATION_DEFAULT) == expected_result expected_result = '{}_{}'.format(variable.column_name, 'False') assert _build_polygons_query_variables_with_aggregation([variable], AGGREGATION_NONE) == expected_result expected_result = '{}_{}'.format(variable.column_name, 'False') assert _build_polygons_query_variables_with_aggregation([variable], 'AVG') == expected_result expected_result = '{}_{}'.format(variable.column_name, 'False') agg = {variable.id: 'AVG'} assert _build_polygons_query_variables_with_aggregation([variable], agg) == expected_result expected_result = '{}_{}'.format(variable.column_name, 'False') agg = {'unexisting_id': 'AVG'} assert _build_polygons_query_variables_with_aggregation([variable], agg) == expected_result expected_result = '{}_{}, {}_{}'.format(variable.column_name, 'True', variable.column_name, 'True') agg = {variable.id: ['AVG', 'SUM']} assert _build_polygons_query_variables_with_aggregation([variable], agg) == expected_result
def test_prepare_variables_without_agg_method_and_custom_agg(self, get_mock, _validate_bq_operations_mock): _validate_bq_operations_mock.return_value = True variable_id = 'project.dataset.table.variable' variable = Variable({ 'id': variable_id, 'column_name': 'column', 'dataset_id': 'fake_name', 'agg_method': None }) get_mock.return_value = variable credentials = Credentials('fake_user', '1234') one_variable_cases = [ variable_id, variable ] for case in one_variable_cases: result = prepare_variables(case, credentials) assert result == [variable] for case in one_variable_cases: result = prepare_variables(case, credentials, aggregation={}) assert result == [] for case in one_variable_cases: result = prepare_variables(case, credentials, aggregation={variable_id: 'SUM'}) assert result == [variable]
def test_prepare_variables_works_with_private_and_subscribed(self, get_mock, entity_repo, get_all_mock): dataset = Dataset({ 'id': 'id', 'slug': 'slug', 'name': 'name', 'description': 'description', 'available_in': ['bq'], 'geography_id': 'geography', 'is_public_data': False }) # mock dataset entity_repo.return_value = dataset # mock subscriptions get_all_mock.return_value = [dataset] variable = Variable({ 'id': 'id', 'column_name': 'column', 'dataset_id': 'fake_name', 'slug': 'slug' }) get_mock.return_value = variable credentials = Credentials('fake_user', '1234') result = prepare_variables(variable, credentials) assert result == [variable]
def test_prepare_variables_fails_with_private(self, get_mock, entity_repo, get_all_mock): dataset = Dataset({ 'id': 'id', 'slug': 'slug', 'name': 'name', 'description': 'description', 'available_in': ['bq'], 'geography_id': 'geography', 'is_public_data': False }) # mock dataset entity_repo.return_value = dataset # mock subscriptions get_all_mock.return_value = [] variable = Variable({ 'id': 'id', 'column_name': 'column', 'dataset_id': 'fake_name', 'slug': 'slug' }) get_mock.return_value = variable credentials = Credentials('fake_user', '1234') with pytest.raises(EnrichmentError) as e: prepare_variables(variable, credentials) error = """ You are not subscribed to the Dataset '{}' yet. Please, use the subscribe method first. """.format(dataset.id) assert str(e.value) == error
def test_prepare_variables_raises_if_not_available_in_bq(self, get_mock, entity_repo, get_all_mock): dataset = Dataset({ 'id': 'id', 'slug': 'slug', 'name': 'name', 'description': 'description', 'available_in': [], 'geography_id': 'geography', 'is_public_data': False }) # mock dataset entity_repo.return_value = dataset # mock subscriptions get_all_mock.return_value = [dataset] variable = Variable({ 'id': 'id', 'column_name': 'column', 'dataset_id': 'fake_name', 'slug': 'slug' }) get_mock.return_value = variable credentials = Credentials('fake_user', '1234') with pytest.raises(EnrichmentError) as e: prepare_variables(variable, credentials) error = """ The Dataset '{}' is not ready for Enrichment. Please, contact us for more information. """.format(dataset) assert str(e.value) == error
def test_prepare_variables(self, get_mock, _validate_bq_operations_mock): _validate_bq_operations_mock.return_value = True variable_id = 'project.dataset.table.variable' variable = Variable({ 'id': variable_id, 'column_name': 'column', 'dataset_id': 'fake_name' }) get_mock.return_value = variable credentials = Credentials('fake_user', '1234') one_variable_cases = [ variable_id, variable ] for case in one_variable_cases: result = prepare_variables(case, credentials) assert result == [variable] several_variables_cases = [ [variable_id, variable_id], [variable, variable], [variable, variable_id] ] for case in several_variables_cases: result = prepare_variables(case, credentials) assert result == [variable, variable]
def test_get_aggregation(self): variable_agg = Variable({ 'id': 'id', 'column_name': 'column', 'dataset_id': 'fake_name', 'agg_method': 'SUM' }) assert _get_aggregation(variable_agg, AGGREGATION_DEFAULT) == variable_agg.agg_method.lower() assert _get_aggregation(variable_agg, AGGREGATION_NONE) is None assert _get_aggregation(variable_agg, 'sum') == 'sum' assert _get_aggregation(variable_agg, 'SUM') == 'sum' assert _get_aggregation(variable_agg, 'avg') == 'avg' assert _get_aggregation(variable_agg, 'AVG') == 'avg' custom_agg = {variable_agg.id: 'AVG'} assert _get_aggregation(variable_agg, custom_agg) == 'avg' custom_agg = {} assert _get_aggregation(variable_agg, custom_agg) == variable_agg.agg_method.lower() custom_agg = {variable_agg.id: ['sum', 'avg']} assert _get_aggregation(variable_agg, custom_agg) == ['sum', 'avg'] custom_agg = {variable_agg.id: ['SUM', 'aVg']} assert _get_aggregation(variable_agg, custom_agg) == ['sum', 'avg'] variable_agg_none = Variable({ 'id': 'id', 'column_name': 'column', 'dataset_id': 'fake_name', 'agg_method': None }) assert _get_aggregation(variable_agg_none, AGGREGATION_DEFAULT) is None assert _get_aggregation(variable_agg_none, AGGREGATION_NONE) is None assert _get_aggregation(variable_agg_none, 'sum') == 'sum' assert _get_aggregation(variable_agg_none, 'SUM') == 'sum' assert _get_aggregation(variable_agg_none, 'avg') == 'avg' assert _get_aggregation(variable_agg_none, 'AVG') == 'avg' custom_agg = {variable_agg.id: 'AVG'} assert _get_aggregation(variable_agg_none, custom_agg) == 'avg' custom_agg = {} assert _get_aggregation(variable_agg_none, custom_agg) is None custom_agg = {variable_agg.id: ['sum', 'avg']} assert _get_aggregation(variable_agg_none, custom_agg) == ['sum', 'avg'] custom_agg = {variable_agg.id: ['suM', 'AVG']} assert _get_aggregation(variable_agg_none, custom_agg) == ['sum', 'avg']
def test_build_polygons_column_with_aggregation(self): variable = Variable({ 'id': 'id', 'column_name': 'column', 'dataset_id': 'fake_name', 'agg_method': 'sum' }) aggregation = 'sum' expected_sql = """ sum( enrichment_table.{column} * ( ST_AREA(ST_INTERSECTION(enrichment_geo_table.geom, data_table.{geo_column})) / NULLIF(ST_AREA(enrichment_geo_table.geom), 0) ) ) AS {column_name} """.format( column=variable.column_name, column_name=variable.column_name, geo_column=_GEOM_COLUMN) sql = _build_polygons_column_with_aggregation(variable, aggregation) assert sql == expected_sql aggregation = 'sum' expected_sql = """ sum( enrichment_table.{column} * ( ST_AREA(ST_INTERSECTION(enrichment_geo_table.geom, data_table.{geo_column})) / NULLIF(ST_AREA(enrichment_geo_table.geom), 0) ) ) AS {column_name} """.format( column=variable.column_name, column_name='sum_{}'.format(variable.column_name), geo_column=_GEOM_COLUMN) sql = _build_polygons_column_with_aggregation(variable, aggregation, True) assert sql == expected_sql aggregation = 'avg' expected_sql = 'avg(enrichment_table.{column}) AS {column_name}'.format( column=variable.column_name, column_name=variable.column_name) sql = _build_polygons_column_with_aggregation(variable, aggregation) assert sql.strip() == expected_sql.strip() aggregation = 'avg' expected_sql = 'avg(enrichment_table.{column}) AS {column_name}'.format( column=variable.column_name, column_name='avg_{}'.format(variable.column_name)) sql = _build_polygons_column_with_aggregation(variable, aggregation, True) assert sql.strip() == expected_sql.strip()
def test_enrichment_query_by_points_with_filters(self, geography_get_mock, dataset_get_mock, _is_available_in_bq_mock): _is_available_in_bq_mock.return_value = True enrichment = Enrichment(credentials=self.credentials) temp_table_name = 'test_table' project = 'project' dataset = 'dataset' table = 'table' variable_name = 'variable1' column = 'column1' geo_table = 'geo_table' view = 'view_{}_{}'.format(dataset, table) geo_view = 'view_{}_{}'.format(dataset, geo_table) variable = Variable({ 'id': '{}.{}.{}.{}'.format(project, dataset, table, variable_name), 'column_name': column, 'dataset_id': 'fake_name' }) variables = [variable] filters = {variable.id: "= 'a string'"} expected_filters = ["{} = 'a string'".format(variable.column_name)] catalog = CatalogEntityWithGeographyMock('{}.{}.{}'.format( project, dataset, geo_table)) dataset_get_mock.return_value = catalog geography_get_mock.return_value = GeographyMock() actual_queries = enrichment._get_points_enrichment_sql( temp_table_name, variables, filters) expected_queries = [ get_query([column], self.username, view, geo_view, temp_table_name, expected_filters) ] actual = sorted(_clean_queries(actual_queries)) expected = sorted(_clean_queries(expected_queries)) assert actual == expected
def test_enrichment_query_by_points_one_variable(self, geography_get_mock, dataset_get_mock): enrichment = Enrichment(credentials=self.credentials) temp_table_name = 'test_table' project = 'project' dataset = 'dataset' table = 'table' variable_name = 'variable1' column = 'column1' geo_table = 'geo_table' view = 'view_{}_{}'.format(dataset, table) geo_view = 'view_{}_{}'.format(dataset, geo_table) variable = Variable({ 'id': '{}.{}.{}.{}'.format(project, dataset, table, variable_name), 'column_name': column, 'dataset_id': 'fake_name' }) variables = [variable] catalog = CatalogEntityWithGeographyMock('{}.{}.{}'.format( project, dataset, geo_table)) dataset_get_mock.return_value = catalog geography_get_mock.return_value = GeographyMock() actual_queries = enrichment._get_points_enrichment_sql( temp_table_name, variables, []) expected_queries = [ get_query([column], self.username, view, geo_view, temp_table_name) ] actual = sorted(_clean_queries(actual_queries)) expected = sorted(_clean_queries(expected_queries)) assert actual == expected
from cartoframes.data.observatory import Enrichment, Variable def file_path(path): return '{}/{}'.format(Path(__file__).parent.absolute(), path) def clean_gdf(gdf, sort_column=None): if sort_column: return gdf.sort_index(axis=1).sort_values( by=sort_column).round(5).reset_index(drop=True) else: return gdf.sort_index(axis=1).round(5).reset_index(drop=True) public_variable1 = Variable.get('poverty_a86da569') # FLOAT, AVG public_variable2 = Variable.get('one_car_f7f299a7') # FLOAT, SUM public_variable3 = Variable.get('geoid_e99a58c1') # STRING, NONE private_variable1 = Variable.get('RSGCY7224_cb77b41d') # INTEGER, SUM private_variable2 = Variable.get('MLTCY7224_4ba39c69') # INTEGER, SUM private_variable3 = Variable.get('BLOCKGROUP_f1b3a750') # STRING, NONE class TestEnrichment(object): def setup_method(self): if (os.environ.get('APIKEY') and os.environ.get('USERNAME')): self.apikey = os.environ['APIKEY'] self.username = os.environ['USERNAME'] else: creds = json.loads(open('tests/e2e/secret.json').read()) self.apikey = creds['APIKEY']