def test_DODataset_ttl_seconds(api_key_auth_client_usr): bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr) ttl_seconds = 10 result = bq_user_dataset.ttl_seconds(ttl_seconds) assert bq_user_dataset == result assert bq_user_dataset._ttl_seconds == ttl_seconds
def test_DODataset_name(api_key_auth_client_usr): bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr) name = 'fake-name' result = bq_user_dataset.name(name) assert bq_user_dataset == result assert bq_user_dataset._name == name
def test_can_download_to_dataframe(mocker, api_key_auth_client_usr): # mock fake_response = ResponseMock(StringIO(CSV_SAMPLE_REDUCED)) mocker.patch.object(APIKeyAuthClient, 'send', return_value=fake_response) bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr) # test result = bq_user_dataset.name( 'census_tracts_american_samoa').download_stream() assert isinstance(result, ResponseStream)
def test_can_upload_from_file_object(mocker, api_key_auth_client_usr): # mock fake_response = ResponseMock() mocker.patch.object(APIKeyAuthClient, 'send', return_value=fake_response) bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr) # test unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace('-', '_') file_object = StringIO(CSV_SAMPLE_REDUCED) result = bq_user_dataset.name(unique_table_name).upload_file_object( file_object) assert result == fake_response
def _upload_data(self, temp_table_name, geodataframe): reduced_geodataframe = geodataframe[[_ENRICHMENT_ID, _GEOM_COLUMN]] dataset = DODataset(auth_client=self.auth_client).name(temp_table_name) \ .column(_ENRICHMENT_ID, 'INT64') \ .column(_GEOM_COLUMN, 'GEOMETRY') \ .ttl_seconds(_TTL_IN_SECONDS) dataset.create() status = dataset.upload_dataframe(reduced_geodataframe, _GEOM_COLUMN) if status not in ['success']: raise EnrichmentError('Couldn\'t upload the dataframe to be enriched. The job hasn\'t finished successfuly') return dataset
def _download(self, credentials, file_path=None, limit=None, order_by=None, sql_query=None, add_geom=None): auth_client = credentials.get_api_key_auth_client() is_geography = None if sql_query is not None: is_geography = self.__class__.__name__ == 'Geography' rows = DODataset(auth_client=auth_client).name( self.id).download_stream(limit=limit, order_by=order_by, sql_query=sql_query, add_geom=add_geom, is_geography=is_geography) if file_path: with open(file_path, 'w') as csvfile: for row in rows: csvfile.write(row.decode('utf-8')) log.info('Data saved: {}'.format(file_path)) if self.__class__.__name__ == 'Dataset': log.info(_DATASET_READ_MSG.format(file_path)) elif self.__class__.__name__ == 'Geography': log.info(_GEOGRAPHY_READ_MSG.format(file_path)) else: dataframe = pd.read_csv(rows) return dataframe
def test_DODataset_column(api_key_auth_client_usr): bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr) invalid_cases = [{'column_name': 'fake-name', 'column_type': 'fake-type'}] for c in invalid_cases: with pytest.raises(Exception) as e: bq_user_dataset.column(c['column_name'], c['column_type']) assert str(e.value) == 'Invalid type {}'.format( c['column_type'].upper()) column_name = 'column' column_type = VALID_TYPES[0] result = bq_user_dataset.column(column_name, column_type) assert bq_user_dataset == result assert bq_user_dataset._columns == [(column_name, column_type)]
def setUp(self): if os.environ.get('APIKEY') and os.environ.get( 'USERNAME') and os.environ.get('USERURL'): self.apikey = os.environ['APIKEY'] self.username = os.environ['USERNAME'] self.base_url = os.environ['USERURL'] else: creds = json.loads(open('tests/e2e/secret.json').read()) self.apikey = creds['APIKEY'] self.username = creds['USERNAME'] self.base_url = creds['USERURL'] credentials = Credentials(username=self.username, api_key=self.apikey, base_url=self.base_url) auth_client = credentials.get_api_key_auth_client() self.do_dataset = DODataset(auth_client=auth_client)
def test_can_import_a_dataset(mocker, api_key_auth_client_usr): # mock fake_response = ResponseMock({'item_queue_id': '123'}) mocker.patch.object(APIKeyAuthClient, 'send', return_value=fake_response) bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr) # test unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace('-', '_') file_object = StringIO(CSV_SAMPLE_REDUCED) dataset = bq_user_dataset.name(unique_table_name) \ .column(name='id', type='INT64') \ .column('geom', 'GEOMETRY') \ .ttl_seconds(30) dataset.create() dataset.upload_file_object(file_object) job = dataset.import_dataset() assert isinstance(job, DODatasetJob)
def set_external_credentials(self): # This must be checked every time to allow the definition of # "default_do_credentials" at any point in the code because # every repo uses a singleton instance of this client external_credentials = defaults.get_default_do_credentials() if external_credentials is not None: external_auth_client = external_credentials.get_api_key_auth_client( ) self._external_do_dataset = DODataset( auth_client=external_auth_client) else: self._external_do_dataset = None
def _execute_enrichment(self, dataset, temp_table_name, geom_type, variables, filters, aggregation): output_name = '{}_result'.format(temp_table_name) status = dataset.enrichment(geom_type=geom_type, variables=variables, filters=filters, aggregation=aggregation, output_name=output_name) if status not in ['success']: raise EnrichmentError('Couldn\'t enrich the dataframe. The job hasn\'t finished successfuly') result = DODataset(auth_client=self.auth_client).name(output_name).download_stream() enriched_dataframe = pandas.read_csv(result) return enriched_dataframe
def __init__(self): default_credentials = Credentials(DEFAULT_USER) default_auth_client = default_credentials.get_api_key_auth_client() self._default_do_dataset = DODataset(auth_client=default_auth_client) self._user_do_dataset = None self._external_do_dataset = None
def set_user_credentials(self, credentials): if credentials is not None: auth_client = credentials.get_api_key_auth_client() self._user_do_dataset = DODataset(auth_client=auth_client) else: self._user_do_dataset = None
def __init__(self): self._do_dataset = None default_credentials = defaults.get_default_credentials( ) or Credentials(DEFAULT_USER) default_auth_client = default_credentials.get_api_key_auth_client() self._default_do_dataset = DODataset(auth_client=default_auth_client)
class RepoClient: def __init__(self): self._do_dataset = None default_credentials = defaults.get_default_credentials( ) or Credentials(DEFAULT_USER) default_auth_client = default_credentials.get_api_key_auth_client() self._default_do_dataset = DODataset(auth_client=default_auth_client) def set_user_credentials(self, credentials): if credentials is not None: auth_client = credentials.get_api_key_auth_client() self._do_dataset = DODataset(auth_client=auth_client) else: self._do_dataset = None def reset_user_credentials(self): self._do_dataset = None def get_countries(self, filters=None): return self._get_entity('countries', filters) def get_categories(self, filters=None): return self._get_entity('categories', filters) def get_providers(self, filters=None): return self._get_entity('providers', filters) def get_datasets(self, filters=None): return self._get_entity('datasets', filters, use_slug=True) def get_geographies(self, filters=None): return self._get_entity('geographies', filters, use_slug=True) def get_variables(self, filters=None): filter_id = self._get_filter_id(filters, use_slug=True) if filter_id: return self._fetch_entity_id('variables', filter_id) else: entity = 'datasets/{}/variables'.format(filters.pop('dataset')) return self._fetch_entity(entity, filters) def get_variables_groups(self, filters=None): filter_id = self._get_filter_id(filters, use_slug=True) if filter_id: return self._fetch_entity_id('variables_groups', filter_id) else: entity = 'datasets/{0}/variables_groups'.format( filters.pop('dataset')) return self._fetch_entity(entity, filters) def _get_filter_id(self, filters, use_slug=False): if isinstance(filters, dict): filter_id = filters.get('id') if not filter_id and use_slug: filter_id = filters.get('slug') return filter_id def _get_entity(self, entity, filters=None, use_slug=False): filter_id = self._get_filter_id(filters, use_slug) if filter_id: return self._fetch_entity_id(entity, filter_id) else: return self._fetch_entity(entity, filters) def _fetch_entity_id(self, entity, filter_id): if isinstance(filter_id, list): return list( filter(None, [ self._fetch_entity('{0}/{1}'.format(entity, _id)) for _id in filter_id ])) else: return self._fetch_entity('{0}/{1}'.format(entity, filter_id)) def _fetch_entity(self, entity, filters=None): if self._do_dataset: return self._do_dataset.metadata(entity, filters) else: return self._default_do_dataset.metadata(entity, filters)
class TestDODataset(unittest.TestCase): """This test suite needs the ENV variable USERURL pointing to a working DO API in "tests/e2e/secret.json". DO API must have the user/apikey mapping set to get access to the user's DO Project in GCP. """ def setUp(self): if os.environ.get('APIKEY') and os.environ.get( 'USERNAME') and os.environ.get('USERURL'): self.apikey = os.environ['APIKEY'] self.username = os.environ['USERNAME'] self.base_url = os.environ['USERURL'] else: creds = json.loads(open('tests/e2e/secret.json').read()) self.apikey = creds['APIKEY'] self.username = creds['USERNAME'] self.base_url = creds['USERURL'] credentials = Credentials(username=self.username, api_key=self.apikey, base_url=self.base_url) auth_client = credentials.get_api_key_auth_client() self.do_dataset = DODataset(auth_client=auth_client) def test_can_upload_from_dataframe(self): sample = StringIO(CSV_SAMPLE_REDUCED) df = pandas.read_csv(sample) unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace( '-', '_') self.do_dataset.name(unique_table_name).upload(df) def test_can_upload_from_file_object(self): unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace( '-', '_') file_object = StringIO(CSV_SAMPLE_REDUCED) self.do_dataset.name(unique_table_name).upload_file_object(file_object) def test_can_import_a_dataset(self): unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace( '-', '_') file_object = StringIO(CSV_SAMPLE_REDUCED) dataset = self.do_dataset.name(unique_table_name) \ .column(name='id', type='INT64') \ .column('geom', 'GEOMETRY') \ .ttl_seconds(30) dataset.create() dataset.upload_file_object(file_object) job = dataset.import_dataset() self.assertIsInstance(job, DODatasetJob) def test_can_get_status_from_import(self): unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace( '-', '_') file_object = StringIO(CSV_SAMPLE_REDUCED) dataset = self.do_dataset.name(unique_table_name) \ .column(name='id', type='INT64') \ .column('geom', 'GEOMETRY') \ .ttl_seconds(30) dataset.create() dataset.upload_file_object(file_object) job = dataset.import_dataset() status = job.status() self.assertIn( status, ['pending', 'running', 'cancelled', 'success', 'failure']) def test_can_wait_for_job_completion(self): unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace( '-', '_') file_object = StringIO(CSV_SAMPLE_REDUCED) dataset = self.do_dataset.name(unique_table_name) \ .column(name='id', type='INT64') \ .column('geom', 'GEOMETRY') \ .ttl_seconds(30) dataset.create() dataset.upload_file_object(file_object, geom_column='geom') job = dataset.import_dataset() status = job.result() self.assertIn(status, ['success']) def test_can_upload_a_dataframe_and_wait_for_completion(self): unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace( '-', '_') sample = StringIO(CSV_SAMPLE_REDUCED) df = pandas.read_csv(sample) dataset = self.do_dataset.name(unique_table_name) \ .column(name='id', type='INT64') \ .column('geom', 'GEOMETRY') \ .ttl_seconds(30) dataset.create() status = dataset.upload_dataframe(df, geom_column='geom') self.assertIn(status, ['success']) def test_can_download_to_dataframe(self): result = self.do_dataset.name( 'census_tracts_american_samoa').download_stream() df = pandas.read_csv(result) self.assertEqual(df.shape, (18, 13)) # do some checks on the contents sample = pandas.DataFrame(df.head(), columns=('state_fips_code', 'county_fips_code', 'geo_id', 'tract_name', 'internal_point_geo')) sample['internal_point_geo'] = df['internal_point_geo'].apply( wkt.loads) geosample = geopandas.GeoDataFrame(sample, geometry='internal_point_geo') self.assertEqual(geosample.to_csv(index=False), EXPECTED_CSV_SAMPLE) def test_creation_of_dataset(self): unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace( '-', '_') dataset = self.do_dataset.name(unique_table_name) \ .column(name='cartodb_id', type='INT64') \ .column('the_geom', 'GEOMETRY') \ .ttl_seconds(30) dataset.create() # do a quick check on the resulting table result = dataset.download_stream() df = pandas.read_csv(result) self.assertEqual(df.shape, (0, 2)) self.assertEqual(df.to_csv(index=False), 'cartodb_id,the_geom\n') def test_points_enrichment_dataset(self): variable_slug = 'poverty_a86da569' variable_column_name = 'poverty' unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace( '-', '_') gdf = read_file( file_path('../observatory/enrichment/files/points.geojson')) gdf[_ENRICHMENT_ID] = range(gdf.shape[0]) gdf[_GEOM_COLUMN] = gdf.geometry gdf = gdf[[_ENRICHMENT_ID, _GEOM_COLUMN]] dataset = self.do_dataset.name(unique_table_name) \ .column(_ENRICHMENT_ID, 'INT64') \ .column(_GEOM_COLUMN, 'GEOMETRY') \ .ttl_seconds(_TTL_IN_SECONDS) dataset.create() status = dataset.upload_dataframe(gdf, geom_column=_GEOM_COLUMN) self.assertIn(status, ['success']) geom_type = GEOM_TYPE_POINTS variables = [variable_slug] output_name = '{}_result'.format(unique_table_name) status = dataset.enrichment(geom_type=geom_type, variables=variables, output_name=output_name) self.assertIn(status, ['success']) result = self.do_dataset.name(output_name).download_stream() result_df = pandas.read_csv(result) self.assertIn(variable_column_name, result_df.columns) def test_polygons_enrichment_dataset(self): variable_slug = 'poverty_a86da569' variable_column_name = 'poverty' unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace( '-', '_') gdf = read_file( file_path('../observatory/enrichment/files/polygon.geojson')) gdf[_ENRICHMENT_ID] = range(gdf.shape[0]) gdf[_GEOM_COLUMN] = gdf.geometry gdf = gdf[[_ENRICHMENT_ID, _GEOM_COLUMN]] dataset = self.do_dataset.name(unique_table_name) \ .column(_ENRICHMENT_ID, 'INT64') \ .column(_GEOM_COLUMN, 'GEOMETRY') \ .ttl_seconds(_TTL_IN_SECONDS) dataset.create() status = dataset.upload_dataframe(gdf, geom_column=_GEOM_COLUMN) self.assertIn(status, ['success']) geom_type = GEOM_TYPE_POLYGONS variables = [variable_slug] output_name = '{}_result'.format(unique_table_name) status = dataset.enrichment(geom_type=geom_type, variables=variables, output_name=output_name) self.assertIn(status, ['success']) result = self.do_dataset.name(output_name).download_stream() df = pandas.read_csv(result) self.assertIn(variable_column_name, df.columns)