예제 #1
0
def test_DODataset_ttl_seconds(api_key_auth_client_usr):
    bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr)

    ttl_seconds = 10
    result = bq_user_dataset.ttl_seconds(ttl_seconds)
    assert bq_user_dataset == result
    assert bq_user_dataset._ttl_seconds == ttl_seconds
예제 #2
0
def test_DODataset_name(api_key_auth_client_usr):
    bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr)

    name = 'fake-name'
    result = bq_user_dataset.name(name)
    assert bq_user_dataset == result
    assert bq_user_dataset._name == name
예제 #3
0
def test_can_download_to_dataframe(mocker, api_key_auth_client_usr):
    # mock
    fake_response = ResponseMock(StringIO(CSV_SAMPLE_REDUCED))
    mocker.patch.object(APIKeyAuthClient, 'send', return_value=fake_response)

    bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr)

    # test
    result = bq_user_dataset.name(
        'census_tracts_american_samoa').download_stream()
    assert isinstance(result, ResponseStream)
예제 #4
0
def test_can_upload_from_file_object(mocker, api_key_auth_client_usr):
    # mock
    fake_response = ResponseMock()
    mocker.patch.object(APIKeyAuthClient, 'send', return_value=fake_response)

    bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr)

    # test
    unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace('-', '_')
    file_object = StringIO(CSV_SAMPLE_REDUCED)
    result = bq_user_dataset.name(unique_table_name).upload_file_object(
        file_object)
    assert result == fake_response
예제 #5
0
    def _upload_data(self, temp_table_name, geodataframe):
        reduced_geodataframe = geodataframe[[_ENRICHMENT_ID, _GEOM_COLUMN]]

        dataset = DODataset(auth_client=self.auth_client).name(temp_table_name) \
            .column(_ENRICHMENT_ID, 'INT64') \
            .column(_GEOM_COLUMN, 'GEOMETRY') \
            .ttl_seconds(_TTL_IN_SECONDS)
        dataset.create()

        status = dataset.upload_dataframe(reduced_geodataframe, _GEOM_COLUMN)

        if status not in ['success']:
            raise EnrichmentError('Couldn\'t upload the dataframe to be enriched. The job hasn\'t finished successfuly')

        return dataset
예제 #6
0
    def _download(self,
                  credentials,
                  file_path=None,
                  limit=None,
                  order_by=None,
                  sql_query=None,
                  add_geom=None):
        auth_client = credentials.get_api_key_auth_client()

        is_geography = None
        if sql_query is not None:
            is_geography = self.__class__.__name__ == 'Geography'

        rows = DODataset(auth_client=auth_client).name(
            self.id).download_stream(limit=limit,
                                     order_by=order_by,
                                     sql_query=sql_query,
                                     add_geom=add_geom,
                                     is_geography=is_geography)
        if file_path:
            with open(file_path, 'w') as csvfile:
                for row in rows:
                    csvfile.write(row.decode('utf-8'))

            log.info('Data saved: {}'.format(file_path))
            if self.__class__.__name__ == 'Dataset':
                log.info(_DATASET_READ_MSG.format(file_path))
            elif self.__class__.__name__ == 'Geography':
                log.info(_GEOGRAPHY_READ_MSG.format(file_path))
        else:
            dataframe = pd.read_csv(rows)
            return dataframe
예제 #7
0
def test_DODataset_column(api_key_auth_client_usr):
    bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr)

    invalid_cases = [{'column_name': 'fake-name', 'column_type': 'fake-type'}]

    for c in invalid_cases:
        with pytest.raises(Exception) as e:
            bq_user_dataset.column(c['column_name'], c['column_type'])
        assert str(e.value) == 'Invalid type {}'.format(
            c['column_type'].upper())

    column_name = 'column'
    column_type = VALID_TYPES[0]
    result = bq_user_dataset.column(column_name, column_type)
    assert bq_user_dataset == result
    assert bq_user_dataset._columns == [(column_name, column_type)]
예제 #8
0
    def setUp(self):
        if os.environ.get('APIKEY') and os.environ.get(
                'USERNAME') and os.environ.get('USERURL'):
            self.apikey = os.environ['APIKEY']
            self.username = os.environ['USERNAME']
            self.base_url = os.environ['USERURL']
        else:
            creds = json.loads(open('tests/e2e/secret.json').read())
            self.apikey = creds['APIKEY']
            self.username = creds['USERNAME']
            self.base_url = creds['USERURL']

        credentials = Credentials(username=self.username,
                                  api_key=self.apikey,
                                  base_url=self.base_url)
        auth_client = credentials.get_api_key_auth_client()
        self.do_dataset = DODataset(auth_client=auth_client)
예제 #9
0
def test_can_import_a_dataset(mocker, api_key_auth_client_usr):
    # mock
    fake_response = ResponseMock({'item_queue_id': '123'})
    mocker.patch.object(APIKeyAuthClient, 'send', return_value=fake_response)

    bq_user_dataset = DODataset(auth_client=api_key_auth_client_usr)

    # test
    unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace('-', '_')
    file_object = StringIO(CSV_SAMPLE_REDUCED)

    dataset = bq_user_dataset.name(unique_table_name) \
        .column(name='id', type='INT64') \
        .column('geom', 'GEOMETRY') \
        .ttl_seconds(30)
    dataset.create()
    dataset.upload_file_object(file_object)
    job = dataset.import_dataset()

    assert isinstance(job, DODatasetJob)
예제 #10
0
 def set_external_credentials(self):
     # This must be checked every time to allow the definition of
     # "default_do_credentials" at any point in the code because
     # every repo uses a singleton instance of this client
     external_credentials = defaults.get_default_do_credentials()
     if external_credentials is not None:
         external_auth_client = external_credentials.get_api_key_auth_client(
         )
         self._external_do_dataset = DODataset(
             auth_client=external_auth_client)
     else:
         self._external_do_dataset = None
예제 #11
0
    def _execute_enrichment(self, dataset, temp_table_name, geom_type, variables, filters, aggregation):
        output_name = '{}_result'.format(temp_table_name)
        status = dataset.enrichment(geom_type=geom_type,
                                    variables=variables,
                                    filters=filters,
                                    aggregation=aggregation,
                                    output_name=output_name)

        if status not in ['success']:
            raise EnrichmentError('Couldn\'t enrich the dataframe. The job hasn\'t finished successfuly')

        result = DODataset(auth_client=self.auth_client).name(output_name).download_stream()
        enriched_dataframe = pandas.read_csv(result)

        return enriched_dataframe
예제 #12
0
 def __init__(self):
     default_credentials = Credentials(DEFAULT_USER)
     default_auth_client = default_credentials.get_api_key_auth_client()
     self._default_do_dataset = DODataset(auth_client=default_auth_client)
     self._user_do_dataset = None
     self._external_do_dataset = None
예제 #13
0
 def set_user_credentials(self, credentials):
     if credentials is not None:
         auth_client = credentials.get_api_key_auth_client()
         self._user_do_dataset = DODataset(auth_client=auth_client)
     else:
         self._user_do_dataset = None
예제 #14
0
 def __init__(self):
     self._do_dataset = None
     default_credentials = defaults.get_default_credentials(
     ) or Credentials(DEFAULT_USER)
     default_auth_client = default_credentials.get_api_key_auth_client()
     self._default_do_dataset = DODataset(auth_client=default_auth_client)
예제 #15
0
class RepoClient:
    def __init__(self):
        self._do_dataset = None
        default_credentials = defaults.get_default_credentials(
        ) or Credentials(DEFAULT_USER)
        default_auth_client = default_credentials.get_api_key_auth_client()
        self._default_do_dataset = DODataset(auth_client=default_auth_client)

    def set_user_credentials(self, credentials):
        if credentials is not None:
            auth_client = credentials.get_api_key_auth_client()
            self._do_dataset = DODataset(auth_client=auth_client)
        else:
            self._do_dataset = None

    def reset_user_credentials(self):
        self._do_dataset = None

    def get_countries(self, filters=None):
        return self._get_entity('countries', filters)

    def get_categories(self, filters=None):
        return self._get_entity('categories', filters)

    def get_providers(self, filters=None):
        return self._get_entity('providers', filters)

    def get_datasets(self, filters=None):
        return self._get_entity('datasets', filters, use_slug=True)

    def get_geographies(self, filters=None):
        return self._get_entity('geographies', filters, use_slug=True)

    def get_variables(self, filters=None):
        filter_id = self._get_filter_id(filters, use_slug=True)
        if filter_id:
            return self._fetch_entity_id('variables', filter_id)
        else:
            entity = 'datasets/{}/variables'.format(filters.pop('dataset'))
            return self._fetch_entity(entity, filters)

    def get_variables_groups(self, filters=None):
        filter_id = self._get_filter_id(filters, use_slug=True)
        if filter_id:
            return self._fetch_entity_id('variables_groups', filter_id)
        else:
            entity = 'datasets/{0}/variables_groups'.format(
                filters.pop('dataset'))
            return self._fetch_entity(entity, filters)

    def _get_filter_id(self, filters, use_slug=False):
        if isinstance(filters, dict):
            filter_id = filters.get('id')
            if not filter_id and use_slug:
                filter_id = filters.get('slug')
            return filter_id

    def _get_entity(self, entity, filters=None, use_slug=False):
        filter_id = self._get_filter_id(filters, use_slug)
        if filter_id:
            return self._fetch_entity_id(entity, filter_id)
        else:
            return self._fetch_entity(entity, filters)

    def _fetch_entity_id(self, entity, filter_id):
        if isinstance(filter_id, list):
            return list(
                filter(None, [
                    self._fetch_entity('{0}/{1}'.format(entity, _id))
                    for _id in filter_id
                ]))
        else:
            return self._fetch_entity('{0}/{1}'.format(entity, filter_id))

    def _fetch_entity(self, entity, filters=None):
        if self._do_dataset:
            return self._do_dataset.metadata(entity, filters)
        else:
            return self._default_do_dataset.metadata(entity, filters)
예제 #16
0
class TestDODataset(unittest.TestCase):
    """This test suite needs the ENV variable USERURL pointing to a working DO API in "tests/e2e/secret.json".
    DO API must have the user/apikey mapping set to get access to the user's DO Project in GCP.
    """
    def setUp(self):
        if os.environ.get('APIKEY') and os.environ.get(
                'USERNAME') and os.environ.get('USERURL'):
            self.apikey = os.environ['APIKEY']
            self.username = os.environ['USERNAME']
            self.base_url = os.environ['USERURL']
        else:
            creds = json.loads(open('tests/e2e/secret.json').read())
            self.apikey = creds['APIKEY']
            self.username = creds['USERNAME']
            self.base_url = creds['USERURL']

        credentials = Credentials(username=self.username,
                                  api_key=self.apikey,
                                  base_url=self.base_url)
        auth_client = credentials.get_api_key_auth_client()
        self.do_dataset = DODataset(auth_client=auth_client)

    def test_can_upload_from_dataframe(self):
        sample = StringIO(CSV_SAMPLE_REDUCED)
        df = pandas.read_csv(sample)
        unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace(
            '-', '_')
        self.do_dataset.name(unique_table_name).upload(df)

    def test_can_upload_from_file_object(self):
        unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace(
            '-', '_')
        file_object = StringIO(CSV_SAMPLE_REDUCED)
        self.do_dataset.name(unique_table_name).upload_file_object(file_object)

    def test_can_import_a_dataset(self):
        unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace(
            '-', '_')
        file_object = StringIO(CSV_SAMPLE_REDUCED)

        dataset = self.do_dataset.name(unique_table_name) \
            .column(name='id', type='INT64') \
            .column('geom', 'GEOMETRY') \
            .ttl_seconds(30)
        dataset.create()
        dataset.upload_file_object(file_object)
        job = dataset.import_dataset()

        self.assertIsInstance(job, DODatasetJob)

    def test_can_get_status_from_import(self):
        unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace(
            '-', '_')
        file_object = StringIO(CSV_SAMPLE_REDUCED)

        dataset = self.do_dataset.name(unique_table_name) \
            .column(name='id', type='INT64') \
            .column('geom', 'GEOMETRY') \
            .ttl_seconds(30)
        dataset.create()
        dataset.upload_file_object(file_object)
        job = dataset.import_dataset()
        status = job.status()

        self.assertIn(
            status, ['pending', 'running', 'cancelled', 'success', 'failure'])

    def test_can_wait_for_job_completion(self):
        unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace(
            '-', '_')
        file_object = StringIO(CSV_SAMPLE_REDUCED)

        dataset = self.do_dataset.name(unique_table_name) \
            .column(name='id', type='INT64') \
            .column('geom', 'GEOMETRY') \
            .ttl_seconds(30)
        dataset.create()
        dataset.upload_file_object(file_object, geom_column='geom')
        job = dataset.import_dataset()
        status = job.result()

        self.assertIn(status, ['success'])

    def test_can_upload_a_dataframe_and_wait_for_completion(self):
        unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace(
            '-', '_')
        sample = StringIO(CSV_SAMPLE_REDUCED)
        df = pandas.read_csv(sample)

        dataset = self.do_dataset.name(unique_table_name) \
            .column(name='id', type='INT64') \
            .column('geom', 'GEOMETRY') \
            .ttl_seconds(30)
        dataset.create()
        status = dataset.upload_dataframe(df, geom_column='geom')

        self.assertIn(status, ['success'])

    def test_can_download_to_dataframe(self):
        result = self.do_dataset.name(
            'census_tracts_american_samoa').download_stream()
        df = pandas.read_csv(result)

        self.assertEqual(df.shape, (18, 13))

        # do some checks on the contents
        sample = pandas.DataFrame(df.head(),
                                  columns=('state_fips_code',
                                           'county_fips_code', 'geo_id',
                                           'tract_name', 'internal_point_geo'))
        sample['internal_point_geo'] = df['internal_point_geo'].apply(
            wkt.loads)
        geosample = geopandas.GeoDataFrame(sample,
                                           geometry='internal_point_geo')

        self.assertEqual(geosample.to_csv(index=False), EXPECTED_CSV_SAMPLE)

    def test_creation_of_dataset(self):
        unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace(
            '-', '_')

        dataset = self.do_dataset.name(unique_table_name) \
            .column(name='cartodb_id', type='INT64') \
            .column('the_geom', 'GEOMETRY') \
            .ttl_seconds(30)
        dataset.create()

        # do a quick check on the resulting table
        result = dataset.download_stream()
        df = pandas.read_csv(result)
        self.assertEqual(df.shape, (0, 2))
        self.assertEqual(df.to_csv(index=False), 'cartodb_id,the_geom\n')

    def test_points_enrichment_dataset(self):
        variable_slug = 'poverty_a86da569'
        variable_column_name = 'poverty'

        unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace(
            '-', '_')
        gdf = read_file(
            file_path('../observatory/enrichment/files/points.geojson'))
        gdf[_ENRICHMENT_ID] = range(gdf.shape[0])
        gdf[_GEOM_COLUMN] = gdf.geometry
        gdf = gdf[[_ENRICHMENT_ID, _GEOM_COLUMN]]

        dataset = self.do_dataset.name(unique_table_name) \
            .column(_ENRICHMENT_ID, 'INT64') \
            .column(_GEOM_COLUMN, 'GEOMETRY') \
            .ttl_seconds(_TTL_IN_SECONDS)
        dataset.create()
        status = dataset.upload_dataframe(gdf, geom_column=_GEOM_COLUMN)
        self.assertIn(status, ['success'])

        geom_type = GEOM_TYPE_POINTS
        variables = [variable_slug]
        output_name = '{}_result'.format(unique_table_name)
        status = dataset.enrichment(geom_type=geom_type,
                                    variables=variables,
                                    output_name=output_name)

        self.assertIn(status, ['success'])

        result = self.do_dataset.name(output_name).download_stream()
        result_df = pandas.read_csv(result)

        self.assertIn(variable_column_name, result_df.columns)

    def test_polygons_enrichment_dataset(self):
        variable_slug = 'poverty_a86da569'
        variable_column_name = 'poverty'

        unique_table_name = 'cf_test_table_' + str(uuid.uuid4()).replace(
            '-', '_')
        gdf = read_file(
            file_path('../observatory/enrichment/files/polygon.geojson'))
        gdf[_ENRICHMENT_ID] = range(gdf.shape[0])
        gdf[_GEOM_COLUMN] = gdf.geometry
        gdf = gdf[[_ENRICHMENT_ID, _GEOM_COLUMN]]

        dataset = self.do_dataset.name(unique_table_name) \
            .column(_ENRICHMENT_ID, 'INT64') \
            .column(_GEOM_COLUMN, 'GEOMETRY') \
            .ttl_seconds(_TTL_IN_SECONDS)
        dataset.create()
        status = dataset.upload_dataframe(gdf, geom_column=_GEOM_COLUMN)
        self.assertIn(status, ['success'])

        geom_type = GEOM_TYPE_POLYGONS
        variables = [variable_slug]
        output_name = '{}_result'.format(unique_table_name)
        status = dataset.enrichment(geom_type=geom_type,
                                    variables=variables,
                                    output_name=output_name)

        self.assertIn(status, ['success'])

        result = self.do_dataset.name(output_name).download_stream()
        df = pandas.read_csv(result)

        self.assertIn(variable_column_name, df.columns)