def test_handle_bq_export_task_county(self, mock_export, mock_project_id):
        """Tests that the export is called for a given table and module when
        the /export_manager/export endpoint is hit for a table in the COUNTY
        module."""
        mock_export.return_value = True

        mock_project_id.return_value = 'test-project'
        self.mock_client.dataset_ref_for_id.return_value = DatasetReference(
            mock_project_id.return_value, self.mock_dataset_name)
        table = 'fake_table'
        module = 'JAILS'
        route = '/export'
        data = {"table_name": table, "schema_type": module}

        response = self.mock_flask_client.post(
            route,
            data=json.dumps(data),
            content_type='application/json',
            headers={'X-Appengine-Inbound-Appid': 'test-project'})
        assert response.status_code == HTTPStatus.OK
        mock_export.assert_called_with(
            self.mock_client, table,
            DatasetReference.from_string(self.mock_dataset_name,
                                         mock_project_id.return_value),
            SchemaType.JAILS)
    def test_handle_bq_export_task_state(self, mock_export, mock_project_id):
        """Tests that the export is called for a given table and module when
        the /export_manager/export endpoint is hit for a table in the STATE
        module."""
        mock_export.return_value = True
        self.mock_export_config.STATE_BASE_TABLES_BQ_DATASET.return_value = 'state'

        mock_project_id.return_value = 'test-project'
        self.mock_client.dataset_ref_for_id.return_value = \
            DatasetReference.from_string('dataset', 'test-project')

        table = 'fake_table'
        module = 'STATE'
        route = '/export'
        data = {"table_name": table, "schema_type": module}

        response = self.mock_flask_client.post(
            route,
            data=json.dumps(data),
            content_type='application/json',
            headers={'X-Appengine-Inbound-Appid': 'test-project'})
        assert response.status_code == HTTPStatus.OK
        mock_export.assert_called_with(
            self.mock_client, table,
            DatasetReference.from_string('dataset', 'test-project'),
            SchemaType.STATE)
Example #3
0
    def test_table_lookup(self):
        # type: () -> None
        dataset_ref1 = DatasetReference('my_project', 'dataset1')
        table_ref1 = TableReference(dataset_ref1, 'table1')
        table1 = Table(table_ref1, _TEST_SCHEMA)

        # Trying to get the same dataset/table in another project doesn't work.
        with self.assertRaisesRegexp(NotFound, 'other_project'):
            self.bq_client.get_table(
                TableReference(DatasetReference('other_project', 'dataset1'),
                               'table1'))

        # Trying to get the table before the dataset exists doesn't work
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.get_table(table_ref1)
        self.bq_client.create_dataset(Dataset(dataset_ref1))

        # Trying to get the table before the table exists doesn't work
        with self.assertRaises(NotFound):
            self.bq_client.get_table(table_ref1)

        self.bq_client.create_table(table1)

        # Assert the created table has the expected properties.
        table_found = self.bq_client.get_table(table_ref1)
        self.assertEqual(table1.project, "my_project")
        self.assertEqual(table1.dataset_id, "dataset1")
        self.assertEqual(table1.table_id, "table1")
        six.assertCountEqual(self, table_found.schema, _TEST_SCHEMA)
Example #4
0
    def test_update_tables_for_state(self, mock_region_config_fn) -> None:
        mock_region_config = FakeDirectIngestRegionRawFileConfig(self.test_region.region_code)
        mock_region_config_fn.return_value = mock_region_config
        self.mock_raw_file_configs = mock_region_config.raw_file_configs

        self.update_controller = DirectIngestRawDataTableLatestViewUpdater(
            state_code=self.test_region.region_code,
            project_id=self.project_id,
            bq_client=self.mock_big_query_client
        )

        with local_project_id_override(self.project_id):
            self.update_controller.update_views_for_state()

            self.assertEqual(self.mock_big_query_client.create_or_update_view.call_count, 2)

            raw_data_dataset = DatasetReference(self.project_id, 'us_xx_raw_data')
            self.mock_big_query_client.table_exists.assert_has_calls([
                mock.call(raw_data_dataset, 'tagA'),
                mock.call(raw_data_dataset, 'tagB'),
                mock.call(raw_data_dataset, 'tagC'),
                mock.call(raw_data_dataset, 'tagWeDoNotIngest')
            ])

            mock_views = [DirectIngestRawDataTableLatestView(region_code=self.test_region.region_code,
                                                             raw_file_config=self.mock_raw_file_configs['tagA']),
                          DirectIngestRawDataTableLatestView(region_code=self.test_region.region_code,
                                                             raw_file_config=self.mock_raw_file_configs['tagC'])]
            views_dataset = DatasetReference(self.project_id, 'us_xx_raw_data_up_to_date_views')

            self.mock_big_query_client.create_or_update_view.assert_has_calls([mock.call(views_dataset, x)
                                                                               for x in mock_views])

            self.mock_big_query_client.create_dataset_if_necessary.assert_called_once()
            self.mock_big_query_client.create_dataset_if_necessary.assert_has_calls([mock.call(views_dataset)])
    def test_dataset_lookup(self):
        # type: () -> None
        with self.assertRaisesRegexp(NotFound, 'some_other_project'):
            self.bq_client.get_dataset(DatasetReference('some_other_project', 'dataset1'))
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.get_dataset(DatasetReference('my_project', 'dataset1'))
        dataset1 = Dataset(DatasetReference('my_project', 'dataset1'))
        self.bq_client.create_dataset(dataset1)

        self.assertDatasetReferenceEqual(
            self.bq_client.get_dataset(DatasetReference('my_project', 'dataset1')).reference,
            dataset1.reference)
        self.assertDatasetReferenceEqual(self.bq_client.dataset('dataset1'),
                                         dataset1.reference)
        self.assertDatasetReferenceEqual(self.bq_client.dataset('dataset1', 'my_project'),
                                         dataset1.reference)
Example #6
0
 def __create_job_config(
         self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig:
     job_config = QueryJobConfig()
     job_config.priority = ems_query_job_config.priority.value
     job_config.use_legacy_sql = False
     job_config.use_query_cache = ems_query_job_config.use_query_cache
     job_config.labels = ems_query_job_config.labels
     if ems_query_job_config.destination_table is not None:
         job_config.time_partitioning = TimePartitioning("DAY")
         table_reference = TableReference(
             DatasetReference(
                 ems_query_job_config.destination_project_id
                 or self.__project_id,
                 ems_query_job_config.destination_dataset),
             ems_query_job_config.destination_table)
         job_config.destination = table_reference
         job_config.write_disposition = ems_query_job_config.write_disposition.value
         job_config.create_disposition = ems_query_job_config.create_disposition.value
     partitioning = ems_query_job_config.time_partitioning
     if partitioning is not None:
         job_config.time_partitioning = TimePartitioning(
             partitioning.type.value, partitioning.field,
             partitioning.expiration_ms,
             partitioning.require_partition_filter)
     if ems_query_job_config.table_definitions is not None:
         job_config.table_definitions = ems_query_job_config.table_definitions
     return job_config
Example #7
0
    def _table_ref(self, table_id):
        """Return a BigQuery client library table reference"""
        from google.cloud.bigquery import DatasetReference
        from google.cloud.bigquery import TableReference

        return TableReference(
            DatasetReference(self.project_id, self.dataset_id), table_id)
Example #8
0
    def _load_table_from_cloud_storage_async(
            self, source_uri: str,
            destination_dataset_ref: bigquery.DatasetReference,
            destination_table_id: str,
            destination_table_schema: List[bigquery.SchemaField],
            write_disposition: bigquery.WriteDisposition
    ) -> bigquery.job.LoadJob:

        self.create_dataset_if_necessary(destination_dataset_ref)

        destination_table_ref = destination_dataset_ref.table(
            destination_table_id)

        job_config = bigquery.LoadJobConfig()
        job_config.schema = destination_table_schema
        job_config.source_format = bigquery.SourceFormat.CSV
        job_config.allow_quoted_newlines = True
        job_config.write_disposition = write_disposition

        load_job = self.client.load_table_from_uri(source_uri,
                                                   destination_table_ref,
                                                   job_config=job_config)

        logging.info("Started load job [%s] for table [%s.%s.%s]",
                     load_job.job_id, destination_table_ref.project,
                     destination_table_ref.dataset_id,
                     destination_table_ref.table_id)

        return load_job
Example #9
0
def make_table(project,
               dataset_id,
               table_id,
               friendly_name=None,
               description=None,
               expires=None,
               partitioning_type=None,
               view_use_legacy_sql=None,
               view_query=None,
               schema=None,
               labels=None):
    dataset_ref = DatasetReference(project, dataset_id)
    table_ref = TableReference(dataset_ref, table_id)
    table = Table(table_ref)
    table.friendly_name = friendly_name
    table.description = description
    table.expires = expires
    table.partitioning_type = partitioning_type
    if view_use_legacy_sql is not None:
        table.view_use_legacy_sql = view_use_legacy_sql
    if view_query is not None:
        table.view_query = view_query
    table.schema = schema
    if labels is not None:
        table.labels = labels
    return table
 def test_listing_datasets(self):
     # type: () -> None
     self.assertFalse(self.bq_client.list_datasets())
     self.assertFalse(self.bq_client.list_datasets('my_project'))
     self.bq_client.create_dataset(Dataset(DatasetReference('my_project', 'dataset1')))
     self.bq_client.create_dataset(Dataset(DatasetReference('my_project', 'dataset2')))
     self.bq_client.create_dataset(Dataset(DatasetReference('other_project', 'dataset3')))
     six.assertCountEqual(self,
                          [dataset.dataset_id for dataset in self.bq_client.list_datasets()],
                          ['dataset1', 'dataset2'])
     six.assertCountEqual(
             self,
             [dataset.dataset_id for dataset in self.bq_client.list_datasets('my_project')],
             ['dataset1', 'dataset2'])
     six.assertCountEqual(
             self,
             [dataset.dataset_id for dataset in self.bq_client.list_datasets('other_project')],
             ['dataset3'])
Example #11
0
 def setUp(self):
     # Input parameters expected by the class
     self.project_id = 'bar_project'
     self.dataset_id = 'foo_dataset'
     self.description = 'fake_description'
     self.existing_labels_or_tags = {'label': 'value', 'tag': ''}
     self.new_labels_or_tags = {'label': 'new_value', 'new_tag': ''}
     self.updated = {'tag': '', 'label': 'new_value', 'new_tag': ''}
     self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
 def test_listing_tables_with_max(self):
     # type: () -> None
     dataset_ref1 = DatasetReference('my_project', 'dataset1')
     self.bq_client.create_dataset(Dataset(dataset_ref1))
     for i in range(10):
         self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table{}'.format(i)),
                                           _TEST_SCHEMA))
     self.assertEqual(5, len(self.bq_client.list_tables(dataset_ref1, max_results=5)))
     self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1, max_results=20)))
     self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1)))
 def test_listing_tables(self):
     # type: () -> None
     dataset_ref1 = DatasetReference('my_project', 'dataset1')
     self.bq_client.create_dataset(Dataset(dataset_ref1))
     self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table1'), _TEST_SCHEMA))
     self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table2'), []))
     six.assertCountEqual(
         self,
         [table_ref.table_id for table_ref in self.bq_client.list_tables(dataset_ref1)],
         ['table1', 'table2'])
    def test_delete_dataset_if_exists_WhenItIsNotEmpty(self):
        dataset_id = self.__generate_test_name("dataset")
        table_name = self.__generate_test_name("table")
        self.client.create_dataset_if_not_exists(dataset_id)
        self.__create_test_table(
            table_name,
            Dataset(DatasetReference(GCP_PROJECT_ID, dataset_id)).reference)

        self.client.delete_dataset_if_exists(dataset_id, delete_contents=True)

        self.assertFalse(self.client.dataset_exists(dataset_id))
Example #15
0
    def table_exists(self, dataset_ref: bigquery.DatasetReference,
                     table_id: str) -> bool:
        table_ref = dataset_ref.table(table_id)

        try:
            self.client.get_table(table_ref)
            return True
        except exceptions.NotFound:
            logging.warning("Table [%s] does not exist in dataset [%s]",
                            table_id, str(dataset_ref))
            return False
Example #16
0
 def run_async_load_job(self, job_id_prefix: str,
                        config: EmsLoadJobConfig) -> str:
     return self.__bigquery_client.load_table_from_uri(
         source_uris=config.source_uri_template,
         destination=TableReference(
             DatasetReference(config.destination_project_id,
                              config.destination_dataset),
             config.destination_table),
         job_id_prefix=job_id_prefix,
         location=self.__location,
         job_config=self.__create_load_job_config(config)).job_id
Example #17
0
    def dataset(self, dataset_id, project=None):
        # type: (str, Optional[str]) -> DatasetReference
        """Constructs a reference to a dataset.

        Args:
            dataset_id: Dataset to look up
            project: If specified, project to find the dataset in, otherwise client's default.

        Returns:
            Reference to the dataset found.
        """
        project = project or self.project
        return DatasetReference(project, dataset_id)
    def test_dataset_delete(self):
        # type: () -> None
        dataset1 = Dataset(DatasetReference('my_project', 'dataset1'))

        # Can't delete dataset, doesn't exist yet.
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.delete_dataset(dataset1.reference)

        self.bq_client.create_dataset(dataset1)
        self.bq_client.create_table(Table(TableReference(dataset1.reference, 'table1'),
                                          _TEST_SCHEMA))

        # Can't delete dataset, not empty
        with self.assertRaises(BadRequest):
            self.bq_client.delete_dataset(dataset1.reference)

        # Okay to delete, specifically requesting to delete contents.
        self.bq_client.delete_dataset(dataset1.reference, delete_contents=True)

        # And now dataset is gone again.
        with self.assertRaisesRegexp(NotFound, 'dataset1'):
            self.bq_client.get_dataset(DatasetReference('my_project', 'dataset1'))
Example #19
0
 def get_raw_details(self, dataset_id, view_id):
     if self.did_full_update:
         return self.raw_details[dataset_id][view_id]
     if dataset_id not in self.raw_details:
         self.raw_details[dataset_id] = dict()
     if view_id not in self.raw_details[dataset_id]:
         try:
             self.raw_details[dataset_id][view_id] = self.bq_client._client.get_table( \
                 TableReference(DatasetReference(self.bq_conductor_conf.GOOGLE_CLOUD_PROJECT,
                                                 dataset_id.split('.')[-1]), view_id)).to_api_repr()
         except:
             self.raw_details[dataset_id][view_id] = None
     return self.raw_details[dataset_id][view_id]
Example #20
0
 def setUp(self):
     project_id = 'fake_project_id'
     dataset_id = 'fake_dataset_id'
     bucket_name = 'fake_bucket'
     dataset_ref = DatasetReference(project_id, dataset_id)
     self.dst_dataset = Dataset(dataset_ref)
     self.bq_client = mock.MagicMock()
     self.gcs_client = mock.MagicMock()
     self.bucket_name = bucket_name
     self.all_blobs = [
         Blob(f'{table}.csv', self.bucket_name)
         for table in common.VOCABULARY_TABLES
     ]
 def setUp(self):
     self.project_id = 'fake_project'
     self.dataset_id = 'fake_dataset'
     self.dataset = DatasetReference(self.project_id, self.dataset_id)
     self.hpo_ids = ['fake_hpo1', 'fake_hpo2']
     self.hpo_cdm_tables = [
         self._table_list_item(hpo_id, cdm_table) for hpo_id in self.hpo_ids
         for cdm_table in CDM_TABLES
     ]
     self.hpo_cdm_and_achilles_tables = [
         self._table_list_item(hpo_id, cdm_table) for hpo_id in self.hpo_ids
         for cdm_table in CDM_TABLES + ['achilles', 'achilles_results']
     ]
    def setUp(self):
        super(ClientWriteFromQueryTest, self).setUp()
        dataset_ref = DatasetReference(self.bq_client.project, 'my_dataset')
        schema = [SchemaField(name="a", field_type='INT64'),
                  SchemaField(name="b", field_type='FLOAT64'),
                  ]
        self.source_table = Table(TableReference(dataset_ref, 'source_table'), schema)
        self.destination_table = Table(TableReference(dataset_ref, 'destination_table'), schema)
        self.bq_client.create_dataset(Dataset(dataset_ref))
        self.bq_client.create_table(self.source_table)
        # We don't create the destination table here; some tests do not want it created.

        # Stick two rows into source_table
        self.assertFalse(self.bq_client.insert_rows(self.source_table,
                                                    [{'a': 1, 'b': 2.5}, {'a': 3, 'b': 4.25}]))
Example #23
0
 def get_mapping_table_ids(self, client):
     """
     returns all the mapping table ids found in the dataset
     :param project_id: project_id containing the dataset
     :param mapping_dataset_id: dataset_id containing mapping tables
     :return: returns mapping table ids
     """
     dataset_ref = DatasetReference(self.project_id,
                                    self._mapping_dataset_id)
     table_objs = client.list_tables(dataset_ref)
     mapping_table_ids = [
         table_obj.table_id for table_obj in table_objs
         if table_obj.table_id in MAPPING_TABLES
     ]
     return mapping_table_ids
 def to_dataset(project, model):
     access_entries = model.access_entries
     if access_entries:
         access_entries = tuple(
             BigQueryAccessEntry.to_access_entry(a) for a in access_entries)
     else:
         access_entries = ()
     dataset_ref = DatasetReference(project, model.dataset_id)
     dataset = Dataset(dataset_ref)
     dataset.friendly_name = model.friendly_name
     dataset.description = model.description
     dataset.default_table_expiration_ms = model.default_table_expiration_ms
     dataset.location = model.location
     dataset.access_entries = access_entries
     dataset.labels = model.labels if model.labels is not None else dict()
     return dataset
Example #25
0
    def setUp(self):
        self.project_id = os.environ.get(PROJECT_ID)
        self.dataset_id = os.environ.get('COMBINED_DATASET_ID')
        self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
        self.client = bq.get_client(self.project_id)

        self.schema = [
            SchemaField("person_id", "INT64"),
            SchemaField("first_name", "STRING"),
            SchemaField("last_name", "STRING"),
            SchemaField("algorithm", "STRING")
        ]

        self.ps_api_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable')
        ]

        self.id_match_fields = [
            dict(name='person_id', type='integer', mode='nullable'),
            dict(name='first_name', type='string', mode='nullable'),
            dict(name='last_name', type='string', mode='nullable'),
            dict(name='algorithm', type='string', mode='nullable')
        ]

        self.hpo_id = 'fake_site'
        self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}'
        self.ps_values_table_id = f'ps_api_values_{self.hpo_id}'

        # Create and populate the ps_values site table

        schema = bq.get_table_schema(PS_API_VALUES)
        tablename = self.ps_values_table_id

        table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}',
                      schema=schema)
        table.time_partitioning = TimePartitioning(
            type_=TimePartitioningType.HOUR)
        table = self.client.create_table(table)

        populate_query = POPULATE_PS_VALUES.render(
            project_id=self.project_id,
            drc_dataset_id=self.dataset_id,
            ps_values_table_id=self.ps_values_table_id)
        job = self.client.query(populate_query)
        job.result()
Example #26
0
def ensure_dataset(project_id, dataset_id, location):
    from google.cloud.bigquery import DatasetReference
    client = bigquery.Client(project=project_id, location=location)

    dataset_ref = DatasetReference(project_id, dataset_id)
    try:
        client.create_dataset(dataset_ref)
    except exceptions.GoogleAPICallError as e:
        if e.response.status_code == 409:  # dataset exists
            pass
        else:
            logger.critical(
                f"unable to create dataset {dataset_id} in project {project_id}; Exception {e}"
            )
            return 2  # sys.exit(2)

    return client, Dataset(dataset_ref)
Example #27
0
def send_to_bq(dataset, table, row):
    bigquery_client = bigquery.Client(project='theo-home')

    table_ref = TableReference(
        dataset_ref=DatasetReference(dataset_id=dataset, project='theo-home'),
        table_id=table,
    )

    schema = [SchemaField(name=field, field_type=bq_types[type(data)]) for field, data in row.items()]

    table = bigquery_client.create_table(
        Table(table_ref, schema=schema),
        exists_ok=True
    )

    errors = bigquery_client.insert_rows(table, [row])
    if errors:
        print(errors, file=sys.stderr)
Example #28
0
def make_dataset(project,
                 dataset_id,
                 friendly_name=None,
                 description=None,
                 default_table_expiration_ms=None,
                 location=None,
                 labels=None,
                 access_entries=None):
    dataset_ref = DatasetReference(project, dataset_id)
    dataset = Dataset(dataset_ref)
    dataset.friendly_name = friendly_name
    dataset.description = description
    dataset.default_table_expiration_ms = default_table_expiration_ms
    dataset.location = location
    if labels is not None:
        dataset.labels = labels
    if access_entries is not None:
        dataset.access_entries = access_entries
    return dataset
 def test_load_table_from_file(self, column_to_null):
     # type: (int) -> None
     dataset_ref = DatasetReference('my_project', 'my_dataset')
     dataset = Dataset(dataset_ref)
     table = Table(TableReference(dataset_ref, 'table1'), _TEST_SCHEMA)
     self.bq_client.create_dataset(dataset)
     self.bq_client.create_table(table)
     output = cStringIO()
     csv_out = csv.writer(output)
     input_row = list(self.INPUT_ROW)
     expected_row = list(self.EXPECTED_ROW)
     if column_to_null is not None:
         input_row[column_to_null] = 'NULL'
         expected_row[column_to_null] = None
     csv_out.writerow(input_row)
     self.bq_client.load_table_from_file(output, table.reference, job_config=None, rewind=True)
     self.assertRowsExpected(
             self.bq_client.query('SELECT * FROM `my_project.my_dataset.table1`',
                                  QueryJobConfig()),
             [expected_row])
Example #30
0
    def copy_view(
            self, view: BigQueryView, destination_client: BigQueryClient,
            destination_dataset_ref: bigquery.DatasetReference
    ) -> bigquery.Table:

        if destination_client.table_exists(destination_dataset_ref,
                                           view.view_id):
            raise ValueError(
                f"Table [{view.view_id}] already exists in dataset!")

        # Create the destination dataset if it doesn't yet exist
        destination_client.create_dataset_if_necessary(destination_dataset_ref)

        new_view_ref = destination_dataset_ref.table(view.view_id)
        new_view = bigquery.Table(new_view_ref)
        new_view.view_query = view.view_query.format(
            destination_client.project_id, destination_dataset_ref.dataset_id,
            view.view_id)
        table = destination_client.create_table(new_view)
        logging.info("Created %s", new_view_ref)
        return table