Exemple #1
0
 def test_list_tables(self):
     table_ids = ['table_1', 'table_2']
     table_count = len(table_ids)
     expected_max_results = table_count + bq._MAX_RESULTS_PADDING
     # mock client calls
     client = self._mock_client_with(table_ids)
     bq.list_tables(client, self.dataset_ref)
     client.list_tables.assert_called_with(dataset=self.dataset_ref,
                                           max_results=expected_max_results)
Exemple #2
0
def exclude_site_submission(client: bigquery.Client,
                            dataset: bigquery.DatasetReference,
                            hpo_ids: List[str]) -> bigquery.QueryJob:
    """
    Empty all CDM tables associated with one or more HPO sites
    
    :param client: Active bigquery client object 
    :param dataset: the dataset to exclude site data from
    :param hpo_ids: Identifies the HPO sites whose data should be excluded
    :return: Query job associated with removing all the records
    :raises RuntimeError if CDM tables associated with a site are not found in the dataset
    """
    LOGGER.debug(
        f'exclude_site_submission called with dataset={dataset.dataset_id} and hpo_ids={hpo_ids}'
    )
    all_tables = list(bq.list_tables(client, dataset))
    tables_to_empty = []
    for hpo_id in hpo_ids:
        hpo_tables = _filter_hpo_tables(all_tables, hpo_id)
        if not hpo_tables:
            raise RuntimeError(
                f'No tables found for {hpo_id} in dataset {dataset.dataset_id}. '
                f'Ensure the specified arguments are correct.')
        tables_to_empty.extend(hpo_tables)
    script = DELETE_QUERY_TPL.render(tables_to_empty=tables_to_empty)
    LOGGER.debug(f'exclude_site_submission about to start script:\n {script}')
    return client.query(script)
    def validate_rule(self, client, *args, **keyword_args):
        """
        Validates the cleaning rule which deletes or updates the data from the tables

        Method to run validation on cleaning rules that will be updating the values.
        For example:
        if your class updates all the datetime fields you should be implementing the
        validation that checks if the date time values that needs to be updated no
        longer exists in the table.

        if your class deletes a subset of rows in the tables you should be implementing
        the validation that checks if the count of final final row counts + deleted rows
        should equals to initial row counts of the affected tables.

        Raises RunTimeError if the validation fails.
        """

        dataset_ref = bigquery.DatasetReference(client.project, self.dataset_id)
        current_tables = list_tables(client, dataset_ref)
        current_tables = [table.table_id for table in current_tables]
        extra_tables = list(set(current_tables) - set(FINAL_TABLES))

        if extra_tables:
            raise RuntimeError(
                f'Some extra tables remain in the dataset: {extra_tables}')
    def setup_rule(self, client, *args, **keyword_args):
        """
        Load required resources prior to executing cleaning rule queries.

        Method to run data upload options before executing the first cleaning
        rule of a class.  For example, if your class requires loading a static
        table, that load operation should be defined here.  It SHOULD NOT BE
        defined as part of get_query_specs().
        """

        dataset_ref = bigquery.DatasetReference(client.project, self.dataset_id)
        current_tables = list_tables(client, dataset_ref)
        current_tables = [table.table_id for table in current_tables]
        self.extra_tables = list(set(current_tables) - set(FINAL_TABLES))
Exemple #5
0
 def copy_vocab_tables(cls, vocabulary_id):
     """
     A function for copying the vocab tables to the test dataset_id
     :param vocabulary_id: 
     :return: 
     """
     # Copy vocab tables over to the test dataset
     vocabulary_dataset = cls.client.get_dataset(vocabulary_id)
     for src_table in bq.list_tables(cls.client, vocabulary_dataset):
         schema = bq.get_table_schema(src_table.table_id)
         destination = f'{cls.project_id}.{cls.dataset_id}.{src_table.table_id}'
         dst_table = cls.client.create_table(Table(destination,
                                                   schema=schema),
                                             exists_ok=True)
         cls.client.copy_table(src_table, dst_table)