def update_person(client, project_id, dataset_id): """ Populates person table with two additional columns from the ext table :param client: bigquery client :param project_id: identifies the project :param dataset_id: identifies the dataset :return: """ person_table_id = f'{project_id}.{dataset_id}.person' person_ext_table_id = f'{project_id}.{dataset_id}.person_ext' person = client.get_table(person_table_id) person_ext = client.get_table(person_ext_table_id) # update person schema person_ext_schema = person_ext.schema person_schema = person.schema new_person_schema = person_schema[:] # Creates a copy of the schema. for schema_field in person_ext_schema: if schema_field.name == 'state_of_residence_concept_id' or schema_field.name == 'state_of_residence_source_value': new_person_schema.append(schema_field) person.schema = new_person_schema person = client.update_table(person, ["schema"]) LOGGER.info(f'Updated person table schema to {person.schema}') query_job = client.query(query=bq.get_create_or_replace_table_ddl( project_id=project_id, dataset_id=dataset_id, table_id='person', schema=person.schema, as_query=NEW_PERSON_QUERY_TMPL.render(person=person, person_ext=person_ext)), ) query_job.result()
def test_get_table_ddl(self): # Schema is determined by table name ddl = bq.get_create_or_replace_table_ddl(self.project_id, self.dataset_id, 'observation').strip() self.assertTrue( ddl.startswith( f'CREATE OR REPLACE TABLE `{self.project_id}.{self.dataset_id}.observation`' )) self.assertTrue(ddl.endswith(')')) # Explicitly provided table name and schema are rendered observation_schema = bq.get_table_schema('observation') ddl = bq.get_create_or_replace_table_ddl( self.project_id, self.dataset_id, table_id='custom_observation', schema=observation_schema).strip() self.assertTrue( ddl.startswith( f'CREATE OR REPLACE TABLE `{self.project_id}.{self.dataset_id}.custom_observation`' )) # Sanity check that observation schema is rendered self.assertTrue( all(field.description in ddl for field in observation_schema)) self.assertTrue(ddl.endswith(')')) # Parameter as_query is rendered fake_as_query = "SELECT 1 FROM fake" ddl = bq.get_create_or_replace_table_ddl( self.project_id, self.dataset_id, 'observation', as_query=fake_as_query).strip() self.assertTrue( ddl.startswith( f'CREATE OR REPLACE TABLE `{self.project_id}.{self.dataset_id}.observation`' )) self.assertTrue(ddl.endswith(fake_as_query))
def backup_rows_to_drop_ddl(self) -> str: """ Get a DDL statement which loads a backup table with rows to be dropped :return: the DDL statement """ observation_schema = bq.get_table_schema(OBSERVATION) query = BACKUP_ROWS_QUERY.render(lookup_table=self.lookup_table, src_table=self.observation_table) return bq.get_create_or_replace_table_ddl( project_id=self.backup_table.project, dataset_id=self.backup_table.dataset_id, table_id=self.backup_table.table_id, schema=observation_schema, as_query=query)
def stage_to_target_ddl(self) -> str: """ Get a DDL statement which drops and creates the observation table with rows from stage :return: the DDL statement """ observation_schema = bq.get_table_schema(OBSERVATION) stage = self.stage_table query = f'''SELECT * FROM `{stage.project}.{stage.dataset_id}.{stage.table_id}`''' return bq.get_create_or_replace_table_ddl( project_id=self.observation_table.project, dataset_id=self.observation_table.dataset_id, schema=observation_schema, table_id=self.observation_table.table_id, as_query=query)
def stage_cleaned_table_ddl(self) -> str: """ Get a DDL statement which stages cleaned table Note: This avoids potential partitioning mismatch error when directly overwriting observation table :return: the DDL statement """ observation_schema = bq.get_table_schema(OBSERVATION) query = CLEANED_ROWS_QUERY.render(src=self.observation_table, backup=self.backup_table) return bq.get_create_or_replace_table_ddl( project_id=self.stage_table.project, dataset_id=self.stage_table.dataset_id, table_id=self.stage_table.table_id, schema=observation_schema, as_query=query)