def test_natality_tutorial(): client = bigquery.Client() dataset_ref = client.dataset('natality_regression') assert not dataset_exists(dataset_ref, client) natality_tutorial.run_natality_tutorial() assert dataset_exists(dataset_ref, client) table = client.get_table( bigquery.Table(dataset_ref.table('regression_input'))) assert table.num_rows > 0 client.delete_dataset(dataset_ref, delete_contents=True)
def create_table(self, dataset_id, table_id, schema): # TODO(developer): Set table_id to the ID of the table to create. table_id = f"{self.client.project}.{dataset_id}.{table_id}" table = bigquery.Table(table_id, schema=schema) try: table = self.client.create_table(table) # Make an API request. print("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)) except Exception as e: print(e) pass
def test_client_query_relax_column(capsys, client, random_table_id): schema = [ bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), ] client.create_table(bigquery.Table(random_table_id, schema=schema)) client_query_relax_column.client_query_relax_column( client, random_table_id) out, err = capsys.readouterr() assert "2 fields in the schema are required." in out assert "0 fields in the schema are now required." in out
def create_table(self, dataset_id, table_name, schema): """Creates BigQuery table. Args: dataset_id (str): BigQuery dataset id. table_name (str): BigQuery table name. schema (List[google.cloud.bigquery.schema.SchemaField]): Schema of the table to be created. """ dataset_ref = self.client.dataset(dataset_id) table_ref = dataset_ref.table(table_name) table = bigquery.Table(table_ref, schema) self.client.create_table(table)
def create_ga_kb_exit_rate_by_product(): schema = [ bigquery.SchemaField('ga_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('ga_exitPagePath', 'STRING', mode='NULLABLE'), bigquery.SchemaField('ga_exitRate', 'FLOAT', mode='NULLABLE'), bigquery.SchemaField('ga_exits', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('ga_pageviews', 'INTEGER', mode='NULLABLE'), bigquery.SchemaField('product', 'STRING', mode='NULLABLE') ] table_ref = dataset_ref.table('ga_kb_exit_rate_by_product') table = bigquery.Table(table_ref, schema=schema) table = client.create_table(table) # API request assert table.table_id == 'ga_kb_exit_rate_by_product'
def create_new_table(client, dataset): """ Function to create table :param client: instance of Bigquery client :param dataset: instance of dataset :return: table instance """ table_id = "{}.{}.corona_cases_table".format(client.project, dataset.dataset_id) table = bigquery.Table(table_id) table = client.create_table(table, exists_ok=True) print( "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id) ) return table
def bq_create_table(client, dataset, tablename, schemaref): bigquery_client = client dataset_ref = bigquery_client.dataset(dataset) # Prepares a reference to the table table_ref = dataset_ref.table(tablename) try: bigquery_client.get_table(table_ref) except Exception: schema = schemaref table = bigquery.Table(table_ref, schema=schema) table = bigquery_client.create_table(table) print('table {} created.'.format(table.table_id))
def test_table_insert_rows_explicit_none_insert_ids(capsys, random_table_id, client): schema = [ bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), ] table = bigquery.Table(random_table_id, schema=schema) table = client.create_table(table) mut.table_insert_rows_explicit_none_insert_ids(random_table_id) out, err = capsys.readouterr() assert "New rows have been added." in out
def make_bq_empty_table( self, aa_datafeed_header_file_path: str, bq_table_id: str, ) -> bool: """make bq empty table.""" if len(self.bq_schema) == 0: self.__make_schema(aa_datafeed_header_file_path) bq_table = self.bq_dataset.table(bq_table_id) bq_table_ref = bigquery.Table(bq_table, schema=self.bq_schema) self.bigquery_client.create_table(bq_table_ref) return True
def build_and_copy_contents(client, src_dataset, dest_dataset): """ Uses google client object to copy non-schemaed data to schemaed table. :param client: google client object with permissions :param src_dataset: The dataset to copy data from :param des_dataset: The dataset to copy data to. It's tables are created with valid schemas before inserting data. """ LOGGER.info(f'Beginning copy of data from unschemaed dataset, ' f'`{src_dataset}`, to schemaed dataset, `{dest_dataset}`.') table_list = client.list_tables(src_dataset) for table_item in table_list: # create empty schemaed tablle with client object schema_list = get_table_schema(table_item.table_id) dest_table = f'{client.project}.{dest_dataset}.{table_item.table_id}' dest_table = bigquery.Table(dest_table, schema=schema_list) dest_table = client.create_table(dest_table) # Make an API request. LOGGER.info( f'Created empty table `{dest_table.project}.{dest_table.dataset_id}.{dest_table.table_id}`' ) fields_name_str = ',\n'.join([item.name for item in schema_list]) # copy contents from non-schemaed source to schemaed dest sql = ( f'SELECT {fields_name_str} ' f'FROM `{table_item.project}.{table_item.dataset_id}.{table_item.table_id}`' ) job_config = bigquery.job.QueryJobConfig( write_disposition=bigquery.job.WriteDisposition.WRITE_EMPTY, priority=bigquery.job.QueryPriority.BATCH, destination=dest_table, labels={ 'table_name': table_item.table_id, 'copy_from': table_item.dataset_id, 'copy_to': dest_dataset }) job_id = (f'schemaed_copy_{table_item.table_id.lower()}_' f'{datetime.now().strftime("%Y%m%d_%H%M%S")}') job = client.query(sql, job_config=job_config, job_id=job_id) job.result() # Wait for the job to complete. LOGGER.info( f'Table contents `{table_item.project}.{table_item.dataset_id}.' f'{table_item.table_id}` were copied to `{dest_table.project}.' f'{dest_table.dataset_id}.{dest_table.table_id}`') LOGGER.info(f'Completed copy of data from unschemaed dataset, ' f'`{src_dataset}`, to schemaed dataset, `{dest_dataset}`.')
def create_gtrends_queries(dataset_name, table_name): schema = [ bigquery.SchemaField('update_date', 'DATE', mode='NULLABLE'), bigquery.SchemaField('region', 'STRING', mode='NULLABLE'), bigquery.SchemaField('original_query', 'STRING', mode='NULLABLE'), bigquery.SchemaField('translated_query', 'STRING', mode='NULLABLE'), bigquery.SchemaField('query_key_ts', 'STRING', mode='NULLABLE'), bigquery.SchemaField('search_increase_pct', 'INTEGER', mode='NULLABLE'), ] dataset_ref = client.dataset(dataset_name) table_ref = dataset_ref.table(table_name) table = bigquery.Table(table_ref, schema=schema) table = client.create_table(table)
def test_remove_unused_fields_from_schema_no_missing_fields(self) -> None: """Tests that remove_unused_fields_from_schema() does nothing if there are no missing fields.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField("field_1", "STRING")] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField("field_1", "STRING")] self.bq_client.remove_unused_fields_from_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.query.assert_not_called()
def test_add_missing_fields_to_schema(self) -> None: """Tests that the add_missing_fields_to_schema function calls the client to update the table.""" table_ref = bigquery.TableReference(self.mock_dataset_ref, self.mock_table_id) schema_fields = [bigquery.SchemaField("fake_schema_field", "STRING")] table = bigquery.Table(table_ref, schema_fields) self.mock_client.get_table.return_value = table new_schema_fields = [bigquery.SchemaField("new_schema_field", "STRING")] self.bq_client.add_missing_fields_to_schema( self.mock_dataset_id, self.mock_table_id, new_schema_fields ) self.mock_client.update_table.assert_called()
def add_bigquery(temp_data, table_name, table_path, dataset_name, schema): """ Given a dataset name and a table_name (in bigquery), create a table in bigquery with schema given in 'schema' and the data stored in Google Storage path 'table_path'. It deduces the file format (NJSON or CSV) from temp_data. """ if local: os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/tmp/key.json' bq = bigquery.Client(project='gabinete-compartilhado') ds = bq.dataset(dataset_name) # Create dataset if non-existent: try: bq.create_dataset(ds) except: pass # Create a local object (bigQuery table): table_ref = ds.table(table_name) table = bigquery.Table(table_ref) # Configure the bigquery table: if os.path.exists(temp_data): external_config = bigquery.ExternalConfig('NEWLINE_DELIMITED_JSON') elif os.path.exists(temp_data.replace('.json', '.csv')): external_config = bigquery.ExternalConfig('CSV') else: raise Exception('unknown temp_data file extension') external_config.schema = schema # external_config.autodetect = True external_config.ignore_unknown_values = True external_config.max_bad_records = 100 source_uris = [table_path] external_config.source_uris = source_uris table.external_data_configuration = external_config # create table (first delete existent table): try: bq.delete_table(table) except Exception as e: print(e) pass bq.create_table(table) print('Table Cr')
def check_table_existance(client, full_table_id, schema): ''' @parameters client, full_table_id, schema Using the full full table see if the table exists already and if not create the empty table. ''' table_exists = None try: client.get_table(full_table_id) table_exists = True except NotFound: table_exists = False if table_exists == True: print(f'{full_table_id} exists. Making deletion of the table') client.delete_table(full_table_id) table = bigquery.Table(full_table_id, schema=schema) table.clustering_fields = [ "CHROM", "ID", "analysis_workflow_type", "project_short_name" ] table = client.create_table(table) print( f"Created clustered table {table.project}, {table.dataset_id}, {table.table_id}" ) else: print(f'{full_table_id} does not exist. Creating the table') table = bigquery.Table(full_table_id, schema=schema) table.clustering_fields = [ "CHROM", "ID", "analysis_workflow_type", "project_short_name" ] table = client.create_table(table) print( f"Created clustered table {table.project}, {table.dataset_id}, {table.table_id}" )
def bigquery_commit(element): """ Initialize bigquery client Specify dataset, table and schema Append keys to key list Commit batches to bigquery in stream format """ source = contact_options.source.get() logging.info("Element BigQuery length: {}".format(len(element))) client = bigquery.Client(project=PROJECT) dataset_ref = client.dataset('campaign_contact') table_ref = dataset_ref.table(source) schema = [bigquery.SchemaField('id', 'STRING', mode='REQUIRED')] # BigQuery table checker function def if_tbl_exists(clients, table_refs): """Check if table exists, if False, create table""" logging.info("Check Table") try: clients.get_table(table_refs) return True except NotFound: return False if if_tbl_exists(client, table_ref) is False: table = bigquery.Table(table_ref, schema=schema) client.create_table(table) logging.info("Table created: {}".format(table_ref)) else: pass key_data = [] for elem in element: key_data.append({"id": elem[0][0]}) if len(key_data) == len(element): client.insert_rows_json(table_ref, key_data, ignore_unknown_values=True, skip_invalid_rows=True) phone_numbers = [] logging.info( "BigQuery Batch Commit, number of elements: {}".format( len(phone_numbers)))
def generate_bigquery_assets(client_bq, bigquery_asset_list): """ This function provisions all required BigQuery datasets & tables. We combine datasets & tables within a single function as creating tables requires the same references when creating datasets. """ from common.bigquery_schema import bigquery_table_schema_dict from google.cloud.exceptions import NotFound from google.cloud import bigquery def asset_exists(client, asset_type, reference): if asset_type not in ['dataset', 'table']: raise TypeError( 'Error - asset_type must be one of: {dataset, table}!') try: if asset_type == 'dataset': client.get_dataset(reference) elif asset_type == 'table': client.get_table(reference) return True except NotFound: return False # Create dataset if it does not exist.. for dataset_name in set([bq_asset[0] for bq_asset in bigquery_asset_list]): dataset_ref = client_bq.dataset(dataset_name) if not asset_exists(client_bq, 'dataset', dataset_ref): dataset = bigquery.Dataset(dataset_ref) client_bq.create_dataset(dataset) # Create table if it does not exist.. table_list = [] for bq_asset in bigquery_asset_list: dataset_name, table_name, table_schema, table_partition = bq_asset table_ref = client_bq.dataset(dataset_name).table(table_name) if not asset_exists(client_bq, 'table', table_ref): table = bigquery.Table( table_ref, schema=bigquery_table_schema_dict[table_schema]) if table_partition: table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field=table_partition) table_list.append(client_bq.create_table(table)) else: table_list.append(client_bq.get_table(table_ref)) return table_list
def merge_task(country, **context): client = bigquery.Client() table_ref = create_dataset( client, destination_dataset_name, project=destination_dataset_project_id, ).table(destination_table_name) if not does_table_exist(client, table_ref): table = bigquery.Table(table_ref, schema=read_bigquery_schema("world_pop")) table.range_partitioning = RangePartitioning( field="year", range_=PartitionRange(start=1900, end=2100, interval=1), ) table.clustering_fields = [ "geography", "geography_polygon", "country", ] client.create_table(table) job_config = bigquery.QueryJobConfig() job_config.priority = bigquery.QueryPriority.INTERACTIVE sql_path = os.path.join( dags_folder, "resources/stages/load/sqls/merge_worldpop.sql") sql_template = read_file(sql_path) execution_date = context["execution_date"] year = execution_date.strftime("%Y") staging_table_name = "{table}_{country}_{year}".format( table=destination_table_name, country=country, year=year) template_context = { "year": year, "country": country, "source_table": staging_table_name, "source_project_id": staging_dataset_project_id, "source_dataset_name": staging_dataset_name, "destination_table": destination_table_name, "destination_dataset_project_id": destination_dataset_project_id, "destination_dataset_name": destination_dataset_name, } sql = context["task"].render_template(sql_template, template_context) job = client.query(sql, location="US", job_config=job_config) submit_bigquery_job(job, job_config) assert job.state == "DONE"
def create_table_with_schema(out_ref, flavor): table = bq.Table(out_ref, schema=bq_schema.make_aug_npl_schema(flavor)) if any( [ out_ref.table_id == table.table_id for table in client.list_tables(out_ref.dataset_id) ] ): client.delete_table(table) msg.info( f"{out_ref.dataset_id}.{out_ref.table_id} already exists. Will overwrite." ) client.create_table(table)
def test_table_exists(client, to_delete): """Determine if a table exists.""" DATASET_ID = 'get_table_dataset_{}'.format(_millis()) TABLE_ID = 'get_table_table_{}'.format(_millis()) dataset = bigquery.Dataset(client.dataset(DATASET_ID)) dataset = client.create_dataset(dataset) to_delete.append(dataset) table_ref = dataset.table(TABLE_ID) table = bigquery.Table(table_ref, schema=SCHEMA) table = client.create_table(table) to_delete.insert(0, table) assert table_exists(client, table_ref) assert not table_exists(client, dataset.table('i_dont_exist'))
def create_mapping_table(self): """ Create a mapping table with a mapping table schema. """ # create a false mapping table schema = [ bigquery.SchemaField("person_id", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("research_id", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("shift", "INTEGER", mode="REQUIRED"), ] table = bigquery.Table(self.fq_mapping_tablename, schema=schema) table = self.client.create_table(table) # Make an API request.
def table(dataset, scope='function'): name = f'{TEST_PROJECT_ID}.{dataset.dataset_id}.{__generate_uuid()}_quickstart_test_table' schema = [ bigquery.SchemaField('name', 'STRING', 'REQUIRED'), bigquery.SchemaField('email', 'STRING', 'REQUIRED') ] table = bigquery_client.create_table(bigquery.Table(name, schema=schema)) time.sleep( 2) # Wait a few seconds for Data Catalog's search index sync/update. yield table bigquery_client.delete_table(table) time.sleep( 2) # Wait a few seconds for Data Catalog's search index sync/update.
def newCase(): uid = uuid.uuid4() name = request.args.get("name") day = datetime.datetime.now().strftime("%Y-%m-%d") if not name: name = "" #Write the new Case details to Datastore wazePut = caseModel(uid=str(uid), day=day, name=name) wazeKey = wazePut.put() #Create the BigQuery Client client = bigquery.Client() datasetRef = client.dataset(bqDataset) tableSuffix = str(uid).replace('-', '_') #Create the Jams Table jamsTable = 'jams_' + tableSuffix tableRef = datasetRef.table(jamsTable) table = bigquery.Table(tableRef, schema=jamsSchema) table = client.create_table(table) assert table.table_id == jamsTable #Create the Alerts Table alertsTable = 'alerts_' + tableSuffix tableRef = datasetRef.table(alertsTable) table = bigquery.Table(tableRef, schema=alertsSchema) table = client.create_table(table) assert table.table_id == alertsTable #Create the Irregularities Table irregularitiesTable = 'irregularities_' + tableSuffix tableRef = datasetRef.table(irregularitiesTable) table = bigquery.Table(tableRef, schema=irregularitiesSchema) table = client.create_table(table) assert table.table_id == irregularitiesTable
def create_table(datasetId, tableId): bq_client = bigquery.Client() dataset_ref = bq_client.dataset(datasetId) # Prepares a reference to the table table_ref = dataset_ref.table(tableId) try: bq_client.get_table(table_ref) except: schema = [ bigquery.SchemaField('slug', 'STRING', mode='REQUIRED'), bigquery.SchemaField('environment', 'STRING', mode='REQUIRED') ] table = bigquery.Table(table_ref, schema=schema) table = bq_client.create_table(table) return 'table {} created.'.format(table.table_id)
def create_view(client, dataset_id, view_id, query_string): view_id = f"zapp-case-study.{dataset_id}.{view_id}" view = bigquery.Table(view_id) # The source table in this example is created from a CSV file in Google # Cloud Storage located at # `gs://cloud-samples-data/bigquery/us-states/us-states.csv`. It contains # 50 US states, while the view returns only those states with names # starting with the letter 'W'. view.view_query = query_string # Make an API request to create the view. view = client.create_table(view) print(f"Created {view.table_type}: {str(view.reference)}")
def deploy(self, destination_table: str): """Deploy the schema to BigQuery named after destination_table.""" client = bigquery.Client() tmp_schema_file = NamedTemporaryFile() self.to_json_file(Path(tmp_schema_file.name)) bigquery_schema = client.schema_from_json(tmp_schema_file.name) try: # destination table already exists, update schema table = client.get_table(destination_table) table.schema = bigquery_schema client.update_table(table, ["schema"]) except NotFound: table = bigquery.Table(destination_table, schema=bigquery_schema) client.create_table(table)
def create_table(client: bigquery.Client, dataset_id: str, table_id: str, fields: List[Dict]): dataset = bigquery.Dataset("{}.{}".format(client.project, dataset_id)) table_ref = dataset.table(table_id) table = bigquery.Table(table_ref, schema=fields) table.time_partitioning = bigquery.table.TimePartitioning( ) # partition by day try: table = client.create_table(table) print("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)) except gexceptions.GoogleAPIError as e: print("Table {} could not be created: {}. Skipping...".format( table_id, e))
def test_update_table_require_partition_filter(capsys, random_table_id, client): # Make a partitioned table. schema = [bigquery.SchemaField("transaction_timestamp", "TIMESTAMP")] table = bigquery.Table(random_table_id, schema=schema) table.time_partitioning = bigquery.TimePartitioning( field="transaction_timestamp") table = client.create_table(table) update_table_require_partition_filter.update_table_require_partition_filter( random_table_id) out, _ = capsys.readouterr() assert ("Updated table '{}' with require_partition_filter=True".format( random_table_id) in out)
def create_table(project, dataset_id, table_id): try: client = bigquery.Client(project=project) dataset_ref = client.dataset(dataset_id) schema = [ bigquery.SchemaField('full_name', 'STRING', mode='REQUIRED'), bigquery.SchemaField('age', 'INTEGER', mode='REQUIRED'), ] table_ref = dataset_ref.table(table_id) table = bigquery.Table(table_ref, schema=schema) table = client.create_table(table) print('Table {} created'.format(table_id)) except: print('Table {} already exists'.format(table_id))
def run_query_job(self, query, table_type='flat'): client = bigquery.Client() table_name = "{p}.{ds}.{t}_{d}"\ .format(p=self.gcp_project, ds=self.dataset,t=table_type, d=self.date_shard) table_id = bigquery.Table(table_name) query_job_config = bigquery.QueryJobConfig( destination=table_id ,dry_run=False ,use_query_cache=False ,labels={"queryfunction":"flatteningquery"} #todo: apply proper labels ,write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) query_job = client.query(query, job_config=query_job_config) query_job.result() # Waits for job to complete. return