def copy_table(dataset_name, table_name, new_table_name, project=None): """Copies a table. If no project is specified, then the currently active project is used. """ bigquery_client = bigquery.Client(project=project) dataset = bigquery_client.dataset(dataset_name) table = dataset.table(table_name) # This sample shows the destination table in the same dataset and project, # however, it's possible to copy across datasets and projects. You can # also copy muliple source tables into a single destination table by # providing addtional arguments to `copy_table`. destination_table = dataset.table(new_table_name) # Create a job to copy the table to the destination table. job_id = str(uuid.uuid4()) job = bigquery_client.copy_table( job_id, destination_table, table) # Create the table if it doesn't exist. job.create_disposition = ( gcloud.bigquery.job.CreateDisposition.CREATE_IF_NEEDED) # Start the job. job.begin() # Wait for the the job to finish. print('Waiting for job to finish...') wait_for_job(job) print('Table {} copied to {}.'.format(table_name, new_table_name))
def handler(event, context): rows = [] for r in event['Records']: payload = r['kinesis']['data'] try: data = json.loads(base64.b64decode(payload)) row = [] for key in ['time', 'tag', 'value']: if key == 'time': row.append(datetime.datetime.fromtimestamp(data[key])) else: row.append(data[key]) rows.append(tuple(row)) except Exception as e: print('Invalid data "{0}": {1}'.format(payload, e)) pass if len(rows) == 0: return kms = boto3.client('kms') blob = base64.b64decode(BQ_CREDENTIALS) dec = kms.decrypt(CiphertextBlob=blob) keyfile_dict = json.loads(dec['Plaintext']) credentials = ServiceAccountCredentials.from_json_keyfile_dict( keyfile_dict) bq = bigquery.Client(credentials=credentials, project=BQ_PROJECT) dataset = bq.dataset(BQ_DATASET) table = dataset.table(BQ_TABLE) table.reload() res = table.insert_data(rows) print(res)
def async_query(query): client = bigquery.Client() query_job = client.run_async_query(str(uuid.uuid4()), query) query_job.use_legacy_sql = False query_job.begin() wait_for_job(query_job) # Manually construct the QueryResults. # TODO: The client library will provide a helper method that does this. # https://github.com/GoogleCloudPlatform/gcloud-python/issues/2083 query_results = bigquery.query.QueryResults('', client) query_results._properties['jobReference'] = { 'jobId': query_job.name, 'projectId': query_job.project } # Drain the query results by requesting a page at a time. page_token = None while True: rows, total_rows, page_token = query_results.fetch_data( max_results=10, page_token=page_token) for row in rows: print(row) if not page_token: break
def delete_table(dataset_name, table_name, project=None): """Deletes a table in a given dataset. If no project is specified, then the currently active project is used. """ bigquery_client = bigquery.Client(project=project) dataset = bigquery_client.dataset(dataset_name) table = dataset.table(table_name) table.delete() print('Table {}:{} deleted.'.format(dataset_name, table_name))
def test_delete_table(capsys): # Create a table to delete bigquery_client = bigquery.Client() dataset = bigquery_client.dataset(DATASET_ID) table = dataset.table('test_delete_table') if not table.exists(): table.schema = [bigquery.SchemaField('id', 'INTEGER')] table.create() snippets.delete_table(DATASET_ID, table.name) assert not table.exists()
def export_data_to_gcs(dataset_name, table_name, destination): bigquery_client = bigquery.Client() dataset = bigquery_client.dataset(dataset_name) table = dataset.table(table_name) job_name = str(uuid.uuid4()) job = bigquery_client.extract_table_to_storage(job_name, table, destination) job.begin() wait_for_job(job) print('Exported {}:{} to {}'.format(dataset_name, table_name, destination))
def load_data_from_gcs(dataset_name, table_name, source): bigquery_client = bigquery.Client() dataset = bigquery_client.dataset(dataset_name) table = dataset.table(table_name) job_name = str(uuid.uuid4()) job = bigquery_client.load_table_from_storage( job_name, table, source) job.begin() wait_for_job(job) print('Loaded {} rows into {}:{}.'.format( job.output_rows, dataset_name, table_name))
def stream_data(dataset_name, table_name, json_data): bigquery_client = bigquery.Client() dataset = bigquery_client.dataset(dataset_name) table = dataset.table(table_name) data = json.loads(json_data) # Reload the table to get the schema. table.reload() rows = [data] errors = table.insert_data(rows) if not errors: print('Loaded 1 row into {}:{}'.format(dataset_name, table_name)) else: print('Errors:') pprint(errors)
def load_data_from_file(dataset_name, table_name, source_file_name): bigquery_client = bigquery.Client() dataset = bigquery_client.dataset(dataset_name) table = dataset.table(table_name) # Reload the table to get the schema. table.reload() with open(source_file_name, 'rb') as source_file: # This example uses CSV, but you can use other formats. # See https://cloud.google.com/bigquery/loading-data job = table.upload_from_file(source_file, source_format='text/csv') job.begin() wait_for_job(job) print('Loaded {} rows into {}:{}.'.format(job.output_rows, dataset_name, table_name))
def list_rows(dataset_name, table_name, project=None): """Prints rows in the given table. Will print 25 rows at most for brevity as tables can contain large amounts of rows. If no project is specified, then the currently active project is used. """ bigquery_client = bigquery.Client(project=project) dataset = bigquery_client.dataset(dataset_name) table = dataset.table(table_name) if not table.exists(): print('Table {}:{} does not exist.'.format(dataset_name, table_name)) return # Reload the table so that the schema is available. table.reload() rows = [] page_token = None # Load at most 25 results. You can change this to `while True` and change # the max_results argument to load more rows from BigQuery, but note # that this can take some time. It's preferred to use a query. while len(rows) < 25: results, total_rows, page_token = table.fetch_data( max_results=25, page_token=page_token) rows.extend(results) if not page_token: break # Use format to create a simple table. format_string = '{:<16} ' * len(table.schema) # Print schema field names field_names = [field.name for field in table.schema] print(format_string.format(*field_names)) for row in rows: print(format_string.format(*row))
def list_datasets(project=None): """Lists all datasets in a given project. If no project is specified, then the currently active project is used """ bigquery_client = bigquery.Client(project=project) datasets = [] page_token = None while True: results, page_token = bigquery_client.list_datasets( page_token=page_token) datasets.extend(results) if not page_token: break for dataset in datasets: print(dataset.name)
def _init_bigquery_dataset(self): from gcloud import bigquery from gcloud.bigquery.dataset import AccessGrant DATASET_URI = 'bigquery.googleapis.com/projects/%s/datasets/%s' % ( Config.CLIENT.project, DATASET_NAME, ) # Create the destination dataset, and set up the ACL to allow # Stackdriver Logging to write into it. bigquery_client = bigquery.Client() dataset = bigquery_client.dataset(DATASET_NAME) dataset.create() self.to_delete.append(dataset) dataset.reload() grants = dataset.access_grants grants.append( AccessGrant('WRITER', 'groupByEmail', '*****@*****.**')) dataset.access_grants = grants dataset.update() return DATASET_URI
def sync_query(query): client = bigquery.Client() query_results = client.run_sync_query(query) # Use standard SQL syntax for queries. # See: https://cloud.google.com/bigquery/sql-reference/ query_results.use_legacy_sql = False query_results.run() # Drain the query results by requesting a page at a time. page_token = None while True: rows, total_rows, page_token = query_results.fetch_data( max_results=10, page_token=page_token) for row in rows: print(row) if not page_token: break
def list_tables(dataset_name, project=None): """Lists all of the tables in a given dataset. If no project is specified, then the currently active project is used. """ bigquery_client = bigquery.Client(project=project) dataset = bigquery_client.dataset(dataset_name) if not dataset.exists(): print('Dataset {} does not exist.'.format(dataset_name)) return tables = [] page_token = None while True: results, page_token = dataset.list_tables(page_token=page_token) tables.extend(results) if not page_token: break for table in tables: print(table.name)
def create_table(dataset_name, table_name, project=None): """Creates a simple table in the given dataset. If no project is specified, then the currently active project is used. """ bigquery_client = bigquery.Client(project=project) dataset = bigquery_client.dataset(dataset_name) if not dataset.exists(): print('Dataset {} does not exist.'.format(dataset_name)) return table = dataset.table(table_name) # Set the table schema table.schema = ( bigquery.SchemaField('Name', 'STRING'), bigquery.SchemaField('Age', 'INTEGER'), bigquery.SchemaField('Weight', 'FLOAT'), ) table.create() print('Created table {} in dataset {}.'.format(table_name, dataset_name))
def update_table_schema(destination_table, source_vcf, description=None): """Updates a BigQuery table with the variants schema using a VCF header. Args: destination_table: BigQuery table name, PROJECT_ID.DATASET_NAME.TABLE_NAME. source_vcf: Path to local or remote (Cloud Storage) VCF or gzipped VCF file. description: Optional description for the BigQuery table. Raises: ValueError: If destination_table cannot be parsed. """ dest_table = tokenize_table_name(destination_table) dest_project_id, dest_dataset_name, dest_table_name = dest_table # Load the source VCF descriptions = Descriptions() descriptions.add_from_vcf(source_vcf) # Initialize the BQ client client = bigquery.Client(project=dest_project_id) # Load the destination table dest_dataset = client.dataset(dest_dataset_name) dest_dataset.reload() dest_table = dest_dataset.table(dest_table_name) dest_table.reload() if description is not None: dest_table.patch(description=description[:_MAX_LENGTH]) if len(description) > _MAX_LENGTH: logging.warning(_TRUNCATION_WARNING, 'table description') # Set the description on the variant fields and the call fields. # # The (non-fixed) variant field descriptions come from the ##INFO headers # The (non-fixed) call fields descriptions can come from the ##FORMAT headers # as well as the ##INFO headers. # Process variant fields call_field = None for field in dest_table.schema: if field.name.lower() in _FIXED_VARIANT_FIELDS: field.description = _FIXED_VARIANT_FIELDS[field.name.lower()] logging.debug('Variant(fixed): %s: %s', field.name, field.description) elif field.name in descriptions.info_fields: field.description = descriptions.info_fields[field.name] logging.debug('Variant(INFO) %s: %s', field.name, field.description) elif field.name.lower() == 'filter': field.description = descriptions.filter_description if field.name == 'call': call_field = field if field.description is not None and len(field.description) > _MAX_LENGTH: logging.warning(_TRUNCATION_WARNING, field.name) field.description = field.description[:_MAX_LENGTH] # Process call fields for field in call_field.fields: if field.name.lower() in _FIXED_CALL_FIELDS: field.description = _FIXED_CALL_FIELDS[field.name.lower()] logging.debug('Call(fixed): %s: %s', field.name, field.description) elif field.name in descriptions.format_fields: field.description = descriptions.format_fields[field.name] logging.debug('Call(FORMAT) %s: %s', field.name, field.description) elif field.name in descriptions.info_fields: field.description = descriptions.info_fields[field.name] logging.debug('Call(INFO) %s: %s', field.name, field.description) elif field.name.lower() == 'filter': field.description = descriptions.filter_description if field.description is not None and len(field.description) > _MAX_LENGTH: logging.warning(_TRUNCATION_WARNING, field.name) field.description = field.description[:_MAX_LENGTH] logging.info('Updating table %s', dest_table.path) dest_table.patch(schema=dest_table.schema)
TABLE_NAME = 'airport' BUCKET_NAME = 'satish123' FILE = 'airport.csv' SOURCE = 'https://storage.cloud.google.com/satish123/airport.csv?_ga=2.200274028.-331489596.1519587350&_gac=1.252996475.1519744301.CjwKCAiAoNTUBRBUEiwAWje2ltt6Onlm-oURmJ0zEqOD_dy_wmi_5yUsCdGXFro37ANM_5QjwIFk5RoC4PUQAvD_BwE'.format( BUCKET_NAME, FILE) SCHEMA = [ bq.SchemaField('name', 'STRING', mode='required'), bq.SchemaField('country', 'STRING', mode='required'), bq.SchemaField('area_code', 'STRING', mode='required'), bq.SchemaField('origin', 'STRING', mode='required') ] # CREDENTIALS = GoogleCredentials.get_application_efault() client = bq.Client(project=BILLING_PROJECT_ID) # Dataset # Check if the dataset exists def create_datasets(name): dataset = client.dataset(name) try: assert not dataset.exists() dataset.create() assert dataset.exists() print("Dataset {} created".format(name)) except (AssertionError): pass
def setUpModule(): _helpers.PROJECT = TESTS_PROJECT Config.CLIENT = bigquery.Client()
* A query is run against the public dataset, bigquery-public-data.samples.natality, selecting only the data of interest to the regression, the output of which is stored in the “regression_input” table. * The output table is moved over the wire to the user's default project via the built-in BigQuery Connector for Spark that bridges BigQuery and Cloud Dataproc. """ from gcloud import bigquery from gcloud.bigquery import job from gcloud.bigquery.table import * # Create a new Google BigQuery client using Google Cloud Platform project # defaults. bq = bigquery.Client() # Create a new BigQuery dataset. reg_dataset = bq.dataset("natality_regression") reg_dataset.create() # In the new BigQuery dataset, create a new table. table = reg_dataset.table(name="regression_input") # The table needs a schema before it can be created and accept data. # We create an ordered list of the columns using SchemaField objects. schema = [] schema.append(SchemaField("weight_pounds", "float")) schema.append(SchemaField("mother_age", "integer")) schema.append(SchemaField("father_age", "integer")) schema.append(SchemaField("gestation_weeks", "integer")) schema.append(SchemaField("weight_gain_pounds", "integer"))
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import operator import time import unittest2 from gcloud import _helpers from gcloud.environment_vars import TESTS_PROJECT from gcloud import bigquery _helpers.PROJECT = TESTS_PROJECT CLIENT = bigquery.Client() DATASET_NAME = 'system_tests_%012d' % (1000 * time.time(),) class TestBigQuery(unittest2.TestCase): def setUp(self): self.to_delete = [] def tearDown(self): for doomed in self.to_delete: doomed.delete() def test_create_dataset(self): dataset = CLIENT.dataset(DATASET_NAME) self.assertFalse(dataset.exists())