def make_table(project, dataset_id, table_id, friendly_name=None, description=None, expires=None, partitioning_type=None, view_use_legacy_sql=None, view_query=None, schema=None, labels=None): dataset_ref = DatasetReference(project, dataset_id) table_ref = TableReference(dataset_ref, table_id) table = Table(table_ref) table.friendly_name = friendly_name table.description = description table.expires = expires table.partitioning_type = partitioning_type if view_use_legacy_sql is not None: table.view_use_legacy_sql = view_use_legacy_sql if view_query is not None: table.view_query = view_query table.schema = schema if labels is not None: table.labels = labels return table
def create(self, table_id, schema): """ Create a table in Google BigQuery given a table and schema Parameters ---------- table : str Name of table to be written schema : str Use the generate_bq_schema to generate your table schema from a dataframe. """ from google.cloud.bigquery import SchemaField from google.cloud.bigquery import Table if self.exists(table_id): raise TableCreationError( "Table {0} already " "exists".format(table_id) ) if not _Dataset(self.project_id, credentials=self.credentials).exists( self.dataset_id ): _Dataset( self.project_id, credentials=self.credentials, location=self.location, ).create(self.dataset_id) table_ref = self.client.dataset(self.dataset_id).table(table_id) table = Table(table_ref) # Manually create the schema objects, adding NULLABLE mode # as a workaround for # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456 for field in schema["fields"]: if "mode" not in field: field["mode"] = "NULLABLE" table.schema = [ SchemaField.from_api_repr(field) for field in schema["fields"] ] try: self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex)
def create(self, table_id, schema): """Create a table in Google BigQuery given a table and schema Parameters ---------- table : str Name of table to be written schema : str Use the generate_bq_schema to generate your table schema from a dataframe. """ from google.cloud.bigquery import SchemaField from google.cloud.bigquery import Table if self.exists(table_id): raise TableCreationError( "Table {0} already " "exists".format(table_id) ) if not _Dataset(self.project_id, credentials=self.credentials).exists( self.dataset_id ): _Dataset( self.project_id, credentials=self.credentials, location=self.location, ).create(self.dataset_id) table_ref = self.client.dataset(self.dataset_id).table(table_id) table = Table(table_ref) schema = pandas_gbq.schema.add_default_nullable_mode(schema) table.schema = [ SchemaField.from_api_repr(field) for field in schema["fields"] ] try: self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex)
def create(self, table_id, schema): """ Create a table in Google BigQuery given a table and schema Parameters ---------- table : str Name of table to be written schema : str Use the generate_bq_schema to generate your table schema from a dataframe. """ from google.cloud.bigquery import SchemaField from google.cloud.bigquery import Table if self.exists(table_id): raise TableCreationError("Table {0} already " "exists".format(table_id)) if not _Dataset(self.project_id, private_key=self.private_key).exists(self.dataset_id): _Dataset(self.project_id, private_key=self.private_key).create(self.dataset_id) table_ref = self.client.dataset(self.dataset_id).table(table_id) table = Table(table_ref) for field in schema['fields']: if 'mode' not in field: field['mode'] = 'NULLABLE' table.schema = [ SchemaField.from_api_repr(field) for field in schema['fields'] ] try: self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex)
def create(self, table_id, schema): """Create a table in Google BigQuery given a table and schema Parameters ---------- table : str Name of table to be written schema : str Use the generate_bq_schema to generate your table schema from a dataframe. """ from google.cloud.bigquery import DatasetReference from google.cloud.bigquery import Table from google.cloud.bigquery import TableReference if self.exists(table_id): raise TableCreationError( "Table {0} already exists".format(table_id) ) if not _Dataset(self.project_id, credentials=self.credentials).exists( self.dataset_id ): _Dataset( self.project_id, credentials=self.credentials, location=self.location, ).create(self.dataset_id) table_ref = TableReference( DatasetReference(self.project_id, self.dataset_id), table_id ) table = Table(table_ref) table.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) try: self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex)
print(dataset_pointz) _transactions_ref = dataset_pointz.table('transactions') try: transactions_table = client.get_table(_transactions_ref) except NotFound: transactions_table = Table(_transactions_ref) _schema = [ SchemaField('id', 'INT64', 'REQUIRED', None), SchemaField('sale', 'INT64', 'REQUIRED', None), SchemaField('pointz_sale', 'INT64', 'REQUIRED', None), SchemaField('pointz', 'INT64', 'REQUIRED', None), SchemaField('year', 'INT64', 'REQUIRED', None), SchemaField('month', 'INT64', 'REQUIRED', None), SchemaField('day', 'INT64', 'REQUIRED', None), SchemaField('store_name', 'string', 'REQUIRED', None), SchemaField('store_id', 'INT64', 'REQUIRED', None), SchemaField('region_name', 'string', 'REQUIRED', None), SchemaField('region_id', 'INT64', 'REQUIRED', None), SchemaField('partner_name', 'string', 'REQUIRED', None), SchemaField('partner_id', 'INT64', 'REQUIRED', None), SchemaField('segment_name', 'string', 'REQUIRED', None), ] transactions_table.schema = _schema transactions_table = client.create_table(transactions_table)
def enrich_task(): client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions: # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table( dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge_{task}.sql'.format( task=task)) merge_sql_template = read_file(merge_sql_path) template_context['params']['source_table'] = temp_table_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)