def execute(self, context): self.log.info('Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris) hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract( source_project_dataset_table=self.source_project_dataset_table, destination_cloud_storage_uris=self.destination_cloud_storage_uris, compression=self.compression, export_format=self.export_format, field_delimiter=self.field_delimiter, print_header=self.print_header, labels=self.labels)
def execute(self, context): self.log.info('Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table) hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location) conn = hook.get_conn() cursor = conn.cursor() cursor.run_copy( source_project_dataset_tables=self.source_project_dataset_tables, destination_project_dataset_table=self. destination_project_dataset_table, write_disposition=self.write_disposition, create_disposition=self.create_disposition, labels=self.labels, encryption_configuration=self.encryption_configuration)
def _bq_get_data(self): self.log.info('Fetching Data from:') self.log.info('Dataset: %s ; Table: %s', self.dataset_id, self.table_id) hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) conn = hook.get_conn() cursor = conn.cursor() i = 0 while True: response = cursor.get_tabledata( dataset_id=self.dataset_id, table_id=self.table_id, max_results=self.batch_size, selected_fields=self.selected_fields, start_index=i * self.batch_size, ) if 'rows' in response: rows = response['rows'] else: self.log.info('Job Finished') return self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) table_data = [] for dict_row in rows: single_row = [] for fields in dict_row['f']: single_row.append(fields['v']) table_data.append(single_row) yield table_data i += 1
def _bq_get_data(self): hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) table_ref = TableReference.from_string( self.source_project_dataset_table) self.log.info('Fetching Data from:') self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id, table_ref.table_id) conn = hook.get_conn() cursor = conn.cursor() i = 0 while True: response = cursor.get_tabledata( dataset_id=table_ref.dataset_id, table_id=table_ref.table_id, max_results=self.batch_size, selected_fields=self.selected_fields, start_index=i * self.batch_size, ) if 'rows' not in response: self.log.info('Job Finished') return rows = response['rows'] self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) table_data = [] table_data = [[fields['v'] for fields in dict_row['f']] for dict_row in rows] yield table_data i += 1
def execute(self, context): bq_hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) if not self.schema_fields: if self.schema_object and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GCSHook( gcp_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) blob = gcs_hook.download( bucket_name=self.bucket, object_name=self.schema_object, ) schema_fields = json.loads(blob.decode("utf-8")) elif self.schema_object is None and self.autodetect is False: raise AirflowException( 'At least one of `schema_fields`, `schema_object`, or `autodetect` must be passed.' ) else: schema_fields = None else: schema_fields = self.schema_fields source_uris = [ f'gs://{self.bucket}/{source_object}' for source_object in self.source_objects ] conn = bq_hook.get_conn() cursor = conn.cursor() if self.external_table: cursor.create_external_table( external_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, src_fmt_configs=self.src_fmt_configs, encryption_configuration=self.encryption_configuration, labels=self.labels, description=self.description, ) else: cursor.run_load( destination_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, autodetect=self.autodetect, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning, cluster_fields=self.cluster_fields, encryption_configuration=self.encryption_configuration, labels=self.labels, description=self.description, ) if cursor.use_legacy_sql: escaped_table_name = f'[{self.destination_project_dataset_table}]' else: escaped_table_name = f'`{self.destination_project_dataset_table}`' if self.max_id_key: cursor.execute( f'SELECT MAX({self.max_id_key}) FROM {escaped_table_name}') row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info( 'Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id, )