def execute(self, context): for i in range(0, len(self.source_project_dataset_table), 1): try: self.log.info('Executing %d/%d extracts', i+1, len(self.source_project_dataset_table)) self.log.info('Executing extract of %s into: %s', self.source_project_dataset_table[i], self.destination_cloud_storage_uris[i]) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract( self.source_project_dataset_table[i], self.destination_cloud_storage_uris[i], self.compression, self.export_format, self.field_delimiter, self.print_header, self.labels) except Exception as e: self.log.error('Exception: %s', e) self.log.info('Wait %d seconds retry', self.lazy_retry_wait) time.sleep(self.lazy_retry_wait) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract( self.source_project_dataset_table[i], self.destination_cloud_storage_uris[i], self.compression, self.export_format, self.field_delimiter, self.print_header, self.labels)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) # Remover a tabela caso exista full_table_name = '%s.%s.%s' % (self.project_id, self.dataset_id, self.table_id) bq_hook.get_conn().cursor().run_table_delete(full_table_name, ignore_if_missing=True)
def count_rows(project, dataset, table): hook = BigQueryHook() conn = hook.get_conn() cursor = conn.cursor() cursor.execute(f"SELECT COUNT(*) FROM `{PROJECT}.{dataset}.{table}`") res = cursor.fetchone() return res[0]
def _load_bq_cursor(self): if self.bq_cursor is None: hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to) conn = hook.get_conn() self.bq_cursor = conn.cursor()
def dry_run_bql(task): """ Call the BigQuery dry run API to run the rendered query. :param task: BigQueryOperator task that need to be rendered :type task: BigQueryOperator :return: query reply from the API :rtype: json """ query = getattr(task, 'bql') if query is None: query = getattr(task, 'sql') hook = BigQueryHook(bigquery_conn_id=task.bigquery_conn_id, delegate_to=task.delegate_to) conn = hook.get_conn() cursor = conn.cursor() job_data = { 'configuration': { 'dryRun': True, 'query': { 'query': query, 'useLegacySql': task.use_legacy_sql, 'maximumBillingTier': task.maximum_billing_tier } } } jobs = cursor.service.jobs() query_reply = jobs \ .insert(projectId=cursor.project_id, body=job_data) \ .execute() return query_reply
def execute(self, context): self.log.info('Deleting: %s', self.deletion_dataset_table) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_table_delete(self.deletion_dataset_table, self.ignore_if_missing)
def execute(self, context): if self.bq_cursor is None: self.log.info('Executing: %s', self.sql) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to) conn = hook.get_conn() self.bq_cursor = conn.cursor() self.bq_cursor.run_query( self.sql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, flatten_results=self.flatten_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, maximum_bytes_billed=self.maximum_bytes_billed, create_disposition=self.create_disposition, query_params=self.query_params, labels=self.labels, schema_update_options=self.schema_update_options, priority=self.priority, time_partitioning=self.time_partitioning, api_resource_configs=self.api_resource_configs, cluster_fields=self.cluster_fields, )
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.gcs_schema_object: gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( gcs_bucket, gcs_object).decode("utf-8")) else: schema_fields = self.schema_fields conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_empty_table( project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id, schema_fields=schema_fields, time_partitioning=self.time_partitioning )
def _run_bq_query(self, context): self.log.info('Running BigQuery query: %s', self.sql) if self.bq_cursor is None: hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to, location=None) conn = hook.get_conn() self.bq_cursor = conn.cursor() job_id = self.bq_cursor.run_query( sql=self.sql, destination_dataset_table=self.tmp_dataset_table, write_disposition='WRITE_TRUNCATE', allow_large_results=self.allow_large_results, flatten_results=None, udf_config=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', query_params=None, labels=self.labels, schema_update_options=(), priority='INTERACTIVE', time_partitioning=None, api_resource_configs=None, cluster_fields=None, ) context['task_instance'].xcom_push(key='job_id', value=job_id)
def execute(self, context): if self.dataset: raw_tables = [ f"{self.dataset}.{tbl}" for tbl in self.dst_table_names ] else: raw_tables = self.dst_table_names dst_table_names = [format_table_name(x) for x in raw_tables] src_table_names = [ format_table_name(x, is_staging=True) for x in raw_tables ] bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) conn = bq_hook.get_conn() cursor = conn.cursor() for src, dst in zip(src_table_names, dst_table_names): cursor.run_copy(src, dst, write_disposition=self.write_disposition) # once all tables moved, then delete staging for src in src_table_names: cursor.run_table_delete(src) return dst_table_names
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = ['gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_external_table( external_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, src_fmt_configs=self.src_fmt_configs )
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = ['gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_external_table( external_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, src_fmt_configs=self.src_fmt_configs, labels=self.labels )
def execute(self, context): logging.info('Deleting: %s', self.deletion_dataset_table) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_table_delete(self.deletion_dataset_table, self.ignore_if_missing)
def execute(self, context): self.log.info('Fetching Data from:') self.log.info('Dataset: %s ; Table: %s ; Max Results: %s', self.dataset_id, self.table_id, self.max_results) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() response = cursor.get_tabledata(dataset_id=self.dataset_id, table_id=self.table_id, max_results=self.max_results, selected_fields=self.selected_fields) self.log.info('Total Extracted rows: %s', response['totalRows']) rows = response['rows'] table_data = [] for dict_row in rows: single_row = [] for fields in dict_row['f']: single_row.append(fields['v']) table_data.append(single_row) return table_data
def execute(self, context): self.log.info('Fetching Data from:') self.log.info('Dataset: %s ; Table: %s ; Max Results: %s', self.dataset_id, self.table_id, self.max_results) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() response = cursor.get_tabledata(dataset_id=self.dataset_id, table_id=self.table_id, max_results=self.max_results, selected_fields=self.selected_fields) self.log.info('Total Extracted rows: %s', response['totalRows']) rows = response['rows'] table_data = [] for dict_row in rows: single_row = [] for fields in dict_row['f']: single_row.append(fields['v']) table_data.append(single_row) return table_data
def _bq_get_data(self): self.log.info('Fetching Data from:') self.log.info('Dataset: %s ; Table: %s', self.dataset_id, self.table_id) hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() i = 0 while True: response = cursor.get_tabledata(dataset_id=self.dataset_id, table_id=self.table_id, max_results=self.batch_size, selected_fields=self.selected_fields, start_index=i * self.batch_size) if 'rows' in response: rows = response['rows'] else: self.log.info('Job Finished') return self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) table_data = [] for dict_row in rows: single_row = [] for fields in dict_row['f']: single_row.append(fields['v']) table_data.append(single_row) yield table_data i += 1
def execute(self, context): gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) schema_fields = self.schema_fields if self.schema_fields else json.loads( gcs_hook.download(self.bucket, self.schema_object)) source_uris = map( lambda schema_object: 'gs://{}/{}'.format( self.bucket, schema_object), self.source_objects) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load( destination_dataset_table=self.destination_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, self.destination_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 logging.info('Loaded BQ data with max {}.{}={}'.format( self.destination_dataset_table, self.max_id_key, max_id)) return max_id
def execute(self, context): full_table_name = format_table_name(self.dst_table_name) print(full_table_name) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) conn = bq_hook.get_conn() cursor = conn.cursor() print(self.sql) # table_resource = { # "tableReference": {"table_id": table_id}, # "materializedView": {"query": self.sql} # } # bigquery.Table.from_api_repr(table_resource) try: cursor.run_query( sql=self.sql, destination_dataset_table=full_table_name, write_disposition="WRITE_TRUNCATE", create_disposition=self.create_disposition, use_legacy_sql=False, ) self.log.info("Query table as created successfully: {}".format( full_table_name)) except HttpError as err: raise AirflowException("BigQuery error: %s" % err.content)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.gcs_schema_object: gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( gcs_bucket, gcs_object).decode("utf-8")) else: schema_fields = self.schema_fields conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_empty_table( project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id, schema_fields=schema_fields, time_partitioning=self.time_partitioning, labels=self.labels )
def execute(self, context): full_table_name = format_table_name(self.src_table) dataset_id, table_id = full_table_name.split(".") bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) conn = bq_hook.get_conn() cursor = conn.cursor() table_resource = { "tableReference": { "table_id": table_id }, "materializedView": { "query": self.sql }, } # bigquery.Table.from_api_repr(table_resource) project_id = get_project_id() try: cursor.service.tables().insert( projectId=project_id, datasetId=dataset_id, body=table_resource).execute(num_retries=self.num_retries) self.log.info("Table created successfully: %s:%s.%s", project_id, dataset_id, table_id) except HttpError as err: raise AirflowException("BigQuery error: %s" % err.content)
def execute(self, context): if self.bq_cursor is None: self.log.info('Executing: %s', self.sql) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to, location=self.location, ) conn = hook.get_conn() self.bq_cursor = conn.cursor() self.bq_cursor.run_query( sql=self.sql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, flatten_results=self.flatten_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, maximum_bytes_billed=self.maximum_bytes_billed, create_disposition=self.create_disposition, query_params=self.query_params, labels=self.labels, schema_update_options=self.schema_update_options, priority=self.priority, time_partitioning=self.time_partitioning, api_resource_configs=self.api_resource_configs, cluster_fields=self.cluster_fields, )
def execute(self, context): self.log.info(f'Executing query """\n{self.sql}\n""" ' f'and save to table: "{self.destination_dataset_table}"') hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_query( sql=self.sql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, create_disposition=self.create_disposition, labels=self.labels, encryption_configuration=self.encryption_configuration, allow_large_results=self.allow_large_results, flatten_results=self.flatten_results, udf_config=self.udf_config, use_legacy_sql=self.use_legacy_sql, maximum_billing_tier=self.maximum_billing_tier, maximum_bytes_billed=self.maximum_bytes_billed, query_params=self.query_params, schema_update_options=self.schema_update_options, priority=self.priority, time_partitioning=self.time_partitioning, api_resource_configs=self.api_resource_configs, cluster_fields=self.cluster_fields, location=self.location)
def execute(self, context): gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) schema_fields = self.schema_fields if self.schema_fields else json.loads(gcs_hook.download(self.bucket, self.schema_object)) source_uris = map(lambda schema_object: 'gs://{}/{}'.format(self.bucket, schema_object), self.source_objects) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load( destination_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format(self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 logging.info('Loaded BQ data with max {}.{}={}'.format(self.destination_project_dataset_table, self.max_id_key, max_id)) return max_id
class GoogleDisplayVideo360ERFToBigQueryOperator(BaseOperator): """Upload Multiple Entity Read Files to specified big query dataset. """ def __init__(self, gcp_conn_id='google_cloud_default', report_body=None, yesterday=False, entity_type=None, file_creation_date=None, cloud_project_id=None, bq_table=None, schema=None, gcs_bucket=None, erf_bucket=None, partner_ids=[], write_disposition='WRITE_TRUNCATE', *args, **kwargs): super(GoogleDisplayVideo360ERFToBigQueryOperator, self).__init__(*args, **kwargs) self.gcp_conn_id = gcp_conn_id self.service = None self.bq_hook = None self.gcs_hook = None self.report_body = report_body self.erf_bucket = erf_bucket self.yesterday = yesterday self.cloud_project_id = cloud_project_id self.bq_table = bq_table self.gcs_bucket = gcs_bucket self.schema = schema self.entity_type = entity_type self.erf_object = 'entity/%s.0.%s.json' % (file_creation_date, entity_type) self.partner_ids = partner_ids self.write_disposition = write_disposition self.file_creation_date = file_creation_date def execute(self, context): if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id) if self.bq_hook is None: self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id) for i, partner_id in enumerate(self.partner_ids): filename = erf_utils.download_and_transform_erf(self, partner_id) entity_read_file_ndj = 'gs://%s/%s' % (self.gcs_bucket, filename) if i > 0: self.write_disposition = 'WRITE_APPEND' bq_base_cursor = self.bq_hook.get_conn().cursor() bq_base_cursor.run_load( destination_project_dataset_table=self.bq_table, schema_fields=self.schema, source_uris=[entity_read_file_ndj], source_format='NEWLINE_DELIMITED_JSON', write_disposition=self.write_disposition) self.gcs_hook.delete(self.gcs_bucket, filename)
def execute(self, context): logging.info('Executing: %s', self.bql) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results, self.udf_config, self.use_legacy_sql)
def execute(self, context): logging.info('Executing: %s', str(self.bql)) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results)
def execute(self, context): if self.bq_cursor is None: self.log.info('Executing: %s', self.sql) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to, location=self.location, ) conn = hook.get_conn() self.bq_cursor = conn.cursor() if isinstance(self.sql, str): job_id = self.bq_cursor.run_query( sql=self.sql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, flatten_results=self.flatten_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, maximum_bytes_billed=self.maximum_bytes_billed, create_disposition=self.create_disposition, query_params=self.query_params, labels=self.labels, schema_update_options=self.schema_update_options, priority=self.priority, time_partitioning=self.time_partitioning, api_resource_configs=self.api_resource_configs, cluster_fields=self.cluster_fields, encryption_configuration=self.encryption_configuration ) elif isinstance(self.sql, Iterable): job_id = [ self.bq_cursor.run_query( sql=s, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, flatten_results=self.flatten_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, maximum_bytes_billed=self.maximum_bytes_billed, create_disposition=self.create_disposition, query_params=self.query_params, labels=self.labels, schema_update_options=self.schema_update_options, priority=self.priority, time_partitioning=self.time_partitioning, api_resource_configs=self.api_resource_configs, cluster_fields=self.cluster_fields, encryption_configuration=self.encryption_configuration ) for s in self.sql] else: raise AirflowException( "argument 'sql' of type {} is neither a string nor an iterable".format(type(str))) context['task_instance'].xcom_push(key='job_id', value=job_id)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.delete_dataset(project_id=self.project_id, dataset_id=self.dataset_id)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_empty_dataset(project_id=self.project_id, dataset_id=self.dataset_id, dataset_reference=self.dataset_reference)
def execute(self, context): self.log.info('Executing: %s', self.bql) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results, self.udf_config, self.use_legacy_sql, self.maximum_billing_tier, self.create_disposition, self.query_params)
def execute(self, context): logging.info('Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_copy( self.source_project_dataset_tables, self.destination_project_dataset_table, self.write_disposition, self.create_disposition)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to, use_legacy_sql=False) bq_cursor = bq_hook.get_conn().cursor() sql = self.SQL_TEMPLATE.format(**self.sql_template_params) bq_cursor.execute(sql) result = bq_cursor.fetchall() # getting the 1st cell of the 1st row of the resultset return result[0][0]
def execute(self, context): if(self.bq_cursor == None): self.log.info('Executing: %s', self.bql) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() self.bq_cursor = conn.cursor() self.bq_cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results, self.udf_config, self.use_legacy_sql, self.maximum_billing_tier, self.create_disposition, self.query_params)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) conn = bq_hook.get_conn() cursor = conn.cursor() self.log.info('Start getting dataset: %s:%s', self.project_id, self.dataset_id) return cursor.get_dataset( dataset_id=self.dataset_id, project_id=self.project_id)
def bq_to_gcs(**kwargs): date_stamp = kwargs['ts'] # get the last current date from Postgres conn = PostgresHook(postgres_conn_id='my_local_db').get_conn() cursor = conn.cursor() cursor.execute('SELECT MAX(last_update_date) FROM airflow.austin_service_reports;') recent_ds = cursor.fetchone()[0] if recent_ds is not None: recent_ds+=timedelta(seconds=1) last = recent_ds else: last = kwargs['start_date']-timedelta(days=1) cursor.close() conn.close() # open connection to BigQuery hook = BigQueryHook( bigquery_conn_id='my_gcp_connection', use_legacy_sql=False ) conn = hook.get_conn() cursor = conn.cursor() with open(SQL_PATH + 'query_bq_dataset.sql', 'r') as f: query = f.read() query = query.format(last, date_stamp) cursor.execute(query) # write to gcs bucket # Each returned row of the result gives: # result = [unique_key, complaint_type, complaint_description, owning_department, source, # status, created_date, last_update_date, close_date, city] with BUCKET.open('bq_bucket/bq_dataset.txt', 'w') as f: while True: result = cursor.fetchone() if result is None: break if result[8] is None: result[8] = '' else: result[8] = datetime.utcfromtimestamp(result[8]) if result[9] is None: result[9] = '' result[7] = datetime.utcfromtimestamp(result[7]) result[6] = datetime.utcfromtimestamp(result[6]) f.write('|'.join([str(val) for val in result]) + '\n') cursor.close() conn.close()
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_empty_dataset( project_id=self.project_id, dataset_id=self.dataset_id, dataset_reference=self.dataset_reference)
def execute(self, context): logging.info('Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract( self.source_project_dataset_table, self.destination_cloud_storage_uris, self.compression, self.export_format, self.field_delimiter, self.print_header)
def execute(self, context): logging.info('Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract(self.source_project_dataset_table, self.destination_cloud_storage_uris, self.compression, self.export_format, self.field_delimiter, self.print_header)
def bq_to_gcs(**kwargs): ds = kwargs['ds'] previous = datetime.strptime(kwargs['prev_ds'], '%Y-%m-%d').date() # get the last current date from Postgres conn = PostgresHook(postgres_conn_id='my_local_db').get_conn() cursor = conn.cursor() cursor.execute('SELECT MAX(CAST(created_date AS DATE)) FROM airflow.austin_service_reports;') recent_ds = cursor.fetchone()[0] if recent_ds is not None: recent_ds+=timedelta(days=1) if recent_ds < previous: prev_ds = datetime.strftime(recent_ds, '%Y-%m-%d') else: prev_ds = kwargs['prev_ds'] else: prev_ds = datetime.strftime(kwargs['start_date']-timedelta(days=1), '%Y-%m-%d') cursor.close() conn.close() # open connection to BigQuery hook = BigQueryHook( bigquery_conn_id='my_gcp_connection', use_legacy_sql=False ) conn = hook.get_conn() cursor = conn.cursor() with open(SQL_PATH + 'query_bq_dataset.sql', 'r') as f: query = f.read() query = query.format(prev_ds,ds) cursor.execute(query) # write to gcs bucket with BUCKET.open('bq_bucket/bq_dataset.csv', 'w') as f: while True: result = cursor.fetchone() if result is None: break if result[6] is None: result[6]= '' else: result[6] = datetime.utcfromtimestamp(result[6]) result[5] = datetime.utcfromtimestamp(result[5]) f.write(','.join([str(val) for val in result]) + '\n') cursor.close() conn.close()
def execute(self, context): self.log.info('Dataset id: %s Project id: %s', self.dataset_id, self.project_id) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_empty_dataset( project_id=self.project_id, dataset_id=self.dataset_id, dataset_reference=self.dataset_reference)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = ['gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load( destination_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info( 'Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id ) return max_id
def execute(self, context): if self.bq_cursor is None: self.log.info('Executing: %s', self.bql) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, delegate_to=self.delegate_to) conn = hook.get_conn() self.bq_cursor = conn.cursor() self.bq_cursor.run_query( self.bql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, udf_config=self.udf_config, maximum_billing_tier=self.maximum_billing_tier, create_disposition=self.create_disposition, query_params=self.query_params, schema_update_options=self.schema_update_options)
def execute(self, context): logging.info('Executing: %s', str(self.bql)) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition)