def tearDown(self): self.delete_gcs_bucket(BUCKET) with provide_gcp_context(GCP_BIGQUERY_KEY, scopes=SCOPES): hook = BigQueryHook() hook.delete_dataset(dataset_id='airflow_test', delete_contents=True) super().tearDown()
def execute(self, context: 'Context'): self.log.info( 'Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris, ) hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) job_id = hook.run_extract( source_project_dataset_table=self.source_project_dataset_table, destination_cloud_storage_uris=self.destination_cloud_storage_uris, compression=self.compression, export_format=self.export_format, field_delimiter=self.field_delimiter, print_header=self.print_header, labels=self.labels, ) job = hook.get_job(job_id=job_id).to_api_repr() conf = job["configuration"]["extract"]["sourceTable"] dataset_id, project_id, table_id = conf["datasetId"], conf[ "projectId"], conf["tableId"] BigQueryTableLink.persist( context=context, task_instance=self, dataset_id=dataset_id, project_id=project_id, table_id=table_id, )
def execute(self, context: 'Context'): self.log.info( 'Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris, ) hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) table_ref = TableReference.from_string(self.source_project_dataset_table, hook.project_id) configuration: Dict[str, Any] = { 'extract': { 'sourceTable': table_ref.to_api_repr(), 'compression': self.compression, 'destinationUris': self.destination_cloud_storage_uris, 'destinationFormat': self.export_format, } } if self.labels: configuration['labels'] = self.labels if self.export_format == 'CSV': # Only set fieldDelimiter and printHeader fields if using CSV. # Google does not like it if you set these fields for other export # formats. configuration['extract']['fieldDelimiter'] = self.field_delimiter configuration['extract']['printHeader'] = self.print_header hook.insert_job(configuration=configuration)
def execute(self, context: 'Context') -> None: self.log.info( 'Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table, ) hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) job_id = hook.run_copy( source_project_dataset_tables=self.source_project_dataset_tables, destination_project_dataset_table=self.destination_project_dataset_table, write_disposition=self.write_disposition, create_disposition=self.create_disposition, labels=self.labels, encryption_configuration=self.encryption_configuration, ) job = hook.get_job(job_id=job_id).to_api_repr() conf = job["configuration"]["copy"]["destinationTable"] BigQueryTableLink.persist( context=context, task_instance=self, dataset_id=conf["datasetId"], project_id=conf["projectId"], table_id=conf["tableId"], )
def execute(self, context: 'Context') -> None: self.log.info( 'Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table, ) hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) hook.run_copy( source_project_dataset_tables=self. source_project_dataset_tables, destination_project_dataset_table=self. destination_project_dataset_table, write_disposition=self.write_disposition, create_disposition=self.create_disposition, labels=self.labels, encryption_configuration=self.encryption_configuration, )
def poke(self, context): table_uri = '{0}:{1}.{2}'.format(self.project_id, self.dataset_id, self.table_id) self.log.info('Sensor checks existence of table: %s', table_uri) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) return hook.table_exists(self.project_id, self.dataset_id, self.table_id)
def extract_google_analytics_data(gcp_conn_id: str, start_date: str, end_date: str, locations: str): bq_hook = BigQueryHook(gcp_conn_id=gcp_conn_id) cities = ','.join(['"{}"'.format(c["name"]) for c in locations]) countries = ','.join(['"{}"'.format(c["country"]) for c in locations]) states = ','.join(['"{}"'.format(c["state"]) for c in locations]) query = """SELECT fullVisitorId, date, geoNetwork.city as city, geoNetwork.country as country, geoNetwork.region as region, product.v2ProductCategory as productCategory, product.v2ProductName as productName, hits.eCommerceAction.action_type as action_type, hits.eCommerceAction.step as action_step, product.productQuantity as quantity, product.productPrice as price, product.productRevenue as revenue, product.isImpression as isImpression, hits.transaction.transactionId as transactionId, hits.transaction.transactionRevenue as transactionRevenue, hits.transaction.transactionTax as transactionTax, hits.transaction.transactionShipping as transactionShipping, FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`, UNNEST(hits) as hits, UNNEST(hits.product) as product WHERE _TABLE_SUFFIX BETWEEN '{start_date}' AND '{end_date}' AND geoNetwork.country IN ({countries}) AND geoNetwork.city IN ({cities}) AND geoNetwork.region IN ({states}) ORDER BY date ASC""".format(start_date=start_date, end_date=end_date, countries=countries, cities=cities, states=states) df = bq_hook.get_pandas_df(sql=query, dialect="standard") df["location_name"] = df.apply(lambda row: get_location_string( row['country'], row['city'], row['region']), axis=1) df["date"] = pd.to_datetime(df["date"]) df["price"] = df["price"] / (10**6) df["revenue"] = df["revenue"] / (10**6) df["transactionRevenue"] = df["transactionRevenue"] / (10**6) df["transactionTax"] = df["transactionTax"] / (10**6) df["transactionShipping"] = df["transactionShipping"] / (10**6) df = df.set_index(["date", "location_name"]) return df.to_csv(date_format="%Y-%m-%d %H:%M:%S")
def poke(self, context: dict) -> bool: table_uri = f'{self.project_id}:{self.dataset_id}.{self.table_id}' self.log.info('Sensor checks existence of table: %s', table_uri) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) return hook.table_exists(project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id)
def poke(self, context: 'Context') -> bool: table_uri = f'{self.project_id}:{self.dataset_id}.{self.table_id}' self.log.info('Sensor checks existence of partition: "%s" in table: %s', self.partition_id, table_uri) hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) return hook.table_partition_exists( project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id, partition_id=self.partition_id, )
def poke(self, context: dict) -> bool: table_uri = '{0}:{1}.{2}'.format(self.project_id, self.dataset_id, self.table_id) self.log.info('Sensor checks existence of partition: "%s" in table: %s', self.partition_id, table_uri) hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) return hook.table_partition_exists( project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id, partition_id=self.partition_id, )
def execute(self, context: 'Context') -> None: big_query_hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) project_id, dataset_id, table_id = self.source_project_dataset_table.split( '.') BigQueryTableLink.persist( context=context, task_instance=self, dataset_id=dataset_id, project_id=project_id, table_id=table_id, ) mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id, schema=self.database) for rows in bigquery_get_data( self.log, self.dataset_id, self.table_id, big_query_hook, self.batch_size, self.selected_fields, ): mssql_hook.insert_rows( table=self.mssql_table, rows=rows, target_fields=self.selected_fields, replace=self.replace, )
def bqcreate(): bqcreate = BigQueryHook().create_empty_table( project_id="project-staging", dataset_id='test', table_id="partitiontable", table_resource = json.loads(tableresource) )
def bigquery_get_data( logger: Logger, dataset_id: str, table_id: str, big_query_hook: BigQueryHook, batch_size: int, selected_fields: Optional[Union[List[str], str]], ) -> Iterator: logger.info('Fetching Data from:') logger.info('Dataset: %s ; Table: %s', dataset_id, table_id) i = 0 while True: rows: List[Row] = big_query_hook.list_rows( dataset_id=dataset_id, table_id=table_id, max_results=batch_size, selected_fields=selected_fields, start_index=i * batch_size, ) if len(rows) == 0: logger.info('Job Finished') return logger.info('Total Extracted rows: %s', len(rows) + i * batch_size) yield [row.values() for row in rows] i += 1
def func_appy_bq(**kwargs): ti = kwargs['ti'] ts = ti.xcom_pull(task_ids='gen_table_resource') BigQueryHook().create_empty_table(project_id="mygcp-project", dataset_id='test', table_id="partition_demo", table_resource=json.loads(ts))
def execute(self, context): self.log.info('Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table) hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location) conn = hook.get_conn() cursor = conn.cursor() cursor.run_copy( source_project_dataset_tables=self.source_project_dataset_tables, destination_project_dataset_table=self. destination_project_dataset_table, write_disposition=self.write_disposition, create_disposition=self.create_disposition, labels=self.labels, encryption_configuration=self.encryption_configuration)
def execute(self, context): self.log.info('Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris) hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location) conn = hook.get_conn() cursor = conn.cursor() cursor.run_extract( source_project_dataset_table=self.source_project_dataset_table, destination_cloud_storage_uris=self.destination_cloud_storage_uris, compression=self.compression, export_format=self.export_format, field_delimiter=self.field_delimiter, print_header=self.print_header, labels=self.labels)
def _bq_get_data(self): self.log.info('Fetching Data from:') self.log.info('Dataset: %s ; Table: %s', self.dataset_id, self.table_id) hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) conn = hook.get_conn() cursor = conn.cursor() i = 0 while True: response = cursor.get_tabledata( dataset_id=self.dataset_id, table_id=self.table_id, max_results=self.batch_size, selected_fields=self.selected_fields, start_index=i * self.batch_size, ) if 'rows' in response: rows = response['rows'] else: self.log.info('Job Finished') return self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) table_data = [] for dict_row in rows: single_row = [] for fields in dict_row['f']: single_row.append(fields['v']) table_data.append(single_row) yield table_data i += 1
def _bq_get_data(self): hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) table_ref = TableReference.from_string( self.source_project_dataset_table) self.log.info('Fetching Data from:') self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id, table_ref.table_id) conn = hook.get_conn() cursor = conn.cursor() i = 0 while True: response = cursor.get_tabledata( dataset_id=table_ref.dataset_id, table_id=table_ref.table_id, max_results=self.batch_size, selected_fields=self.selected_fields, start_index=i * self.batch_size, ) if 'rows' not in response: self.log.info('Job Finished') return rows = response['rows'] self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) table_data = [] table_data = [[fields['v'] for fields in dict_row['f']] for dict_row in rows] yield table_data i += 1
def execute(self, context: 'Context'): self.log.info( 'Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris, ) hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) hook.run_extract( source_project_dataset_table=self.source_project_dataset_table, destination_cloud_storage_uris=self.destination_cloud_storage_uris, compression=self.compression, export_format=self.export_format, field_delimiter=self.field_delimiter, print_header=self.print_header, labels=self.labels, )
def execute(self, context): bq = BigQueryHook(gcp_conn_id=self.gcp_conn_id, use_legacy_sql=False) total_rows = [] # self.log.info("{credentials}") for table in self.table_list: q = self.sql.format(table) response = bq.get_pandas_df(q) if response.empty: response_value = 0 else: response_value = response['f0_'].iloc[0] total_rows.append(response_value) if sum(total_rows) != self.pass_value: logging.info(sum(total_rows)) logging.info('Tests failed') logging.info(total_rows) logging.info(self.pass_value) raise ValueError('Data quality check failed')
def load(gcp_conn_id: str, combined_data: str, gcs_bucket: str, gcs_object: str): gcs_hook = GCSHook(gcp_conn_id=gcp_conn_id) gcs_hook.upload(bucket_name=gcs_bucket, data=combined_data, object_name=gcs_object) bq_hook = BigQueryHook(bigquery_conn_id=gcp_conn_id) bq_hook.run_load( destination_project_dataset_table= "augmented-works-297410.demo_dataset.sales_interactions2", source_uris="gs://{}/{}".format(gcs_bucket, gcs_object), write_disposition="WRITE_APPEND", source_format="CSV", skip_leading_rows=1, autodetect=False, schema_fields=[ bigquery.SchemaField("date", "DATETIME").to_api_repr(), bigquery.SchemaField("location_name", "STRING").to_api_repr(), bigquery.SchemaField("average_temp", "FLOAT").to_api_repr(), bigquery.SchemaField("fullVisitorId", "STRING").to_api_repr(), bigquery.SchemaField("city", "STRING").to_api_repr(), bigquery.SchemaField("country", "STRING").to_api_repr(), bigquery.SchemaField("region", "STRING").to_api_repr(), bigquery.SchemaField("productCategory", "STRING").to_api_repr(), bigquery.SchemaField("productName", "STRING").to_api_repr(), bigquery.SchemaField("action_type", "INTEGER").to_api_repr(), bigquery.SchemaField("action_step", "INTEGER").to_api_repr(), bigquery.SchemaField("quantity", "FLOAT").to_api_repr(), bigquery.SchemaField("price", "FLOAT").to_api_repr(), bigquery.SchemaField("revenue", "FLOAT").to_api_repr(), bigquery.SchemaField("isImpression", "BOOL").to_api_repr(), bigquery.SchemaField("transactionId", "STRING").to_api_repr(), bigquery.SchemaField("transactionRevenue", "FLOAT").to_api_repr(), bigquery.SchemaField("transactionTax", "FLOAT").to_api_repr(), bigquery.SchemaField("transactionShipping", "FLOAT").to_api_repr(), ])
def _bq_get_data(self): self.log.info('Fetching Data from:') self.log.info('Dataset: %s ; Table: %s', self.dataset_id, self.table_id) hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) i = 0 while True: response = hook.list_rows( dataset_id=self.dataset_id, table_id=self.table_id, max_results=self.batch_size, selected_fields=self.selected_fields, start_index=i * self.batch_size, ) rows = [dict(r) for r in response] if len(rows) == 0: self.log.info('Job Finished') return self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) table_data = [] for dict_row in rows: single_row = [] for fields in dict_row['f']: single_row.append(fields['v']) table_data.append(single_row) yield table_data i += 1
def execute(self, context: 'Context') -> None: big_query_hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) mysql_hook = MySqlHook(schema=self.database, mysql_conn_id=self.mysql_conn_id) for rows in bigquery_get_data( self.log, self.dataset_id, self.table_id, big_query_hook, self.batch_size, self.selected_fields, ): mysql_hook.insert_rows( table=self.mysql_table, rows=rows, target_fields=self.selected_fields, replace=self.replace, )
def execute(self, context: 'Context'): self.log.info( 'Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris, ) hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) self.hook = hook configuration = self._prepare_configuration() job_id = hook.generate_job_id( job_id=self.job_id, dag_id=self.dag_id, task_id=self.task_id, logical_date=context["logical_date"], configuration=configuration, force_rerun=self.force_rerun, ) try: self.log.info("Executing: %s", configuration) job: ExtractJob = hook.insert_job( job_id=job_id, configuration=configuration, project_id=self.project_id, location=self.location, timeout=self.result_timeout, retry=self.result_retry, ) self._handle_job_error(job) except Conflict: # If the job already exists retrieve it job = hook.get_job( project_id=self.project_id, location=self.location, job_id=job_id, ) if job.state in self.reattach_states: # We are reattaching to a job job.result(timeout=self.result_timeout, retry=self.result_retry) self._handle_job_error(job) else: # Same job configuration so we need force_rerun raise AirflowException( f"Job with id: {job_id} already exists and is in {job.state} state. If you " f"want to force rerun it consider setting `force_rerun=True`." f"Or, if you want to reattach in this scenario add {job.state} to `reattach_states`" ) conf = job.to_api_repr()["configuration"]["extract"]["sourceTable"] dataset_id, project_id, table_id = conf["datasetId"], conf[ "projectId"], conf["tableId"] BigQueryTableLink.persist( context=context, task_instance=self, dataset_id=dataset_id, project_id=project_id, table_id=table_id, )
def execute(self, context): bq_hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) if not self.schema_fields: if self.schema_object and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GCSHook( gcp_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) blob = gcs_hook.download( bucket_name=self.bucket, object_name=self.schema_object, ) schema_fields = json.loads(blob.decode("utf-8")) elif self.schema_object is None and self.autodetect is False: raise AirflowException( 'At least one of `schema_fields`, `schema_object`, or `autodetect` must be passed.' ) else: schema_fields = None else: schema_fields = self.schema_fields source_uris = [ f'gs://{self.bucket}/{source_object}' for source_object in self.source_objects ] conn = bq_hook.get_conn() cursor = conn.cursor() if self.external_table: cursor.create_external_table( external_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, src_fmt_configs=self.src_fmt_configs, encryption_configuration=self.encryption_configuration, labels=self.labels, description=self.description, ) else: cursor.run_load( destination_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, autodetect=self.autodetect, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning, cluster_fields=self.cluster_fields, encryption_configuration=self.encryption_configuration, labels=self.labels, description=self.description, ) if cursor.use_legacy_sql: escaped_table_name = f'[{self.destination_project_dataset_table}]' else: escaped_table_name = f'`{self.destination_project_dataset_table}`' if self.max_id_key: cursor.execute( f'SELECT MAX({self.max_id_key}) FROM {escaped_table_name}') row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info( 'Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id, )
def execute(self, context: 'Context'): bq_hook = BigQueryHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) if not self.schema_fields: if self.schema_object and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) blob = gcs_hook.download( bucket_name=self.bucket, object_name=self.schema_object, ) schema_fields = json.loads(blob.decode("utf-8")) else: schema_fields = None else: schema_fields = self.schema_fields self.source_objects = (self.source_objects if isinstance( self.source_objects, list) else [self.source_objects]) source_uris = [ f'gs://{self.bucket}/{source_object}' for source_object in self.source_objects ] if self.external_table: with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) bq_hook.create_external_table( external_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, autodetect=self.autodetect, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, src_fmt_configs=self.src_fmt_configs, encryption_configuration=self.encryption_configuration, labels=self.labels, description=self.description, ) else: with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) bq_hook.run_load( destination_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, autodetect=self.autodetect, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning, cluster_fields=self.cluster_fields, encryption_configuration=self.encryption_configuration, labels=self.labels, description=self.description, ) if self.max_id_key: select_command = f'SELECT MAX({self.max_id_key}) FROM `{self.destination_project_dataset_table}`' with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) job_id = bq_hook.run_query( sql=select_command, use_legacy_sql=False, ) row = list(bq_hook.get_job(job_id).result()) if row: max_id = row[0] if row[0] else 0 self.log.info( 'Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id, ) else: raise RuntimeError(f"The {select_command} returned no rows!")
def get_hook(self): if self.conn_type == 'mysql': from airflow.providers.mysql.hooks.mysql import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.providers.postgres.hooks.postgres import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.providers.apache.pig.hooks.pig import PigCliHook return PigCliHook(pig_cli_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.providers.apache.hive.hooks.hive import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.providers.presto.hooks.presto import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.providers.apache.hive.hooks.hive import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.providers.sqlite.hooks.sqlite import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.providers.jdbc.hooks.jdbc import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.providers.microsoft.mssql.hooks.mssql import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'odbc': from airflow.providers.odbc.hooks.odbc import OdbcHook return OdbcHook(odbc_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.providers.oracle.hooks.oracle import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.providers.vertica.hooks.vertica import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.providers.cloudant.hooks.cloudant import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.providers.jira.hooks.jira import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.providers.redis.hooks.redis import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.providers.microsoft.azure.hooks.wasb import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.providers.docker.hooks.docker import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.providers.microsoft.azure.hooks.azure_data_lake import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.providers.microsoft.azure.hooks.azure_cosmos import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.providers.apache.cassandra.hooks.cassandra import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.providers.mongo.hooks.mongo import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.providers.google.cloud.hooks.cloud_sql import CloudSQLDatabaseHook return CloudSQLDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.providers.grpc.hooks.grpc import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))