def execute(self, context): if self.dataset: raw_tables = [ f"{self.dataset}.{tbl}" for tbl in self.dst_table_names ] else: raw_tables = self.dst_table_names dst_table_names = [format_table_name(x) for x in raw_tables] src_table_names = [ format_table_name(x, is_staging=True) for x in raw_tables ] bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) conn = bq_hook.get_conn() cursor = conn.cursor() for src, dst in zip(src_table_names, dst_table_names): cursor.run_copy(src, dst, write_disposition=self.write_disposition) # once all tables moved, then delete staging for src in src_table_names: cursor.run_table_delete(src) return dst_table_names
def execute(self, context): full_table_name = format_table_name(self.dst_table_name) print(full_table_name) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) conn = bq_hook.get_conn() cursor = conn.cursor() print(self.sql) # table_resource = { # "tableReference": {"table_id": table_id}, # "materializedView": {"query": self.sql} # } # bigquery.Table.from_api_repr(table_resource) try: cursor.run_query( sql=self.sql, destination_dataset_table=full_table_name, write_disposition="WRITE_TRUNCATE", create_disposition=self.create_disposition, use_legacy_sql=False, ) self.log.info("Query table as created successfully: {}".format( full_table_name)) except HttpError as err: raise AirflowException("BigQuery error: %s" % err.content)
def execute(self, context): full_table_name = format_table_name(self.src_table) dataset_id, table_id = full_table_name.split(".") bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) conn = bq_hook.get_conn() cursor = conn.cursor() table_resource = { "tableReference": { "table_id": table_id }, "materializedView": { "query": self.sql }, } # bigquery.Table.from_api_repr(table_resource) project_id = get_project_id() try: cursor.service.tables().insert( projectId=project_id, datasetId=dataset_id, body=table_resource).execute(num_retries=self.num_retries) self.log.info("Table created successfully: %s:%s.%s", project_id, dataset_id, table_id) except HttpError as err: raise AirflowException("BigQuery error: %s" % err.content)
def execute(self, context): dst_table_name = format_table_name(self.dst_table_name, is_staging=True) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) conn = bq_hook.get_conn() cursor = conn.cursor() bucket = get_bucket() src_uris = f"{bucket}/{self.src_uris}" cursor.run_load( dst_table_name, source_uris=src_uris, schema_fields=self.schema_fields, autodetect=self.autodetect, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, )
def __new__( cls, parent_id, gcs_dirs_xcom, dst_dir, filename, schema_fields, table_name, task_id, dag, ): from airflow.utils.dates import days_ago args = { "start_date": days_ago(2), } bucket = get_bucket().replace("gs://", "", 1) full_table_name = format_table_name(table_name, is_staging=True) subdag = DAG(dag_id=f"{parent_id}.{task_id}", default_args=args) column_names = [schema["name"] for schema in schema_fields] # by convention, preface task names with dag_id op_col_select = PythonTaskflowOperator( task_id="select_cols", python_callable=_keep_columns, # note that this input should have form schedule/{execution_date}/... taskflow={ "gcs_dirs": { "dag_id": parent_id, "task_ids": gcs_dirs_xcom } }, op_kwargs={ "dst_dir": dst_dir, "filename": filename, "required_cols": [], "optional_cols": column_names, }, dag=subdag, ) op_stage_bq = GoogleCloudStorageToBigQueryOperator( task_id="stage_bigquery", bucket=bucket, # note that we can't really pull a list out of xcom without subclassing # operators, so we really on knowing that the task passing in # gcs_dirs_xcom data is using schedule/{execution_date} source_objects=[ "schedule/{{execution_date}}/*/%s/%s" % (dst_dir, filename) ], schema_fields=schema_fields, destination_project_dataset_table=full_table_name, create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", # _keep_columns function includes headers in output skip_leading_rows=1, dag=subdag, ) op_col_select >> op_stage_bq return SubDagOperator(subdag=subdag, dag=dag, task_id=task_id)