Example #1
0
    def execute(self, context):
        if self.dataset:
            raw_tables = [
                f"{self.dataset}.{tbl}" for tbl in self.dst_table_names
            ]
        else:
            raw_tables = self.dst_table_names

        dst_table_names = [format_table_name(x) for x in raw_tables]

        src_table_names = [
            format_table_name(x, is_staging=True) for x in raw_tables
        ]

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        for src, dst in zip(src_table_names, dst_table_names):
            cursor.run_copy(src, dst, write_disposition=self.write_disposition)

        # once all tables moved, then delete staging
        for src in src_table_names:
            cursor.run_table_delete(src)

        return dst_table_names
    def execute(self, context):
        full_table_name = format_table_name(self.dst_table_name)
        print(full_table_name)

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        print(self.sql)

        # table_resource = {
        #    "tableReference": {"table_id": table_id},
        #    "materializedView": {"query": self.sql}
        # }

        # bigquery.Table.from_api_repr(table_resource)

        try:
            cursor.run_query(
                sql=self.sql,
                destination_dataset_table=full_table_name,
                write_disposition="WRITE_TRUNCATE",
                create_disposition=self.create_disposition,
                use_legacy_sql=False,
            )

            self.log.info("Query table as created successfully: {}".format(
                full_table_name))
        except HttpError as err:
            raise AirflowException("BigQuery error: %s" % err.content)
    def execute(self, context):
        full_table_name = format_table_name(self.src_table)
        dataset_id, table_id = full_table_name.split(".")

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        table_resource = {
            "tableReference": {
                "table_id": table_id
            },
            "materializedView": {
                "query": self.sql
            },
        }

        # bigquery.Table.from_api_repr(table_resource)
        project_id = get_project_id()

        try:
            cursor.service.tables().insert(
                projectId=project_id,
                datasetId=dataset_id,
                body=table_resource).execute(num_retries=self.num_retries)

            self.log.info("Table created successfully: %s:%s.%s", project_id,
                          dataset_id, table_id)
        except HttpError as err:
            raise AirflowException("BigQuery error: %s" % err.content)
Example #4
0
    def execute(self, context):
        dst_table_name = format_table_name(self.dst_table_name,
                                           is_staging=True)

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        bucket = get_bucket()
        src_uris = f"{bucket}/{self.src_uris}"

        cursor.run_load(
            dst_table_name,
            source_uris=src_uris,
            schema_fields=self.schema_fields,
            autodetect=self.autodetect,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
        )
Example #5
0
    def __new__(
        cls,
        parent_id,
        gcs_dirs_xcom,
        dst_dir,
        filename,
        schema_fields,
        table_name,
        task_id,
        dag,
    ):
        from airflow.utils.dates import days_ago

        args = {
            "start_date": days_ago(2),
        }

        bucket = get_bucket().replace("gs://", "", 1)
        full_table_name = format_table_name(table_name, is_staging=True)

        subdag = DAG(dag_id=f"{parent_id}.{task_id}", default_args=args)

        column_names = [schema["name"] for schema in schema_fields]

        # by convention, preface task names with dag_id
        op_col_select = PythonTaskflowOperator(
            task_id="select_cols",
            python_callable=_keep_columns,
            # note that this input should have form schedule/{execution_date}/...
            taskflow={
                "gcs_dirs": {
                    "dag_id": parent_id,
                    "task_ids": gcs_dirs_xcom
                }
            },
            op_kwargs={
                "dst_dir": dst_dir,
                "filename": filename,
                "required_cols": [],
                "optional_cols": column_names,
            },
            dag=subdag,
        )

        op_stage_bq = GoogleCloudStorageToBigQueryOperator(
            task_id="stage_bigquery",
            bucket=bucket,
            # note that we can't really pull a list out of xcom without subclassing
            # operators, so we really on knowing that the task passing in
            # gcs_dirs_xcom data is using schedule/{execution_date}
            source_objects=[
                "schedule/{{execution_date}}/*/%s/%s" % (dst_dir, filename)
            ],
            schema_fields=schema_fields,
            destination_project_dataset_table=full_table_name,
            create_disposition="CREATE_IF_NEEDED",
            write_disposition="WRITE_TRUNCATE",
            # _keep_columns function includes headers in output
            skip_leading_rows=1,
            dag=subdag,
        )

        op_col_select >> op_stage_bq

        return SubDagOperator(subdag=subdag, dag=dag, task_id=task_id)