Ejemplo n.º 1
0
    def test_execute_no_force_rerun(self, mock_hook, mock_md5):
        job_id = "123456"
        hash_ = "hash"
        real_job_id = f"{job_id}_{hash_}"
        mock_md5.return_value.hexdigest.return_value = hash_

        configuration = {
            "query": {
                "query": "SELECT * FROM any",
                "useLegacySql": False,
            }
        }

        mock_hook.return_value.insert_job.return_value.result.side_effect = Conflict(
            "any")
        job = MagicMock(
            job_id=real_job_id,
            error_result=False,
            state="DONE",
            done=lambda: True,
        )
        mock_hook.return_value.get_job.return_value = job

        op = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration=configuration,
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID,
            reattach_states={"PENDING"},
        )
        # No force rerun
        with pytest.raises(AirflowException):
            op.execute({})
Ejemplo n.º 2
0
    def test_execute_success(self, mock_hook, mock_md5):
        job_id = "123456"
        hash_ = "hash"
        real_job_id = f"{job_id}_{hash_}"
        mock_md5.return_value.hexdigest.return_value = hash_

        configuration = {
            "query": {
                "query": "SELECT * FROM any",
                "useLegacySql": False,
            }
        }
        mock_hook.return_value.insert_job.return_value = MagicMock(
            job_id=real_job_id, error_result=False)

        op = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration=configuration,
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID,
        )
        result = op.execute({})

        mock_hook.return_value.insert_job.assert_called_once_with(
            configuration=configuration,
            location=TEST_DATASET_LOCATION,
            job_id=real_job_id,
            project_id=TEST_GCP_PROJECT_ID,
        )

        assert result == real_job_id
Ejemplo n.º 3
0
    def test_execute(self, mock_hook):
        job_id = "123456"
        configuration = {
            "query": {
                "query": "SELECT * FROM any",
                "useLegacySql": False,
            }
        }
        mock_hook.return_value.insert_job.return_value = MagicMock(
            job_id=job_id)

        op = BigQueryInsertJobOperator(task_id="insert_query_job",
                                       configuration=configuration,
                                       location=TEST_DATASET_LOCATION,
                                       job_id=job_id,
                                       project_id=TEST_GCP_PROJECT_ID)
        result = op.execute({})

        mock_hook.return_value.insert_job.assert_called_once_with(
            configuration=configuration,
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID,
        )

        assert result == job_id
Ejemplo n.º 4
0
    def test_execute_idempotency(self, mock_hook, mock_sleep_generator):
        job_id = "123456"

        configuration = {
            "query": {
                "query": "SELECT * FROM any",
                "useLegacySql": False,
            }
        }

        class MockJob:
            _call_no = 0
            _done = False

            def __init__(self):
                pass

            def reload(self):
                if MockJob._call_no == 3:
                    MockJob._done = True
                else:
                    MockJob._call_no += 1

            def done(self):
                return MockJob._done

            @property
            def job_id(self):
                return job_id

        mock_hook.return_value.insert_job.return_value.result.side_effect = Conflict(
            "any")
        mock_sleep_generator.return_value = [0, 0, 0, 0, 0]
        mock_hook.return_value.get_job.return_value = MockJob()

        op = BigQueryInsertJobOperator(task_id="insert_query_job",
                                       configuration=configuration,
                                       location=TEST_DATASET_LOCATION,
                                       job_id=job_id,
                                       project_id=TEST_GCP_PROJECT_ID)
        result = op.execute({})

        assert MockJob._call_no == 3

        mock_hook.return_value.get_job.assert_called_once_with(
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID,
        )

        assert result == job_id
Ejemplo n.º 5
0
 def test_job_id_validity(self, mock_md5, test_dag_id, expected_job_id):
     hash_ = "hash"
     mock_md5.return_value.hexdigest.return_value = hash_
     context = {"execution_date": datetime(2020, 1, 23)}
     configuration = {
         "query": {
             "query": "SELECT * FROM any",
             "useLegacySql": False,
         }
     }
     with DAG(dag_id=test_dag_id, start_date=datetime(2020, 1, 23)):
         op = BigQueryInsertJobOperator(task_id="test_job_id",
                                        configuration=configuration,
                                        project_id=TEST_GCP_PROJECT_ID)
     assert op._job_id(context) == expected_job_id
Ejemplo n.º 6
0
    def test_execute_reattach(self, mock_hook, mock_md5):
        job_id = "123456"
        hash_ = "hash"
        real_job_id = f"{job_id}_{hash_}"
        mock_md5.return_value.hexdigest.return_value = hash_

        configuration = {
            "query": {
                "query": "SELECT * FROM any",
                "useLegacySql": False,
            }
        }

        mock_hook.return_value.insert_job.return_value.result.side_effect = Conflict(
            "any")
        job = MagicMock(
            job_id=real_job_id,
            error_result=False,
            state="PENDING",
            done=lambda: False,
        )
        mock_hook.return_value.get_job.return_value = job

        op = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration=configuration,
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID,
            reattach_states={"PENDING"},
        )
        result = op.execute({})

        mock_hook.return_value.get_job.assert_called_once_with(
            location=TEST_DATASET_LOCATION,
            job_id=real_job_id,
            project_id=TEST_GCP_PROJECT_ID,
        )

        job.result.assert_called_once_with()

        assert result == real_job_id
Ejemplo n.º 7
0
    def test_execute_failure(self, mock_hook, mock_md5):
        job_id = "123456"
        hash_ = "hash"
        real_job_id = f"{job_id}_{hash_}"
        mock_md5.return_value.hexdigest.return_value = hash_

        configuration = {
            "query": {
                "query": "SELECT * FROM any",
                "useLegacySql": False,
            }
        }
        mock_hook.return_value.insert_job.return_value = MagicMock(job_id=real_job_id, error_result=True)

        op = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration=configuration,
            location=TEST_DATASET_LOCATION,
            job_id=job_id,
            project_id=TEST_GCP_PROJECT_ID,
        )
        with pytest.raises(AirflowException):
            op.execute({})
Ejemplo n.º 8
0
def bq_insert():

    INSERT_ROWS_QUERY = (
        f"INSERT into {DATASET_NAME}.{TABLE_1} VALUES ( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.0/~/xO_germline_db.rdata/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.10.0/xF_bgrd_ave.csv','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','823540771','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE='),( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.5/hs_depth.csv/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.6.5/hs_depth.csv','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','443893876','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE='),( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.0/~/xO_germline_db.rdata/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.6.5/xf_whitelist_cpras.txt','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','8400','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE='),( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.0/~/xO_germline_db.rdata/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.6.3/xe_gene_coordinates.bed','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','5803378','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE='),( 'storage#object','tl-alp-storage-transfer/TempusVCF/1.6.0/~/xO_germline_db.rdata/1613720449739061','https://www.googleapis.com/storage/v1/b/dag_test_bucket_sub/o/test1%2F','TempusVCF/1.10.0/xF_germline_db.csv','tl-alp-storage-transfer','1613720449739061','1','application/x-directory; charset=UTF-8','2021-02-25 07:40:49.822 UTC','2021-02-25 07:40:49.822 UTC','STANDARD','2021-02-19 07:40:49.822 UTC','3225246','1B2M2Y8AsgTpgAmY7PhCfg==','https://www.googleapis.com/download/storage/v1/b/dag_test_bucket_sub/o/test1%2F?generation=1613720449739061&alt=media','AAAAAA==','CLXSsZG59e4CEAE=');"
    )

    print(INSERT_ROWS_QUERY)

    # task_id=f"insert_query_{name}"
    bq_job = BigQueryInsertJobOperator(task_id='insert_job',
                                       configuration={
                                           "query": {
                                               "query": INSERT_ROWS_QUERY,
                                               "useLegacySql": False,
                                           }
                                       })
    )
    # [END howto_operator_facebook_ads_to_gcs]

    load_csv = GCSToBigQueryOperator(
        task_id='gcs_to_bq_example',
        bucket=GCS_BUCKET,
        source_objects=[GCS_OBJ_PATH],
        destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
        write_disposition='WRITE_TRUNCATE',
    )

    read_data_from_gcs_many_chunks = BigQueryInsertJobOperator(
        task_id="read_data_from_gcs_many_chunks",
        configuration={
            "query": {
                "query": f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{TABLE_NAME}`",
                "useLegacySql": False,
            }
        },
    )

    delete_bucket = GCSDeleteBucketOperator(
        task_id="delete_bucket",
        bucket_name=GCS_BUCKET,
    )

    delete_dataset = BigQueryDeleteDatasetOperator(
        task_id="delete_dataset",
        project_id=GCP_PROJECT_ID,
        dataset_id=DATASET_NAME,
        delete_contents=True,
            schema_fields=SCHEMA,
            location=location,
        )

        create_dataset >> [create_table_1, create_table_2]

        delete_dataset = BigQueryDeleteDatasetOperator(
            task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
        )

        # [START howto_operator_bigquery_insert_job]
        insert_query_job = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration={
                "query": {
                    "query": INSERT_ROWS_QUERY,
                    "useLegacySql": "False",
                }
            },
            location=location,
        )
        # [END howto_operator_bigquery_insert_job]

        # [START howto_operator_bigquery_select_job]
        select_query_job = BigQueryInsertJobOperator(
            task_id="select_query_job",
            configuration={
                "query": {
                    "query": "{% include 'example_bigquery_query.sql' %}",
                    "useLegacySql": False,
                }
            },
Ejemplo n.º 11
0
            },
            "externalDataConfiguration": {
                "sourceFormat": "NEWLINE_DELIMITED_JSON",
                "compression": "NONE",
                "csvOptions": {"skipLeadingRows": 1},
            },
        },
        schema_object=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json",
    )
    # [END howto_operator_create_external_table_multiple_types]

    read_data_from_gcs_multiple_types = BigQueryInsertJobOperator(
        task_id="read_data_from_gcs_multiple_types",
        configuration={
            "query": {
                "query": f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}."
                f"{safe_name(SOURCE_MULTIPLE_TYPES)}`",
                "useLegacySql": False,
            }
        },
    )

    # [START howto_operator_presto_to_gcs_many_chunks]
    presto_to_gcs_many_chunks = PrestoToGCSOperator(
        task_id="presto_to_gcs_many_chunks",
        sql=f"select * from {SOURCE_CUSTOMER_TABLE}",
        bucket=GCS_BUCKET,
        filename=f"{safe_name(SOURCE_CUSTOMER_TABLE)}.{{}}.json",
        schema_filename=f"{safe_name(SOURCE_CUSTOMER_TABLE)}-schema.json",
        approx_max_file_size_bytes=10_000_000,
        gzip=False,
    )
Ejemplo n.º 12
0
            schema_fields=SCHEMA,
            location=location,
        )

        create_dataset >> [create_table_1, create_table_2]

        delete_dataset = BigQueryDeleteDatasetOperator(
            task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
        )

        # [START howto_operator_bigquery_insert_job]
        insert_query_job = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration={
                "query": {
                    "query": INSERT_ROWS_QUERY,
                    "useLegacySql": False,
                }
            },
            location=location,
        )
        # [END howto_operator_bigquery_insert_job]

        # [START howto_operator_bigquery_select_job]
        select_query_job = BigQueryInsertJobOperator(
            task_id="select_query_job",
            configuration={
                "query": {
                    "query": "{% include 'example_bigquery_query.sql' %}",
                    "useLegacySql": False,
                }
            },
            "field": "ds",
        },
    )
    # [START howto_sensor_bigquery_table]
    check_table_exists = BigQueryTableExistenceSensor(
        task_id="check_table_exists",
        project_id=PROJECT_ID,
        dataset_id=DATASET_NAME,
        table_id=TABLE_NAME)
    # [END howto_sensor_bigquery_table]

    execute_insert_query = BigQueryInsertJobOperator(
        task_id="execute_insert_query",
        configuration={
            "query": {
                "query": INSERT_ROWS_QUERY,
                "useLegacySql": False,
            }
        },
    )

    # [START howto_sensor_bigquery_table_partition]
    check_table_partition_exists = BigQueryTablePartitionExistenceSensor(
        task_id="check_table_partition_exists",
        project_id=PROJECT_ID,
        dataset_id=DATASET_NAME,
        table_id=TABLE_NAME,
        partition_id=PARTITION_NAME,
    )
    # [END howto_sensor_bigquery_table_partition]
Ejemplo n.º 14
0
    create_warehouse_reviews_table = BigQueryCreateEmptyTableOperator(
        task_id='create_warehouse_reviews_table',
        dataset_id=os.environ['GCP_BQ_DATASET_WAREHOUSE'],
        table_id=GCP_BQ_TABLE_REVIEWS,
        project_id=GCP_PROJECT_ID,
        bigquery_conn_id='gr_bigquery_conn',
        schema_fields=warehouse.reviews_schema)

    load_warehouse_data = DummyOperator(task_id='load_warehouse_data')

    load_warehouse_users_data = BigQueryInsertJobOperator(
        task_id="load_warehouse_users_data",
        project_id=os.environ['GCP_PROJECT_ID'],
        job_id='users',
        configuration={
            "query": {
                "query": warehouse.load_users,
                "useLegacySql": False,
            }
        },
        gcp_conn_id='gr_bigquery_conn')

    load_warehouse_authors_data = BigQueryInsertJobOperator(
        task_id="load_warehouse_authors_data",
        project_id=os.environ['GCP_PROJECT_ID'],
        job_id='users',
        configuration={
            "query": {
                "query": warehouse.load_authors,
                "useLegacySql": False,
            }
Ejemplo n.º 15
0
            table_id=TABLE_2,
            schema_fields=SCHEMA,
            location=location,
        )

        create_dataset >> [create_table_1, create_table_2]

        delete_dataset = BigQueryDeleteDatasetOperator(
            task_id="delete_dataset", dataset_id=DATASET, delete_contents=True)

        # [START howto_operator_bigquery_insert_job]
        insert_query_job = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration={
                "query": {
                    "query": INSERT_ROWS_QUERY,
                    "useLegacySql": False,
                }
            },
            location=location,
        )
        # [END howto_operator_bigquery_insert_job]

        # [START howto_operator_bigquery_select_job]
        select_query_job = BigQueryInsertJobOperator(
            task_id="select_query_job",
            configuration={
                "query": {
                    "query": "{% include 'example_bigquery_query.sql' %}",
                    "useLegacySql": False,
                }
            },
 join_customer_purchase = BigQueryInsertJobOperator(
     task_id='bq_join_customer_purchase',
     gcp_conn_id='bigquery_default',
     project_id=TRF_PRJ,
     location=BQ_LOCATION,
     configuration={
         'jobType': 'QUERY',
         'query': {
             'query':
             """SELECT
               c.id as customer_id,
               p.id as purchase_id,
               p.item as item,
               p.price as price,
               p.timestamp as timestamp
             FROM `{dwh_0_prj}.{dwh_0_dataset}.customers` c
             JOIN `{dwh_0_prj}.{dwh_0_dataset}.purchases` p ON c.id = p.customer_id
           """.format(
                 dwh_0_prj=DWH_LAND_PRJ,
                 dwh_0_dataset=DWH_LAND_BQ_DATASET,
             ),
             'destinationTable': {
                 'projectId': DWH_CURATED_PRJ,
                 'datasetId': DWH_CURATED_BQ_DATASET,
                 'tableId': 'customer_purchase'
             },
             'writeDisposition':
             'WRITE_TRUNCATE',
             "useLegacySql":
             False
         }
     },
     impersonation_chain=[TRF_SA_BQ])
            location=location,
        )

        create_dataset >> [create_table_1, create_table_2]

        delete_dataset = BigQueryDeleteDatasetOperator(
            task_id="delete_dataset",
            dataset_id=DATASET_NAME,
            delete_contents=True)

        # [START howto_operator_bigquery_insert_job]
        insert_query_job = BigQueryInsertJobOperator(
            task_id="insert_query_job",
            configuration={
                "query": {
                    "query": INSERT_ROWS_QUERY,
                    "useLegacySql": False,
                }
            },
            location=location,
        )
        # [END howto_operator_bigquery_insert_job]

        execute_insert_query = BigQueryExecuteQueryOperator(
            task_id="execute_insert_query",
            sql=INSERT_ROWS_QUERY,
            use_legacy_sql=False,
            location=location)

        bigquery_execute_multi_query = BigQueryExecuteQueryOperator(
            task_id="execute_multi_query",
            sql=[
Ejemplo n.º 18
0
    ##############################################################
    # Check mandatory columns for null values                    #
    # This function sets up a check to see if the numbber of null#
    # values allowed in the title column to be 1                 # 
    ##############################################################
        validateTitleNull = BigQueryValueCheckOperator(
          task_id="validateNullTitle",
          sql=f"SELECT COUNT(*) FROM {DATASET_NAME}.{TABLE_NAME} WHERE title IS NULL",
          pass_value=1,
          use_legacy_sql=False
        )


    ##############################################################
    # Update row to set tbd1 with uppercase value of title       #
    ##############################################################
        UPDATE_STATEMENT = f"""UPDATE `{DATASET_NAME}.{TABLE_NAME}` SET tbd1 = LOWER(title) WHERE TRUE""" 
        
        updateTableColumns = BigQueryInsertJobOperator(
          task_id="select_query_job",
          configuration={
            "query": {
              "query": UPDATE_STATEMENT,
              "useLegacySql": False,
            }
          }
        )

        loadGcsToBq >> validateTitleNull >> updateTableColumns
Ejemplo n.º 19
0
    logs_to_parquet = MyDataprocSubmitJobOperator(
        task_id='logs_to_parquet',
        job={
            'placement': {
                'cluster_name': dataproc_config['cluster_name']
            },
            'pyspark_job': {
                'main_python_file_uri':
                f"gs://{bucket_config['jobs']}/jobs/UploadFilesToParquet.py",
                'properties': {
                    'spark.master': 'yarn',
                    'spark.deploy-mode': 'cluster',
                    'spark.driver.cores': '1',
                    'spark.driver.memory': '1g',
                    'spark.executor.cores': '2',
                    'spark.executor.memory': '4g',
                    'spark.executor.instances': '4'
                },
                'args': [
                    '--GCS_UPLOAD_URI',
                    f"gs://{bucket_config['upload_data']}/logs/20210101/*.log",
                    '--GCS_DATA_LAKE_URI', '/user/hdfs/raw_data/'
                ]
            }
        })

    hdfs_to_bigquery = BigQueryInsertJobOperator(
        task_id='hdfs_to_bigquery', configuration=bigquery_load_storage_config)

logs_to_parquet >> hdfs_to_bigquery
Ejemplo n.º 20
0
                "csvOptions": {
                    "skipLeadingRows": 1
                },
            },
        },
        source_objects=[
            f"{EXPORT_PREFIX}/all_namespaces/kind_{EXPORT_COLLECTION_ID}"
            f"/all_namespaces_kind_{EXPORT_COLLECTION_ID}.export_metadata"
        ],
    )
    # [END howto_operator_create_external_table_multiple_types]

    read_data_from_gcs_multiple_types = BigQueryInsertJobOperator(
        task_id="execute_query",
        configuration={
            "query": {
                "query":
                f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data`",
                "useLegacySql": False,
            }
        },
    )

    # Firestore
    export_database_to_gcs >> create_dataset

    # BigQuery
    create_dataset >> create_external_table_multiple_types
    create_external_table_multiple_types >> read_data_from_gcs_multiple_types
    read_data_from_gcs_multiple_types >> delete_dataset
Ejemplo n.º 21
0
    dataset_id="{{ var.json.env.stg }}",
    dag=dag,
)

create_production_dataset = BigQueryCreateEmptyDatasetOperator(
    task_id="create_production_dataset",
    gcp_conn_id="my_gcp_connection",
    dataset_id="{{ var.json.env.production }}",
    dag=dag,
)

create_fact_table = BigQueryInsertJobOperator(
    task_id="create_fact_table",
    configuration={
        "query": {
            "query": "{% include 'create_fact_table.sql' %}",
            "use_legacy_sql": False,
        }
    },
    dag=dag,
)

create_geo_table = BigQueryInsertJobOperator(
    task_id="create_geo_table",
    configuration={
        "query": {
            "query": "{% include 'create_geo_table.sql' %}",
            "use_legacy_sql": False,
        }
    },
    dag=dag,
)