Beispiel #1
0
    def test_execute(self, mock_hook):
        operator = BigQueryCreateExternalTableOperator(
            task_id=TASK_ID,
            destination_project_dataset_table='{}.{}'.format(
                TEST_DATASET, TEST_TABLE_ID
            ),
            schema_fields=[],
            bucket=TEST_GCS_BUCKET,
            source_objects=TEST_GCS_DATA,
            source_format=TEST_SOURCE_FORMAT
        )

        operator.execute(None)
        mock_hook.return_value \
            .create_external_table \
            .assert_called_once_with(
                external_project_dataset_table='{}.{}'.format(
                    TEST_DATASET, TEST_TABLE_ID
                ),
                schema_fields=[],
                source_uris=['gs://{}/{}'.format(TEST_GCS_BUCKET, source_object)
                             for source_object in TEST_GCS_DATA],
                source_format=TEST_SOURCE_FORMAT,
                compression='NONE',
                skip_leading_rows=0,
                field_delimiter=',',
                max_bad_records=0,
                quote_character=None,
                allow_quoted_newlines=False,
                allow_jagged_rows=False,
                src_fmt_configs={},
                labels=None,
                encryption_configuration=None
            )
Beispiel #2
0
 create_external_table_multiple_types = BigQueryCreateExternalTableOperator(
     task_id="create_external_table",
     bucket=BUCKET_NAME,
     table_resource={
         "tableReference": {
             "projectId": GCP_PROJECT_ID,
             "datasetId": DATASET_NAME,
             "tableId": "firestore_data",
         },
         "schema": {
             "fields": [
                 {
                     "name": "name",
                     "type": "STRING"
                 },
                 {
                     "name": "post_abbr",
                     "type": "STRING"
                 },
             ]
         },
         "externalDataConfiguration": {
             "sourceFormat": "DATASTORE_BACKUP",
             "compression": "NONE",
             "csvOptions": {
                 "skipLeadingRows": 1
             },
         },
     },
     source_objects=[
         f"{EXPORT_PREFIX}/all_namespaces/kind_{EXPORT_COLLECTION_ID}"
         f"/all_namespaces_kind_{EXPORT_COLLECTION_ID}.export_metadata"
     ],
 )
Beispiel #3
0
        location=DATASET_LOCATION,
        project_id=GCP_PROJECT_ID,
    )

    delete_dataset = BigQueryDeleteDatasetOperator(task_id="delete_dataset",
                                                   dataset_id=DATASET_NAME,
                                                   project_id=GCP_PROJECT_ID,
                                                   delete_contents=True)

    # [START howto_operator_create_external_table_multiple_types]
    create_external_table_multiple_types = BigQueryCreateExternalTableOperator(
        task_id="create_external_table",
        bucket=BUCKET_NAME,
        source_objects=[
            f"{EXPORT_PREFIX}/all_namespaces/kind_{EXPORT_COLLECTION_ID}"
            f"/all_namespaces_kind_{EXPORT_COLLECTION_ID}.export_metadata"
        ],
        source_format="DATASTORE_BACKUP",
        destination_project_dataset_table=
        f"{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data",
    )
    # [END howto_operator_create_external_table_multiple_types]

    read_data_from_gcs_multiple_types = BigQueryExecuteQueryOperator(
        task_id="execute_query",
        sql=
        f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.firestore_data`",
        use_legacy_sql=False,
    )

    # Firestore
Beispiel #4
0
 # [START howto_operator_create_external_table_multiple_types]
 create_external_table_multiple_types = BigQueryCreateExternalTableOperator(
     task_id="create_external_table",
     bucket=BUCKET_NAME,
     table_resource={
         "tableReference": {
             "projectId": GCP_PROJECT_ID,
             "datasetId": DATASET_NAME,
             "tableId": "firestore_data",
         },
         "schema": {
             "fields": [
                 {
                     "name": "name",
                     "type": "STRING"
                 },
                 {
                     "name": "post_abbr",
                     "type": "STRING"
                 },
             ]
         },
         "externalDataConfiguration": {
             "sourceFormat": "DATASTORE_BACKUP",
             "compression": "NONE",
             "csvOptions": {
                 "skipLeadingRows": 1
             },
         },
     },
 )
 # [END howto_operator_create_external_table_multiple_types]
Beispiel #5
0
        max_results="10",
        selected_fields="value,to_address",
    )

    get_data_result = BashOperator(
        task_id="get_data_result",
        bash_command="echo \"{{ task_instance.xcom_pull('get-data') }}\"")

    create_external_table = BigQueryCreateExternalTableOperator(
        task_id="create_external_table",
        bucket=DATA_SAMPLE_GCS_BUCKET_NAME,
        source_objects=[DATA_SAMPLE_GCS_OBJECT_NAME],
        destination_project_dataset_table="{}.external_table".format(
            DATASET_NAME),
        skip_leading_rows=1,
        schema_fields=[{
            "name": "name",
            "type": "STRING"
        }, {
            "name": "post_abbr",
            "type": "STRING"
        }],
    )

    execute_query_external_table = BigQueryExecuteQueryOperator(
        task_id="execute_query_external_table",
        destination_dataset_table="{}.selected_data_from_external_table".
        format(DATASET_NAME),
        sql='SELECT * FROM `{}.external_table` WHERE name LIKE "W%"'.format(
            DATASET_NAME),
        use_legacy_sql=False,
    )
    # [END howto_operator_presto_to_gcs_multiple_types]

    # [START howto_operator_create_external_table_multiple_types]
    create_external_table_multiple_types = BigQueryCreateExternalTableOperator(
        task_id="create_external_table_multiple_types",
        bucket=GCS_BUCKET,
        source_objects=[f"{safe_name(SOURCE_MULTIPLE_TYPES)}.*.json"],
        table_resource={
            "tableReference": {
                "projectId": GCP_PROJECT_ID,
                "datasetId": DATASET_NAME,
                "tableId": f"{safe_name(SOURCE_MULTIPLE_TYPES)}",
            },
            "schema": {
                "fields": [
                    {"name": "name", "type": "STRING"},
                    {"name": "post_abbr", "type": "STRING"},
                ]
            },
            "externalDataConfiguration": {
                "sourceFormat": "NEWLINE_DELIMITED_JSON",
                "compression": "NONE",
                "csvOptions": {"skipLeadingRows": 1},
            },
        },
        schema_object=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json",
    )
    # [END howto_operator_create_external_table_multiple_types]

    read_data_from_gcs_multiple_types = BigQueryInsertJobOperator(
        task_id="read_data_from_gcs_multiple_types",
 # [START howto_operator_bigquery_create_external_table]
 create_external_table = BigQueryCreateExternalTableOperator(
     task_id="create_external_table",
     table_resource={
         "tableReference": {
             "projectId": PROJECT_ID,
             "datasetId": DATASET_NAME,
             "tableId": "external_table",
         },
         "schema": {
             "fields": [
                 {
                     "name": "name",
                     "type": "STRING"
                 },
                 {
                     "name": "post_abbr",
                     "type": "STRING"
                 },
             ]
         },
         "externalDataConfiguration": {
             "sourceFormat": "CSV",
             "compression": "NONE",
             "csvOptions": {
                 "skipLeadingRows": 1
             },
             "sourceUris": [DATA_SAMPLE_GCS_URL],
         },
     },
 )
 # [END howto_operator_bigquery_create_external_table]
    presto_to_gcs_multiple_types = PrestoToGCSOperator(
        task_id="presto_to_gcs_multiple_types",
        sql=f"select * from {SOURCE_MULTIPLE_TYPES}",
        bucket=GCS_BUCKET,
        filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}.{{}}.json",
        schema_filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json",
        gzip=False,
    )
    # [END howto_operator_presto_to_gcs_multiple_types]

    # [START howto_operator_create_external_table_multiple_types]
    create_external_table_multiple_types = BigQueryCreateExternalTableOperator(
        task_id="create_external_table_multiple_types",
        bucket=GCS_BUCKET,
        source_objects=[f"{safe_name(SOURCE_MULTIPLE_TYPES)}.*.json"],
        source_format="NEWLINE_DELIMITED_JSON",
        destination_project_dataset_table=
        f"{DATASET_NAME}.{safe_name(SOURCE_MULTIPLE_TYPES)}",
        schema_object=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json",
    )
    # [END howto_operator_create_external_table_multiple_types]

    read_data_from_gcs_multiple_types = BigQueryExecuteQueryOperator(
        task_id="read_data_from_gcs_multiple_types",
        sql=
        f"SELECT COUNT(*) FROM `{GCP_PROJECT_ID}.{DATASET_NAME}.{safe_name(SOURCE_MULTIPLE_TYPES)}`",
        use_legacy_sql=False,
    )

    # [START howto_operator_presto_to_gcs_many_chunks]
    presto_to_gcs_many_chunks = PrestoToGCSOperator(
        },
    )

    # TODO: Homework - research and try XCOM to communicate output values between 2 tasks/operators
    local_to_gcs_task = PythonOperator(
        task_id="local_to_gcs_task",
        python_callable=upload_to_gcs,
        op_kwargs={
            "bucket": BUCKET,
            "object_name": f"raw/{parquet_file}",
            "local_file": f"{path_to_local_home}/{parquet_file}",
        },
    )

    bigquery_external_table_task = BigQueryCreateExternalTableOperator(
        task_id="bigquery_external_table_task",
        table_resource={
            "tableReference": {
                "projectId": PROJECT_ID,
                "datasetId": BIGQUERY_DATASET,
                "tableId": "external_table",
            },
            "externalDataConfiguration": {
                "sourceFormat": "PARQUET",
                "sourceUris": [f"gs://{BUCKET}/raw/{parquet_file}"],
            },
        },
    )

    download_dataset_task >> format_to_parquet_task >> local_to_gcs_task >> bigquery_external_table_task