Beispiel #1
0
    def test_execute(self, mock_hook):
        operator = BigQueryCreateEmptyDatasetOperator(
            task_id=TASK_ID,
            dataset_id=TEST_DATASET,
            project_id=TEST_GCP_PROJECT_ID,
            location=TEST_DATASET_LOCATION)

        operator.execute(None)
        mock_hook.return_value.create_empty_dataset.assert_called_once_with(
            dataset_id=TEST_DATASET,
            project_id=TEST_GCP_PROJECT_ID,
            location=TEST_DATASET_LOCATION,
            dataset_reference={},
            exists_ok=False,
        )
Beispiel #2
0
        tags=["example"],
) as dag:
    # [START howto_operator_export_database_to_gcs]
    export_database_to_gcs = CloudFirestoreExportDatabaseOperator(
        task_id="export_database_to_gcs",
        project_id=FIRESTORE_PROJECT_ID,
        body={
            "outputUriPrefix": EXPORT_DESTINATION_URL,
            "collectionIds": [EXPORT_COLLECTION_ID]
        },
    )
    # [END howto_operator_export_database_to_gcs]

    create_dataset = BigQueryCreateEmptyDatasetOperator(
        task_id="create_dataset",
        dataset_id=DATASET_NAME,
        location=DATASET_LOCATION,
        project_id=GCP_PROJECT_ID,
    )

    delete_dataset = BigQueryDeleteDatasetOperator(task_id="delete_dataset",
                                                   dataset_id=DATASET_NAME,
                                                   project_id=GCP_PROJECT_ID,
                                                   delete_contents=True)

    # [START howto_operator_create_external_table_multiple_types]
    create_external_table_multiple_types = BigQueryCreateExternalTableOperator(
        task_id="create_external_table",
        bucket=BUCKET_NAME,
        table_resource={
            "tableReference": {
                "projectId": GCP_PROJECT_ID,
with models.DAG(
    "example_facebook_ads_to_gcs",
    schedule_interval='@once',  # Override to match your needs
    start_date=datetime(2021, 1, 1),
    catchup=False,
) as dag:

    create_bucket = GCSCreateBucketOperator(
        task_id="create_bucket",
        bucket_name=GCS_BUCKET,
        project_id=GCP_PROJECT_ID,
    )

    create_dataset = BigQueryCreateEmptyDatasetOperator(
        task_id="create_dataset",
        dataset_id=DATASET_NAME,
    )

    create_table = BigQueryCreateEmptyTableOperator(
        task_id="create_table",
        dataset_id=DATASET_NAME,
        table_id=TABLE_NAME,
        schema_fields=[
            {'name': 'campaign_name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'campaign_id', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'ad_id', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'clicks', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'impressions', 'type': 'STRING', 'mode': 'NULLABLE'},
        ],
    )
Beispiel #4
0
        source_project_dataset_tables="{}.selected_data_from_external_table".
        format(DATASET_NAME),
        destination_project_dataset_table=
        "{}.copy_of_selected_data_from_external_table".format(DATASET_NAME),
    )

    bigquery_to_gcs = BigQueryToGCSOperator(
        task_id="bigquery_to_gcs",
        source_project_dataset_table="{}.selected_data_from_external_table".
        format(DATASET_NAME),
        destination_cloud_storage_uris=[
            "gs://{}/export-bigquery.csv".format(DATA_EXPORT_BUCKET_NAME)
        ],
    )

    create_dataset = BigQueryCreateEmptyDatasetOperator(
        task_id="create-dataset", dataset_id=DATASET_NAME)

    create_dataset_with_location = BigQueryCreateEmptyDatasetOperator(
        task_id="create_dataset_with_location",
        dataset_id=LOCATION_DATASET_NAME,
        location=BQ_LOCATION)

    create_table = BigQueryCreateEmptyTableOperator(
        task_id="create_table",
        dataset_id=DATASET_NAME,
        table_id="test_table",
        schema_fields=[
            {
                "name": "emp_name",
                "type": "STRING",
                "mode": "REQUIRED"
                gcs['updated_at']) + dt.timedelta(hours=7)
            json_gcs.append(gcs)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket("airflow-postgres")
    blob = bucket.blob("users.csv")
    df = pd.DataFrame(data=json_gcs).to_csv(sep=",",
                                            header=False,
                                            index=False,
                                            quotechar='"',
                                            quoting=csv.QUOTE_ALL,
                                            encoding='utf-8')
    blob.upload_from_string(data=df)


create_users_dataset = BigQueryCreateEmptyDatasetOperator(
    task_id='users_dataset', dataset_id=DATASET_NAME, dag=dag)

convert_input_file = PythonOperator(task_id='convert_users',
                                    python_callable=users_converter,
                                    dag=dag)

load_users = GCSToBigQueryOperator(
    task_id='gcs_to_bigquery_users',
    bucket='airflow-postgres',
    source_objects=['users.csv'],
    destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
    schema_fields=[
        {
            'name': 'created_at',
            'type': 'TIMESTAMP',
            'mode': 'REQUIRED'
Beispiel #6
0
dag_id = "example_bigquery_sensors"

with models.DAG(
        dag_id,
        schedule_interval=None,  # Override to match your needs
        start_date=days_ago(1),
        tags=["example"],
        user_defined_macros={
            "DATASET": DATASET_NAME,
            "TABLE": TABLE_NAME
        },
        default_args={"project_id": PROJECT_ID},
) as dag_with_locations:
    create_dataset = BigQueryCreateEmptyDatasetOperator(
        task_id="create-dataset",
        dataset_id=DATASET_NAME,
        project_id=PROJECT_ID)

    create_table = BigQueryCreateEmptyTableOperator(
        task_id="create_table",
        dataset_id=DATASET_NAME,
        table_id=TABLE_NAME,
        schema_fields=SCHEMA,
        time_partitioning={
            "type": "DAY",
            "field": "ds",
        },
    )
    # [START howto_sensor_bigquery_table]
    check_table_exists = BigQueryTableExistenceSensor(
        task_id="check_table_exists",
    BigQueryDeleteDatasetOperator,
)
from airflow.providers.google.cloud.operators.gcs_to_bigquery import GCSToBigQueryOperator
from airflow.utils.dates import days_ago

DATASET_NAME = os.environ.get("GCP_DATASET_NAME", 'airflow_test')
TABLE_NAME = os.environ.get("GCP_TABLE_NAME", 'gcs_to_bq_table')

args = {'start_date': days_ago(2)}

dag = models.DAG(dag_id='example_gcs_to_bigquery_operator',
                 default_args=args,
                 schedule_interval=None,
                 tags=['example'])

create_test_dataset = BigQueryCreateEmptyDatasetOperator(
    task_id='create_airflow_test_dataset', dataset_id=DATASET_NAME, dag=dag)

# [START howto_operator_gcs_to_bigquery]
load_csv = GCSToBigQueryOperator(
    task_id='gcs_to_bigquery_example',
    bucket='cloud-samples-data',
    source_objects=['bigquery/us-states/us-states.csv'],
    destination_project_dataset_table=f"{DATASET_NAME}.{TABLE_NAME}",
    schema_fields=[
        {
            'name': 'name',
            'type': 'STRING',
            'mode': 'NULLABLE'
        },
        {
            'name': 'post_abbr',
Beispiel #8
0
        query=QUERY,
        include_deleted=True,
        bucket_name=GCS_BUCKET,
        object_name=GCS_OBJ_PATH,
        salesforce_conn_id=SALESFORCE_CONN_ID,
        export_format='csv',
        coerce_to_timestamp=False,
        record_time_added=False,
        gcp_conn_id=GCS_CONN_ID,
        task_id="upload_to_gcs",
        dag=dag,
    )
    # [END howto_operator_salesforce_to_gcs]

    create_dataset = BigQueryCreateEmptyDatasetOperator(
        task_id="create_dataset", dataset_id=DATASET_NAME, project_id=GCP_PROJECT_ID, gcp_conn_id=GCS_CONN_ID
    )

    create_table = BigQueryCreateEmptyTableOperator(
        task_id="create_table",
        dataset_id=DATASET_NAME,
        table_id=TABLE_NAME,
        schema_fields=[
            {'name': 'id', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'company', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'phone', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'email', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'createddate', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'lastmodifieddate', 'type': 'STRING', 'mode': 'NULLABLE'},
            {'name': 'isdeleted', 'type': 'BOOL', 'mode': 'NULLABLE'},
Beispiel #9
0
        f"(42, 'monthy python', '{INSERT_DATE}'), "
        f"(42, 'fishy fish', '{INSERT_DATE}');"
    )
    # [END howto_operator_bigquery_query]

    with models.DAG(
        dag_id,
        schedule_interval='@once',  # Override to match your needs
        start_date=datetime(2021, 1, 1),
        catchup=False,
        tags=["example"],
        user_defined_macros={"DATASET": DATASET, "TABLE": TABLE_1},
    ) as dag_with_locations:
        create_dataset = BigQueryCreateEmptyDatasetOperator(
            task_id="create-dataset",
            dataset_id=DATASET,
            location=location,
        )

        create_table_1 = BigQueryCreateEmptyTableOperator(
            task_id="create_table_1",
            dataset_id=DATASET,
            table_id=TABLE_1,
            schema_fields=SCHEMA,
            location=location,
        )

        create_table_2 = BigQueryCreateEmptyTableOperator(
            task_id="create_table_2",
            dataset_id=DATASET,
            table_id=TABLE_2,
Beispiel #10
0
    resource=
    "{{ task_instance.xcom_pull(task_ids='call_parameters', key='endPoint') }}",
    object_name=
    'data/{{execution_date.strftime("%Y") }}/{{execution_date.strftime("%m") }}/{{execution_date.strftime("%d") }}',
    bucket_name="{{ var.json.env.bucket }}",
    partitioned_key=
    "{{ task_instance.xcom_pull(task_ids='call_parameters', key='pickup_dimension_name') }}",
    query="{% include 'api_params.sql' %}",
    max_rows=
    "{{ task_instance.xcom_pull(task_ids='total_record', key='return_value') }}",
    dag=dag,
)

create_staging_dataset = BigQueryCreateEmptyDatasetOperator(
    task_id="create_staging_dataset",
    gcp_conn_id="my_gcp_connection",
    dataset_id="{{ var.json.env.stg }}",
    dag=dag,
)

create_production_dataset = BigQueryCreateEmptyDatasetOperator(
    task_id="create_production_dataset",
    gcp_conn_id="my_gcp_connection",
    dataset_id="{{ var.json.env.production }}",
    dag=dag,
)

create_fact_table = BigQueryInsertJobOperator(
    task_id="create_fact_table",
    configuration={
        "query": {
            "query": "{% include 'create_fact_table.sql' %}",
Beispiel #11
0
IMPERSONATION_CHAIN = f"impersonated_account@{PROJECT_ID}.iam.gserviceaccount.com"

DATA_SAMPLE_GCS_URL_PARTS = urlparse(DATA_SAMPLE_GCS_URL)
DATA_SAMPLE_GCS_BUCKET_NAME = DATA_SAMPLE_GCS_URL_PARTS.netloc
DATA_SAMPLE_GCS_OBJECT_NAME = DATA_SAMPLE_GCS_URL_PARTS.path[1:]


with models.DAG(
    "example_bigquery_operations",
    schedule_interval=None,  # Override to match your needs
    start_date=days_ago(1),
    tags=["example"],
) as dag:
    # [START howto_operator_bigquery_create_dataset]
    create_dataset = BigQueryCreateEmptyDatasetOperator(
        task_id="create-dataset", 
        dataset_id=DATASET_NAME,
        impersonation_chain=IMPERSONATION_CHAIN
    )
    # [END howto_operator_bigquery_create_dataset]


    # [START howto_operator_bigquery_delete_dataset]
    delete_dataset = BigQueryDeleteDatasetOperator(
        task_id="delete_dataset", 
        dataset_id=DATASET_NAME, 
        delete_contents=True,
        impersonation_chain=IMPERSONATION_CHAIN
    )
    # [END howto_operator_bigquery_delete_dataset]