Exemple #1
0
    def test_execute(self, mock_hook):
        template_id = "template_id"
        version = 6
        parameters = {}

        op = DataprocInstantiateWorkflowTemplateOperator(
            task_id=TASK_ID,
            template_id=template_id,
            region=GCP_LOCATION,
            project_id=GCP_PROJECT,
            version=version,
            parameters=parameters,
            request_id=REQUEST_ID,
            retry=RETRY,
            timeout=TIMEOUT,
            metadata=METADATA,
            gcp_conn_id=GCP_CONN_ID,
            impersonation_chain=IMPERSONATION_CHAIN,
        )
        op.execute(context={})
        mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN)
        mock_hook.return_value.instantiate_workflow_template.assert_called_once_with(
            template_name=template_id,
            location=GCP_LOCATION,
            project_id=GCP_PROJECT,
            version=version,
            parameters=parameters,
            request_id=REQUEST_ID,
            retry=RETRY,
            timeout=TIMEOUT,
            metadata=METADATA,
        )
Exemple #2
0
    )
    # [END how_to_cloud_dataproc_update_cluster_operator]

    # [START how_to_cloud_dataproc_create_workflow_template]
    create_workflow_template = DataprocCreateWorkflowTemplateOperator(
        task_id="create_workflow_template",
        template=WORKFLOW_TEMPLATE,
        project_id=PROJECT_ID,
        region=REGION,
    )
    # [END how_to_cloud_dataproc_create_workflow_template]

    # [START how_to_cloud_dataproc_trigger_workflow_template]
    trigger_workflow = DataprocInstantiateWorkflowTemplateOperator(
        task_id="trigger_workflow",
        region=REGION,
        project_id=PROJECT_ID,
        template_id=WORKFLOW_NAME)
    # [END how_to_cloud_dataproc_trigger_workflow_template]

    pig_task = DataprocSubmitJobOperator(task_id="pig_task",
                                         job=PIG_JOB,
                                         region=REGION,
                                         project_id=PROJECT_ID)
    spark_sql_task = DataprocSubmitJobOperator(task_id="spark_sql_task",
                                               job=SPARK_SQL_JOB,
                                               region=REGION,
                                               project_id=PROJECT_ID)

    spark_task = DataprocSubmitJobOperator(task_id="spark_task",
                                           job=SPARK_JOB,
default_args = {
    # Tell airflow to start one day ago, so that it runs as soon as you upload it
    "start_date": days_ago(1),
    "project_id": project_id,
}

# Define a DAG (directed acyclic graph) of tasks.
# Any task you create within the context manager is automatically added to the
# DAG object.
with models.DAG(
        # The id you will see in the DAG airflow page
        "dataproc_workflow_dag",
        default_args=default_args,
        # The interval with which to schedule the DAG
        schedule_interval=datetime.timedelta(
            days=1),  # Override to match your needs
) as dag:

    start_template_job = DataprocInstantiateWorkflowTemplateOperator(
        # The task id of your job
        task_id="dataproc_workflow_dag",
        # The template id of your workflow
        template_id="sparkpi",
        project_id=project_id,
        # The region for the template
        region="us-central1",
    )

# [END composer_dataproc_workflow_instantiate_operator_tutorial]
Exemple #4
0
        task_id='transfer_grabbed_data',
        source_bucket=os.environ['GCP_GCS_BUCKET_LANDING'],
        destination_bucket=os.environ['GCP_GCS_BUCKET_WORKING'],
        gcp_conn_id='gr_storage_conn',
        source_object='{{ run_id }}',
        destination_object='{{ run_id }}',
    )

    spark_etl = DataprocInstantiateWorkflowTemplateOperator(
        task_id='spark_etl',
        template_id=os.environ['GCP_DATAPROC_TEMPLATE_ID'],
        project_id=os.environ['GCP_PROJECT_ID'],
        region=os.environ['GCP_REGION'],
        parameters={
            'PATH_TO_ETL_FILE':
            f"gs://{os.environ['GCP_GCS_BUCKET_ROUTINE']}/etl/etl.py",
            'CLUSTER_NAME': 'goodreads-etl',
            'ARG_SOURCE_BUCKET': os.environ['GCP_GCS_BUCKET_WORKING'],
            'ARG_DESTINATION_BUCKET': os.environ['GCP_GCS_BUCKET_PROCESSED'],
            'ARG_OBJECT_PREFIX': '{{ run_id }}',
        },
        gcp_conn_id='gr_dataproc_conn',
    )

    drop_stage_user_data = BigQueryDeleteTableOperator(
        task_id="drop_stage_user_data",
        deletion_dataset_table=
        f"{GCP_PROJECT_ID}.{GCP_BQ_DATASET_STAGE}.{GCP_BQ_TABLE_USERS}",
        gcp_conn_id='gr_bigquery_conn',
        ignore_if_missing=True)