Exemple #1
0
    def test_on_kill(self, mock_hook):
        job = {}
        job_id = "job_id"
        mock_hook.return_value.wait_for_job.return_value = None
        mock_hook.return_value.submit_job.return_value.reference.job_id = job_id

        op = DataprocSubmitJobOperator(
            task_id=TASK_ID,
            location=GCP_LOCATION,
            project_id=GCP_PROJECT,
            job=job,
            gcp_conn_id=GCP_CONN_ID,
            retry=RETRY,
            timeout=TIMEOUT,
            metadata=METADATA,
            request_id=REQUEST_ID,
            impersonation_chain=IMPERSONATION_CHAIN,
            cancel_on_kill=False,
        )
        op.execute(context={})

        op.on_kill()
        mock_hook.return_value.cancel_job.assert_not_called()

        op.cancel_on_kill = True
        op.on_kill()
        mock_hook.return_value.cancel_job.assert_called_once_with(
            project_id=GCP_PROJECT, location=GCP_LOCATION, job_id=job_id
        )
Exemple #2
0
    def test_execute(self, mock_hook):
        job = {}
        job_id = "job_id"
        mock_hook.return_value.wait_for_job.return_value = None
        mock_hook.return_value.submit_job.return_value.reference.job_id = job_id

        op = DataprocSubmitJobOperator(
            task_id=TASK_ID,
            location=GCP_LOCATION,
            project_id=GCP_PROJECT,
            job=job,
            gcp_conn_id=GCP_CONN_ID,
            retry=RETRY,
            timeout=TIMEOUT,
            metadata=METADATA,
            request_id=REQUEST_ID,
        )
        op.execute(context={})

        mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID)
        mock_hook.return_value.submit_job.assert_called_once_with(
            project_id=GCP_PROJECT,
            location=GCP_LOCATION,
            job=job,
            request_id=REQUEST_ID,
            retry=RETRY,
            timeout=TIMEOUT,
            metadata=METADATA,
        )
        mock_hook.return_value.wait_for_job.assert_called_once_with(
            job_id=job_id, project_id=GCP_PROJECT, location=GCP_LOCATION)
                                                   project_id=PROJECT_ID,
                                                   cluster=CLUSTER,
                                                   region=REGION)

    scale_cluster = DataprocUpdateClusterOperator(
        task_id="scale_cluster",
        cluster_name=CLUSTER_NAME,
        cluster=CLUSTER_UPDATE,
        update_mask=UPDATE_MASK,
        graceful_decommission_timeout=TIMEOUT,
        project_id=PROJECT_ID,
        location=REGION,
    )

    pig_task = DataprocSubmitJobOperator(task_id="pig_task",
                                         job=PIG_JOB,
                                         location=REGION,
                                         project_id=PROJECT_ID)

    spark_sql_task = DataprocSubmitJobOperator(
        task_id="spark_sql_task",
        job=SPARK_SQL_JOB,
        location=REGION,
        project_id=PROJECT_ID,
    )

    spark_task = DataprocSubmitJobOperator(task_id="spark_task",
                                           job=SPARK_JOB,
                                           location=REGION,
                                           project_id=PROJECT_ID)

    pyspark_task = DataprocSubmitJobOperator(task_id="pyspark_task",
Exemple #4
0
        template=WORKFLOW_TEMPLATE,
        project_id=PROJECT_ID,
        region=REGION,
    )
    # [END how_to_cloud_dataproc_create_workflow_template]

    # [START how_to_cloud_dataproc_trigger_workflow_template]
    trigger_workflow = DataprocInstantiateWorkflowTemplateOperator(
        task_id="trigger_workflow",
        region=REGION,
        project_id=PROJECT_ID,
        template_id=WORKFLOW_NAME)
    # [END how_to_cloud_dataproc_trigger_workflow_template]

    pig_task = DataprocSubmitJobOperator(task_id="pig_task",
                                         job=PIG_JOB,
                                         region=REGION,
                                         project_id=PROJECT_ID)
    spark_sql_task = DataprocSubmitJobOperator(task_id="spark_sql_task",
                                               job=SPARK_SQL_JOB,
                                               region=REGION,
                                               project_id=PROJECT_ID)

    spark_task = DataprocSubmitJobOperator(task_id="spark_task",
                                           job=SPARK_JOB,
                                           region=REGION,
                                           project_id=PROJECT_ID)

    # [START cloud_dataproc_async_submit_sensor]
    spark_task_async = DataprocSubmitJobOperator(task_id="spark_task_async",
                                                 job=SPARK_JOB,
                                                 region=REGION,
Exemple #5
0
    # By default you won't have access to use `gcloud dataproc jobs submit` on the cluster that you created.
    # Running this script would let you submit jobs to the cluster through gcloud.
    # Be sure to give the correct cluster-name, cluster-region and your group entity
    assign_permissions = BashOperator(
        task_id="assign_permissions_for_dataproc_cluster",
        bash_command=f"bash {DAGS_FOLDER}/dataproc-set-iam.sh {CLUSTER_NAME} {REGION} group:{GROUP_NAME}",
    )

    # BashOperator to hold the Dataproc delete operator for specified sleep time
    # sleep_task = BashOperator(task_id="sleep_task_to_keep_dataproc_cluster_alive_3h", bash_command="sleep 8h",)

    cg_producer_pyspark_task = DataprocSubmitJobOperator(
        task_id="run_cee_cg_producer",
        impersonation_chain=CONNECT_SA,
        job=CG_PYSPARK_JOB,
        location=REGION,
        cluster_name=CLUSTER_NAME,
        cluster_config=CLUSTER_CONFIG,
        labels={"tenant": TENANT, "created-by": USER, },
    )

    pr_producer_pyspark_task = DataprocSubmitJobOperator(
        task_id="run_cee_pr_producer",
        impersonation_chain=CONNECT_SA,
        job=PR_PYSPARK_JOB,
        location=REGION,
        cluster_name=CLUSTER_NAME,
        cluster_config=CLUSTER_CONFIG,
        labels={"tenant": TENANT, "created-by": USER, },
    )
Exemple #6
0
    },
}

with models.DAG("gcp_dataproc_spark", start_date=days_ago(1), schedule_interval=None) as dag:
    # [START how_to_cloud_dataproc_create_cluster_operator]
    create_cluster = DataprocCreateClusterOperator(
        task_id="create_cluster",
        project_id=PROJECT_ID,
        cluster_config=CLUSTER_CONFIG,
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )
    # [END how_to_cloud_dataproc_create_cluster_operator]

    spark_task = DataprocSubmitJobOperator(
    	task_id="spark_task", job=SPARK_JOB, location=REGION, project_id=PROJECT_ID
    )

    # [START how_to_cloud_dataproc_delete_cluster_operator]
    delete_cluster = DataprocDeleteClusterOperator(
        task_id="delete_cluster", project_id=PROJECT_ID, cluster_name=CLUSTER_NAME, region=REGION
    )
    # [END how_to_cloud_dataproc_delete_cluster_operator]
    

    create_cluster >> spark_task >> delete_cluster




    default_args=default_dag_args) as dag:

    create_dataproc_acme_sales_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_acme_sales_cluster',
        cluster_name=DATAPROC_CLUSTER_NAME,
        region=REGION,
        zone=ZONE,
        num_workers=3,
        master_machine_type=DATAPROC_MASTER_MACHINE_TYPE,
        worker_machine_type=DATAPROC_MASTER_MACHINE_TYPE,
        image_version=IMAGE_VERSION,
        project_id=PROJECT_ID)

    locations_staging_spark_job = DataprocSubmitJobOperator(
        task_id="locations_staging_spark_job",
        job=LOCATIONS_STAGING_SPARK_JOB,
        location=REGION,
        project_id=PROJECT_ID)

    products_staging_spark_job = DataprocSubmitJobOperator(
        task_id="products_staging_spark_job",
        job=PRODUCTS_STAGING_SPARK_JOB,
        location=REGION,
        project_id=PROJECT_ID)

    transactions_staging_spark_job = DataprocSubmitJobOperator(
        task_id="transactions_staging_spark_job",
        job=TRANSACTIONS_STAGING_SPARK_JOB,
        location=REGION,
        project_id=PROJECT_ID)
Exemple #8
0
    schedule_interval='00 6 * * *',
    max_active_runs=1,
    default_args=default_args
) as dag_daily:
    create_cluster = DataprocCreateClusterOperator(
        task_id="create_cluster",
        cluster_name=CLUSTER_NAME_DAILY,
        region=REGION,
        project_id=PROJECT_ID,
        cluster_config=CLUSTER_CONFIGURATION,
    )

    pig_job_nltk_stopwords = DataprocSubmitJobOperator(
        task_id="pig_job_nltk_stopwords",
        job=get_pig_job_config("sh python -m nltk.downloader stopwords",
                               CLUSTER_NAME_DAILY),
        location=REGION,
        project_id=PROJECT_ID
    )

    pig_job_spacy_vocabulary = DataprocSubmitJobOperator(
        task_id="pig_job_spacy_vocabulary",
        job=get_pig_job_config("sh python -m spacy download es_core_news_lg",
                               CLUSTER_NAME_DAILY),
        location=REGION,
        project_id=PROJECT_ID
    )

    python_task_config_file = PythonOperator(
        task_id='python_task_config_file',
        python_callable=write_str_to_gcp,