def test_on_kill(self, mock_hook): job = {} job_id = "job_id" mock_hook.return_value.wait_for_job.return_value = None mock_hook.return_value.submit_job.return_value.reference.job_id = job_id op = DataprocSubmitJobOperator( task_id=TASK_ID, location=GCP_LOCATION, project_id=GCP_PROJECT, job=job, gcp_conn_id=GCP_CONN_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, request_id=REQUEST_ID, impersonation_chain=IMPERSONATION_CHAIN, cancel_on_kill=False, ) op.execute(context={}) op.on_kill() mock_hook.return_value.cancel_job.assert_not_called() op.cancel_on_kill = True op.on_kill() mock_hook.return_value.cancel_job.assert_called_once_with( project_id=GCP_PROJECT, location=GCP_LOCATION, job_id=job_id )
def test_execute(self, mock_hook): job = {} job_id = "job_id" mock_hook.return_value.wait_for_job.return_value = None mock_hook.return_value.submit_job.return_value.reference.job_id = job_id op = DataprocSubmitJobOperator( task_id=TASK_ID, location=GCP_LOCATION, project_id=GCP_PROJECT, job=job, gcp_conn_id=GCP_CONN_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, request_id=REQUEST_ID, ) op.execute(context={}) mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID) mock_hook.return_value.submit_job.assert_called_once_with( project_id=GCP_PROJECT, location=GCP_LOCATION, job=job, request_id=REQUEST_ID, retry=RETRY, timeout=TIMEOUT, metadata=METADATA, ) mock_hook.return_value.wait_for_job.assert_called_once_with( job_id=job_id, project_id=GCP_PROJECT, location=GCP_LOCATION)
project_id=PROJECT_ID, cluster=CLUSTER, region=REGION) scale_cluster = DataprocUpdateClusterOperator( task_id="scale_cluster", cluster_name=CLUSTER_NAME, cluster=CLUSTER_UPDATE, update_mask=UPDATE_MASK, graceful_decommission_timeout=TIMEOUT, project_id=PROJECT_ID, location=REGION, ) pig_task = DataprocSubmitJobOperator(task_id="pig_task", job=PIG_JOB, location=REGION, project_id=PROJECT_ID) spark_sql_task = DataprocSubmitJobOperator( task_id="spark_sql_task", job=SPARK_SQL_JOB, location=REGION, project_id=PROJECT_ID, ) spark_task = DataprocSubmitJobOperator(task_id="spark_task", job=SPARK_JOB, location=REGION, project_id=PROJECT_ID) pyspark_task = DataprocSubmitJobOperator(task_id="pyspark_task",
template=WORKFLOW_TEMPLATE, project_id=PROJECT_ID, region=REGION, ) # [END how_to_cloud_dataproc_create_workflow_template] # [START how_to_cloud_dataproc_trigger_workflow_template] trigger_workflow = DataprocInstantiateWorkflowTemplateOperator( task_id="trigger_workflow", region=REGION, project_id=PROJECT_ID, template_id=WORKFLOW_NAME) # [END how_to_cloud_dataproc_trigger_workflow_template] pig_task = DataprocSubmitJobOperator(task_id="pig_task", job=PIG_JOB, region=REGION, project_id=PROJECT_ID) spark_sql_task = DataprocSubmitJobOperator(task_id="spark_sql_task", job=SPARK_SQL_JOB, region=REGION, project_id=PROJECT_ID) spark_task = DataprocSubmitJobOperator(task_id="spark_task", job=SPARK_JOB, region=REGION, project_id=PROJECT_ID) # [START cloud_dataproc_async_submit_sensor] spark_task_async = DataprocSubmitJobOperator(task_id="spark_task_async", job=SPARK_JOB, region=REGION,
# By default you won't have access to use `gcloud dataproc jobs submit` on the cluster that you created. # Running this script would let you submit jobs to the cluster through gcloud. # Be sure to give the correct cluster-name, cluster-region and your group entity assign_permissions = BashOperator( task_id="assign_permissions_for_dataproc_cluster", bash_command=f"bash {DAGS_FOLDER}/dataproc-set-iam.sh {CLUSTER_NAME} {REGION} group:{GROUP_NAME}", ) # BashOperator to hold the Dataproc delete operator for specified sleep time # sleep_task = BashOperator(task_id="sleep_task_to_keep_dataproc_cluster_alive_3h", bash_command="sleep 8h",) cg_producer_pyspark_task = DataprocSubmitJobOperator( task_id="run_cee_cg_producer", impersonation_chain=CONNECT_SA, job=CG_PYSPARK_JOB, location=REGION, cluster_name=CLUSTER_NAME, cluster_config=CLUSTER_CONFIG, labels={"tenant": TENANT, "created-by": USER, }, ) pr_producer_pyspark_task = DataprocSubmitJobOperator( task_id="run_cee_pr_producer", impersonation_chain=CONNECT_SA, job=PR_PYSPARK_JOB, location=REGION, cluster_name=CLUSTER_NAME, cluster_config=CLUSTER_CONFIG, labels={"tenant": TENANT, "created-by": USER, }, )
}, } with models.DAG("gcp_dataproc_spark", start_date=days_ago(1), schedule_interval=None) as dag: # [START how_to_cloud_dataproc_create_cluster_operator] create_cluster = DataprocCreateClusterOperator( task_id="create_cluster", project_id=PROJECT_ID, cluster_config=CLUSTER_CONFIG, region=REGION, cluster_name=CLUSTER_NAME, ) # [END how_to_cloud_dataproc_create_cluster_operator] spark_task = DataprocSubmitJobOperator( task_id="spark_task", job=SPARK_JOB, location=REGION, project_id=PROJECT_ID ) # [START how_to_cloud_dataproc_delete_cluster_operator] delete_cluster = DataprocDeleteClusterOperator( task_id="delete_cluster", project_id=PROJECT_ID, cluster_name=CLUSTER_NAME, region=REGION ) # [END how_to_cloud_dataproc_delete_cluster_operator] create_cluster >> spark_task >> delete_cluster
default_args=default_dag_args) as dag: create_dataproc_acme_sales_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_acme_sales_cluster', cluster_name=DATAPROC_CLUSTER_NAME, region=REGION, zone=ZONE, num_workers=3, master_machine_type=DATAPROC_MASTER_MACHINE_TYPE, worker_machine_type=DATAPROC_MASTER_MACHINE_TYPE, image_version=IMAGE_VERSION, project_id=PROJECT_ID) locations_staging_spark_job = DataprocSubmitJobOperator( task_id="locations_staging_spark_job", job=LOCATIONS_STAGING_SPARK_JOB, location=REGION, project_id=PROJECT_ID) products_staging_spark_job = DataprocSubmitJobOperator( task_id="products_staging_spark_job", job=PRODUCTS_STAGING_SPARK_JOB, location=REGION, project_id=PROJECT_ID) transactions_staging_spark_job = DataprocSubmitJobOperator( task_id="transactions_staging_spark_job", job=TRANSACTIONS_STAGING_SPARK_JOB, location=REGION, project_id=PROJECT_ID)
schedule_interval='00 6 * * *', max_active_runs=1, default_args=default_args ) as dag_daily: create_cluster = DataprocCreateClusterOperator( task_id="create_cluster", cluster_name=CLUSTER_NAME_DAILY, region=REGION, project_id=PROJECT_ID, cluster_config=CLUSTER_CONFIGURATION, ) pig_job_nltk_stopwords = DataprocSubmitJobOperator( task_id="pig_job_nltk_stopwords", job=get_pig_job_config("sh python -m nltk.downloader stopwords", CLUSTER_NAME_DAILY), location=REGION, project_id=PROJECT_ID ) pig_job_spacy_vocabulary = DataprocSubmitJobOperator( task_id="pig_job_spacy_vocabulary", job=get_pig_job_config("sh python -m spacy download es_core_news_lg", CLUSTER_NAME_DAILY), location=REGION, project_id=PROJECT_ID ) python_task_config_file = PythonOperator( task_id='python_task_config_file', python_callable=write_str_to_gcp,