Esempio n. 1
0
    def test_deprecation_warning(self):
        with pytest.warns(DeprecationWarning) as warnings:
            op = DataprocCreateClusterOperator(
                task_id=TASK_ID,
                region=GCP_LOCATION,
                project_id=GCP_PROJECT,
                cluster_name="cluster_name",
                num_workers=2,
                zone="zone",
            )
        assert_warning("Passing cluster parameters by keywords", warnings)

        assert op.project_id == GCP_PROJECT
        assert op.cluster_name == "cluster_name"
        assert op.cluster_config['worker_config']['num_instances'] == 2
        assert "zones/zone" in op.cluster_config['master_config'][
            "machine_type_uri"]

        with pytest.warns(DeprecationWarning) as warnings:
            op_default_region = DataprocCreateClusterOperator(
                task_id=TASK_ID,
                project_id=GCP_PROJECT,
                cluster_name="cluster_name",
                cluster_config=op.cluster_config,
            )
        assert_warning("Default region value", warnings)
        assert op_default_region.region == 'global'
Esempio n. 2
0
 def test_execute_if_cluster_exists(self, mock_hook):
     mock_hook.return_value.create_cluster.side_effect = [
         AlreadyExists("test")
     ]
     op = DataprocCreateClusterOperator(
         task_id=TASK_ID,
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster=CLUSTER,
         gcp_conn_id=GCP_CONN_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
         request_id=REQUEST_ID,
     )
     op.execute(context={})
     mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID)
     mock_hook.return_value.create_cluster.assert_called_once_with(
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster=CLUSTER,
         request_id=REQUEST_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
     )
     mock_hook.return_value.get_cluster.assert_called_once_with(
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster_name=CLUSTER_NAME,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
     )
Esempio n. 3
0
    def test_deprecation_warning(self):
        with self.assertWarns(DeprecationWarning) as warning:
            op = DataprocCreateClusterOperator(
                task_id=TASK_ID,
                region=GCP_LOCATION,
                project_id=GCP_PROJECT,
                cluster_name="cluster_name",
                num_workers=2,
                zone="zone",
            )
        assert_warning("Passing cluster parameters by keywords", warning)

        self.assertEqual(op.project_id, GCP_PROJECT)
        self.assertEqual(op.cluster_name, "cluster_name")
        self.assertEqual(op.cluster_config['worker_config']['num_instances'], 2)
        self.assertIn("zones/zone", op.cluster_config['master_config']["machine_type_uri"])

        with self.assertWarns(DeprecationWarning) as warning:
            op_default_region = DataprocCreateClusterOperator(
                task_id=TASK_ID,
                project_id=GCP_PROJECT,
                cluster_name="cluster_name",
                cluster_config=op.cluster_config,
            )
        assert_warning("Default region value", warning)
        self.assertEqual(op_default_region.region, 'global')
Esempio n. 4
0
 def test_execute(self, mock_hook):
     op = DataprocCreateClusterOperator(
         task_id=TASK_ID,
         region=GCP_LOCATION,
         labels=LABELS,
         cluster_name=CLUSTER_NAME,
         project_id=GCP_PROJECT,
         cluster_config=CONFIG,
         request_id=REQUEST_ID,
         gcp_conn_id=GCP_CONN_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
         impersonation_chain=IMPERSONATION_CHAIN,
     )
     op.execute(context={})
     mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN)
     mock_hook.return_value.create_cluster.assert_called_once_with(
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster_config=CONFIG,
         labels=LABELS,
         cluster_name=CLUSTER_NAME,
         request_id=REQUEST_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
     )
Esempio n. 5
0
    def test_execute_if_cluster_exists_in_error_state(self, mock_hook):
        mock_hook.return_value.create_cluster.side_effect = [AlreadyExists("test")]
        cluster_status = mock_hook.return_value.get_cluster.return_value.status
        cluster_status.state = 0
        cluster_status.ERROR = 0

        op = DataprocCreateClusterOperator(
            task_id=TASK_ID,
            region=GCP_LOCATION,
            project_id=GCP_PROJECT,
            cluster_config=CONFIG,
            labels=LABELS,
            cluster_name=CLUSTER_NAME,
            delete_on_error=True,
            gcp_conn_id=GCP_CONN_ID,
            retry=RETRY,
            timeout=TIMEOUT,
            metadata=METADATA,
            request_id=REQUEST_ID,
        )
        with self.assertRaises(AirflowException):
            op.execute(context={})

        mock_hook.return_value.diagnose_cluster.assert_called_once_with(
            region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name=CLUSTER_NAME
        )
        mock_hook.return_value.delete_cluster.assert_called_once_with(
            region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name=CLUSTER_NAME
        )
Esempio n. 6
0
    def test_execute_if_cluster_exists_in_deleting_state(
        self, mock_hook, mock_get_cluster, mock_create_cluster, mock_generator
    ):
        cluster = mock.MagicMock()
        cluster.status.state = 0
        cluster.status.DELETING = 0

        cluster2 = mock.MagicMock()
        cluster2.status.state = 0
        cluster2.status.ERROR = 0

        mock_create_cluster.side_effect = [AlreadyExists("test"), cluster2]
        mock_generator.return_value = [0]
        mock_get_cluster.side_effect = [cluster, NotFound("test")]

        op = DataprocCreateClusterOperator(
            task_id=TASK_ID,
            region=GCP_LOCATION,
            project_id=GCP_PROJECT,
            cluster_config=CONFIG,
            labels=LABELS,
            cluster_name=CLUSTER_NAME,
            delete_on_error=True,
            gcp_conn_id=GCP_CONN_ID,
        )
        with self.assertRaises(AirflowException):
            op.execute(context={})

        calls = [mock.call(mock_hook.return_value), mock.call(mock_hook.return_value)]
        mock_get_cluster.assert_has_calls(calls)
        mock_create_cluster.assert_has_calls(calls)
        mock_hook.return_value.diagnose_cluster.assert_called_once_with(
            region=GCP_LOCATION, project_id=GCP_PROJECT, cluster_name=CLUSTER_NAME
        )
Esempio n. 7
0
 def test_execute_if_cluster_exists_do_not_use(self, mock_hook):
     mock_hook.return_value.create_cluster.side_effect = [AlreadyExists("test")]
     mock_hook.return_value.get_cluster.return_value.status.state = 0
     op = DataprocCreateClusterOperator(
         task_id=TASK_ID,
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster=CLUSTER,
         gcp_conn_id=GCP_CONN_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
         request_id=REQUEST_ID,
         use_if_exists=False,
     )
     with self.assertRaises(AlreadyExists):
         op.execute(context={})
Esempio n. 8
0
 def test_execute_if_cluster_exists(self, mock_hook, to_dict_mock):
     mock_hook.return_value.create_cluster.side_effect = [
         AlreadyExists("test")
     ]
     mock_hook.return_value.get_cluster.return_value.status.state = 0
     op = DataprocCreateClusterOperator(
         task_id=TASK_ID,
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster_config=CONFIG,
         labels=LABELS,
         cluster_name=CLUSTER_NAME,
         gcp_conn_id=GCP_CONN_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
         request_id=REQUEST_ID,
         impersonation_chain=IMPERSONATION_CHAIN,
     )
     op.execute(context={})
     mock_hook.assert_called_once_with(
         gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN)
     mock_hook.return_value.create_cluster.assert_called_once_with(
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster_config=CONFIG,
         labels=LABELS,
         cluster_name=CLUSTER_NAME,
         request_id=REQUEST_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
     )
     mock_hook.return_value.get_cluster.assert_called_once_with(
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster_name=CLUSTER_NAME,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
     )
     to_dict_mock.assert_called_once_with(
         mock_hook.return_value.get_cluster.return_value)
Esempio n. 9
0
 def test_execute(self, mock_hook):
     op = DataprocCreateClusterOperator(
         task_id=TASK_ID,
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster=CLUSTER,
         request_id=REQUEST_ID,
         gcp_conn_id=GCP_CONN_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
     )
     op.execute(context={})
     mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID)
     mock_hook.return_value.create_cluster.assert_called_once_with(
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster=CLUSTER,
         request_id=REQUEST_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
     )
Esempio n. 10
0
    def test_deprecation_warning(self):
        with self.assertWarns(DeprecationWarning) as warning:
            cluster_operator = DataprocCreateClusterOperator(
                task_id=TASK_ID,
                region=GCP_LOCATION,
                project_id=GCP_PROJECT,
                cluster_name="cluster_name",
                num_workers=2,
                zone="zone",
            )
        assert_warning("Passing cluster parameters by keywords", warning)
        cluster = cluster_operator.cluster

        self.assertEqual(cluster['project_id'], GCP_PROJECT)
        self.assertEqual(cluster['cluster_name'], "cluster_name")
        self.assertEqual(cluster['config']['worker_config']['num_instances'], 2)
        self.assertIn("zones/zone", cluster["config"]['master_config']["machine_type_uri"])
Esempio n. 11
0
 def test_depreciation_warning(self, mock_generator, mock_signature):
     mock_signature.return_value.parameters = cluster_params
     with self.assertWarns(DeprecationWarning) as warning:
         DataprocCreateClusterOperator(
             task_id=TASK_ID,
             region=GCP_LOCATION,
             project_id=GCP_PROJECT,
             cluster_name="cluster_name",
             num_workers=2,
             zone="zone",
         )
     assert_warning("Passing cluster parameters by keywords", warning)
     mock_generator.assert_called_once_with(
         task_id=TASK_ID,
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster_name="cluster_name",
         num_workers=2,
         zone="zone",
     )
    "retries": 0,
    "retry_delay": datetime.timedelta(minutes=5),
    "project_id": PROJECT,
}

with models.DAG(
        "Weekly-ETL-DAG-5",
        schedule_interval=None,
        start_date=datetime.datetime.combine(datetime.datetime.today(),
                                             datetime.datetime.min.time()),
) as dag:

    create_dataproc_cluster = DataprocCreateClusterOperator(
        task_id="create_dataproc_cluster",
        project_id=PROJECT,
        region=REGION,
        cluster=get_dataproc_config(),
        trigger_rule="all_done",
    )

    create_firewall_rule = PythonOperator(
        task_id="create_firewall_rule",
        provide_context=True,
        python_callable=add_firewall_function,
        dag=dag,
        trigger_rule="all_done",
    )

    start_pipelines = []
    for x in range(len(DF_PIPELINES)):
        start_pipelines.append(
Esempio n. 13
0
gcp_config = Variable.get('gcp_project_1', deserialize_json=True)
dataproc_config = gcp_config['dataproc']
bucket_config = dataproc_config['bucket']
cluster_config = DataprocCreateClusterConfig.make(gcp_config)

default_args = {
    'owner': 'airflow',
    'depends_on_past': True,
    'start_date': datetime(2021, 3, 9, tzinfo=local_tz),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
    'project_id': gcp_config['project_id'],
    'region': gcp_config['region'],
    'gcp_conn_id': gcp_config['conn_id']
}

with DAG('create_dataproc',
         default_args=default_args,
         description='create_dataproc',
         schedule_interval='@once') as dag:

    create_dataproc = DataprocCreateClusterOperator(
        task_id='create_dataproc',
        cluster_name=dataproc_config['cluster_name'],
        cluster_config=cluster_config)

create_dataproc
Esempio n. 14
0
    project_id="test",
    zone="us-central1-a",
    master_machine_type="n1-standard-4",
    worker_machine_type="n1-standard-4",
    num_workers=2,
    storage_bucket="test",
    init_actions_uris=[path],
    metadata={
        'PIP_PACKAGES': 'pyyaml requests pandas openpyxl'
    },
).make()

create_cluster_operator = DataprocCreateClusterOperator(
    task_id='create_dataproc_cluster',
    cluster_name="test",
    project_id="test",
    region="us-central1",
    cluster_config=CLUSTER_GENERATOR_CONFIG,
)
# [END how_to_cloud_dataproc_create_cluster_generate_cluster_config]

# Update options
# [START how_to_cloud_dataproc_updatemask_cluster_operator]
CLUSTER_UPDATE = {
    "config": {
        "worker_config": {
            "num_instances": 3
        },
        "secondary_worker_config": {
            "num_instances": 3
        }
Esempio n. 15
0
        "cluster_name": CLUSTER_NAME
    },
    "hadoop_job": {
        "main_jar_file_uri":
        "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar",
        "args": ["wordcount", "gs://pub/shakespeare/rose.txt", OUTPUT_PATH],
    },
}

with models.DAG(
        "example_gcp_dataproc",
        default_args={"start_date": days_ago(1)},
        schedule_interval=None,
) as dag:
    create_cluster = DataprocCreateClusterOperator(task_id="create_cluster",
                                                   project_id=PROJECT_ID,
                                                   cluster=CLUSTER,
                                                   region=REGION)

    scale_cluster = DataprocUpdateClusterOperator(
        task_id="scale_cluster",
        cluster_name=CLUSTER_NAME,
        cluster=CLUSTER_UPDATE,
        update_mask=UPDATE_MASK,
        graceful_decommission_timeout=TIMEOUT,
        project_id=PROJECT_ID,
        location=REGION,
    )

    pig_task = DataprocSubmitJobOperator(task_id="pig_task",
                                         job=PIG_JOB,
                                         location=REGION,
Esempio n. 16
0
        "step_id": "pig_job_1",
        "pig_job": PIG_JOB["pig_job"]
    }],
}

with models.DAG(
        "example_gcp_dataproc",
        schedule_interval='@once',
        start_date=datetime(2021, 1, 1),
        catchup=False,
) as dag:
    # [START how_to_cloud_dataproc_create_cluster_operator]
    create_cluster = DataprocCreateClusterOperator(
        task_id="create_cluster",
        project_id=PROJECT_ID,
        cluster_config=CLUSTER_CONFIG,
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )
    # [END how_to_cloud_dataproc_create_cluster_operator]

    # [START how_to_cloud_dataproc_update_cluster_operator]
    scale_cluster = DataprocUpdateClusterOperator(
        task_id="scale_cluster",
        cluster_name=CLUSTER_NAME,
        cluster=CLUSTER_UPDATE,
        update_mask=UPDATE_MASK,
        graceful_decommission_timeout=TIMEOUT,
        project_id=PROJECT_ID,
        region=REGION,
    )
Esempio n. 17
0
                    "properties": {"spark.yarn.queue": "default"}
                    },
}

with models.DAG(
        DAG_ID,
        schedule_interval=None,
        default_args=default_dag_args,
        tags=DAG_TAGS,
) as dag:  # Must specify your tenant name and owner of the dag

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = DataprocCreateClusterOperator(
        task_id="create_dataproc_cluster",
        impersonation_chain=CONNECT_SA,
        cluster_name=CLUSTER_NAME,  # include your dataproc cluster name
        region=REGION,
        cluster_config=CLUSTER_CONFIG,
        labels={"tenant": TENANT, "created-by": USER, },  # specify your tenant's name
    )

    # By default you won't have access to use `gcloud dataproc jobs submit` on the cluster that you created.
    # Running this script would let you submit jobs to the cluster through gcloud.
    # Be sure to give the correct cluster-name, cluster-region and your group entity
    assign_permissions = BashOperator(
        task_id="assign_permissions_for_dataproc_cluster",
        bash_command=f"bash {DAGS_FOLDER}/dataproc-set-iam.sh {CLUSTER_NAME} {REGION} group:{GROUP_NAME}",
    )

    # BashOperator to hold the Dataproc delete operator for specified sleep time
    # sleep_task = BashOperator(task_id="sleep_task_to_keep_dataproc_cluster_alive_3h", bash_command="sleep 8h",)
Esempio n. 18
0
"""
###############################################################################
# DAG
###############################################################################

with DAG(
    dag_id="DAF_PIPELINE_FOOD_ONTOLOGY_PREDICT_DAG",
    catchup=True,
    schedule_interval='00 6 * * *',
    max_active_runs=1,
    default_args=default_args
) as dag_daily:
    create_cluster = DataprocCreateClusterOperator(
        task_id="create_cluster",
        cluster_name=CLUSTER_NAME_DAILY,
        region=REGION,
        project_id=PROJECT_ID,
        cluster_config=CLUSTER_CONFIGURATION,
    )

    pig_job_nltk_stopwords = DataprocSubmitJobOperator(
        task_id="pig_job_nltk_stopwords",
        job=get_pig_job_config("sh python -m nltk.downloader stopwords",
                               CLUSTER_NAME_DAILY),
        location=REGION,
        project_id=PROJECT_ID
    )

    pig_job_spacy_vocabulary = DataprocSubmitJobOperator(
        task_id="pig_job_spacy_vocabulary",
        job=get_pig_job_config("sh python -m spacy download es_core_news_lg",