Example #1
0
 def test_cluster_name_log_no_sub(self):
     with patch('airflow.contrib.hooks.gcp_dataproc_hook.DataProcHook') as mock_hook:
         mock_hook.return_value.get_conn = self.mock_conn
         dataproc_task = DataprocClusterDeleteOperator(
             task_id=TASK_ID,
             cluster_name=CLUSTER_NAME,
             project_id=PROJECT_ID,
             dag=self.dag
         )
         with patch.object(dataproc_task.log, 'info') as mock_info:
             with self.assertRaises(TypeError):
                 dataproc_task.execute(None)
             mock_info.assert_called_with('Deleting cluster: %s', CLUSTER_NAME)
 def test_cluster_name_log_no_sub(self):
     with patch('airflow.contrib.hooks.gcp_dataproc_hook.DataProcHook') as mock_hook:
         mock_hook.return_value.get_conn = self.mock_conn
         dataproc_task = DataprocClusterDeleteOperator(
             task_id=TASK_ID,
             cluster_name=CLUSTER_NAME,
             project_id=GCP_PROJECT_ID,
             dag=self.dag
         )
         with patch.object(dataproc_task.log, 'info') as mock_info:
             with self.assertRaises(TypeError):
                 dataproc_task.execute(None)
             mock_info.assert_called_with('Deleting cluster: %s', CLUSTER_NAME)
    def test_cluster_name_log_no_sub(self):
        with patch('airflow.contrib.hooks.gcp_dataproc_hook.DataProcHook') \
            as mock_hook, patch('logging.info') as l:
            mock_hook.return_value.get_conn = self.mock_conn
            dataproc_task = DataprocClusterDeleteOperator(
                task_id=TASK_ID,
                cluster_name=CLUSTER_NAME,
                project_id=PROJECT_ID,
                dag=self.dag
            )

            with self.assertRaises(TypeError) as _:
                dataproc_task.execute(None)
            l.assert_called_with(('Deleting cluster: ' + CLUSTER_NAME))
    def test_cluster_name_log_sub(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook:
            mock_hook.return_value.get_conn = self.mock_conn
            dataproc_task = DataprocClusterDeleteOperator(
                task_id=TASK_ID,
                cluster_name='smoke-cluster-{{ ts_nodash }}',
                project_id=PROJECT_ID,
                dag=self.dag
            )

            with patch.object(dataproc_task.log, 'info') as mock_info:
                context = { 'ts_nodash' : 'testnodash'}

                rendered = dataproc_task.render_template('cluster_name', getattr(dataproc_task,'cluster_name'), context)
                setattr(dataproc_task, 'cluster_name', rendered)
                with self.assertRaises(TypeError) as _:
                    dataproc_task.execute(None)
                mock_info.assert_called_with('Deleting cluster: %s', u'smoke-cluster-testnodash')
Example #5
0
 def delete_cluster(self):
     """
     Returns a DataprocClusterDeleteOperator
     """
     return DataprocClusterDeleteOperator(
         task_id='delete_dataproc_cluster',
         trigger_rule=TriggerRule.ALL_DONE,
         cluster_name=self.cluster_name,
         region=self.region,
         gcp_conn_id=self.gcp_conn_id,
         project_id=self.connection.project_id)
Example #6
0
    def test_delete_cluster(self):
        with patch(HOOK) as mock_hook:
            hook = mock_hook()
            hook.get_conn.return_value = self.mock_conn
            hook.wait.return_value = None

            dataproc_task = DataprocClusterDeleteOperator(
                task_id=TASK_ID,
                region=GCP_REGION,
                project_id=GCP_PROJECT_ID,
                cluster_name=CLUSTER_NAME,
                dag=self.dag)
            dataproc_task.execute(None)

            self.mock_clusters.delete.assert_called_once_with(
                region=GCP_REGION,
                projectId=GCP_PROJECT_ID,
                clusterName=CLUSTER_NAME,
                requestId=mock.ANY)
            hook.wait.assert_called_once_with(self.operation)
Example #7
0
        task_id='weekend_analytics',
        main='gs://fsp-logistics-spark-bucket/pyspark/weekend/gas_composition_count.py',
        cluster_name='spark-cluster-{{ ds_nodash }}',
        region='europe-central2',
        dataproc_pyspark_jars='gs://spark-lib/bigquery/spark-bigquery-latest.jar',
    )

    weekday_analytics = SubDagOperator(
        task_id='weekday_analytics',
        subdag=weekday_subdag(
            parent_dag='bigquery_data_analytics',
            task_id='weekday_analytics',
            schedule_interval='0 20 * * *',
            default_args=default_arguments
        )
    )

    delete_cluster = DataprocClusterDeleteOperator(
        task_id='delete_cluster',
        project_id='fsp-airflow',
        cluster_name='spark-cluster-{{ ds_nodash }}',
        trigger_rule='all_done',
        region='europe-central2'
    )

create_cluster >> weekday_or_weekend >> [
    weekday_analytics,
    weekend_analytics
] >> delete_cluster

Example #8
0
          default_args=args,
          on_failure_callback=_on_failure_callback,
          description="Own stuff",
          schedule_interval="0 0 * * *") as dag:





    exchange_to_gcs = HttpToGcsOperator(gcs_bucket='land_data_training_jjac_airflow',
                                                 gcs_path='exchange-rates/exchange-rates-{{ds}}.json',
                                                 endpoint='/history?start_at={{ds}}&end_at={{tomorrow_ds}}&symbols=EUR&base=GBP',
                                                   task_id="get_data")

    start_dataproc = DataprocClusterCreateOperator(project_id='airflowbolcomdec-7601d68caa710',
                                                   cluster_name='test-dataproc-jjac-{{ds}}',
                                                   num_workers=4,
                                                   region='europe-west1',
                                                   task_id='start_dataproc')
    proc_dataproc = DataProcPySparkOperator(main=path.join(path.dirname(__file__)) + '/spark/build_statistics.py',
                                            project_id='airflowbolcomdec-7601d68caa710',
                                            cluster_name='test-dataproc-jjac-{{ds}}',
                                            region='europe-west1',
                                            arguments=['inp_prop', 'inp_curren', 'target_path', 'tar_curr', 'tar_date'],
                                            task_id="proc_dataproc")
    delete_dataproc = DataprocClusterDeleteOperator(project_id='airflowbolcomdec-7601d68caa710',
                                                    cluster_name='test-dataproc-jjac-{{ds}}',
                                                    region='europe-west1',
                                                    task_id="delete_dataproc", trigger_rule=TriggerRule.ALL_DONE)

    exchange_to_gcs >> start_dataproc >> proc_dataproc >> delete_dataproc
Example #9
0
PYSPARK_JOB = 'gs://' + GCS_BUCKET + '/spark-jobs/compute_aggregates.py'

with models.DAG('ComputeStats',
                default_args=default_args,
                schedule_interval="0 0 * * *") as dag:

    create_cluster = DataprocClusterCreateOperator(
        task_id='CreateCluster',
        cluster_name="analyse-pricing-{{ ds }}",
        project_id="afspfeb3-9d4bdb09f618016d0bc39",
        num_workers=2,
        zone="europe-west4-a",
        dag=dag,
    )

    compute_aggregates = DataProcPySparkOperator(
        task_id="compute_aggregates",
        main=PYSPARK_JOB,
        cluster_name="analyse-pricing-{{ ds }}",
        dag=dag,
    )

    delete_cluster = DataprocClusterDeleteOperator(
        task_id='DeleteCluster',
        cluster_name="analyse-pricing-{{ ds }}",
        project_id="afspfeb3-9d4bdb09f618016d0bc39",
        dag=dag,
    )

    create_cluster >> compute_aggregates >> delete_cluster
Example #10
0
    project_id="airflowbolcom-b9aabd6971d488d9",
    num_workers=2,
    zone="europe-west1-d",
    dag=dag)

compute_aggregates = DataProcPySparkOperator(
    task_id="compute_aggregates",
    main=
    'gs://europe-west1-training-airfl-68071199-bucket/other/build_statistics_simple.py',
    cluster_name="dataproc-cluster-dag-training-{{ ds }}",
    arguments=["{{ ds_nodash }}"],
    dag=dag)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc_cluster",
    cluster_name="dataproc-cluster-dag-training-{{ ds }}",
    project_id="airflowbolcom-b9aabd6971d488d9",
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag)

dest_table = "airflowbolcom-b9aabd6971d488d9:airflow_training_dataset.land_registry_${{ ds_nodash }}"
bucket_to_bq = GoogleCloudStorageToBigQueryOperator(
    task_id="gcs_to_bq",
    bucket="airflow_training_data",
    source_objects=["average_prices/transfer_date={{ds_nodash}}/*.parquet"],
    destination_project_dataset_table=dest_table,
    source_format="PARQUET",
    write_disposition="WRITE_TRUNCATE",
    dag=dag)

# currencies = ['USD', 'EUR']
# endpoint = "https://europe-west1-gdd-airflow-training.cloudfunctions.net/airflow-training-transform-valutas?date=1970-01-01&from=GBP&to=EUR"  # noqa: E501
Example #11
0
def export_to_parquet(
    table,
    arguments=[],
    dag_name="export_to_parquet",
    parent_dag_name=None,
    default_args=None,
    aws_conn_id="aws_dev_iam_s3",
    gcp_conn_id="google_cloud_derived_datasets",
    dataproc_zone="us-central1-a",
    dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet",
    num_preemptible_workers=0,
):

    """ Export a BigQuery table to Parquet.

    https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py

    :param str table:                             [Required] BigQuery table name
    :param List[str] arguments:                   Additional pyspark arguments
    :param str dag_name:                          Name of DAG
    :param Optional[str] parent_dag_name:         Parent DAG name
    :param Optional[Dict[str, Any]] default_args: DAG configuration
    :param str gcp_conn_id:                       Airflow connection id for GCP access
    :param str dataproc_storage_bucket:           Dataproc staging GCS bucket
    :param str dataproc_zone:                     GCP zone to launch dataproc clusters
    :param int num_preemptible_workers:           Number of Dataproc preemptible workers

    :return: airflow.models.DAG
    """

    # limit cluster name to 42 characters then suffix with -YYYYMMDD
    cluster_name = table.replace("_", "-")
    if len(cluster_name) > 42:
        if cluster_name.rsplit("-v", 1)[-1].isdigit():
            prefix, version = cluster_name.rsplit("-v", 1)
            cluster_name = prefix[:40 - len(version)] + "-v" + version
        else:
            cluster_name = cluster_name[:42]
    cluster_name += "-{{ ds_nodash }}"

    dag_prefix = parent_dag_name + "." if parent_dag_name else ""
    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
    properties = {
        "core:fs.s3a." + key: value
        for key, value in zip(
            ("access.key", "secret.key", "session.token"),
            AwsHook(aws_conn_id).get_credentials(),
        )
        if value is not None
    }

    with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:

        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            properties=properties,
            num_workers=2,
            image_version="1.3",
            storage_bucket=dataproc_storage_bucket,
            zone=dataproc_zone,
            master_machine_type="n1-standard-8",
            worker_machine_type="n1-standard-8",
            num_preemptible_workers=num_preemptible_workers,
        )

        run_dataproc_pyspark = DataProcPySparkOperator(
            task_id="run_dataproc_pyspark",
            cluster_name=cluster_name,
            dataproc_pyspark_jars=[
                "gs://mozilla-bigquery-etl/jars/spark-bigquery-0.5.1-beta-SNAPSHOT.jar"
            ],
            main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master"
            "/script/pyspark/export_to_parquet.py",
            arguments=[table] + arguments,
            gcp_conn_id=gcp_conn_id,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        )

        create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster

        return dag