def test_hook_correct_region(self):
       with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook:
            dataproc_task = DataProcPySparkOperator(
                task_id=TASK_ID,
                main=MAIN_URI,
                region=REGION
            )

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, REGION)
    def test_hook_correct_region():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcPySparkOperator(
                task_id=TASK_ID,
                main=MAIN_URI,
                region=GCP_REGION
            )

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY,
                                                                  GCP_REGION, mock.ANY)
Example #3
0
)

write_prices_to_bq = DataFlowPythonOperator(
    task_id="write_prices_to_bq",
    dataflow_default_options={
        "project": PROJECT_ID,
        "region": "europe-west1"
    },
    py_file="gs://" + BUCKET + "/scripts/dataflow_job.py",
    dag=dag,
)

dataproc_compute_aggregates = DataProcPySparkOperator(
    task_id="dataproc_compute_aggregates",
    main="gs://airflow-training-data-tim/scripts/build_statistics.py",
    cluster_name="analyse-pricing-{{ ds }}",
    arguments=["{{ ds }}"],
    dag=dag,
)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="dataproc_create_cluster",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id=PROJECT_ID,
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
    pool="dataproc",
)

query = """
Example #4
0
    def test_dataproc_job_id_is_set():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcPySparkOperator(task_id=TASK_ID,
                                                    main=MAIN_URI)

            _assert_dataproc_job_id(mock_hook, dataproc_task)
}

with DAG("flights_delay_etl", default_args=DEFAULT_DAG_ARGS) as dag:
    create_cluster = DataprocClusterCreateOperator(
        task_id="create_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        master_machine_type="n1-standard-1",
        worker_machine_type="n1-standard-2",
        num_workers=2,
        region="us-central1",
        zone="us-central1-a"
    )

    submit_pyspark = DataProcPySparkOperator(
        task_id="run_pyspark_etl",
        main=PYSPARK_JOB,
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="us-central1"
    )

    bq_load_delay_by_flight_nums=GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_avg_delays_by_flight_nums",
        bucket="spark-etl-1",
        source_objects=["flights_data_output/"+file_name+"_flight_nums/*.json"],
        destination_project_dataset_table="bigdata-etl-20201027.data_analysis.avg_delays_by_flight_nums",
        autodetect=True,
        source_format="NEWLINE_DELIMITED_JSON",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0
    )
Example #6
0
def export_to_parquet(
    table,
    arguments=[],
    dag_name="export_to_parquet",
    parent_dag_name=None,
    default_args=None,
    aws_conn_id="aws_dev_iam_s3",
    gcp_conn_id="google_cloud_derived_datasets",
    dataproc_zone="us-central1-a",
    dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet",
    num_preemptible_workers=0,
):

    """ Export a BigQuery table to Parquet.

    https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py

    :param str table:                             [Required] BigQuery table name
    :param List[str] arguments:                   Additional pyspark arguments
    :param str dag_name:                          Name of DAG
    :param Optional[str] parent_dag_name:         Parent DAG name
    :param Optional[Dict[str, Any]] default_args: DAG configuration
    :param str gcp_conn_id:                       Airflow connection id for GCP access
    :param str dataproc_storage_bucket:           Dataproc staging GCS bucket
    :param str dataproc_zone:                     GCP zone to launch dataproc clusters
    :param int num_preemptible_workers:           Number of Dataproc preemptible workers

    :return: airflow.models.DAG
    """

    # limit cluster name to 42 characters then suffix with -YYYYMMDD
    cluster_name = table.replace("_", "-")
    if len(cluster_name) > 42:
        if cluster_name.rsplit("-v", 1)[-1].isdigit():
            prefix, version = cluster_name.rsplit("-v", 1)
            cluster_name = prefix[:40 - len(version)] + "-v" + version
        else:
            cluster_name = cluster_name[:42]
    cluster_name += "-{{ ds_nodash }}"

    dag_prefix = parent_dag_name + "." if parent_dag_name else ""
    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)
    properties = {
        "core:fs.s3a." + key: value
        for key, value in zip(
            ("access.key", "secret.key", "session.token"),
            AwsHook(aws_conn_id).get_credentials(),
        )
        if value is not None
    }

    with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:

        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            properties=properties,
            num_workers=2,
            image_version="1.3",
            storage_bucket=dataproc_storage_bucket,
            zone=dataproc_zone,
            master_machine_type="n1-standard-8",
            worker_machine_type="n1-standard-8",
            num_preemptible_workers=num_preemptible_workers,
        )

        run_dataproc_pyspark = DataProcPySparkOperator(
            task_id="run_dataproc_pyspark",
            cluster_name=cluster_name,
            dataproc_pyspark_jars=[
                "gs://mozilla-bigquery-etl/jars/spark-bigquery-0.5.1-beta-SNAPSHOT.jar"
            ],
            main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master"
            "/script/pyspark/export_to_parquet.py",
            arguments=[table] + arguments,
            gcp_conn_id=gcp_conn_id,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        )

        create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster

        return dag
Example #7
0
properties = {"spark.jars.packages": "io.delta:delta-core_2.11:0.5.0"}

t1 = DataprocClusterCreateOperator(task_id="create_cluster",
                                   gcp_conn_id='google_cloud_default',
                                   project_id=project_id,
                                   region=region,
                                   zone=zone,
                                   cluster_name=cluster_name,
                                   storage_bucket=storage_bucket,
                                   num_workers=0,
                                   master_machine_type='n1-standard-2',
                                   image_version=image_version,
                                   dag=dag)

t2 = DataProcPySparkOperator(task_id="run",
                             gcp_conn_id='google_cloud_default',
                             project_id=project_id,
                             main=job_file,
                             job_name='test',
                             dataproc_pyspark_properties=properties,
                             cluster_name=cluster_name,
                             region=region,
                             dag=dag)

t3 = DataprocClusterDeleteOperator(task_id='delete_cluster',
                                   project_id=project_id,
                                   cluster_name=cluster_name,
                                   region=region,
                                   dag=dag)

t1 >> t2 >> t3
Example #8
0
def moz_dataproc_pyspark_runner(
    parent_dag_name=None,
    dag_name='run_pyspark_on_dataproc',
    default_args=None,
    cluster_name=None,
    num_workers=2,
    image_version='1.4',
    region='us-west1',
    subnetwork_uri=None,
    internal_ip_only=None,
    idle_delete_ttl='10800',
    auto_delete_ttl='21600',
    master_machine_type='n1-standard-8',
    worker_machine_type='n1-standard-4',
    num_preemptible_workers=0,
    service_account='*****@*****.**',
    init_actions_uris=None,
    additional_metadata=None,
    additional_properties=None,
    optional_components=['ANACONDA'],
    install_component_gateway=True,
    python_driver_code=None,
    py_args=None,
    job_name=None,
    aws_conn_id=None,
    gcp_conn_id='google_cloud_airflow_dataproc',
    artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts',
    storage_bucket='moz-fx-data-prod-dataproc-scratch',
    master_disk_type='pd-standard',
    worker_disk_type='pd-standard',
    master_disk_size=1024,
    worker_disk_size=1024,
    master_num_local_ssds=0,
    worker_num_local_ssds=0,
):
    """
    This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.
    Then we call DataProcPySparkOperator to execute the pyspark script defined by the argument
    python_driver_code. Once that succeeds, we teardown the cluster.

    **Example**: ::

        # Unsalted cluster name so subsequent runs fail if the cluster name exists
        cluster_name = 'test-dataproc-cluster-hwoo'

        # Defined in Airflow's UI -> Admin -> Connections
        gcp_conn_id = 'google_cloud_airflow_dataproc'

        run_dataproc_pyspark = SubDagOperator(
            task_id='run_dataproc_pyspark',
            dag=dag,
            subdag = moz_dataproc_pyspark_runner(
                parent_dag_name=dag.dag_id,
                dag_name='run_dataproc_pyspark',
                job_name='Do_something_on_pyspark',
                default_args=default_args,
                cluster_name=cluster_name,
                python_driver_code='gs://some_bucket/some_py_script.py',
                py_args=["-d", "{{ ds_nodash }}"],
                gcp_conn_id=gcp_conn_id)
        )

    Airflow related args:
    ---
    :param str parent_dag_name:           Parent dag name.
    :param str dag_name:                  Dag name.
    :param dict default_args:             Dag configuration.

    Dataproc Cluster related args:
    ---
    :param str cluster_name:              The name of the dataproc cluster.
    :param int num_workers:               The number of spark workers.
    :param str image_version:             The image version of software to use for dataproc
                                          cluster.
    :param str region:                    Region where the dataproc cluster will be located.
                                          Zone will be chosen automatically
    :param str subnetwork_uri:            The subnetwork uri to be used for machine communication,
                                          cannot be specified with network_uri. Only need this if
                                          setting internal_ip_only = True. (See next parameter)
    :param bool internal_ip_only:         If True, cluster nodes will only have internal IP addresses.
                                          Can only be enabled with subnetwork_uri enabled networks.
                                          We use this for NAT'd dataproc clusters whose outbound traffic
                                          needs to be whitelisted. To use a NAT'd cluster, set
                                          subnetwork_uri='default', internal_ip_only=True, and
                                          region=us-west2-a|b|c
    :param str idle_delete_ttl:           The duration in seconds to keep idle cluster alive.
    :param str auto_delete_ttl:           The duration in seconds that the cluster will live.
    :param str master_machine_type:       Compute engine machine type to use for master.
    :param str worker_machine_type:       Compute engine machine type to use for the workers.
    :param int num_preemptible_workers:   Number of preemptible worker nodes to spin up.
    :param str service_account:           The service account for spark VMs to use. For example
                                          if cross project access is needed. Note that this svc
                                          account needs the following permissions:
                                          roles/logging.logWriter and roles/storage.objectAdmin.
    :param list init_actions_uris:        List of GCS uri's containing dataproc init scripts.
    :param dict additional_metadata       Custom metadata keys and values, might be used to
                                          configure initialization actions.
    :param dict additional_properties     Custom cluster properties, can be used to configure
                                          cluster components, add Spark packages, etc.
    :param str job_name:                  Name of the spark job to run.

    :param str aws_conn_id:               Airflow connection id for S3 access (if needed).
    :param str gcp_conn_id:               The connection ID to use connecting to GCP.
    :param str artifact_bucket:           Path to resources for bootstrapping the dataproc cluster
    :param str storage_bucket:            Path to scratch bucket for intermediate cluster results
    :param list optional_components:      List of optional components to install on cluster
                                          Defaults to ['ANACONDA'] for now since JUPYTER is broken.
    :param str install_component_gateway: Enable alpha feature component gateway.
    :param master_disk_type:              Type of the boot disk for the master node
                                            (default is ``pd-standard``).
                                            Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
                                            ``pd-standard`` (Persistent Disk Hard Disk Drive).
    :type master_disk_type: str
    :param master_disk_size:              Disk size for the master node
    :type master_disk_size: int
    :param master_num_local_ssds : Number of local SSDs to mount
        (default is 0)
    :type master_num_local_ssds : int
    :param worker_disk_type:              Type of the boot disk for the worker node
                                            (default is ``pd-standard``).
                                            Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or
                                            ``pd-standard`` (Persistent Disk Hard Disk Drive).
    :type worker_disk_type: str
    :param worker_disk_size:              Disk size for the worker node
    :type worker_disk_size: int
    :param worker_num_local_ssds : Number of local SSDs to mount
        (default is 0)
    :type worker_num_local_ssds : int

    Pyspark related args:
    ---
    :param str python_driver_code:        The Hadoop Compatible Filesystem (HCFS) URI of the main
                                          Python file to use as the driver. Must be a .py file.
    :param list py_args:                  Arguments for the pyspark job.

    """

    if cluster_name is None or python_driver_code is None:
        raise AirflowException(
            'Please specify cluster_name and/or python_driver_code.')

    dataproc_helper = DataProcHelper(
        cluster_name=cluster_name,
        job_name=job_name,
        num_workers=num_workers,
        image_version=image_version,
        region=region,
        subnetwork_uri=subnetwork_uri,
        internal_ip_only=internal_ip_only,
        idle_delete_ttl=idle_delete_ttl,
        auto_delete_ttl=auto_delete_ttl,
        master_machine_type=master_machine_type,
        worker_machine_type=worker_machine_type,
        num_preemptible_workers=num_preemptible_workers,
        service_account=service_account,
        init_actions_uris=init_actions_uris,
        optional_components=optional_components,
        additional_metadata=additional_metadata,
        additional_properties=additional_properties,
        install_component_gateway=install_component_gateway,
        aws_conn_id=aws_conn_id,
        gcp_conn_id=gcp_conn_id,
        artifact_bucket=artifact_bucket,
        storage_bucket=storage_bucket,
        master_disk_type=master_disk_type,
        master_disk_size=master_disk_size,
        worker_disk_type=worker_disk_type,
        worker_disk_size=worker_disk_size,
        master_num_local_ssds=master_num_local_ssds,
        worker_num_local_ssds=worker_num_local_ssds,
    )

    _dag_name = '{}.{}'.format(parent_dag_name, dag_name)

    with models.DAG(_dag_name, default_args=default_args) as dag:
        create_dataproc_cluster = dataproc_helper.create_cluster()

        run_pyspark_on_dataproc = DataProcPySparkOperator(
            task_id='run_dataproc_pyspark',
            job_name=job_name,
            cluster_name=cluster_name,
            region=region,
            main=python_driver_code,
            arguments=py_args,
            gcp_conn_id=gcp_conn_id,
        )

        delete_dataproc_cluster = dataproc_helper.delete_cluster()

        create_dataproc_cluster >> run_pyspark_on_dataproc >> delete_dataproc_cluster
        return dag
Example #9
0
def spark_subdag(
    parent_dag_name,
    child_dag_name,
    default_args,
    gcp_conn_id,
    service_account,
    main,
    pyfiles,
    arguments,
    bootstrap_bucket,
    dataproc_region="us-west1",
    num_preemptible_workers=10,
):
    """Run the PySpark job for unnesting and range-partitioning Prio pings from
    the ingestion service.

    :param str parent_dag_name:         Name of the parent DAG.
    :param str child_dag_name:          Name of the child DAG.
    :param Dict[str, Any] default_args: Default arguments for the child DAG.
    :param str gcp_conn_id:             Name of the connection string.
    :param str service_account:         The address of the service account.
    :param str dataproc_region:           The region of the DataProc cluster.
    :param str main:
    :param List[str] pyfiles:
    :param List[str] arguments:
    :param int num_preemptible_workers: The number of preemptible workers.
    :return: DAG
    """

    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    shared_config = {
        "cluster_name": "prio-staging-{{ds_nodash}}",
        "gcp_conn_id": gcp_conn_id,
        "project_id": connection.project_id,
        # From an error when not specifying the region:
        # - Dataproc images 2.0 and higher do not support the to-be
        #   deprecated global region. Please use any non-global Dataproc
        #   region instead
        #  - Must specify a zone in GCE configuration when using
        #    'regions/global'. To use auto zone placement, specify
        #    regions/<non-global-region> in request path, e.g.
        #    regions/us-central1
        "region": dataproc_region,
    }

    with DAG(f"{parent_dag_name}.{child_dag_name}",
             default_args=default_args) as dag:
        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            image_version="preview-ubuntu18",
            service_account=service_account,
            master_machine_type="n1-standard-4",
            worker_machine_type="n1-standard-4",
            num_workers=2,
            num_preemptible_workers=num_preemptible_workers,
            init_actions_uris=[
                f"{bootstrap_bucket}/install-python-requirements.sh"
            ],
            idle_delete_ttl=600,
            dag=dag,
            **shared_config,
        )

        run_dataproc_spark = DataProcPySparkOperator(
            task_id="run_dataproc_spark",
            main=main,
            dataproc_pyspark_jars=[
                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
            ],
            pyfiles=pyfiles,
            arguments=arguments,
            dag=dag,
            **shared_config,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            trigger_rule="all_done",
            dag=dag,
            **shared_config,
        )
        create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster
        return dag
Example #10
0
    region=region,
    start_date=start_date),
                                              task_id=catalog_task_id,
                                              dag=dag)

consume_show_comments_job_path = "gs://" + gcs_netflix_bucket + "/spark-jobs/consume_reddit_comments.py"
reddit_destination_path = "gs://" + gcs_netflix_bucket + "/comments/raw/comments.parquet"
gcp_netflix_catalog_path = "gs://" + gcs_netflix_bucket + "/" + catalog_path

consume_show_comment_to_datalake = DataProcPySparkOperator(
    task_id='consume_show_comment_to_datalake',
    main=consume_show_comments_job_path,
    cluster_name=cluster_name,
    job_name='consume_show_comments',
    region=region,
    arguments=[
        Variable.get("reddit_client_id"),
        Variable.get("reddit_client_secret"), gcp_netflix_catalog_path,
        ["netflix NetflixBestOf bestofnetflix"], reddit_destination_path
    ],
    gcp_conn_id=gcp_conn,
    dag=dag)

generate_show_comments_job_path = "gs://" + gcs_netflix_bucket + "/spark-jobs/generate_show_comments.py"
generate_show_comment_to_datalake = DataProcPySparkOperator(
    task_id='generate_show_comment_to_datalake',
    main=generate_show_comments_job_path,
    cluster_name=cluster_name,
    job_name='generate_show_comments',
    region=region,
    arguments=[gcp_netflix_catalog_path, reddit_destination_path],
dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="airflowbolcom-4b5ba3f7fec9aea9",
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
)

compute_aggregates = DataProcPySparkOperator(
    task_id='compute_aggregates',
    main='gs://dpranantha/statistics/build_statistics.py',
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=[
        "gs://dpranantha/{{ ds }}/land_price_uk_*.json",
        "gs://dpranantha/{{ ds }}/currency_*.json",
        "gs://dpranantha/{{ ds }}/average/"
    ],
    dag=dag,
)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="airflowbolcom-4b5ba3f7fec9aea9",
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag,
)

gcsBq = GoogleCloudStorageToBigQueryOperator(
Example #12
0
    crear_cluster = DataprocClusterCreateOperator(
        task_id="create_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        master_machine_type="n1-standard-1",
        master_disk_size=50,
        worker_machine_type="n1-standard-1",
        worker_disk_size=50,
        num_workers=2,
        region="us-west1",
        zone="us-west1-a",
        image_version='1.4')

    pyspark = DataProcPySparkOperator(
        task_id="run_pyspark",
        main=route,
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="us-west1")

    borrar_cluster = DataprocClusterDeleteOperator(
        task_id="borrar_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="us-west1",
        trigger_rule=TriggerRule.ALL_DONE)

    dummy_final = DummyOperator(task_id="prueba_final")

    delay = PythonOperator(task_id="delay1",
                           python_callable=my_function,
                           op_args=[200])
Example #13
0
        # in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        image_version='1.5-debian10',
        num_workers=0,
        master_machine_type='n1-standard-2',
        num_masters=1,
        storage_bucket='egen-training-286300-dataproc-staging',
        zone='us-central1-b')

    # Submit the PySpark job.
    submit_pyspark = DataProcPySparkOperator(
        task_id='run_dataproc_pyspark',
        main='gs://egen-training-286300' +
        '/spark-jobs/sparkify_etl_2.py',
        # Obviously needs to match the name of cluster created in the prior Operator.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        # Let's template our arguments for the pyspark job from the POST payload.
        arguments=[
            "--bucket={{ dag_run.conf['bucket'] }}",
            "--raw_file_name={{dag_run.conf['raw_file_name']}}"
        ])

    # Load the transformed files to a BigQuery table.
    bq_load = GoogleCloudStorageToBigQueryOperator(
        task_id='GCS_to_BigQuery',
        bucket="{{dag_run.conf['bucket']}}",
        # Wildcard for objects created by spark job to be written to BigQuery
        # Reads the relative path to the objects transformed by the spark job from the POST message.
        source_objects=["transformed/partitioned/users_table.parquet/part-*"],
        destination_project_dataset_table='egen.users_table_no_partition',
        autodetect=True,
        master_disk_size=50,
        worker_machine_type="n1-standard-1",
        worker_disk_size=50,
        num_workers=2,
        region="us-east1",
        zone="us-east1-b",
        init_actions_uris=[
            'gs://dataproc-initialization-actions/python/pip-install.sh'
        ],
        optional_components=["ANACONDA"],
        metadata={'PIP_PACKAGES': 'google-cloud-storage'},
    )

    submit_pyspark = DataProcPySparkOperator(
        task_id="run_pyspark_etl",
        main=PYSPARK_JOB,
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="us-east1")

    bq_load_profeco_data = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_csv_profeco",
        bucket='gnp-storage',
        source_objects=["Profeco/resources/Sin-fecha/profeco.pdf"],
        destination_project_dataset_table=PROJECT_ID + ".GNP.Profeco_table",
        autodetect=True,
        source_format="CSV",
        field_delimiter=',',
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)
Example #15
0
    create_cluster = DataprocClusterCreateOperator(
        task_id="create_cluster",
        project_id=HYDRO_DATA_PROJECT_ID,
        cluster_name="spark-cluster-{{ ds_nodash }}",  # spark-cluster-YYYMMDD
        num_workers=2,
        storage_bucket=SPARK_BUCKET,
        region="us-west1",
        zone="us-west1-a",
        idle_delete_ttl=300,  #  5 mins is the min. value
        gcp_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID,
    )

    calculate_daily_average_kwh = DataProcPySparkOperator(
        task_id="calculate_daily_average_kwh",
        main=f"gs://{SPARK_BUCKET}/pyspark/daily_average_kwh.py",
        cluster_name="spark-cluster-{{ ds_nodash }}",
        dataproc_pyspark_jars="gs://spark-lib/bigquery/spark-bigquery-latest.jar",
        gcp_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID,
        region="us-west1",
    )

    calculate_daily_sum_kwh = DataProcPySparkOperator(
        task_id="calculate_daily_sum_kwh",
        main=f"gs://{SPARK_BUCKET}/pyspark/daily_sum_kwh.py",
        cluster_name="spark-cluster-{{ ds_nodash }}",
        dataproc_pyspark_jars="gs://spark-lib/bigquery/spark-bigquery-latest.jar",
        gcp_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID,
        region="us-west1"
    )

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_cluster",
    ],
    metadata={
        'PIP_PACKAGES':
        'tensorflow==2.0.0 pyarrow==0.15.1 sentencepiece==0.1.85 gcsfs nltk tensorflow-hub tables bert-for-tf2 absl-py google-cloud-storage google-cloud-logging '
    },
    image_version='1.4.22-debian9',
    master_machine_type=MASTER_MACHINE_TYPE,
    worker_machine_type=WORKER_MACHINE_TYPE,
    properties={
        "dataproc:dataproc.logging.stackdriver.job.driver.enable": "true"
    },
    region=REGION,
    task_id='create_spark',
    dag=dag1)

run_spark = DataProcPySparkOperator(
    main='gs://topic-sentiment-1/code/data_wrangling.py',
    arguments=[RAW_DATA, TOKENIZED_DATA_DIR, THRESHOLD],
    task_id='run_spark',
    cluster_name=SPARK_CLUSTER,
    region=REGION,
    dag=dag1)

delete_spark = DataprocClusterDeleteOperator(cluster_name=SPARK_CLUSTER,
                                             project_id=PROJECT,
                                             region=REGION,
                                             task_id='delete_spark')

# Dag definition
begin >> create_spark >> run_spark >> delete_spark >> end
    dag=dag,
)

dataproc_remove_cluster = DataprocClusterDeleteOperator(
    task_id="dataproc_remove_cluster",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="airflowbolcom-may2829-b2a87b4d",
    dag=dag,
)

dataproc_run_pyspark = DataProcPySparkOperator(
    task_id="dataproc_run_pyspark",
    main="gs://een_emmer/build_statistics.py",
    cluster_name="analyse-pricing-{{ ds }}",
    arguments=[
        "gs://een_emmer/daily_load_{{ ds }}",
        "gs://een_emmer/exchangerate_{{ ds }}.txt",
        "gs://een_emmer/dataproc_output_{{ ds }}",
    ],
    dag=dag,
)

prices_uk_from_postgres_to_cloudstorage = PostgresToGoogleCloudStorageOperator(
    task_id="prices_uk_from_postgres_to_cloudstorage",
    sql=
    "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket="een_emmer",
    filename="daily_load_{{ ds }}",
    postgres_conn_id="stuff_postgres",
    dag=dag,
)
Example #18
0
        'bucket': 'europe-west1-training-airfl-bb0beabce-bucket',
        'job_name': '{{ task_instance_key_str }}'
    },
    py_file="gs://airflow-daniel/dataflow_job.py",
    dag=dag)

from airflow.contrib.operators.dataproc_operator import (DataprocClusterCreateOperator, DataprocClusterDeleteOperator,
                                                         DataProcPySparkOperator, )

dataproc_create_cluster = DataprocClusterCreateOperator(task_id="create_dataproc",
                                                        cluster_name="analyse-pricing-{{ ds }}",
                                                        project_id="airflowbolcom-b01c3abbfb10e7ee",
                                                        num_workers=2, zone="europe-west4-a", dag=dag, )
compute_aggregates = DataProcPySparkOperator(task_id='compute_aggregates',
                                             main='gs://airflow-daniel/build_statistics.py',
                                             cluster_name='analyse-pricing-{{ ds }}', arguments=[
        "gs://airflow-daniel/land_registry_price_paid_uk/{{ ds }}/*.json",
        "gs://airflow-daniel/currency/{{ ds }}/*.json", "gs://airflow-daniel/average_prices/{{ ds }}/"],
                                             dag=dag)
from airflow.utils.trigger_rule import TriggerRule

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id="airflowbolcom-b01c3abbfb10e7ee",
    trigger_rule=TriggerRule.ALL_DONE, dag=dag)

write_to_bq = GoogleCloudStorageToBigQueryOperator(task_id="write_to_bq",
                                                   bucket="airflow-daniel",
                                                   source_objects=["average_prices/{{ ds }}/*.parquet"],
                                                   destination_project_dataset_table="airflow.airflow{{ ds_nodash }}",
                                                   source_format="PARQUET", write_disposition="WRITE_TRUNCATE",
                                                   dag=dag, )
pgsql_to_gcs >> load_into_bigquery
Example #19
0
    dag.doc_md = __doc__

    create_cluster_task = DataprocClusterCreateOperator(
        task_id='create_cluster',
        project_id='youtubelist-256522',
        cluster_name='spark-cluster1-{{ ds_nodash }}',
        num_workers=2,
        storage_bucket="opi_staging_bucket",
        metadata={'PIP_PACKAGES': 'pandas praw google-cloud-storage'},
        region="us-central1")

    feature_engineering_task = DataProcPySparkOperator(
        task_id='feature_engineering',
        main='gs://opi_processed_data/pyspark/create_feature_store.py',
        cluster_name='spark-cluster1-{{ ds_nodash }}',
        dataproc_pyspark_jars=
        "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar",
        arguments=['--timestamp', '{{ ts_nodash }}', '--train_perc', "70"],
        region="us-central1")

    delete_cluster_task = DataprocClusterDeleteOperator(
        task_id="delete_cluster",
        project_id="youtubelist-256522",
        cluster_name="spark-cluster1-{{ ds_nodash }}",
        region="us-central1")

    train_task = PythonOperator(
        task_id="train_model",
        python_callable=train_model,
        op_args=['{{ ts_nodash }}', 0.9, 0.5],
    )
Example #20
0
    gcs_path='currency/{{ds}}.json')

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id='airflowbolcom-bc4a05f9b43155a6',
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
)
compute_aggregates = DataProcPySparkOperator(
    task_id='compute_aggregates',
    main='gs://riccardos_bucket/build_statistics.py',
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=[
        "gs://riccardos_bucket/house_data/{{ ds }}.json",
        "gs://riccardos_bucket/currency/{{ ds }}.json",
        "gs://riccardos_bucket/average_prices/{{ ds }}/"
    ],
    dag=dag,
)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id='airflowbolcom-bc4a05f9b43155a6',
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag,
)

[pgsl_to_gcs, currency_gcs
Example #21
0
"""

query_all = """
create external table if not exists customer_all
 (cust_details string)  
 STORED AS TEXTFILE LOCATION 'gs://mysqldataflow/json/customer/'
"""
dag = airflow.DAG('Import-MySQL-to-GS-and-DataProc',
                  'catchup=False',
                  default_args=default_args,
                  schedule_interval=datetime.timedelta(days=1))

t1 = DataProcPySparkOperator(
    task_id='import-mysql-data',
    main='gs://mysqlnosql/spark_jdbc_to_gs.py',
    cluster_name='mydataproc2',
    region='us-central1',
    dataproc_pyspark_jars=['gs://mysqlnosql/spark-avro.jar'],
    dag=dag)

t2 = DataProcHiveOperator(query=query_part,
                          cluster_name='mydataproc2',
                          region='us-central1',
                          task_id='create_table_in_hive_2_cols',
                          dag=dag)

t3 = DataProcHiveOperator(query=query_all,
                          cluster_name='mydataproc2',
                          region='us-central1',
                          task_id='create_table_in_hive_all_cols',
                          dag=dag)
Example #22
0
    project_id="airflowbolcom-jan2829-b51a8ad2",
    region='europe-west4',
    dag=dag)

arguments = [
    'gs://europe-west1-training-airfl-a98394bc-bucket/data/properties/properties_*.json',  #input_properties
    'gs://europe-west1-training-airfl-a98394bc-bucket/data/use_case_ivan/exchange_rates.json',  #input_currencies
    'gs://europe-west1-training-airfl-a98394bc-bucket/use_case_output',  #target_path
    'EUR',  #target_currency
    '2018-01-03',  #target_date
]

run_spark = DataProcPySparkOperator(
    task_id="run_spark",
    main="gs://europe-west1-training-airfl-a98394bc-bucket/build_statistics.py",
    cluster_name="my-dataproc-cluster",
    region='europe-west4',
    arguments=arguments,
    dag=dag)
# delete_dataproc_cluster = DataprocClusterDeleteOperator(task_id="delete_dataproc_cluster",
#                                                         cluster_name="my-dataproc-cluster",
#                                                         project_id="airflowbolcom-jan2829-b51a8ad2",
#                                                         region='europe-west4',
#                                                         dag=dag)
# fetch_exchange_rates
# fetch_exchange_rates >> create_dataproc_cluster
# fetch_exchange_rates >> create_dataproc_cluster >> run_spark >> delete_dataproc_cluster
fetch_exchange_rates >> create_dataproc_cluster >> run_spark
# write_response_to_gcs = LaunchToGcsOperator(task_id="write_response_to_gcs",
#                                             python_callable=_connect,
#                                             provide_context=True,
Example #23
0
}

with DAG("movies_etl", default_args=DEFAULT_DAG_ARGS) as dag:

    create_cluster = DataprocClusterCreateOperator(
        task_id="create_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        master_machine_type="n1-standard-1",
        worker_machine_type="n1-standard-2",
        num_workers=2,
        region="asia-southeast2",
        zone="asia-southeast2-a")

    submit_pyspark = DataProcPySparkOperator(
        task_id="run_pyspark_etl",
        main=PYSPARK_JOB,
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="asia-southeast2")

    bq_load_movies_1 = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_movies_1",
        bucket='bigdata-etl-2_flights',
        source_objects=[
            "movies_data_output/" + current_date + "_datamart_1/*.json"
        ],
        destination_project_dataset_table=
        f'{PROJECT_ID}:qoala_test.movies_datamart_1',
        autodetect=True,
        source_format="NEWLINE_DELIMITED_JSON",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
Example #24
0
				 "yarn:yarn:scheduler.maximum-allocation-mb": "50000",
				 "yarn:yarn.nodemanager.resource.memory-mb": "50000",
				 "spark:spark.driver.maxResultsSize": "55g",
				 "spark:spark.driver.memory": "19g",
				 "spark:spark.executor.memory": "19g",
				 "spark:spark.executor.extraJavaOptions": "-XX:+PrintGCDetails"},
	master_machine_type='n1-standard-16',
	worker_machine_type='n1-standard-16',
	zone='us-central1-b',
	gcp_conn_id='google-cloud-default',
	service_account='*****@*****.**',
	service_account_scopes=['https://www.googleapis.com/auth/cloud-platform'],
	delegate_to='*****@*****.**',
	dag=dag)

t4 = DataProcPySparkOperator(
	task_id='submit_job',
	main='gs://store_recs/development/model/spark_store_recs_v2.py',
	cluster_name='cluster-1',
	dag=dag
	)

t5 = DataprocClusterDeleteOperator(
	task_id='delete_cluster',
	cluster_name='cluster-1',
	project_id='my_project',
	dag=dag)

t2.set_upstream(t1)
t4.set_upstream([t3, t2])
t5.set_upstream(t4)
Example #25
0
    create_cluster = DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # ds_nodash is an airflow macro for "[Execution] Date string no dashes"
        # in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        num_workers=2,
        num_preemptible_workers=2,
        zone=Variable.get('gce_zone'))

    # Submit the PySpark job.
    submit_pyspark = DataProcPySparkOperator(
        task_id='run_dataproc_pyspark',
        main=PYSPARK_JOB,
        # Obviously needs to match the name of cluster created in the prior Operator.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        # Let's template our arguments for the pyspark job from the POST payload.
        arguments=[
            "--gcs_path_raw={{ dag_run.conf['raw_path'] }}",
            "--gcs_path_transformed=gs://" + BUCKET +
            "/{{ dag_run.conf['transformed_path'] }}"
        ])

    # Load the transformed files to a BigQuery table.
    bq_load = GoogleCloudStorageToBigQueryOperator(
        task_id='GCS_to_BigQuery',
        bucket=BUCKET,
        # Wildcard for objects created by spark job to be written to BigQuery
        # Reads the relative path to the objects transformed by the spark job from the POST message.
        source_objects=["{{ dag_run.conf['transformed_path'] }}/part-*"],
        destination_project_dataset_table=OUTPUT_TABLE,
        schema_fields=None,
     gcs_bucket='output_bucket_for_airflow',
     gcs_path='exchange-rates-{{ ds }}.json',
     method="GET",
     http_conn_id="http_default",
     gcs_conn_id="google_cloud_default")
 create_cluster = DataprocClusterCreateOperator(
     task_id='create_cluster',
     project_id='afspfeb3-28e3a1b32a56613ef127e',
     cluster_name='analyse-pricing-{{ ds }}',
     num_workers=2,
     zone='europe-west4-a')
 calculate_statistics = DataProcPySparkOperator(
     task_id='calculate_statistics',
     main='gs://output_bucket_for_airflow/build_statistics.py',
     arguments=[
         'gs://output_bucket_for_airflow/prices-{{ ds }}.json',
         'gs://output_bucket_for_airflow/exchange-rates-{{ ds }}.json',
         'gs://output_bucket_for_airflow/output.parquet', 'EUR',
         '{{ yesterday_ds }}'
     ],
     cluster_name='analyse-pricing-{{ ds }}')
 delete_cluster = DataprocClusterDeleteOperator(
     task_id='delete_cluster',
     cluster_name='analyse-pricing-{{ds}}',
     project_id='afspfeb3-28e3a1b32a56613ef127e',
     region='global')
 # statistics_to_big_query = GoogleCloudStorageToBigQueryOperator(task_id='statistics_to_big_query',
 #                                                                bucket='output_bucket_for_airflow',
 #                                                                source_objects=[''],
 #                                                                destination_project_dataset_table,
 #                                                                schema_fields=None,
 #                                                                schema_object=None,
Example #27
0
def export_to_parquet(
    table,
    destination_table=None,
    static_partitions=[],
    arguments=[],
    use_storage_api=False,
    dag_name="export_to_parquet",
    parent_dag_name=None,
    default_args=None,
    gcp_conn_id="google_cloud_derived_datasets",
    dataproc_zone="us-central1-a",
    dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet",
    num_workers=2,
    num_preemptible_workers=0,
    gcs_output_bucket="moz-fx-data-derived-datasets-parquet",
):

    """ Export a BigQuery table to Parquet.

    https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py

    :param str table:                             [Required] BigQuery table name
    :param Optional[str] destination_table:       Output table name, defaults to table,
                                                  will have r'_v[0-9]+$' replaced with
                                                  r'/v[0-9]+'
    :param List[str] arguments:                   Additional pyspark arguments
    :param bool use_storage_api:                  Whether to read from the BigQuery
                                                  Storage API or an AVRO export
    :param str dag_name:                          Name of DAG
    :param Optional[str] parent_dag_name:         Parent DAG name
    :param Optional[Dict[str, Any]] default_args: DAG configuration
    :param str gcp_conn_id:                       Airflow connection id for GCP access
    :param str dataproc_storage_bucket:           Dataproc staging GCS bucket
    :param str dataproc_zone:                     GCP zone to launch dataproc clusters
    :param int num_preemptible_workers:           Number of Dataproc preemptible workers

    :return: airflow.models.DAG
    """

    # remove the dataset prefix and partition suffix from table
    table_id = table.rsplit(".", 1)[-1]
    unqualified_table, _, partition_id = table_id.partition("$")
    # limit cluster name to 35 characters plus suffix of -export-YYYYMMDD (51 total)
    cluster_name = unqualified_table.replace("_", "-")
    if len(cluster_name) > 35:
        # preserve version when truncating cluster name to 42 characters
        prefix, version = re.match(r"(.*?)(-v[0-9]+)?$", cluster_name).groups("")
        cluster_name = prefix[:35 - len(version)] + version
    cluster_name += "-export-{{ ds_nodash }}"

    dag_prefix = parent_dag_name + "." if parent_dag_name else ""
    connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id)

    if destination_table is None:
        destination_table = unqualified_table
    # separate version using "/" instead of "_"
    export_prefix = re.sub(r"_(v[0-9]+)$", r"/\1", destination_table) + "/"
    if static_partitions:
        export_prefix += "/".join(static_partitions) + "/"
    avro_prefix = "avro/" + export_prefix
    if not static_partitions and partition_id:
        avro_prefix += "partition_id=" + partition_id + "/"
    avro_path = "gs://" + gcs_output_bucket + "/" + avro_prefix + "*.avro"

    with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag:

        create_dataproc_cluster = DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            num_workers=num_workers,
            image_version="1.4",
            storage_bucket=dataproc_storage_bucket,
            zone=dataproc_zone,
            master_machine_type="n1-standard-8",
            worker_machine_type="n1-standard-8",
            num_preemptible_workers=num_preemptible_workers,
            init_actions_uris=[
                "gs://dataproc-initialization-actions/python/pip-install.sh",
            ],
            metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"},
        )

        run_dataproc_pyspark = DataProcPySparkOperator(
            task_id="run_dataproc_pyspark",
            cluster_name=cluster_name,
            dataproc_pyspark_jars=[
                "gs://spark-lib/bigquery/spark-bigquery-latest.jar"
            ],
            dataproc_pyspark_properties={
                "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4",
            },
            main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master"
            "/script/pyspark/export_to_parquet.py",
            arguments=[table]
            + [
                "--" + key + "=" + value
                for key, value in {
                    "avro-path": (not use_storage_api) and avro_path,
                    "destination": "gs://" + gcs_output_bucket,
                    "destination-table": destination_table,
                }.items()
                if value
            ]
            + (["--static-partitions"] if static_partitions else [])
            + [static_partitions]
            + arguments,
            gcp_conn_id=gcp_conn_id,
        )

        delete_dataproc_cluster = DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=cluster_name,
            gcp_conn_id=gcp_conn_id,
            project_id=connection.project_id,
            trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        )

        if not use_storage_api:
            avro_export = BigQueryToCloudStorageOperator(
                task_id="avro_export",
                source_project_dataset_table=table,
                destination_cloud_storage_uris=avro_path,
                compression=None,
                export_format="AVRO",
                bigquery_conn_id=gcp_conn_id,
            )
            avro_delete = GoogleCloudStorageDeleteOperator(
                task_id="avro_delete",
                bucket_name=gcs_output_bucket,
                prefix=avro_prefix,
                google_cloud_storage_conn_id=gcp_conn_id,
                trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
            )
            avro_export >> run_dataproc_pyspark >> avro_delete

        create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster

        return dag
Example #28
0
    HttpToGcsOperator(
        task_id="get_currency_" + currency,
        method="GET",
        endpoint="airflow-training-transform-valutas?date={{ ds }}&from=GBP&to=" + currency,
        http_conn_id="http_airflow_training",
        gcs_conn_id="google_cloud_default",
        gcs_bucket="airflow-training-knab-geert",
        gcs_path="currency/{{ ds }}-" + currency + ".json",
        dag=dag
    ) >> dataproc_create_cluster


compute_aggregates = DataProcPySparkOperator(
    task_id='compute_aggregates',
    main='gs://airflow-training-knab-geert/build_statistics.py',
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=["{{ ds }}"],
    dag=dag,
)
dataproc_create_cluster >> compute_aggregates


dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_cluster",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="gdd-25d677142443a8e2ace1927d48",
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag,
)
compute_aggregates >> dataproc_delete_cluster
Example #29
0
    bucket="airflow_training_data",
    filename="data_{{ds_nodash}}/land_registry_price.json",
    dag=dag)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc_cluster",
    cluster_name="dataproc-cluster-dag-training-{{ ds }}",
    project_id="airflowbolcom-b9aabd6971d488d9",
    num_workers=2,
    zone="europe-west1-d",
    dag=dag)

compute_aggregates = DataProcPySparkOperator(
    task_id="compute_aggregates",
    main=
    'gs://europe-west1-training-airfl-68071199-bucket/other/build_statistics_simple.py',
    cluster_name="dataproc-cluster-dag-training-{{ ds }}",
    arguments=["{{ ds_nodash }}"],
    dag=dag)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc_cluster",
    cluster_name="dataproc-cluster-dag-training-{{ ds }}",
    project_id="airflowbolcom-b9aabd6971d488d9",
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag)

dest_table = "airflowbolcom-b9aabd6971d488d9:airflow_training_dataset.land_registry_${{ ds_nodash }}"
bucket_to_bq = GoogleCloudStorageToBigQueryOperator(
    task_id="gcs_to_bq",
    bucket="airflow_training_data",
Example #30
0
    DataprocClusterCreateOperator, DataprocClusterDeleteOperator,
    DataProcPySparkOperator)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id=PROJECT_ID,
    num_workers=2,
    zone="europe-west4-a",
    dag=dag)

compute_aggregates = DataProcPySparkOperator(
    task_id='compute_aggregates',
    main='gs://europe-west1-training-airfl-67643e8c-bucket/build_statistics.py',
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=[
        "gs://gabriele-bucket/pg_export/{{ ds }}/*.json",
        "gs://gabriele-bucket/currency/{{ ds }}/*.json",
        "gs://gabriele-bucket/average_prices/{{ ds }}/"
    ],
    dag=dag)

from airflow.utils.trigger_rule import TriggerRule
dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id=PROJECT_ID,
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag)

from airflow_training.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
Example #31
0
                                         endpoint='airflow-training-transform-valutas?date={{ ds }}&to=EUR',
                                         bucket="bvb-data",
                                         filename="exchange_rate_{{ ds }}",
                                         dag=dag)

dataproc_create_cluster = DataprocClusterCreateOperator(task_id="dataproc_create",
                                                        cluster_name="analyse-pricing-{{ ds }}",
                                                        project_id='airflowbolcom-may2829-aaadbb22',
                                                        num_workers=2,
                                                        zone="europe-west4-a",
                                                        dag=dag)

compute_aggregates = DataProcPySparkOperator(task_id="dataproc_run",
                                             main="gs://europe-west1-training-airfl-4ecc4ae4-bucket/build_statistics.py",
                                             cluster_name="analyse-pricing-{{ ds }}",
                                             arguments=["gs://bvb-data/daily_load_{{ ds}}",
                                                        "gs://bvb-data/exchange_rate_{{ ds }}",
                                                        "gs://bvb-data/output_file_{{ ds }}"],
                                             dag=dag)

dataproc_delete_cluster = DataprocClusterDeleteOperator(task_id="dataproc_delete",
                                                        cluster_name="analyse-pricing-{{ ds }}",
                                                        project_id='airflowbolcom-may2829-aaadbb22',
                                                        dag=dag)

gcstobq = GoogleCloudStorageToBigQueryOperator(task_id="gcs_to_bq",
                                               bucket="bvb-data",
                                               source_objects=["output_file_{{ ds }}/part-*"],
                                               destination_project_dataset_table="airflowbolcom-may2829-aaadbb22:prices.land_registry_price${{ ds_nodash }}",
                                               source_format="PARQUET",
                                               write_disposition="WRITE_TRUNCATE",
Example #32
0
        query="SHOW DATABASES;",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    spark_task = DataProcSparkOperator(
        task_id="spark_task",
        main_class="org.apache.spark.examples.SparkPi",
        dataproc_jars="file:///usr/lib/spark/examples/jars/spark-examples.jar",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    pyspark_task = DataProcPySparkOperator(
        task_id="pyspark_task",
        main=PYSPARK_URI,
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    hive_task = DataProcHiveOperator(
        task_id="hive_task",
        query="SHOW DATABASES;",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    hadoop_task = DataProcHadoopOperator(
        task_id="hadoop_task",
        main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar",
        arguments=["wordcount", "gs://pub/shakespeare/rose.txt", OUTPUT_PATH],
        region=REGION,