def test_hook_correct_region(self): with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook: dataproc_task = DataProcPySparkOperator( task_id=TASK_ID, main=MAIN_URI, region=REGION ) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, REGION)
def test_hook_correct_region(): with patch(HOOK) as mock_hook: dataproc_task = DataProcPySparkOperator( task_id=TASK_ID, main=MAIN_URI, region=GCP_REGION ) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
) write_prices_to_bq = DataFlowPythonOperator( task_id="write_prices_to_bq", dataflow_default_options={ "project": PROJECT_ID, "region": "europe-west1" }, py_file="gs://" + BUCKET + "/scripts/dataflow_job.py", dag=dag, ) dataproc_compute_aggregates = DataProcPySparkOperator( task_id="dataproc_compute_aggregates", main="gs://airflow-training-data-tim/scripts/build_statistics.py", cluster_name="analyse-pricing-{{ ds }}", arguments=["{{ ds }}"], dag=dag, ) dataproc_create_cluster = DataprocClusterCreateOperator( task_id="dataproc_create_cluster", cluster_name="analyse-pricing-{{ ds }}", project_id=PROJECT_ID, num_workers=2, zone="europe-west4-a", dag=dag, pool="dataproc", ) query = """
def test_dataproc_job_id_is_set(): with patch(HOOK) as mock_hook: dataproc_task = DataProcPySparkOperator(task_id=TASK_ID, main=MAIN_URI) _assert_dataproc_job_id(mock_hook, dataproc_task)
} with DAG("flights_delay_etl", default_args=DEFAULT_DAG_ARGS) as dag: create_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", master_machine_type="n1-standard-1", worker_machine_type="n1-standard-2", num_workers=2, region="us-central1", zone="us-central1-a" ) submit_pyspark = DataProcPySparkOperator( task_id="run_pyspark_etl", main=PYSPARK_JOB, cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="us-central1" ) bq_load_delay_by_flight_nums=GoogleCloudStorageToBigQueryOperator( task_id="bq_load_avg_delays_by_flight_nums", bucket="spark-etl-1", source_objects=["flights_data_output/"+file_name+"_flight_nums/*.json"], destination_project_dataset_table="bigdata-etl-20201027.data_analysis.avg_delays_by_flight_nums", autodetect=True, source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0 )
def export_to_parquet( table, arguments=[], dag_name="export_to_parquet", parent_dag_name=None, default_args=None, aws_conn_id="aws_dev_iam_s3", gcp_conn_id="google_cloud_derived_datasets", dataproc_zone="us-central1-a", dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet", num_preemptible_workers=0, ): """ Export a BigQuery table to Parquet. https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py :param str table: [Required] BigQuery table name :param List[str] arguments: Additional pyspark arguments :param str dag_name: Name of DAG :param Optional[str] parent_dag_name: Parent DAG name :param Optional[Dict[str, Any]] default_args: DAG configuration :param str gcp_conn_id: Airflow connection id for GCP access :param str dataproc_storage_bucket: Dataproc staging GCS bucket :param str dataproc_zone: GCP zone to launch dataproc clusters :param int num_preemptible_workers: Number of Dataproc preemptible workers :return: airflow.models.DAG """ # limit cluster name to 42 characters then suffix with -YYYYMMDD cluster_name = table.replace("_", "-") if len(cluster_name) > 42: if cluster_name.rsplit("-v", 1)[-1].isdigit(): prefix, version = cluster_name.rsplit("-v", 1) cluster_name = prefix[:40 - len(version)] + "-v" + version else: cluster_name = cluster_name[:42] cluster_name += "-{{ ds_nodash }}" dag_prefix = parent_dag_name + "." if parent_dag_name else "" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) properties = { "core:fs.s3a." + key: value for key, value in zip( ("access.key", "secret.key", "session.token"), AwsHook(aws_conn_id).get_credentials(), ) if value is not None } with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag: create_dataproc_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, properties=properties, num_workers=2, image_version="1.3", storage_bucket=dataproc_storage_bucket, zone=dataproc_zone, master_machine_type="n1-standard-8", worker_machine_type="n1-standard-8", num_preemptible_workers=num_preemptible_workers, ) run_dataproc_pyspark = DataProcPySparkOperator( task_id="run_dataproc_pyspark", cluster_name=cluster_name, dataproc_pyspark_jars=[ "gs://mozilla-bigquery-etl/jars/spark-bigquery-0.5.1-beta-SNAPSHOT.jar" ], main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master" "/script/pyspark/export_to_parquet.py", arguments=[table] + arguments, gcp_conn_id=gcp_conn_id, ) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster return dag
properties = {"spark.jars.packages": "io.delta:delta-core_2.11:0.5.0"} t1 = DataprocClusterCreateOperator(task_id="create_cluster", gcp_conn_id='google_cloud_default', project_id=project_id, region=region, zone=zone, cluster_name=cluster_name, storage_bucket=storage_bucket, num_workers=0, master_machine_type='n1-standard-2', image_version=image_version, dag=dag) t2 = DataProcPySparkOperator(task_id="run", gcp_conn_id='google_cloud_default', project_id=project_id, main=job_file, job_name='test', dataproc_pyspark_properties=properties, cluster_name=cluster_name, region=region, dag=dag) t3 = DataprocClusterDeleteOperator(task_id='delete_cluster', project_id=project_id, cluster_name=cluster_name, region=region, dag=dag) t1 >> t2 >> t3
def moz_dataproc_pyspark_runner( parent_dag_name=None, dag_name='run_pyspark_on_dataproc', default_args=None, cluster_name=None, num_workers=2, image_version='1.4', region='us-west1', subnetwork_uri=None, internal_ip_only=None, idle_delete_ttl='10800', auto_delete_ttl='21600', master_machine_type='n1-standard-8', worker_machine_type='n1-standard-4', num_preemptible_workers=0, service_account='*****@*****.**', init_actions_uris=None, additional_metadata=None, additional_properties=None, optional_components=['ANACONDA'], install_component_gateway=True, python_driver_code=None, py_args=None, job_name=None, aws_conn_id=None, gcp_conn_id='google_cloud_airflow_dataproc', artifact_bucket='moz-fx-data-prod-airflow-dataproc-artifacts', storage_bucket='moz-fx-data-prod-dataproc-scratch', master_disk_type='pd-standard', worker_disk_type='pd-standard', master_disk_size=1024, worker_disk_size=1024, master_num_local_ssds=0, worker_num_local_ssds=0, ): """ This will initially create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway. Then we call DataProcPySparkOperator to execute the pyspark script defined by the argument python_driver_code. Once that succeeds, we teardown the cluster. **Example**: :: # Unsalted cluster name so subsequent runs fail if the cluster name exists cluster_name = 'test-dataproc-cluster-hwoo' # Defined in Airflow's UI -> Admin -> Connections gcp_conn_id = 'google_cloud_airflow_dataproc' run_dataproc_pyspark = SubDagOperator( task_id='run_dataproc_pyspark', dag=dag, subdag = moz_dataproc_pyspark_runner( parent_dag_name=dag.dag_id, dag_name='run_dataproc_pyspark', job_name='Do_something_on_pyspark', default_args=default_args, cluster_name=cluster_name, python_driver_code='gs://some_bucket/some_py_script.py', py_args=["-d", "{{ ds_nodash }}"], gcp_conn_id=gcp_conn_id) ) Airflow related args: --- :param str parent_dag_name: Parent dag name. :param str dag_name: Dag name. :param dict default_args: Dag configuration. Dataproc Cluster related args: --- :param str cluster_name: The name of the dataproc cluster. :param int num_workers: The number of spark workers. :param str image_version: The image version of software to use for dataproc cluster. :param str region: Region where the dataproc cluster will be located. Zone will be chosen automatically :param str subnetwork_uri: The subnetwork uri to be used for machine communication, cannot be specified with network_uri. Only need this if setting internal_ip_only = True. (See next parameter) :param bool internal_ip_only: If True, cluster nodes will only have internal IP addresses. Can only be enabled with subnetwork_uri enabled networks. We use this for NAT'd dataproc clusters whose outbound traffic needs to be whitelisted. To use a NAT'd cluster, set subnetwork_uri='default', internal_ip_only=True, and region=us-west2-a|b|c :param str idle_delete_ttl: The duration in seconds to keep idle cluster alive. :param str auto_delete_ttl: The duration in seconds that the cluster will live. :param str master_machine_type: Compute engine machine type to use for master. :param str worker_machine_type: Compute engine machine type to use for the workers. :param int num_preemptible_workers: Number of preemptible worker nodes to spin up. :param str service_account: The service account for spark VMs to use. For example if cross project access is needed. Note that this svc account needs the following permissions: roles/logging.logWriter and roles/storage.objectAdmin. :param list init_actions_uris: List of GCS uri's containing dataproc init scripts. :param dict additional_metadata Custom metadata keys and values, might be used to configure initialization actions. :param dict additional_properties Custom cluster properties, can be used to configure cluster components, add Spark packages, etc. :param str job_name: Name of the spark job to run. :param str aws_conn_id: Airflow connection id for S3 access (if needed). :param str gcp_conn_id: The connection ID to use connecting to GCP. :param str artifact_bucket: Path to resources for bootstrapping the dataproc cluster :param str storage_bucket: Path to scratch bucket for intermediate cluster results :param list optional_components: List of optional components to install on cluster Defaults to ['ANACONDA'] for now since JUPYTER is broken. :param str install_component_gateway: Enable alpha feature component gateway. :param master_disk_type: Type of the boot disk for the master node (default is ``pd-standard``). Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or ``pd-standard`` (Persistent Disk Hard Disk Drive). :type master_disk_type: str :param master_disk_size: Disk size for the master node :type master_disk_size: int :param master_num_local_ssds : Number of local SSDs to mount (default is 0) :type master_num_local_ssds : int :param worker_disk_type: Type of the boot disk for the worker node (default is ``pd-standard``). Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or ``pd-standard`` (Persistent Disk Hard Disk Drive). :type worker_disk_type: str :param worker_disk_size: Disk size for the worker node :type worker_disk_size: int :param worker_num_local_ssds : Number of local SSDs to mount (default is 0) :type worker_num_local_ssds : int Pyspark related args: --- :param str python_driver_code: The Hadoop Compatible Filesystem (HCFS) URI of the main Python file to use as the driver. Must be a .py file. :param list py_args: Arguments for the pyspark job. """ if cluster_name is None or python_driver_code is None: raise AirflowException( 'Please specify cluster_name and/or python_driver_code.') dataproc_helper = DataProcHelper( cluster_name=cluster_name, job_name=job_name, num_workers=num_workers, image_version=image_version, region=region, subnetwork_uri=subnetwork_uri, internal_ip_only=internal_ip_only, idle_delete_ttl=idle_delete_ttl, auto_delete_ttl=auto_delete_ttl, master_machine_type=master_machine_type, worker_machine_type=worker_machine_type, num_preemptible_workers=num_preemptible_workers, service_account=service_account, init_actions_uris=init_actions_uris, optional_components=optional_components, additional_metadata=additional_metadata, additional_properties=additional_properties, install_component_gateway=install_component_gateway, aws_conn_id=aws_conn_id, gcp_conn_id=gcp_conn_id, artifact_bucket=artifact_bucket, storage_bucket=storage_bucket, master_disk_type=master_disk_type, master_disk_size=master_disk_size, worker_disk_type=worker_disk_type, worker_disk_size=worker_disk_size, master_num_local_ssds=master_num_local_ssds, worker_num_local_ssds=worker_num_local_ssds, ) _dag_name = '{}.{}'.format(parent_dag_name, dag_name) with models.DAG(_dag_name, default_args=default_args) as dag: create_dataproc_cluster = dataproc_helper.create_cluster() run_pyspark_on_dataproc = DataProcPySparkOperator( task_id='run_dataproc_pyspark', job_name=job_name, cluster_name=cluster_name, region=region, main=python_driver_code, arguments=py_args, gcp_conn_id=gcp_conn_id, ) delete_dataproc_cluster = dataproc_helper.delete_cluster() create_dataproc_cluster >> run_pyspark_on_dataproc >> delete_dataproc_cluster return dag
def spark_subdag( parent_dag_name, child_dag_name, default_args, gcp_conn_id, service_account, main, pyfiles, arguments, bootstrap_bucket, dataproc_region="us-west1", num_preemptible_workers=10, ): """Run the PySpark job for unnesting and range-partitioning Prio pings from the ingestion service. :param str parent_dag_name: Name of the parent DAG. :param str child_dag_name: Name of the child DAG. :param Dict[str, Any] default_args: Default arguments for the child DAG. :param str gcp_conn_id: Name of the connection string. :param str service_account: The address of the service account. :param str dataproc_region: The region of the DataProc cluster. :param str main: :param List[str] pyfiles: :param List[str] arguments: :param int num_preemptible_workers: The number of preemptible workers. :return: DAG """ connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) shared_config = { "cluster_name": "prio-staging-{{ds_nodash}}", "gcp_conn_id": gcp_conn_id, "project_id": connection.project_id, # From an error when not specifying the region: # - Dataproc images 2.0 and higher do not support the to-be # deprecated global region. Please use any non-global Dataproc # region instead # - Must specify a zone in GCE configuration when using # 'regions/global'. To use auto zone placement, specify # regions/<non-global-region> in request path, e.g. # regions/us-central1 "region": dataproc_region, } with DAG(f"{parent_dag_name}.{child_dag_name}", default_args=default_args) as dag: create_dataproc_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", image_version="preview-ubuntu18", service_account=service_account, master_machine_type="n1-standard-4", worker_machine_type="n1-standard-4", num_workers=2, num_preemptible_workers=num_preemptible_workers, init_actions_uris=[ f"{bootstrap_bucket}/install-python-requirements.sh" ], idle_delete_ttl=600, dag=dag, **shared_config, ) run_dataproc_spark = DataProcPySparkOperator( task_id="run_dataproc_spark", main=main, dataproc_pyspark_jars=[ "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" ], pyfiles=pyfiles, arguments=arguments, dag=dag, **shared_config, ) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", trigger_rule="all_done", dag=dag, **shared_config, ) create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster return dag
region=region, start_date=start_date), task_id=catalog_task_id, dag=dag) consume_show_comments_job_path = "gs://" + gcs_netflix_bucket + "/spark-jobs/consume_reddit_comments.py" reddit_destination_path = "gs://" + gcs_netflix_bucket + "/comments/raw/comments.parquet" gcp_netflix_catalog_path = "gs://" + gcs_netflix_bucket + "/" + catalog_path consume_show_comment_to_datalake = DataProcPySparkOperator( task_id='consume_show_comment_to_datalake', main=consume_show_comments_job_path, cluster_name=cluster_name, job_name='consume_show_comments', region=region, arguments=[ Variable.get("reddit_client_id"), Variable.get("reddit_client_secret"), gcp_netflix_catalog_path, ["netflix NetflixBestOf bestofnetflix"], reddit_destination_path ], gcp_conn_id=gcp_conn, dag=dag) generate_show_comments_job_path = "gs://" + gcs_netflix_bucket + "/spark-jobs/generate_show_comments.py" generate_show_comment_to_datalake = DataProcPySparkOperator( task_id='generate_show_comment_to_datalake', main=generate_show_comments_job_path, cluster_name=cluster_name, job_name='generate_show_comments', region=region, arguments=[gcp_netflix_catalog_path, reddit_destination_path],
dataproc_create_cluster = DataprocClusterCreateOperator( task_id="create_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id="airflowbolcom-4b5ba3f7fec9aea9", num_workers=2, zone="europe-west4-a", dag=dag, ) compute_aggregates = DataProcPySparkOperator( task_id='compute_aggregates', main='gs://dpranantha/statistics/build_statistics.py', cluster_name='analyse-pricing-{{ ds }}', arguments=[ "gs://dpranantha/{{ ds }}/land_price_uk_*.json", "gs://dpranantha/{{ ds }}/currency_*.json", "gs://dpranantha/{{ ds }}/average/" ], dag=dag, ) dataproc_delete_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id="airflowbolcom-4b5ba3f7fec9aea9", trigger_rule=TriggerRule.ALL_DONE, dag=dag, ) gcsBq = GoogleCloudStorageToBigQueryOperator(
crear_cluster = DataprocClusterCreateOperator( task_id="create_cluster", cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", master_machine_type="n1-standard-1", master_disk_size=50, worker_machine_type="n1-standard-1", worker_disk_size=50, num_workers=2, region="us-west1", zone="us-west1-a", image_version='1.4') pyspark = DataProcPySparkOperator( task_id="run_pyspark", main=route, cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="us-west1") borrar_cluster = DataprocClusterDeleteOperator( task_id="borrar_cluster", cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="us-west1", trigger_rule=TriggerRule.ALL_DONE) dummy_final = DummyOperator(task_id="prueba_final") delay = PythonOperator(task_id="delay1", python_callable=my_function, op_args=[200])
# in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', image_version='1.5-debian10', num_workers=0, master_machine_type='n1-standard-2', num_masters=1, storage_bucket='egen-training-286300-dataproc-staging', zone='us-central1-b') # Submit the PySpark job. submit_pyspark = DataProcPySparkOperator( task_id='run_dataproc_pyspark', main='gs://egen-training-286300' + '/spark-jobs/sparkify_etl_2.py', # Obviously needs to match the name of cluster created in the prior Operator. cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', # Let's template our arguments for the pyspark job from the POST payload. arguments=[ "--bucket={{ dag_run.conf['bucket'] }}", "--raw_file_name={{dag_run.conf['raw_file_name']}}" ]) # Load the transformed files to a BigQuery table. bq_load = GoogleCloudStorageToBigQueryOperator( task_id='GCS_to_BigQuery', bucket="{{dag_run.conf['bucket']}}", # Wildcard for objects created by spark job to be written to BigQuery # Reads the relative path to the objects transformed by the spark job from the POST message. source_objects=["transformed/partitioned/users_table.parquet/part-*"], destination_project_dataset_table='egen.users_table_no_partition', autodetect=True,
master_disk_size=50, worker_machine_type="n1-standard-1", worker_disk_size=50, num_workers=2, region="us-east1", zone="us-east1-b", init_actions_uris=[ 'gs://dataproc-initialization-actions/python/pip-install.sh' ], optional_components=["ANACONDA"], metadata={'PIP_PACKAGES': 'google-cloud-storage'}, ) submit_pyspark = DataProcPySparkOperator( task_id="run_pyspark_etl", main=PYSPARK_JOB, cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="us-east1") bq_load_profeco_data = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_csv_profeco", bucket='gnp-storage', source_objects=["Profeco/resources/Sin-fecha/profeco.pdf"], destination_project_dataset_table=PROJECT_ID + ".GNP.Profeco_table", autodetect=True, source_format="CSV", field_delimiter=',', create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0)
create_cluster = DataprocClusterCreateOperator( task_id="create_cluster", project_id=HYDRO_DATA_PROJECT_ID, cluster_name="spark-cluster-{{ ds_nodash }}", # spark-cluster-YYYMMDD num_workers=2, storage_bucket=SPARK_BUCKET, region="us-west1", zone="us-west1-a", idle_delete_ttl=300, # 5 mins is the min. value gcp_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID, ) calculate_daily_average_kwh = DataProcPySparkOperator( task_id="calculate_daily_average_kwh", main=f"gs://{SPARK_BUCKET}/pyspark/daily_average_kwh.py", cluster_name="spark-cluster-{{ ds_nodash }}", dataproc_pyspark_jars="gs://spark-lib/bigquery/spark-bigquery-latest.jar", gcp_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID, region="us-west1", ) calculate_daily_sum_kwh = DataProcPySparkOperator( task_id="calculate_daily_sum_kwh", main=f"gs://{SPARK_BUCKET}/pyspark/daily_sum_kwh.py", cluster_name="spark-cluster-{{ ds_nodash }}", dataproc_pyspark_jars="gs://spark-lib/bigquery/spark-bigquery-latest.jar", gcp_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID, region="us-west1" ) delete_cluster = DataprocClusterDeleteOperator( task_id="delete_cluster",
], metadata={ 'PIP_PACKAGES': 'tensorflow==2.0.0 pyarrow==0.15.1 sentencepiece==0.1.85 gcsfs nltk tensorflow-hub tables bert-for-tf2 absl-py google-cloud-storage google-cloud-logging ' }, image_version='1.4.22-debian9', master_machine_type=MASTER_MACHINE_TYPE, worker_machine_type=WORKER_MACHINE_TYPE, properties={ "dataproc:dataproc.logging.stackdriver.job.driver.enable": "true" }, region=REGION, task_id='create_spark', dag=dag1) run_spark = DataProcPySparkOperator( main='gs://topic-sentiment-1/code/data_wrangling.py', arguments=[RAW_DATA, TOKENIZED_DATA_DIR, THRESHOLD], task_id='run_spark', cluster_name=SPARK_CLUSTER, region=REGION, dag=dag1) delete_spark = DataprocClusterDeleteOperator(cluster_name=SPARK_CLUSTER, project_id=PROJECT, region=REGION, task_id='delete_spark') # Dag definition begin >> create_spark >> run_spark >> delete_spark >> end
dag=dag, ) dataproc_remove_cluster = DataprocClusterDeleteOperator( task_id="dataproc_remove_cluster", cluster_name="analyse-pricing-{{ ds }}", project_id="airflowbolcom-may2829-b2a87b4d", dag=dag, ) dataproc_run_pyspark = DataProcPySparkOperator( task_id="dataproc_run_pyspark", main="gs://een_emmer/build_statistics.py", cluster_name="analyse-pricing-{{ ds }}", arguments=[ "gs://een_emmer/daily_load_{{ ds }}", "gs://een_emmer/exchangerate_{{ ds }}.txt", "gs://een_emmer/dataproc_output_{{ ds }}", ], dag=dag, ) prices_uk_from_postgres_to_cloudstorage = PostgresToGoogleCloudStorageOperator( task_id="prices_uk_from_postgres_to_cloudstorage", sql= "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'", bucket="een_emmer", filename="daily_load_{{ ds }}", postgres_conn_id="stuff_postgres", dag=dag, )
'bucket': 'europe-west1-training-airfl-bb0beabce-bucket', 'job_name': '{{ task_instance_key_str }}' }, py_file="gs://airflow-daniel/dataflow_job.py", dag=dag) from airflow.contrib.operators.dataproc_operator import (DataprocClusterCreateOperator, DataprocClusterDeleteOperator, DataProcPySparkOperator, ) dataproc_create_cluster = DataprocClusterCreateOperator(task_id="create_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id="airflowbolcom-b01c3abbfb10e7ee", num_workers=2, zone="europe-west4-a", dag=dag, ) compute_aggregates = DataProcPySparkOperator(task_id='compute_aggregates', main='gs://airflow-daniel/build_statistics.py', cluster_name='analyse-pricing-{{ ds }}', arguments=[ "gs://airflow-daniel/land_registry_price_paid_uk/{{ ds }}/*.json", "gs://airflow-daniel/currency/{{ ds }}/*.json", "gs://airflow-daniel/average_prices/{{ ds }}/"], dag=dag) from airflow.utils.trigger_rule import TriggerRule dataproc_delete_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id="airflowbolcom-b01c3abbfb10e7ee", trigger_rule=TriggerRule.ALL_DONE, dag=dag) write_to_bq = GoogleCloudStorageToBigQueryOperator(task_id="write_to_bq", bucket="airflow-daniel", source_objects=["average_prices/{{ ds }}/*.parquet"], destination_project_dataset_table="airflow.airflow{{ ds_nodash }}", source_format="PARQUET", write_disposition="WRITE_TRUNCATE", dag=dag, ) pgsql_to_gcs >> load_into_bigquery
dag.doc_md = __doc__ create_cluster_task = DataprocClusterCreateOperator( task_id='create_cluster', project_id='youtubelist-256522', cluster_name='spark-cluster1-{{ ds_nodash }}', num_workers=2, storage_bucket="opi_staging_bucket", metadata={'PIP_PACKAGES': 'pandas praw google-cloud-storage'}, region="us-central1") feature_engineering_task = DataProcPySparkOperator( task_id='feature_engineering', main='gs://opi_processed_data/pyspark/create_feature_store.py', cluster_name='spark-cluster1-{{ ds_nodash }}', dataproc_pyspark_jars= "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar", arguments=['--timestamp', '{{ ts_nodash }}', '--train_perc', "70"], region="us-central1") delete_cluster_task = DataprocClusterDeleteOperator( task_id="delete_cluster", project_id="youtubelist-256522", cluster_name="spark-cluster1-{{ ds_nodash }}", region="us-central1") train_task = PythonOperator( task_id="train_model", python_callable=train_model, op_args=['{{ ts_nodash }}', 0.9, 0.5], )
gcs_path='currency/{{ds}}.json') dataproc_create_cluster = DataprocClusterCreateOperator( task_id="create_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id='airflowbolcom-bc4a05f9b43155a6', num_workers=2, zone="europe-west4-a", dag=dag, ) compute_aggregates = DataProcPySparkOperator( task_id='compute_aggregates', main='gs://riccardos_bucket/build_statistics.py', cluster_name='analyse-pricing-{{ ds }}', arguments=[ "gs://riccardos_bucket/house_data/{{ ds }}.json", "gs://riccardos_bucket/currency/{{ ds }}.json", "gs://riccardos_bucket/average_prices/{{ ds }}/" ], dag=dag, ) dataproc_delete_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id='airflowbolcom-bc4a05f9b43155a6', trigger_rule=TriggerRule.ALL_DONE, dag=dag, ) [pgsl_to_gcs, currency_gcs
""" query_all = """ create external table if not exists customer_all (cust_details string) STORED AS TEXTFILE LOCATION 'gs://mysqldataflow/json/customer/' """ dag = airflow.DAG('Import-MySQL-to-GS-and-DataProc', 'catchup=False', default_args=default_args, schedule_interval=datetime.timedelta(days=1)) t1 = DataProcPySparkOperator( task_id='import-mysql-data', main='gs://mysqlnosql/spark_jdbc_to_gs.py', cluster_name='mydataproc2', region='us-central1', dataproc_pyspark_jars=['gs://mysqlnosql/spark-avro.jar'], dag=dag) t2 = DataProcHiveOperator(query=query_part, cluster_name='mydataproc2', region='us-central1', task_id='create_table_in_hive_2_cols', dag=dag) t3 = DataProcHiveOperator(query=query_all, cluster_name='mydataproc2', region='us-central1', task_id='create_table_in_hive_all_cols', dag=dag)
project_id="airflowbolcom-jan2829-b51a8ad2", region='europe-west4', dag=dag) arguments = [ 'gs://europe-west1-training-airfl-a98394bc-bucket/data/properties/properties_*.json', #input_properties 'gs://europe-west1-training-airfl-a98394bc-bucket/data/use_case_ivan/exchange_rates.json', #input_currencies 'gs://europe-west1-training-airfl-a98394bc-bucket/use_case_output', #target_path 'EUR', #target_currency '2018-01-03', #target_date ] run_spark = DataProcPySparkOperator( task_id="run_spark", main="gs://europe-west1-training-airfl-a98394bc-bucket/build_statistics.py", cluster_name="my-dataproc-cluster", region='europe-west4', arguments=arguments, dag=dag) # delete_dataproc_cluster = DataprocClusterDeleteOperator(task_id="delete_dataproc_cluster", # cluster_name="my-dataproc-cluster", # project_id="airflowbolcom-jan2829-b51a8ad2", # region='europe-west4', # dag=dag) # fetch_exchange_rates # fetch_exchange_rates >> create_dataproc_cluster # fetch_exchange_rates >> create_dataproc_cluster >> run_spark >> delete_dataproc_cluster fetch_exchange_rates >> create_dataproc_cluster >> run_spark # write_response_to_gcs = LaunchToGcsOperator(task_id="write_response_to_gcs", # python_callable=_connect, # provide_context=True,
} with DAG("movies_etl", default_args=DEFAULT_DAG_ARGS) as dag: create_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", master_machine_type="n1-standard-1", worker_machine_type="n1-standard-2", num_workers=2, region="asia-southeast2", zone="asia-southeast2-a") submit_pyspark = DataProcPySparkOperator( task_id="run_pyspark_etl", main=PYSPARK_JOB, cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="asia-southeast2") bq_load_movies_1 = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_movies_1", bucket='bigdata-etl-2_flights', source_objects=[ "movies_data_output/" + current_date + "_datamart_1/*.json" ], destination_project_dataset_table= f'{PROJECT_ID}:qoala_test.movies_datamart_1', autodetect=True, source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0,
"yarn:yarn:scheduler.maximum-allocation-mb": "50000", "yarn:yarn.nodemanager.resource.memory-mb": "50000", "spark:spark.driver.maxResultsSize": "55g", "spark:spark.driver.memory": "19g", "spark:spark.executor.memory": "19g", "spark:spark.executor.extraJavaOptions": "-XX:+PrintGCDetails"}, master_machine_type='n1-standard-16', worker_machine_type='n1-standard-16', zone='us-central1-b', gcp_conn_id='google-cloud-default', service_account='*****@*****.**', service_account_scopes=['https://www.googleapis.com/auth/cloud-platform'], delegate_to='*****@*****.**', dag=dag) t4 = DataProcPySparkOperator( task_id='submit_job', main='gs://store_recs/development/model/spark_store_recs_v2.py', cluster_name='cluster-1', dag=dag ) t5 = DataprocClusterDeleteOperator( task_id='delete_cluster', cluster_name='cluster-1', project_id='my_project', dag=dag) t2.set_upstream(t1) t4.set_upstream([t3, t2]) t5.set_upstream(t4)
create_cluster = DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # ds_nodash is an airflow macro for "[Execution] Date string no dashes" # in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', num_workers=2, num_preemptible_workers=2, zone=Variable.get('gce_zone')) # Submit the PySpark job. submit_pyspark = DataProcPySparkOperator( task_id='run_dataproc_pyspark', main=PYSPARK_JOB, # Obviously needs to match the name of cluster created in the prior Operator. cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', # Let's template our arguments for the pyspark job from the POST payload. arguments=[ "--gcs_path_raw={{ dag_run.conf['raw_path'] }}", "--gcs_path_transformed=gs://" + BUCKET + "/{{ dag_run.conf['transformed_path'] }}" ]) # Load the transformed files to a BigQuery table. bq_load = GoogleCloudStorageToBigQueryOperator( task_id='GCS_to_BigQuery', bucket=BUCKET, # Wildcard for objects created by spark job to be written to BigQuery # Reads the relative path to the objects transformed by the spark job from the POST message. source_objects=["{{ dag_run.conf['transformed_path'] }}/part-*"], destination_project_dataset_table=OUTPUT_TABLE, schema_fields=None,
gcs_bucket='output_bucket_for_airflow', gcs_path='exchange-rates-{{ ds }}.json', method="GET", http_conn_id="http_default", gcs_conn_id="google_cloud_default") create_cluster = DataprocClusterCreateOperator( task_id='create_cluster', project_id='afspfeb3-28e3a1b32a56613ef127e', cluster_name='analyse-pricing-{{ ds }}', num_workers=2, zone='europe-west4-a') calculate_statistics = DataProcPySparkOperator( task_id='calculate_statistics', main='gs://output_bucket_for_airflow/build_statistics.py', arguments=[ 'gs://output_bucket_for_airflow/prices-{{ ds }}.json', 'gs://output_bucket_for_airflow/exchange-rates-{{ ds }}.json', 'gs://output_bucket_for_airflow/output.parquet', 'EUR', '{{ yesterday_ds }}' ], cluster_name='analyse-pricing-{{ ds }}') delete_cluster = DataprocClusterDeleteOperator( task_id='delete_cluster', cluster_name='analyse-pricing-{{ds}}', project_id='afspfeb3-28e3a1b32a56613ef127e', region='global') # statistics_to_big_query = GoogleCloudStorageToBigQueryOperator(task_id='statistics_to_big_query', # bucket='output_bucket_for_airflow', # source_objects=[''], # destination_project_dataset_table, # schema_fields=None, # schema_object=None,
def export_to_parquet( table, destination_table=None, static_partitions=[], arguments=[], use_storage_api=False, dag_name="export_to_parquet", parent_dag_name=None, default_args=None, gcp_conn_id="google_cloud_derived_datasets", dataproc_zone="us-central1-a", dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet", num_workers=2, num_preemptible_workers=0, gcs_output_bucket="moz-fx-data-derived-datasets-parquet", ): """ Export a BigQuery table to Parquet. https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py :param str table: [Required] BigQuery table name :param Optional[str] destination_table: Output table name, defaults to table, will have r'_v[0-9]+$' replaced with r'/v[0-9]+' :param List[str] arguments: Additional pyspark arguments :param bool use_storage_api: Whether to read from the BigQuery Storage API or an AVRO export :param str dag_name: Name of DAG :param Optional[str] parent_dag_name: Parent DAG name :param Optional[Dict[str, Any]] default_args: DAG configuration :param str gcp_conn_id: Airflow connection id for GCP access :param str dataproc_storage_bucket: Dataproc staging GCS bucket :param str dataproc_zone: GCP zone to launch dataproc clusters :param int num_preemptible_workers: Number of Dataproc preemptible workers :return: airflow.models.DAG """ # remove the dataset prefix and partition suffix from table table_id = table.rsplit(".", 1)[-1] unqualified_table, _, partition_id = table_id.partition("$") # limit cluster name to 35 characters plus suffix of -export-YYYYMMDD (51 total) cluster_name = unqualified_table.replace("_", "-") if len(cluster_name) > 35: # preserve version when truncating cluster name to 42 characters prefix, version = re.match(r"(.*?)(-v[0-9]+)?$", cluster_name).groups("") cluster_name = prefix[:35 - len(version)] + version cluster_name += "-export-{{ ds_nodash }}" dag_prefix = parent_dag_name + "." if parent_dag_name else "" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) if destination_table is None: destination_table = unqualified_table # separate version using "/" instead of "_" export_prefix = re.sub(r"_(v[0-9]+)$", r"/\1", destination_table) + "/" if static_partitions: export_prefix += "/".join(static_partitions) + "/" avro_prefix = "avro/" + export_prefix if not static_partitions and partition_id: avro_prefix += "partition_id=" + partition_id + "/" avro_path = "gs://" + gcs_output_bucket + "/" + avro_prefix + "*.avro" with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag: create_dataproc_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, num_workers=num_workers, image_version="1.4", storage_bucket=dataproc_storage_bucket, zone=dataproc_zone, master_machine_type="n1-standard-8", worker_machine_type="n1-standard-8", num_preemptible_workers=num_preemptible_workers, init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh", ], metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"}, ) run_dataproc_pyspark = DataProcPySparkOperator( task_id="run_dataproc_pyspark", cluster_name=cluster_name, dataproc_pyspark_jars=[ "gs://spark-lib/bigquery/spark-bigquery-latest.jar" ], dataproc_pyspark_properties={ "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", }, main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master" "/script/pyspark/export_to_parquet.py", arguments=[table] + [ "--" + key + "=" + value for key, value in { "avro-path": (not use_storage_api) and avro_path, "destination": "gs://" + gcs_output_bucket, "destination-table": destination_table, }.items() if value ] + (["--static-partitions"] if static_partitions else []) + [static_partitions] + arguments, gcp_conn_id=gcp_conn_id, ) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) if not use_storage_api: avro_export = BigQueryToCloudStorageOperator( task_id="avro_export", source_project_dataset_table=table, destination_cloud_storage_uris=avro_path, compression=None, export_format="AVRO", bigquery_conn_id=gcp_conn_id, ) avro_delete = GoogleCloudStorageDeleteOperator( task_id="avro_delete", bucket_name=gcs_output_bucket, prefix=avro_prefix, google_cloud_storage_conn_id=gcp_conn_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) avro_export >> run_dataproc_pyspark >> avro_delete create_dataproc_cluster >> run_dataproc_pyspark >> delete_dataproc_cluster return dag
HttpToGcsOperator( task_id="get_currency_" + currency, method="GET", endpoint="airflow-training-transform-valutas?date={{ ds }}&from=GBP&to=" + currency, http_conn_id="http_airflow_training", gcs_conn_id="google_cloud_default", gcs_bucket="airflow-training-knab-geert", gcs_path="currency/{{ ds }}-" + currency + ".json", dag=dag ) >> dataproc_create_cluster compute_aggregates = DataProcPySparkOperator( task_id='compute_aggregates', main='gs://airflow-training-knab-geert/build_statistics.py', cluster_name='analyse-pricing-{{ ds }}', arguments=["{{ ds }}"], dag=dag, ) dataproc_create_cluster >> compute_aggregates dataproc_delete_cluster = DataprocClusterDeleteOperator( task_id="delete_cluster", cluster_name="analyse-pricing-{{ ds }}", project_id="gdd-25d677142443a8e2ace1927d48", trigger_rule=TriggerRule.ALL_DONE, dag=dag, ) compute_aggregates >> dataproc_delete_cluster
bucket="airflow_training_data", filename="data_{{ds_nodash}}/land_registry_price.json", dag=dag) dataproc_create_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", cluster_name="dataproc-cluster-dag-training-{{ ds }}", project_id="airflowbolcom-b9aabd6971d488d9", num_workers=2, zone="europe-west1-d", dag=dag) compute_aggregates = DataProcPySparkOperator( task_id="compute_aggregates", main= 'gs://europe-west1-training-airfl-68071199-bucket/other/build_statistics_simple.py', cluster_name="dataproc-cluster-dag-training-{{ ds }}", arguments=["{{ ds_nodash }}"], dag=dag) dataproc_delete_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name="dataproc-cluster-dag-training-{{ ds }}", project_id="airflowbolcom-b9aabd6971d488d9", trigger_rule=TriggerRule.ALL_DONE, dag=dag) dest_table = "airflowbolcom-b9aabd6971d488d9:airflow_training_dataset.land_registry_${{ ds_nodash }}" bucket_to_bq = GoogleCloudStorageToBigQueryOperator( task_id="gcs_to_bq", bucket="airflow_training_data",
DataprocClusterCreateOperator, DataprocClusterDeleteOperator, DataProcPySparkOperator) dataproc_create_cluster = DataprocClusterCreateOperator( task_id="create_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id=PROJECT_ID, num_workers=2, zone="europe-west4-a", dag=dag) compute_aggregates = DataProcPySparkOperator( task_id='compute_aggregates', main='gs://europe-west1-training-airfl-67643e8c-bucket/build_statistics.py', cluster_name='analyse-pricing-{{ ds }}', arguments=[ "gs://gabriele-bucket/pg_export/{{ ds }}/*.json", "gs://gabriele-bucket/currency/{{ ds }}/*.json", "gs://gabriele-bucket/average_prices/{{ ds }}/" ], dag=dag) from airflow.utils.trigger_rule import TriggerRule dataproc_delete_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc", cluster_name="analyse-pricing-{{ ds }}", project_id=PROJECT_ID, trigger_rule=TriggerRule.ALL_DONE, dag=dag) from airflow_training.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
endpoint='airflow-training-transform-valutas?date={{ ds }}&to=EUR', bucket="bvb-data", filename="exchange_rate_{{ ds }}", dag=dag) dataproc_create_cluster = DataprocClusterCreateOperator(task_id="dataproc_create", cluster_name="analyse-pricing-{{ ds }}", project_id='airflowbolcom-may2829-aaadbb22', num_workers=2, zone="europe-west4-a", dag=dag) compute_aggregates = DataProcPySparkOperator(task_id="dataproc_run", main="gs://europe-west1-training-airfl-4ecc4ae4-bucket/build_statistics.py", cluster_name="analyse-pricing-{{ ds }}", arguments=["gs://bvb-data/daily_load_{{ ds}}", "gs://bvb-data/exchange_rate_{{ ds }}", "gs://bvb-data/output_file_{{ ds }}"], dag=dag) dataproc_delete_cluster = DataprocClusterDeleteOperator(task_id="dataproc_delete", cluster_name="analyse-pricing-{{ ds }}", project_id='airflowbolcom-may2829-aaadbb22', dag=dag) gcstobq = GoogleCloudStorageToBigQueryOperator(task_id="gcs_to_bq", bucket="bvb-data", source_objects=["output_file_{{ ds }}/part-*"], destination_project_dataset_table="airflowbolcom-may2829-aaadbb22:prices.land_registry_price${{ ds_nodash }}", source_format="PARQUET", write_disposition="WRITE_TRUNCATE",
query="SHOW DATABASES;", region=REGION, cluster_name=CLUSTER_NAME, ) spark_task = DataProcSparkOperator( task_id="spark_task", main_class="org.apache.spark.examples.SparkPi", dataproc_jars="file:///usr/lib/spark/examples/jars/spark-examples.jar", region=REGION, cluster_name=CLUSTER_NAME, ) pyspark_task = DataProcPySparkOperator( task_id="pyspark_task", main=PYSPARK_URI, region=REGION, cluster_name=CLUSTER_NAME, ) hive_task = DataProcHiveOperator( task_id="hive_task", query="SHOW DATABASES;", region=REGION, cluster_name=CLUSTER_NAME, ) hadoop_task = DataProcHadoopOperator( task_id="hadoop_task", main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar", arguments=["wordcount", "gs://pub/shakespeare/rose.txt", OUTPUT_PATH], region=REGION,