#create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( # task_id='create_dataproc_cluster', # # Give the cluster a unique name by appending the date scheduled. # # See https://airflow.apache.org/code.html#default-variables # cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', # num_workers=2, # zone=models.Variable.get('gce_zone'), # master_machine_type='n1-standard-1', # worker_machine_type='n1-standard-1') # Submit the PySpark job. submit_pyspark1 = dataproc_operator.DataProcPySparkOperator( task_id='submit_pyspark1', main=PYSPARK_JOB, # Obviously needs to match the name of cluster created in the prior Operator. cluster_name='cicd-demo-cluster', region='us-central1', dataproc_jars='gs://spark-lib/bigquery/spark-bigquery-latest.jar', dataproc_pyspark_jars= 'gs://spark-lib/bigquery/spark-bigquery-latest.jar') submit_pyspark2 = dataproc_operator.DataProcPySparkOperator( task_id='submit_pyspark2', main=PYSPARK_JOB, # Obviously needs to match the name of cluster created in the prior Operator. cluster_name='cicd-demo-cluster', region='us-central1', dataproc_jars='gs://spark-lib/bigquery/spark-bigquery-latest.jar', dataproc_pyspark_jars= 'gs://spark-lib/bigquery/spark-bigquery-latest.jar')
tags=['all-bastion-ssh', 'dataproc', 'cassandra'], storage_bucket='hd-personalization-dev-batch', properties={'dataproc:dataproc.allow.zero.workers': 'true'}, dag=dag) dataproc_pyspark_submit = dataproc_operator.DataProcPySparkOperator( task_id='pyspark_task', main= 'gs://hd-personalization-dev-artifacts/releases/com.homedepot.recommendations/collections-model-training/python-scripts/v0.0.0+16/__main__.py', pyfiles=[ 'gs://hd-personalization-dev-artifacts/releases/com.homedepot.recommendations/collections-model-training/python-scripts/v0.0.0+16/collections_model_training-0.0.1-py3.7.egg' ], arguments=[ 'LSTM_DATAGEN', '--project', 'hd-personalization-dev', '--category', 'AreaRugs', '--dupletsData', 'gs://hd-personalization-dev-data/vdc2136/training/duplets/2020-06-01/', '--featuresData', 'gs://hd-personalization-dev-data/vdc2136/training/data/AllFeatures.csv', '--finalOutputPath', 'gs://hd-personalization-dev-data/vdc2136/training/lstm/2020-06-02/', '--appName', 'LSTM_DATA_GEN', '--mode=cluster' ], job_name='airflow_pyspark_job', cluster_name='airflow-dataproc-cluster', project_id='hd-personalization-dev', dag=dag) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='airflow-dataproc-cluster', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
# BashOperator # A simple print date print_date = BashOperator(task_id='print_date', bash_command='date') # dataproc_operator # Create small dataproc cluster create_dataproc = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('dataproc_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the PySpark job run_spark = dataproc_operator.DataProcPySparkOperator( task_id='run_spark', main=SPARK_CODE, cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', job_name=dataproc_job_name) # dataproc_operator # Delete Cloud Dataproc cluster. delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # STEP 6: Set DAGs dependencies # Each task should run after have finished the task before. print_date >> create_dataproc >> run_spark >> delete_dataproc
from airflow import models from airflow.contrib.operators import dataproc_operator from airflow.utils import trigger_rule output_file = os.path.join( models.Variable.get('gcs_bucket'), 'dataproc_simple', datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) + os.sep yesterday = datetime.datetime.combine( datetime.datetime.today() - datetime.timedelta(1), datetime.datetime.min.time()) args = { 'start_date': yesterday, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with models.DAG('spark_simple', schedule_interval=datetime.timedelta(days=1), default_args=args) as dag: run_step = dataproc_operator.DataProcPySparkOperator( task_id='run_spark', cluster_name='cluster-9c11', region='europe-west1', main='gs://bigdataupv_code/compras_top_ten_countries.py', files=['gs://bigdataupv_code/helpers.py'])
# [END composer_quickstart_schedule] # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='parquetconverter2', num_workers=3, zone='europe-west1-b', master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the pyspark CSV2PARQUET example run_dataproc_csv2parquet = dataproc_operator.DataProcPySparkOperator( task_id='run_dataproc_parquetconvert', cluster_name='parquetconverter2', main='gs://alex-code/convert.py') # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='parquetconverter2', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # [START composer_quickstart_steps] # Define DAG dependencies. create_dataproc_cluster >> run_dataproc_csv2parquet >> delete_dataproc_cluster # [END composer_quickstart_steps]
}, optional_components=['ANACONDA', 'JUPYTER', 'ZEPPELIN'], enable_optional_components=True, enable_http_port_access=True, zone="europe-west1-b", storage_bucket="vf-polimi-batch-data", idle_delete_ttl=3601, internal_ip_only=False, init_actions_uris=[ 'gs://goog-dataproc-initialization-actions-europe-west1/python/pip-install.sh' ]) run_batch_kpi_scheduled = dataproc_operator.DataProcPySparkOperator( task_id="submit_batch-kpi-scheduled", cluster_name='vf-polimi-demo', region='europe-west1', main='gs://vf-polimi-batch-data/dev/compute-kpi-batch.py', dataproc_pyspark_jars= 'gs://spark-lib/bigquery/spark-bigquery-latest.jar', xcom_push=True) remove_cluster = dataproc_operator.DataprocClusterDeleteOperator( project_id=PROJECT, task_id="delete_cluster", cluster_name='vf-polimi-demo', region='europe-west1') def check_batch_kpi_scheduled_cluster_running(**kwargs): ti = kwargs['ti'] xcom_value = ti.xcom_pull(task_ids='batch_kpi_scheduled_cluster') if xcom_value == "vf-polimi-demo": return 'delete_cluster'
with airflow.DAG('gcs_composer_trigger_dag',default_args=default_args, schedule_interval=None) as dag: create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='composer-311-complaints-{{ ds_nodash }}', num_workers=2, region=models.Variable.get('region'), zone=models.Variable.get('gce_zone'), project_id=models.Variable.get('project_id'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') run_dataproc_job = dataproc_operator.DataProcPySparkOperator( task_id="run_dataproc_job", main="gs://311-complaints-spark_jobs/spark_job.py", cluster_name='composer-311-complaints-{{ ds_nodash }}', region=models.Variable.get('region'), dataproc_pyspark_jars=['gs://spark-lib/bigquery/spark-bigquery-latest.jar'], arguments=['gs://{{ dag_run.conf.get("bucket") }}/{{ dag_run.conf.get("name") }}']) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='composer-311-complaints-{{ ds_nodash }}', project_id=models.Variable.get('project_id'), region=models.Variable.get('region'), trigger_rule=trigger_rule.TriggerRule.ALL_DONE) bigquery_transformations=BigQueryOperator( sql='/sql/job.sql', task_id='bigquery_transformations', use_legacy_sql=False,
import logging logging.info('Hello World!') create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='composer-dataproc-{{ ds_nodash }}', num_workers=2, region='asia-south1', zone='asia-south1-a', master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') dataprod_pyspark = dataproc_operator.DataProcPySparkOperator( task_id='pyspark', main='gs://code_deploy/dataproc_read_bucket_to_bigquery.py', cluster_name='composer-dataproc-{{ ds_nodash }}', region='asia-south1', dataproc_pyspark_jars=[]) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='composer-dataproc-{{ ds_nodash }}', region='asia-south1', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # An instance of an operator is called a task. In this case, the # hello_python task calls the "greeting" Python function. hello_python = python_operator.PythonOperator(task_id='hello', python_callable=greeting)
task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name=pipeline_cluster_name, num_workers=2, region='us-central1', autoscaling_policy= 'projects/{}/regions/us-central1/autoscalingPolicies/ephimeral-scaling-policy' .format(os.environ['PROJECT_ID']), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') run_py_spark = dataproc_operator.DataProcPySparkOperator( task_id='run_py_spark', region='us-central1', main='gs://{}/data/compute-pi-pipeline/calculate-pi.py'.format( os.environ['COMPOSER_BUCKET']), arguments=[models.Variable.get("NUM_SAMPLES")], cluster_name=pipeline_cluster_name) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', region='us-central1', cluster_name=pipeline_cluster_name, # Setting trigger_rule to ALL_DONE causes the cluster to be deleted # even if the Dataproc job fails. trigger_rule=trigger_rule.TriggerRule.ALL_DONE) # Define DAG dependencies. create_dataproc_cluster >> run_py_spark >> delete_dataproc_cluster
delete_ml_partition = bash_operator.BashOperator( task_id='delete_ml_partition', dag=dag, bash_command= '''bq rm -f -t 'dataops_demo_ml_dev.t_twitter_google${{ macros.ds_format(ds, "%Y-%m-%d", "%Y%m%d") }}' ''', ) # Execute PySpark job run_pyspark_job_splitting = dataproc_operator.DataProcPySparkOperator( task_id='run_pyspark_job_splitting', dag=dag, main='gs://' + Variable.get('v_composer_bucket') + '/dags/dataproc/twitterPySparkSplitting.py', cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}', dataproc_pyspark_jars=[ 'gs://spark-lib/bigquery/spark-bigquery-latest.jar' ], arguments=[ "--dataproc=1.4", "--job_date={{ ds }}", "--bucket=dataproc_dataops_tmp" ]) run_pyspark_job_frequency = dataproc_operator.DataProcPySparkOperator( task_id='run_pyspark_job_frequency', dag=dag, main='gs://' + Variable.get('v_composer_bucket') + '/dags/dataproc/twitterPySparkFrequency.py', cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}', dataproc_pyspark_jars=[ 'gs://spark-lib/bigquery/spark-bigquery-latest.jar'
# dataproc_operator # Create small dataproc cluster create_dataproc = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('dataproc_zone'), master_machine_type='e2-standard-4', worker_machine_type='e2-standard-8') # Run the PySpark job run_spark0 = dataproc_operator.DataProcPySparkOperator( task_id='run_spark0', main=SPARK_CODE0, dataproc_pyspark_jars=jarpath, arguments=arglist, cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', job_name=dataproc_job_name + '0') run_spark1 = dataproc_operator.DataProcPySparkOperator( task_id='run_spark1', main=SPARK_CODE1, dataproc_pyspark_jars=jarpath, arguments=arglist, cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', job_name=dataproc_job_name + '1') run_spark2 = dataproc_operator.DataProcPySparkOperator( task_id='run_spark2', main=SPARK_CODE2,
# dataproc_operator # Create small dataproc cluster create_dataproc = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', num_workers=2, zone=None, master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1', region=models.Variable.get('dataproc_zone')) # Run the PySpark job run_spark = dataproc_operator.DataProcPySparkOperator( task_id='run_spark', main=SPARK_CODE, cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', job_name=dataproc_job_name, region=models.Variable.get('dataproc_zone')) # dataproc_operator # Delete Cloud Dataproc cluster. delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', trigger_rule=trigger_rule.TriggerRule.ALL_DONE, region=models.Variable.get('dataproc_zone')) # STEP 6: Set DAGs dependencies # Each task should run after have finished the task before. print_date >> create_dataproc >> run_spark >> delete_dataproc
# See https://airflow.apache.org/code.html#default-variables cluster_name='accomodation-cluster-{{ ds_nodash }}', num_workers=2, init_actions_uris=['gs://able-cogency-234306/tmp/cloud-sql-proxy.sh'], service_account_scopes=['https://www.googleapis.com/auth/cloud-platform','https://www.googleapis.com/auth/sqlservice.admin'], metadata={'enable-cloud-sql-hive-metastore':'false','additional-cloud-sql-instances':'able-cogency-234306:us-central1:testddd'}, region='us-central1', zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. csv_import_job = dataproc_operator.DataProcPySparkOperator( task_id='csv_import_job', main=CSVIMPORTPY, cluster_name='accomodation-cluster-{{ ds_nodash }}', job_name='csv_import_job', region='us-central1') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. accomodation_model_job = dataproc_operator.DataProcPySparkOperator( task_id='accomodation_model_job', main=MODELPY, cluster_name='accomodation-cluster-{{ ds_nodash }}', job_name='accomodation_model_job', region='us-central1') # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster',
# Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Submit the PySpark job. submit_pyspark1 = dataproc_operator.DataProcPySparkOperator( task_id='submit_pyspark1', main=PYSPARK_JOB, # Obviously needs to match the name of cluster created in the prior Operator. cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', dataproc_jars = 'gs://spark-lib/bigquery/spark-bigquery-latest.jar', dataproc_pyspark_jars ='gs://spark-lib/bigquery/spark-bigquery-latest.jar') submit_pyspark2 = dataproc_operator.DataProcPySparkOperator( task_id='submit_pyspark2', main=PYSPARK_JOB, # Obviously needs to match the name of cluster created in the prior Operator. cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', dataproc_jars = 'gs://spark-lib/bigquery/spark-bigquery-latest.jar', dataproc_pyspark_jars ='gs://spark-lib/bigquery/spark-bigquery-latest.jar')
create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', project_id='dataproc-300110', cluster_name='cluster-58-wb', num_workers=2, region='us-east1', init_actions_uris=['gs://worldbank2021/code/init_cluster.sh'], master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2') dataproc_pyspark_1 = dataproc_operator.DataProcPySparkOperator( task_id='Load_BQ_spark_job_1', # call the py file for processing # main='gs://dataproc-nyc-taxi-2020/code_deploy/dataproc_wb.py', main='gs://worldbank2021/code/dataproc_load_bq.py', cluster_name='cluster-58-wb', region='us-east1', arguments=['wb_country_series_definition'], dataproc_pyspark_jars=[ 'gs://spark-lib/bigquery/spark-bigquery-latest.jar' ]) dataproc_pyspark_2 = dataproc_operator.DataProcPySparkOperator( task_id='Load_BQ_spark_job_2', main='gs://worldbank2021/code/dataproc_load_bq.py', cluster_name='cluster-58-wb', region='us-east1', arguments=['wb_country_summary'], dataproc_pyspark_jars=[ 'gs://spark-lib/bigquery/spark-bigquery-latest.jar' ])