def create_engine(cls): from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook from airflow.contrib.operators import dataproc_operator from dbnd._core.current import get_settings cloud = get_settings().get_env_config(CloudType.gcp) gcp_conn_id = cloud.conn_id dataproc_config = DataprocConfig() cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id) return dataproc_operator.DataprocClusterCreateOperator( task_id="create_dataproc_cluster", project_id=cluster_hook.project_id, cluster_name=dataproc_config.cluster, gcp_conn_id=gcp_conn_id, num_workers=dataproc_config.num_workers, zone=dataproc_config.zone, network_uri=dataproc_config.network_uri, subnetwork_uri=dataproc_config.subnetwork_uri, tags=dataproc_config.tags, storage_bucket=dataproc_config.storage_bucket, init_actions_uris=dataproc_config.init_actions_uris, init_action_timeout=dataproc_config.init_action_timeout, metadata=dataproc_config.metadata, image_version=dataproc_config.image_version, properties=dataproc_config.properties, master_machine_type=dataproc_config.master_machine_type, master_disk_size=dataproc_config.master_disk_size, worker_machine_type=dataproc_config.worker_machine_type, worker_disk_size=dataproc_config.worker_disk_size, num_preemptible_workers=dataproc_config.num_preemptible_workers, labels=dataproc_config.labels, delegate_to=dataproc_config.delegate_to, service_account=dataproc_config.service_account, service_account_scopes=dataproc_config.service_account_scopes, idle_delete_ttl=dataproc_config.idle_delete_ttl, auto_delete_time=dataproc_config.auto_delete_time, auto_delete_ttl=dataproc_config.auto_delete_ttl, )
'automation_dataproc_hive_create', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='dataproc', #PROJECT_ID=models.Variable.get('PROJECT_ID','dataproc-usecase-276215'), PROJECT_ID='dataproc-usecase-276215', region='us-west1', num_masters=1, num_workers=2, zone='us-west1-b', master_machine_type='n1-standard-1', master_disk_size = 100, worker_disk_size = 100, num_preemptible_workers=0, worker_machine_type='n1-standard-1', idle_delete_ttl =1800, subnetwork_uri='ctl', optional_components=['PRESTO','SOLR','RANGER'], #service_account_scopes=['cloud-platform','default','sql-admin'], #init_actions_uris=['gs://us-west4-test1-d1c785e8-bucket/data/cloud-sql-proxy/cloud-sql-proxy.sh']) #metadata ={'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted'}) metadata= {'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted','use-cloud-sql-private-ip': 'true', 'hive-metastore-instance': 'dataproc-usecase-276215:us-west1:hive-metadata'}) # Run the Hive job on the Cloud Dataproc cluster run_dataproc_hive_create_db = DataProcHiveOperator( task_id='create_db',
default_dag_args = { 'start_date': yesterday, 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'], } with DAG('gcp_data_platform', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( project_id=PROJECT_ID, task_id='create_dataproc_cluster', cluster_name='gcp-data-platform', num_workers=0, zone='us-west1a', master_machine_type='n1-highmem-4', ) run_dataproc_spark = dataproc_operator.DataProcSparkOperator( task_id='events_dataproc', cluster_name='gcp-data-platform', region=REGION, main_class='io.dagster.events.EventPipeline', dataproc_spark_jars=[ '%s/events-assembly-%s.jar' % (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH) ], arguments=[ '--gcs-input-bucket',
'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': 'upc-bdm' } with models.DAG( 'store_data_integration', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', storage_bucket='listery-staging', num_workers=2, master_disk_size=20, worker_disk_size=20, num_preemptible_workers=1, zone='us-east1-c', master_machine_type='n1-standard-4', worker_machine_type='n1-standard-4') run_integration_job = dataproc_operator.DataProcSparkOperator( task_id='run_integration_job', main_jar=SPARK_JOBS_JAR, cluster_name='composer-data-integration-cluster-{{ ds_nodash }}', arguments=["--local", "false", "--subprogram", "integration"]) offer_integration = dataproc_operator.DataProcSparkOperator( task_id='offer_integration', main_jar=SPARK_JOBS_JAR,
#Request for job request_job = PythonOperator( task_id='Request_for_dataproc_job', python_callable=notify_success, dag=da g, on_success_callback=alert_job_requester ) #Creating dataproc cluster. create_dataproc_cluster = dpo.DataprocClusterCreateOperator( project_id = projectID, task_id = 'create_dataproc_cluster', cluster_name = cluster_name, num_workers = num_workers, region = region, #zone = 'us-central1-a', #network_uri = 'default', subnetwork_uri = subnet, properties = cluster_properties, on_success_callback=task_success_slack_alert, trigger_rule = trigger_rule.TriggerRule.ALL_SUCCESS ) #Run spark job on the above cluster. run_spark_job = BashOperator( task_id = 'run_spark_job', bash_command = bash_command, dag=dag, on_success_callback=task_success_slack_alert, trigger_rule = trigger_rule.TriggerRule.ALL_SUCCESS )
dag = DAG( 'dataproc_dag', default_args=default_args, schedule_interval="@once", ) # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='airflow-dataproc-cluster', project_id='hd-personalization-dev', num_workers=0, init_actions_uris=[ 'gs://hd-personalization-dev-data/vdc2136/training/updated/initialization_actions.sh' ], zone='us-east1-c', master_machine_type='n1-standard-8', subnetwork_uri= 'https://www.googleapis.com/compute/v1/projects/hd-personalization-dev/regions/us-east1/subnetworks/batch-us-east1-subnet', tags=['all-bastion-ssh', 'dataproc', 'cassandra'], storage_bucket='hd-personalization-dev-batch', properties={'dataproc:dataproc.allow.zero.workers': 'true'}, dag=dag) dataproc_pyspark_submit = dataproc_operator.DataProcPySparkOperator( task_id='pyspark_task', main= 'gs://hd-personalization-dev-artifacts/releases/com.homedepot.recommendations/collections-model-training/python-scripts/v0.0.0+16/__main__.py', pyfiles=[ 'gs://hd-personalization-dev-artifacts/releases/com.homedepot.recommendations/collections-model-training/python-scripts/v0.0.0+16/collections_model_training-0.0.1-py3.7.egg' ],
default_dag_args = { "start_date": yesterday, "email_on_failure": False, "email_on_retry": False, "email": ["*****@*****.**"], } with DAG("gcp_data_platform", schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( project_id=PROJECT_ID, task_id="create_dataproc_cluster", cluster_name="gcp-data-platform", num_workers=0, zone="us-west1a", master_machine_type="n1-highmem-4", ) run_dataproc_spark = dataproc_operator.DataProcSparkOperator( task_id="events_dataproc", cluster_name="gcp-data-platform", region=REGION, main_class="io.dagster.events.EventPipeline", dataproc_spark_jars=[ "%s/events-assembly-%s.jar" % (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH) ], arguments=[ "--gcs-input-bucket",
} # [START composer_quickstart_schedule] with models.DAG( 'composer_csv2parquetv4', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # [END composer_quickstart_schedule] # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='parquetconverter2', num_workers=3, zone='europe-west1-b', master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the pyspark CSV2PARQUET example run_dataproc_csv2parquet = dataproc_operator.DataProcPySparkOperator( task_id='run_dataproc_parquetconvert', cluster_name='parquetconverter2', main='gs://alex-code/convert.py') # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='parquetconverter2',
default_dag_args = { 'start_date': yesterday, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': dt.timedelta(seconds=30), 'project_id': models.Variable.get('gcp_project') } with DAG('dataproc_spark_submit', schedule_interval='0 17 * * *', default_args=default_dag_args) as dag: create_dataproc_cluster = dpo.DataprocClusterCreateOperator( project_id = default_dag_args['project_id'], task_id = 'create_dataproc_cluster', cluster_name = CLUSTER_NAME, num_workers = 2, zone = models.Variable.get('gce_zone') ) run_spark_job = dpo.DataProcSparkOperator( task_id = 'run_spark_job', #main_jar = MAIN_JAR, main_class = MAIN_CLASS, cluster_name = CLUSTER_NAME ) delete_dataproc_cluster = dpo.DataprocClusterDeleteOperator( project_id = default_dag_args['project_id'], task_id = 'delete_dataproc_cluster', cluster_name = CLUSTER_NAME,
pipeline_cluster_name = 'cluster-2-compute-pi-{{ ds_nodash }}' with models.DAG( 'Compute-PI', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name=pipeline_cluster_name, num_workers=2, region='us-central1', autoscaling_policy= 'projects/{}/regions/us-central1/autoscalingPolicies/ephimeral-scaling-policy' .format(os.environ['PROJECT_ID']), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') run_py_spark = dataproc_operator.DataProcPySparkOperator( task_id='run_py_spark', region='us-central1', main='gs://{}/data/compute-pi-pipeline/calculate-pi.py'.format( os.environ['COMPOSER_BUCKET']), arguments=[models.Variable.get("NUM_SAMPLES")], cluster_name=pipeline_cluster_name) # Delete Cloud Dataproc cluster.
# Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='sqoop-cluster-{{ ds_nodash }}', num_workers=2, zone='us-east4-a', master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1', init_actions_uris=[ 'gs://dataproc-initialization-actions/cloud-sql-proxy/cloud-sql-proxy.sh' ], properties={ 'hive:hive.metastore.warehouse.dir': 'gs://dannydataproc/hive-warehouse' }, metadata={ 'enable-cloud-sql-hive-metastore': 'false', 'additional-cloud-sql-instances': 'dannydataproc:us-central1:sqooptest', 'hive-metastore-instance': 'dannydataproc:us-central1:sqooptest' }) # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop',
# detected in the Cloud Storage bucket. 'start_date': days_ago(1), 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5) } with models.DAG( 'acme_sales_staging_composer', schedule_interval = datetime.timedelta(days=1), default_args=default_dag_args) as dag: create_dataproc_acme_sales_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_acme_sales_cluster', cluster_name=DATAPROC_CLUSTER_NAME, region=REGION, zone=ZONE, num_workers=3, master_machine_type=DATAPROC_MASTER_MACHINE_TYPE, worker_machine_type=DATAPROC_MASTER_MACHINE_TYPE, image_version=IMAGE_VERSION, project_id=PROJECT_ID) locations_staging_spark_job = DataprocSubmitJobOperator( task_id="locations_staging_spark_job", job=LOCATIONS_STAGING_SPARK_JOB, location=REGION, project_id=PROJECT_ID) products_staging_spark_job = DataprocSubmitJobOperator( task_id="products_staging_spark_job", job=PRODUCTS_STAGING_SPARK_JOB, location=REGION,
# [START composer_quickstart_schedule] with models.DAG( 'composer_accomodation_model', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # [END composer_quickstart_schedule] # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='accomodation-cluster-{{ ds_nodash }}', num_workers=2, init_actions_uris=['gs://able-cogency-234306/tmp/cloud-sql-proxy.sh'], service_account_scopes=['https://www.googleapis.com/auth/cloud-platform','https://www.googleapis.com/auth/sqlservice.admin'], metadata={'enable-cloud-sql-hive-metastore':'false','additional-cloud-sql-instances':'able-cogency-234306:us-central1:testddd'}, region='us-central1', zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. csv_import_job = dataproc_operator.DataProcPySparkOperator( task_id='csv_import_job', main=CSVIMPORTPY, cluster_name='accomodation-cluster-{{ ds_nodash }}', job_name='csv_import_job', region='us-central1')
default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='dataproc', #PROJECT_ID=models.Variable.get('PROJECT_ID','dataproc-usecase-276215'), PROJECT_ID='dataproc-usecase-276215', region='us-west1', num_masters=1, num_workers=2, zone='us-west1-b', master_machine_type='n1-standard-1', master_disk_size = 100, worker_disk_size = 100, num_preemptible_workers=0, worker_machine_type='n1-standard-1', idle_delete_ttl =1800, subnetwork_uri='ctl', optional_components=['PRESTO','SOLR','RANGER'], service_account_scopes=['https://www.googleapis.com/auth/cloud-platform'], #service_account_scopes=['default'], internal_ip_only='false', #init_actions_uris=['gs://goog-dataproc-initialization-actions-us-west1/cloud-sql-proxy/cloud-sql-proxy.sh'], #metadata ={'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted'}) metadata= {'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted','use-cloud-sql-private-ip': 'true','db-admin-password-uri': 'gs://secrectkeybucket2/admin-password.encrypted', 'hive-metastore-instance': 'dataproc-usecase-276215:us-west1:hive-metadata'}, init_actions_uris=['gs://dataproc-staging-us-west4-17809115036-8cg5zgf1/cloud-sql-proxy/cloud-sql-proxy.sh'], init_action_timeout="10m")
'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with models.DAG( 'composer_sample_quickstart', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', num_workers=2, region='us-central1', zone=models.Variable.get('gce_zone'), image_version='2.0', master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', region='us-central1', main_jar=WORDCOUNT_JAR, cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', arguments=wordcount_args) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
'start_date': yesterday, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with models.DAG('composer_hadoop_wordcount', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_jar=WORDCOUNT_JAR, cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}', arguments=wordcount_args) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}', trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
default_args=default_dag_args, description='ETL using ephemeral Hadoop cluster', dagrun_timeout=timedelta(minutes=50), max_active_runs=1) # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', dag=dag, project_id=os.environ.get('GCP_PROJECT'), cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}', num_workers=2, num_preemptible_workers=1, zone='europe-west6-c', master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1', idle_delete_ttl=3600, image_version='1.4', storage_bucket='dataproc_dataops_tmp', # storage_bucket='gs://{{ var.value.v_twitter_temp_bucket }}', subnetwork_uri='https://www.googleapis.com/compute/v1/projects/' + os.environ.get('GCP_PROJECT') + '/regions/europe-west6/subnetworks/default', internal_ip_only= True #Enable Private Google Access on subnetwork 'default' gcloud compute networks subnets update default --region=europe-west6 --enable-private-ip-google-access ) delete_ml_partition = bash_operator.BashOperator( task_id='delete_ml_partition', dag=dag, bash_command= '''bq rm -f -t 'dataops_demo_ml_dev.t_twitter_google${{ macros.ds_format(ds, "%Y-%m-%d", "%Y%m%d") }}' ''',
description='DAG for deployment a Dataproc Cluster', schedule_interval=timedelta(days=1), default_args=default_dag_args) as dag: # STEP 5: Set Operators # BashOperator # A simple print date print_date = BashOperator(task_id='print_date', bash_command='date') # dataproc_operator # Create small dataproc cluster create_dataproc = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc', cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('dataproc_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the PySpark job run_spark = dataproc_operator.DataProcPySparkOperator( task_id='run_spark', main=SPARK_CODE, cluster_name='dataproc-cluster-demo-{{ ds_nodash }}', job_name=dataproc_job_name) # dataproc_operator # Delete Cloud Dataproc cluster. delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc',
'project_id': models.Variable.get('gcp_project') } with models.DAG('wordcount_hadoop', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # check if input file exists check_file_existence = BashOperator( task_id='check_file_existence', bash_command='if [! -f \"{}\"; then exit 1; fi'.format(input_file) ) # create dataproc cluster create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='quickstart-cluster-{{ ds_nodash }}' num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1' ) # submit an Apache Hadoop Job run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_jar=WORDCOUNT_JAR, cluster_name='quickstart-cluster-{{ ds_nodash }}', arguments=wordcount_args )
} # [START composer_hadoop_schedule_airflow_1] with models.DAG( 'composer_hadoop_tutorial', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # [END composer_hadoop_schedule_airflow_1] # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/docs/apache-airflow/stable/macros-ref.html cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_jar=WORDCOUNT_JAR, cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}', arguments=wordcount_args) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
'start_date': datetime.datetime(2017, 1, 1), } def print_context(**kwargs): print(kwargs) file_name=kwargs['dag_run'].conf['name'] bucket=kwargs['dag_run'].conf['bucket'] bucket_path="gs://{}/{}".format(bucket,file_name) kwargs['ti'].xcom_push(key="bucket_path",value=bucket_path) with airflow.DAG('gcs_composer_trigger_dag',default_args=default_args, schedule_interval=None) as dag: create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='composer-311-complaints-{{ ds_nodash }}', num_workers=2, region=models.Variable.get('region'), zone=models.Variable.get('gce_zone'), project_id=models.Variable.get('project_id'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') run_dataproc_job = dataproc_operator.DataProcPySparkOperator( task_id="run_dataproc_job", main="gs://311-complaints-spark_jobs/spark_job.py", cluster_name='composer-311-complaints-{{ ds_nodash }}', region=models.Variable.get('region'), dataproc_pyspark_jars=['gs://spark-lib/bigquery/spark-bigquery-latest.jar'], arguments=['gs://{{ dag_run.conf.get("bucket") }}/{{ dag_run.conf.get("name") }}']) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster',
'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': project_id } with models.DAG( 'nyc_collisions_dag', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: mapreduce_cluster_name = 'airflow-mapreduce-cluster' create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name=mapreduce_cluster_name, num_workers=2, zone=gce_zone, master_machine_type=machine_type, worker_machine_type=machine_type) hadoop_job = dataproc_operator.DataProcHadoopOperator( task_id='hadoop_job', cluster_name=mapreduce_cluster_name, main_jar=hadoop_job_jar_uri, arguments=[ collisions_dataset_uri, f'{hadoop_job_output_bucket}/{exec_dt}' ]) hive_job = dataproc_operator.DataProcHiveOperator( task_id='hive_job',
with models.DAG( 'dataproc_cluster_create_pysparkjob_and_delete_1', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: def greeting(): import logging logging.info('Hello World!') create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='composer-dataproc-{{ ds_nodash }}', num_workers=2, region='asia-south1', zone='asia-south1-a', master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') dataprod_pyspark = dataproc_operator.DataProcPySparkOperator( task_id='pyspark', main='gs://code_deploy/dataproc_read_bucket_to_bigquery.py', cluster_name='composer-dataproc-{{ ds_nodash }}', region='asia-south1', dataproc_pyspark_jars=[]) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='composer-dataproc-{{ ds_nodash }}', region='asia-south1', # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with models.DAG( 'composer_sample_quickstart', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: # Create a Cloud Dataproc cluster. create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', # Give the cluster a unique name by appending the date scheduled. # See https://airflow.apache.org/code.html#default-variables cluster_name='quickstart-cluster-{{ ds_nodash }}', num_workers=2, zone=models.Variable.get('gce_zone'), master_machine_type='n1-standard-1', worker_machine_type='n1-standard-1') # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster # master node. run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_jar=WORDCOUNT_JAR, cluster_name='quickstart-cluster-{{ ds_nodash }}', arguments=wordcount_args) # Delete Cloud Dataproc cluster. delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
# def greeting(): # import logging # logging.info('Hello DataPipeline for World Health Data!') # An instance of an operator is called a task. In this case, the # hello_python task calls the "greeting" Python function. # hello_python = python_operator.PythonOperator( # task_id='KickOff', # python_callable=greeting) create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator( task_id='create_dataproc_cluster', project_id='dataproc-300110', cluster_name='cluster-58-wb', num_workers=2, region='us-east1', init_actions_uris=['gs://worldbank2021/code/init_cluster.sh'], master_machine_type='n1-standard-2', worker_machine_type='n1-standard-2') dataproc_pyspark_1 = dataproc_operator.DataProcPySparkOperator( task_id='Load_BQ_spark_job_1', # call the py file for processing # main='gs://dataproc-nyc-taxi-2020/code_deploy/dataproc_wb.py', main='gs://worldbank2021/code/dataproc_load_bq.py', cluster_name='cluster-58-wb', region='us-east1', arguments=['wb_country_series_definition'], dataproc_pyspark_jars=[ 'gs://spark-lib/bigquery/spark-bigquery-latest.jar'