Example #1
0
      job_name=dag_name + 'bike_share_retention_d7',
      cluster_name='{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}'
      + '4',
      execution_timeout=timedelta(minutes=180),
      arguments=args)

  unique_user_sensor = GoogleCloudStorageObjectSensor(
      task_id='unique_user_sensor',
      bucket='jiuzhangsuanfa',
      object='bike/unique-user/_SUCCESS',
      poke_interval=30,
      timeout=2700)

  unique_user.set_upstream(dataproc_create_cluster_1)

  unique_user.set_downstream(bike_share_aggregator)

  bike_share_aggregator.set_downstream(dataproc_destroy_cluster_1)

  bike_share_retention_d1.set_upstream(dataproc_create_cluster_2)

  bike_share_retention_d1.set_downstream(dataproc_destroy_cluster_2)

  bike_share_retention_d3.set_upstream(dataproc_create_cluster_3)

  bike_share_retention_d3.set_downstream(dataproc_destroy_cluster_3)

  bike_share_retention_d7.set_upstream(dataproc_create_cluster_4)

  bike_share_retention_d7.set_downstream(dataproc_destroy_cluster_4)
Example #2
0
        },
        {
            'name': 'retention',
            'type': 'INTEGER',
            'mode': 'NULLABLE'
        },
    ],
    create_disposition='CREATE_IF_NEEDED',
    write_disposition='WRITE_APPEND',
    dag=dag)

# =================
# == tasks flow ===
# =================

# dataproc upstream & downstream for both create and delete dataproc
create_cluster_1.set_upstream(push_unique_cluster_name)
create_cluster_1.set_upstream(sensor_task)
create_cluster_2.set_upstream(push_unique_cluster_name)
create_cluster_2.set_upstream(delete_cluster_1)
# create job upstream & downstream
calc_unique_users.set_upstream(create_cluster_1)
calc_unique_users.set_downstream(bq_load_user)
calc_unique_users.set_downstream(calc_agg)
calc_agg.set_downstream(bq_load_agg)
calc_retention_day1.set_upstream(create_cluster_2)
calc_retention_day1.set_downstream(bq_load_retention)
# create cfs_to_bq upstream & downstream
bq_load_agg.set_downstream(delete_cluster_1)
bq_load_retention.set_downstream(delete_cluster_2)
Example #3
0
        # in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        image_version='1.5-debian10',
        num_workers=2,
        storage_bucket=CLUSTER_STORAGE_BUCKET,
        region=REGION,
        zone=ZONE)

    # Submit our Spark Job
    submit_scalaspark = DataProcSparkOperator(
        task_id=TASK_ID,
        region=REGION,
        main_class=MAIN_CLASS,
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        dataproc_spark_jars=MAIN_JAR)

    # Delete the Cloud Dataproc cluster.
    delete_cluster = DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        region=REGION,
        # Obviously needs to match the name of cluster created in the prior two Operators.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        # This will tear down the cluster even if there are failures in upstream tasks.
        trigger_rule=TriggerRule.ALL_DONE)

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_scalaspark)

    submit_scalaspark.set_downstream(delete_cluster)