default_args=default_dag_args) as dag:

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        main_jar=WORDCOUNT_JAR,
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        arguments=wordcount_args)

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # Define DAG dependencies.
    create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
Esempio n. 2
0
        default_args=default_dag_args) as dag:

    mapreduce_cluster_name = 'airflow-mapreduce-cluster'

    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name=mapreduce_cluster_name,
        num_workers=2,
        zone=gce_zone,
        master_machine_type=machine_type,
        worker_machine_type=machine_type)

    hadoop_job = dataproc_operator.DataProcHadoopOperator(
        task_id='hadoop_job',
        cluster_name=mapreduce_cluster_name,
        main_jar=hadoop_job_jar_uri,
        arguments=[
            collisions_dataset_uri,
            f'{hadoop_job_output_bucket}/{exec_dt}'
        ])

    hive_job = dataproc_operator.DataProcHiveOperator(
        task_id='hive_job',
        cluster_name=mapreduce_cluster_name,
        dataproc_hive_jars=[hive_hcatalog_jar_uri],
        query_uri=hive_job_hql_uri,
        variables={
            'collisions_job_output_bucket': f'{hadoop_job_output_bucket}/{exec_dt}',
            'hive_job_output_bucket':       f'{hive_job_output_bucket}/{exec_dt}',
            'hive_hcatalog_jar':            hive_hcatalog_jar_uri,
            'zips_boroughs_bucket':         zips_boroughs_bucket_uri
        }
Esempio n. 3
0
            'enable-cloud-sql-hive-metastore': 'false',
            'additional-cloud-sql-instances':
            'dannydataproc:us-central1:sqooptest',
            'hive-metastore-instance': 'dannydataproc:us-central1:sqooptest'
        })

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        main_class='org.apache.sqoop.Sqoop',
        dataproc_hadoop_jars=[
            'gs://dannydataproc/sqoop-1.4.7-hadoop260.jar',
            'file:///usr/share/java/mysql-connector-java-5.1.42.jar',
            'gs://dannydataproc/avro-tools-1.8.2.jar'
        ],
        arguments=[
            'import', '-Dmapreduce.job.user.classpath.first=true',
            '--connect=jdbc:mysql://127.0.0.1/guestbook', '--username=sqoop',
            '--password-file=gs://dannydataproc/passwordFile.txt',
            '--target-dir=gs://dannydataproc/entities', '--table=entries',
            '--as-avrodatafile', '--delete-target-dir'
        ],
        cluster_name='sqoop-cluster-{{ ds_nodash }}')

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='sqoop-cluster-{{ ds_nodash }}',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        num_workers=2,
        region='us-central1',
        zone=models.Variable.get('gce_zone'),
        image_version='2.0',
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2')
    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        region='us-central1',
        main_jar=WORDCOUNT_JAR,
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        arguments=wordcount_args)
    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        region='us-central1',
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
    # Define DAG dependencies.
    create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
Esempio n. 5
0
    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name=naming_cluster + '-{{ ds_nodash }}',
        num_workers=2,
        zone=var_zone,
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_spark = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        main_jar=SPARK_JAR,
        cluster_name=naming_cluster + '-{{ ds_nodash }}',
        arguments=executor_args)

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name=naming_cluster + '-{{ ds_nodash }}',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # [START composer_hadoop_steps]
    # Define DAG dependencies.
    create_dataproc_cluster >> run_dataproc_spark >> delete_dataproc_cluster
    # [END composer_hadoop_steps]