Ejemplo n.º 1
0
    #create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
    #    task_id='create_dataproc_cluster',
    #    # Give the cluster a unique name by appending the date scheduled.
    #    # See https://airflow.apache.org/code.html#default-variables
    #    cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
    #    num_workers=2,
    #    zone=models.Variable.get('gce_zone'),
    #    master_machine_type='n1-standard-1',
    #    worker_machine_type='n1-standard-1')

    # Submit the PySpark job.
    submit_pyspark1 = dataproc_operator.DataProcPySparkOperator(
        task_id='submit_pyspark1',
        main=PYSPARK_JOB,
        # Obviously needs to match the name of cluster created in the prior Operator.
        cluster_name='cicd-demo-cluster',
        region='us-central1',
        dataproc_jars='gs://spark-lib/bigquery/spark-bigquery-latest.jar',
        dataproc_pyspark_jars=
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar')

    submit_pyspark2 = dataproc_operator.DataProcPySparkOperator(
        task_id='submit_pyspark2',
        main=PYSPARK_JOB,
        # Obviously needs to match the name of cluster created in the prior Operator.
        cluster_name='cicd-demo-cluster',
        region='us-central1',
        dataproc_jars='gs://spark-lib/bigquery/spark-bigquery-latest.jar',
        dataproc_pyspark_jars=
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar')
Ejemplo n.º 2
0
    tags=['all-bastion-ssh', 'dataproc', 'cassandra'],
    storage_bucket='hd-personalization-dev-batch',
    properties={'dataproc:dataproc.allow.zero.workers': 'true'},
    dag=dag)

dataproc_pyspark_submit = dataproc_operator.DataProcPySparkOperator(
    task_id='pyspark_task',
    main=
    'gs://hd-personalization-dev-artifacts/releases/com.homedepot.recommendations/collections-model-training/python-scripts/v0.0.0+16/__main__.py',
    pyfiles=[
        'gs://hd-personalization-dev-artifacts/releases/com.homedepot.recommendations/collections-model-training/python-scripts/v0.0.0+16/collections_model_training-0.0.1-py3.7.egg'
    ],
    arguments=[
        'LSTM_DATAGEN', '--project', 'hd-personalization-dev', '--category',
        'AreaRugs', '--dupletsData',
        'gs://hd-personalization-dev-data/vdc2136/training/duplets/2020-06-01/',
        '--featuresData',
        'gs://hd-personalization-dev-data/vdc2136/training/data/AllFeatures.csv',
        '--finalOutputPath',
        'gs://hd-personalization-dev-data/vdc2136/training/lstm/2020-06-02/',
        '--appName', 'LSTM_DATA_GEN', '--mode=cluster'
    ],
    job_name='airflow_pyspark_job',
    cluster_name='airflow-dataproc-cluster',
    project_id='hd-personalization-dev',
    dag=dag)

delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
    task_id='delete_dataproc_cluster',
    cluster_name='airflow-dataproc-cluster',
    # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
Ejemplo n.º 3
0
    # BashOperator
    # A simple print date
    print_date = BashOperator(task_id='print_date', bash_command='date')

    # dataproc_operator
    # Create small dataproc cluster
    create_dataproc = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('dataproc_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the PySpark job
    run_spark = dataproc_operator.DataProcPySparkOperator(
        task_id='run_spark',
        main=SPARK_CODE,
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        job_name=dataproc_job_name)

    # dataproc_operator
    # Delete Cloud Dataproc cluster.
    delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # STEP 6: Set DAGs dependencies
    # Each task should run after have finished the task before.
    print_date >> create_dataproc >> run_spark >> delete_dataproc
Ejemplo n.º 4
0
from airflow import models
from airflow.contrib.operators import dataproc_operator
from airflow.utils import trigger_rule

output_file = os.path.join(
    models.Variable.get('gcs_bucket'), 'dataproc_simple',
    datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) + os.sep

yesterday = datetime.datetime.combine(
    datetime.datetime.today() - datetime.timedelta(1),
    datetime.datetime.min.time())

args = {
    'start_date': yesterday,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG('spark_simple',
                schedule_interval=datetime.timedelta(days=1),
                default_args=args) as dag:
    run_step = dataproc_operator.DataProcPySparkOperator(
        task_id='run_spark',
        cluster_name='cluster-9c11',
        region='europe-west1',
        main='gs://bigdataupv_code/compras_top_ten_countries.py',
        files=['gs://bigdataupv_code/helpers.py'])
Ejemplo n.º 5
0
    # [END composer_quickstart_schedule]

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='parquetconverter2',
        num_workers=3,
        zone='europe-west1-b',
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the pyspark CSV2PARQUET example
    run_dataproc_csv2parquet = dataproc_operator.DataProcPySparkOperator(
        task_id='run_dataproc_parquetconvert',
        cluster_name='parquetconverter2',
        main='gs://alex-code/convert.py')

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='parquetconverter2',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # [START composer_quickstart_steps]
    # Define DAG dependencies.
    create_dataproc_cluster >> run_dataproc_csv2parquet >> delete_dataproc_cluster
    # [END composer_quickstart_steps]
Ejemplo n.º 6
0
        },
        optional_components=['ANACONDA', 'JUPYTER', 'ZEPPELIN'],
        enable_optional_components=True,
        enable_http_port_access=True,
        zone="europe-west1-b",
        storage_bucket="vf-polimi-batch-data",
        idle_delete_ttl=3601,
        internal_ip_only=False,
        init_actions_uris=[
            'gs://goog-dataproc-initialization-actions-europe-west1/python/pip-install.sh'
        ])

    run_batch_kpi_scheduled = dataproc_operator.DataProcPySparkOperator(
        task_id="submit_batch-kpi-scheduled",
        cluster_name='vf-polimi-demo',
        region='europe-west1',
        main='gs://vf-polimi-batch-data/dev/compute-kpi-batch.py',
        dataproc_pyspark_jars=
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar',
        xcom_push=True)

    remove_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        project_id=PROJECT,
        task_id="delete_cluster",
        cluster_name='vf-polimi-demo',
        region='europe-west1')

    def check_batch_kpi_scheduled_cluster_running(**kwargs):
        ti = kwargs['ti']
        xcom_value = ti.xcom_pull(task_ids='batch_kpi_scheduled_cluster')
        if xcom_value == "vf-polimi-demo":
            return 'delete_cluster'
Ejemplo n.º 7
0
with airflow.DAG('gcs_composer_trigger_dag',default_args=default_args, schedule_interval=None) as dag:

     create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='composer-311-complaints-{{ ds_nodash }}',
        num_workers=2,
        region=models.Variable.get('region'),
        zone=models.Variable.get('gce_zone'),
        project_id=models.Variable.get('project_id'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

     run_dataproc_job = dataproc_operator.DataProcPySparkOperator(
        task_id="run_dataproc_job",
        main="gs://311-complaints-spark_jobs/spark_job.py",
        cluster_name='composer-311-complaints-{{ ds_nodash }}',
        region=models.Variable.get('region'),
        dataproc_pyspark_jars=['gs://spark-lib/bigquery/spark-bigquery-latest.jar'],
        arguments=['gs://{{ dag_run.conf.get("bucket") }}/{{ dag_run.conf.get("name") }}'])

     delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='composer-311-complaints-{{ ds_nodash }}',
        project_id=models.Variable.get('project_id'),
        region=models.Variable.get('region'),
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

     bigquery_transformations=BigQueryOperator(
        sql='/sql/job.sql',
        task_id='bigquery_transformations',
        use_legacy_sql=False,
Ejemplo n.º 8
0
        import logging
        logging.info('Hello World!')

    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='composer-dataproc-{{ ds_nodash }}',
        num_workers=2,
        region='asia-south1',
        zone='asia-south1-a',
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')
    dataprod_pyspark = dataproc_operator.DataProcPySparkOperator(
        task_id='pyspark',
        main='gs://code_deploy/dataproc_read_bucket_to_bigquery.py',
        cluster_name='composer-dataproc-{{ ds_nodash }}',
        region='asia-south1',
        dataproc_pyspark_jars=[])
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='composer-dataproc-{{ ds_nodash }}',
        region='asia-south1',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # An instance of an operator is called a task. In this case, the
    # hello_python task calls the "greeting" Python function.
    hello_python = python_operator.PythonOperator(task_id='hello',
                                                  python_callable=greeting)
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name=pipeline_cluster_name,
        num_workers=2,
        region='us-central1',
        autoscaling_policy=
        'projects/{}/regions/us-central1/autoscalingPolicies/ephimeral-scaling-policy'
        .format(os.environ['PROJECT_ID']),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    run_py_spark = dataproc_operator.DataProcPySparkOperator(
        task_id='run_py_spark',
        region='us-central1',
        main='gs://{}/data/compute-pi-pipeline/calculate-pi.py'.format(
            os.environ['COMPOSER_BUCKET']),
        arguments=[models.Variable.get("NUM_SAMPLES")],
        cluster_name=pipeline_cluster_name)

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        region='us-central1',
        cluster_name=pipeline_cluster_name,
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # Define DAG dependencies.
    create_dataproc_cluster >> run_py_spark >> delete_dataproc_cluster
Ejemplo n.º 10
0
delete_ml_partition = bash_operator.BashOperator(
    task_id='delete_ml_partition',
    dag=dag,
    bash_command=
    '''bq rm -f -t 'dataops_demo_ml_dev.t_twitter_google${{ macros.ds_format(ds, "%Y-%m-%d", "%Y%m%d") }}' ''',
)

# Execute PySpark job
run_pyspark_job_splitting = dataproc_operator.DataProcPySparkOperator(
    task_id='run_pyspark_job_splitting',
    dag=dag,
    main='gs://' + Variable.get('v_composer_bucket') +
    '/dags/dataproc/twitterPySparkSplitting.py',
    cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}',
    dataproc_pyspark_jars=[
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar'
    ],
    arguments=[
        "--dataproc=1.4", "--job_date={{ ds }}",
        "--bucket=dataproc_dataops_tmp"
    ])

run_pyspark_job_frequency = dataproc_operator.DataProcPySparkOperator(
    task_id='run_pyspark_job_frequency',
    dag=dag,
    main='gs://' + Variable.get('v_composer_bucket') +
    '/dags/dataproc/twitterPySparkFrequency.py',
    cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}',
    dataproc_pyspark_jars=[
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar'
Ejemplo n.º 11
0
    # dataproc_operator
    # Create small dataproc cluster
    create_dataproc = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('dataproc_zone'),
        master_machine_type='e2-standard-4',
        worker_machine_type='e2-standard-8')

    # Run the PySpark job

    run_spark0 = dataproc_operator.DataProcPySparkOperator(
        task_id='run_spark0',
        main=SPARK_CODE0,
        dataproc_pyspark_jars=jarpath,
        arguments=arglist,
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        job_name=dataproc_job_name + '0')

    run_spark1 = dataproc_operator.DataProcPySparkOperator(
        task_id='run_spark1',
        main=SPARK_CODE1,
        dataproc_pyspark_jars=jarpath,
        arguments=arglist,
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        job_name=dataproc_job_name + '1')

    run_spark2 = dataproc_operator.DataProcPySparkOperator(
        task_id='run_spark2',
        main=SPARK_CODE2,
    # dataproc_operator
    # Create small dataproc cluster
    create_dataproc = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        num_workers=2,
        zone=None,
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1',
        region=models.Variable.get('dataproc_zone'))

    # Run the PySpark job
    run_spark = dataproc_operator.DataProcPySparkOperator(
        task_id='run_spark',
        main=SPARK_CODE,
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        job_name=dataproc_job_name,
        region=models.Variable.get('dataproc_zone'))

    # dataproc_operator
    # Delete Cloud Dataproc cluster.
    delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        region=models.Variable.get('dataproc_zone'))

    # STEP 6: Set DAGs dependencies
    # Each task should run after have finished the task before.
    print_date >> create_dataproc >> run_spark >> delete_dataproc
Ejemplo n.º 13
0
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='accomodation-cluster-{{ ds_nodash }}',
        num_workers=2,
        init_actions_uris=['gs://able-cogency-234306/tmp/cloud-sql-proxy.sh'],
        service_account_scopes=['https://www.googleapis.com/auth/cloud-platform','https://www.googleapis.com/auth/sqlservice.admin'],
        metadata={'enable-cloud-sql-hive-metastore':'false','additional-cloud-sql-instances':'able-cogency-234306:us-central1:testddd'},
        region='us-central1',
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    csv_import_job = dataproc_operator.DataProcPySparkOperator(
        task_id='csv_import_job',
        main=CSVIMPORTPY,
        cluster_name='accomodation-cluster-{{ ds_nodash }}',
		job_name='csv_import_job',
        region='us-central1')
	
	# Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    accomodation_model_job = dataproc_operator.DataProcPySparkOperator(
        task_id='accomodation_model_job',
        main=MODELPY,
        cluster_name='accomodation-cluster-{{ ds_nodash }}',
		job_name='accomodation_model_job',
        region='us-central1')

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
Ejemplo n.º 14
0
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')




    # Submit the PySpark job.
    submit_pyspark1 = dataproc_operator.DataProcPySparkOperator(
        task_id='submit_pyspark1',
        main=PYSPARK_JOB,
        # Obviously needs to match the name of cluster created in the prior Operator.
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        dataproc_jars  = 'gs://spark-lib/bigquery/spark-bigquery-latest.jar',
        dataproc_pyspark_jars ='gs://spark-lib/bigquery/spark-bigquery-latest.jar')


    submit_pyspark2 = dataproc_operator.DataProcPySparkOperator(
        task_id='submit_pyspark2',
        main=PYSPARK_JOB,
        # Obviously needs to match the name of cluster created in the prior Operator.
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        dataproc_jars  = 'gs://spark-lib/bigquery/spark-bigquery-latest.jar',
        dataproc_pyspark_jars ='gs://spark-lib/bigquery/spark-bigquery-latest.jar')


Ejemplo n.º 15
0
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        project_id='dataproc-300110',
        cluster_name='cluster-58-wb',
        num_workers=2,
        region='us-east1',
        init_actions_uris=['gs://worldbank2021/code/init_cluster.sh'],
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2')

dataproc_pyspark_1 = dataproc_operator.DataProcPySparkOperator(
    task_id='Load_BQ_spark_job_1',
    # call the py file for processing
    #    main='gs://dataproc-nyc-taxi-2020/code_deploy/dataproc_wb.py',
    main='gs://worldbank2021/code/dataproc_load_bq.py',
    cluster_name='cluster-58-wb',
    region='us-east1',
    arguments=['wb_country_series_definition'],
    dataproc_pyspark_jars=[
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar'
    ])

dataproc_pyspark_2 = dataproc_operator.DataProcPySparkOperator(
    task_id='Load_BQ_spark_job_2',
    main='gs://worldbank2021/code/dataproc_load_bq.py',
    cluster_name='cluster-58-wb',
    region='us-east1',
    arguments=['wb_country_summary'],
    dataproc_pyspark_jars=[
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar'
    ])