예제 #1
0
파일: dataproc.py 프로젝트: turbaszek/dbnd
    def create_engine(cls):
        from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
        from airflow.contrib.operators import dataproc_operator

        from dbnd._core.current import get_settings

        cloud = get_settings().get_env_config(CloudType.gcp)

        gcp_conn_id = cloud.conn_id

        dataproc_config = DataprocConfig()
        cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)

        return dataproc_operator.DataprocClusterCreateOperator(
            task_id="create_dataproc_cluster",
            project_id=cluster_hook.project_id,
            cluster_name=dataproc_config.cluster,
            gcp_conn_id=gcp_conn_id,
            num_workers=dataproc_config.num_workers,
            zone=dataproc_config.zone,
            network_uri=dataproc_config.network_uri,
            subnetwork_uri=dataproc_config.subnetwork_uri,
            tags=dataproc_config.tags,
            storage_bucket=dataproc_config.storage_bucket,
            init_actions_uris=dataproc_config.init_actions_uris,
            init_action_timeout=dataproc_config.init_action_timeout,
            metadata=dataproc_config.metadata,
            image_version=dataproc_config.image_version,
            properties=dataproc_config.properties,
            master_machine_type=dataproc_config.master_machine_type,
            master_disk_size=dataproc_config.master_disk_size,
            worker_machine_type=dataproc_config.worker_machine_type,
            worker_disk_size=dataproc_config.worker_disk_size,
            num_preemptible_workers=dataproc_config.num_preemptible_workers,
            labels=dataproc_config.labels,
            delegate_to=dataproc_config.delegate_to,
            service_account=dataproc_config.service_account,
            service_account_scopes=dataproc_config.service_account_scopes,
            idle_delete_ttl=dataproc_config.idle_delete_ttl,
            auto_delete_time=dataproc_config.auto_delete_time,
            auto_delete_ttl=dataproc_config.auto_delete_ttl,
        )
예제 #2
0
        'automation_dataproc_hive_create',
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='dataproc',
		#PROJECT_ID=models.Variable.get('PROJECT_ID','dataproc-usecase-276215'),
		PROJECT_ID='dataproc-usecase-276215',
		region='us-west1',
		num_masters=1,
        num_workers=2,
		zone='us-west1-b',
        master_machine_type='n1-standard-1',
		master_disk_size = 100,
		worker_disk_size = 100,
		num_preemptible_workers=0,
        worker_machine_type='n1-standard-1',
		idle_delete_ttl =1800,
		subnetwork_uri='ctl',
		optional_components=['PRESTO','SOLR','RANGER'],
		#service_account_scopes=['cloud-platform','default','sql-admin'],
		#init_actions_uris=['gs://us-west4-test1-d1c785e8-bucket/data/cloud-sql-proxy/cloud-sql-proxy.sh'])
        #metadata ={'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted'})
        metadata= {'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted','use-cloud-sql-private-ip': 'true', 'hive-metastore-instance': 'dataproc-usecase-276215:us-west1:hive-metadata'})
		
		
    # Run the Hive job on the Cloud Dataproc cluster
    run_dataproc_hive_create_db = DataProcHiveOperator(
        task_id='create_db',
예제 #3
0
default_dag_args = {
    'start_date': yesterday,
    'email_on_failure': False,
    'email_on_retry': False,
    'email': ['*****@*****.**'],
}

with DAG('gcp_data_platform',
         schedule_interval=datetime.timedelta(days=1),
         default_args=default_dag_args) as dag:

    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        project_id=PROJECT_ID,
        task_id='create_dataproc_cluster',
        cluster_name='gcp-data-platform',
        num_workers=0,
        zone='us-west1a',
        master_machine_type='n1-highmem-4',
    )

    run_dataproc_spark = dataproc_operator.DataProcSparkOperator(
        task_id='events_dataproc',
        cluster_name='gcp-data-platform',
        region=REGION,
        main_class='io.dagster.events.EventPipeline',
        dataproc_spark_jars=[
            '%s/events-assembly-%s.jar' %
            (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH)
        ],
        arguments=[
            '--gcs-input-bucket',
예제 #4
0
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': 'upc-bdm'
}

with models.DAG(
        'store_data_integration',
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:
    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        storage_bucket='listery-staging',
        num_workers=2,
        master_disk_size=20,
        worker_disk_size=20,
        num_preemptible_workers=1,
        zone='us-east1-c',
        master_machine_type='n1-standard-4',
        worker_machine_type='n1-standard-4')

    run_integration_job = dataproc_operator.DataProcSparkOperator(
        task_id='run_integration_job',
        main_jar=SPARK_JOBS_JAR,
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        arguments=["--local", "false", "--subprogram", "integration"])

    offer_integration = dataproc_operator.DataProcSparkOperator(
        task_id='offer_integration',
        main_jar=SPARK_JOBS_JAR,
    #Request for job
    request_job = PythonOperator(
        task_id='Request_for_dataproc_job',
        python_callable=notify_success,
        dag=da  g,
        on_success_callback=alert_job_requester
    )

    #Creating dataproc cluster.
    create_dataproc_cluster = dpo.DataprocClusterCreateOperator(
        project_id = projectID,
        task_id = 'create_dataproc_cluster',
        cluster_name = cluster_name,
        num_workers = num_workers,
        region = region,
        #zone = 'us-central1-a',
        #network_uri = 'default',
        subnetwork_uri = subnet,
        properties = cluster_properties,
        on_success_callback=task_success_slack_alert,
        trigger_rule = trigger_rule.TriggerRule.ALL_SUCCESS
    )

    #Run spark job on the above cluster.
    run_spark_job = BashOperator(
        task_id = 'run_spark_job',
        bash_command = bash_command,
        dag=dag,
        on_success_callback=task_success_slack_alert,
        trigger_rule = trigger_rule.TriggerRule.ALL_SUCCESS
    )
예제 #6
0
dag = DAG(
    'dataproc_dag',
    default_args=default_args,
    schedule_interval="@once",
)

# Create a Cloud Dataproc cluster.
create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
    task_id='create_dataproc_cluster',
    cluster_name='airflow-dataproc-cluster',
    project_id='hd-personalization-dev',
    num_workers=0,
    init_actions_uris=[
        'gs://hd-personalization-dev-data/vdc2136/training/updated/initialization_actions.sh'
    ],
    zone='us-east1-c',
    master_machine_type='n1-standard-8',
    subnetwork_uri=
    'https://www.googleapis.com/compute/v1/projects/hd-personalization-dev/regions/us-east1/subnetworks/batch-us-east1-subnet',
    tags=['all-bastion-ssh', 'dataproc', 'cassandra'],
    storage_bucket='hd-personalization-dev-batch',
    properties={'dataproc:dataproc.allow.zero.workers': 'true'},
    dag=dag)

dataproc_pyspark_submit = dataproc_operator.DataProcPySparkOperator(
    task_id='pyspark_task',
    main=
    'gs://hd-personalization-dev-artifacts/releases/com.homedepot.recommendations/collections-model-training/python-scripts/v0.0.0+16/__main__.py',
    pyfiles=[
        'gs://hd-personalization-dev-artifacts/releases/com.homedepot.recommendations/collections-model-training/python-scripts/v0.0.0+16/collections_model_training-0.0.1-py3.7.egg'
    ],
예제 #7
0
default_dag_args = {
    "start_date": yesterday,
    "email_on_failure": False,
    "email_on_retry": False,
    "email": ["*****@*****.**"],
}

with DAG("gcp_data_platform",
         schedule_interval=datetime.timedelta(days=1),
         default_args=default_dag_args) as dag:

    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        project_id=PROJECT_ID,
        task_id="create_dataproc_cluster",
        cluster_name="gcp-data-platform",
        num_workers=0,
        zone="us-west1a",
        master_machine_type="n1-highmem-4",
    )

    run_dataproc_spark = dataproc_operator.DataProcSparkOperator(
        task_id="events_dataproc",
        cluster_name="gcp-data-platform",
        region=REGION,
        main_class="io.dagster.events.EventPipeline",
        dataproc_spark_jars=[
            "%s/events-assembly-%s.jar" %
            (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH)
        ],
        arguments=[
            "--gcs-input-bucket",
예제 #8
0
}

# [START composer_quickstart_schedule]
with models.DAG(
        'composer_csv2parquetv4',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:
    # [END composer_quickstart_schedule]

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='parquetconverter2',
        num_workers=3,
        zone='europe-west1-b',
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the pyspark CSV2PARQUET example
    run_dataproc_csv2parquet = dataproc_operator.DataProcPySparkOperator(
        task_id='run_dataproc_parquetconvert',
        cluster_name='parquetconverter2',
        main='gs://alex-code/convert.py')

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='parquetconverter2',
예제 #9
0
default_dag_args = {
    'start_date': yesterday,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': dt.timedelta(seconds=30),
    'project_id': models.Variable.get('gcp_project')
}

with DAG('dataproc_spark_submit', schedule_interval='0 17 * * *',
    default_args=default_dag_args) as dag:

    create_dataproc_cluster = dpo.DataprocClusterCreateOperator(
        project_id = default_dag_args['project_id'],
        task_id = 'create_dataproc_cluster',
        cluster_name = CLUSTER_NAME,
        num_workers = 2,
        zone = models.Variable.get('gce_zone')
    )

    run_spark_job = dpo.DataProcSparkOperator(
        task_id = 'run_spark_job',
        #main_jar = MAIN_JAR,
        main_class = MAIN_CLASS,
        cluster_name = CLUSTER_NAME
    )

    delete_dataproc_cluster = dpo.DataprocClusterDeleteOperator(
        project_id = default_dag_args['project_id'],
        task_id = 'delete_dataproc_cluster',
        cluster_name = CLUSTER_NAME,
pipeline_cluster_name = 'cluster-2-compute-pi-{{ ds_nodash }}'

with models.DAG(
        'Compute-PI',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name=pipeline_cluster_name,
        num_workers=2,
        region='us-central1',
        autoscaling_policy=
        'projects/{}/regions/us-central1/autoscalingPolicies/ephimeral-scaling-policy'
        .format(os.environ['PROJECT_ID']),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    run_py_spark = dataproc_operator.DataProcPySparkOperator(
        task_id='run_py_spark',
        region='us-central1',
        main='gs://{}/data/compute-pi-pipeline/calculate-pi.py'.format(
            os.environ['COMPOSER_BUCKET']),
        arguments=[models.Variable.get("NUM_SAMPLES")],
        cluster_name=pipeline_cluster_name)

    # Delete Cloud Dataproc cluster.
예제 #11
0
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='sqoop-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone='us-east4-a',
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1',
        init_actions_uris=[
            'gs://dataproc-initialization-actions/cloud-sql-proxy/cloud-sql-proxy.sh'
        ],
        properties={
            'hive:hive.metastore.warehouse.dir':
            'gs://dannydataproc/hive-warehouse'
        },
        metadata={
            'enable-cloud-sql-hive-metastore': 'false',
            'additional-cloud-sql-instances':
            'dannydataproc:us-central1:sqooptest',
            'hive-metastore-instance': 'dannydataproc:us-central1:sqooptest'
        })

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
    # detected in the Cloud Storage bucket.
    'start_date': days_ago(1),
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5)
}

with models.DAG(
    'acme_sales_staging_composer',
    schedule_interval = datetime.timedelta(days=1),
    default_args=default_dag_args) as dag:

    create_dataproc_acme_sales_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_acme_sales_cluster',
        cluster_name=DATAPROC_CLUSTER_NAME,
        region=REGION,
        zone=ZONE,
        num_workers=3,
        master_machine_type=DATAPROC_MASTER_MACHINE_TYPE,
        worker_machine_type=DATAPROC_MASTER_MACHINE_TYPE,
        image_version=IMAGE_VERSION,
        project_id=PROJECT_ID)

    locations_staging_spark_job = DataprocSubmitJobOperator(
        task_id="locations_staging_spark_job",
        job=LOCATIONS_STAGING_SPARK_JOB,
        location=REGION,
        project_id=PROJECT_ID)

    products_staging_spark_job = DataprocSubmitJobOperator(
        task_id="products_staging_spark_job",
        job=PRODUCTS_STAGING_SPARK_JOB,
        location=REGION,
예제 #13
0
# [START composer_quickstart_schedule]
with models.DAG(
        'composer_accomodation_model',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:
    # [END composer_quickstart_schedule]

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='accomodation-cluster-{{ ds_nodash }}',
        num_workers=2,
        init_actions_uris=['gs://able-cogency-234306/tmp/cloud-sql-proxy.sh'],
        service_account_scopes=['https://www.googleapis.com/auth/cloud-platform','https://www.googleapis.com/auth/sqlservice.admin'],
        metadata={'enable-cloud-sql-hive-metastore':'false','additional-cloud-sql-instances':'able-cogency-234306:us-central1:testddd'},
        region='us-central1',
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    csv_import_job = dataproc_operator.DataProcPySparkOperator(
        task_id='csv_import_job',
        main=CSVIMPORTPY,
        cluster_name='accomodation-cluster-{{ ds_nodash }}',
		job_name='csv_import_job',
        region='us-central1')
예제 #14
0
        default_args=default_dag_args) as dag:

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='dataproc',
	#PROJECT_ID=models.Variable.get('PROJECT_ID','dataproc-usecase-276215'),
	PROJECT_ID='dataproc-usecase-276215',
	region='us-west1',
	num_masters=1,
        num_workers=2,
	zone='us-west1-b',
        master_machine_type='n1-standard-1',
	master_disk_size = 100,
	worker_disk_size = 100,
	num_preemptible_workers=0,
        worker_machine_type='n1-standard-1',
	idle_delete_ttl =1800,
	subnetwork_uri='ctl',
	optional_components=['PRESTO','SOLR','RANGER'],
	service_account_scopes=['https://www.googleapis.com/auth/cloud-platform'],
        #service_account_scopes=['default'],
	internal_ip_only='false',
	#init_actions_uris=['gs://goog-dataproc-initialization-actions-us-west1/cloud-sql-proxy/cloud-sql-proxy.sh'],
        #metadata ={'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted'})
        metadata= {'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted','use-cloud-sql-private-ip': 'true','db-admin-password-uri': 'gs://secrectkeybucket2/admin-password.encrypted', 'hive-metastore-instance': 'dataproc-usecase-276215:us-west1:hive-metadata'},        
        init_actions_uris=['gs://dataproc-staging-us-west4-17809115036-8cg5zgf1/cloud-sql-proxy/cloud-sql-proxy.sh'],        
        init_action_timeout="10m")
        
		
		
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}
with models.DAG(
        'composer_sample_quickstart',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:
    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        num_workers=2,
        region='us-central1',
        zone=models.Variable.get('gce_zone'),
        image_version='2.0',
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2')
    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        region='us-central1',
        main_jar=WORDCOUNT_JAR,
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        arguments=wordcount_args)
    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
예제 #16
0
    'start_date': yesterday,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG('composer_hadoop_wordcount',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:

    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        main_jar=WORDCOUNT_JAR,
        cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}',
        arguments=wordcount_args)

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
예제 #17
0
          default_args=default_dag_args,
          description='ETL using ephemeral Hadoop cluster',
          dagrun_timeout=timedelta(minutes=50),
          max_active_runs=1)

# Create a Cloud Dataproc cluster.
create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
    task_id='create_dataproc_cluster',
    dag=dag,
    project_id=os.environ.get('GCP_PROJECT'),
    cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}',
    num_workers=2,
    num_preemptible_workers=1,
    zone='europe-west6-c',
    master_machine_type='n1-standard-1',
    worker_machine_type='n1-standard-1',
    idle_delete_ttl=3600,
    image_version='1.4',
    storage_bucket='dataproc_dataops_tmp',
    #    storage_bucket='gs://{{ var.value.v_twitter_temp_bucket }}',
    subnetwork_uri='https://www.googleapis.com/compute/v1/projects/' +
    os.environ.get('GCP_PROJECT') +
    '/regions/europe-west6/subnetworks/default',
    internal_ip_only=
    True  #Enable Private Google Access on subnetwork 'default' gcloud compute networks subnets update default --region=europe-west6 --enable-private-ip-google-access
)

delete_ml_partition = bash_operator.BashOperator(
    task_id='delete_ml_partition',
    dag=dag,
    bash_command=
    '''bq rm -f -t 'dataops_demo_ml_dev.t_twitter_google${{ macros.ds_format(ds, "%Y-%m-%d", "%Y%m%d") }}' ''',
예제 #18
0
                description='DAG for deployment a Dataproc Cluster',
                schedule_interval=timedelta(days=1),
                default_args=default_dag_args) as dag:

    # STEP 5: Set Operators

    # BashOperator
    # A simple print date
    print_date = BashOperator(task_id='print_date', bash_command='date')

    # dataproc_operator
    # Create small dataproc cluster
    create_dataproc = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('dataproc_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the PySpark job
    run_spark = dataproc_operator.DataProcPySparkOperator(
        task_id='run_spark',
        main=SPARK_CODE,
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        job_name=dataproc_job_name)

    # dataproc_operator
    # Delete Cloud Dataproc cluster.
    delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc',
	'project_id': models.Variable.get('gcp_project')
}

with models.DAG('wordcount_hadoop', 
	schedule_interval=datetime.timedelta(days=1),
	default_args=default_dag_args) as dag:
	
	# check if input file exists
	check_file_existence = BashOperator(
		task_id='check_file_existence',
		bash_command='if [! -f \"{}\"; then exit 1; fi'.format(input_file)
		)

	# create dataproc cluster
	create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
		task_id='create_dataproc_cluster',
		cluster_name='quickstart-cluster-{{ ds_nodash }}'
		num_workers=2, 
		zone=models.Variable.get('gce_zone'),
		master_machine_type='n1-standard-1',
		worker_machine_type='n1-standard-1'
		)

	# submit an Apache Hadoop Job
	run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
		task_id='run_dataproc_hadoop',
		main_jar=WORDCOUNT_JAR, 
		cluster_name='quickstart-cluster-{{ ds_nodash }}',
		arguments=wordcount_args
		)
}

# [START composer_hadoop_schedule_airflow_1]
with models.DAG(
        'composer_hadoop_tutorial',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:
    # [END composer_hadoop_schedule_airflow_1]

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/docs/apache-airflow/stable/macros-ref.html
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2')

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        main_jar=WORDCOUNT_JAR,
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        arguments=wordcount_args)

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
예제 #21
0
            'start_date': datetime.datetime(2017, 1, 1),
}
def print_context(**kwargs):
    print(kwargs)
    file_name=kwargs['dag_run'].conf['name']
    bucket=kwargs['dag_run'].conf['bucket']
    bucket_path="gs://{}/{}".format(bucket,file_name)
    kwargs['ti'].xcom_push(key="bucket_path",value=bucket_path)

with airflow.DAG('gcs_composer_trigger_dag',default_args=default_args, schedule_interval=None) as dag:

     create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='composer-311-complaints-{{ ds_nodash }}',
        num_workers=2,
        region=models.Variable.get('region'),
        zone=models.Variable.get('gce_zone'),
        project_id=models.Variable.get('project_id'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

     run_dataproc_job = dataproc_operator.DataProcPySparkOperator(
        task_id="run_dataproc_job",
        main="gs://311-complaints-spark_jobs/spark_job.py",
        cluster_name='composer-311-complaints-{{ ds_nodash }}',
        region=models.Variable.get('region'),
        dataproc_pyspark_jars=['gs://spark-lib/bigquery/spark-bigquery-latest.jar'],
        arguments=['gs://{{ dag_run.conf.get("bucket") }}/{{ dag_run.conf.get("name") }}'])

     delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
예제 #22
0
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': project_id
}

with models.DAG(
        'nyc_collisions_dag',
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    mapreduce_cluster_name = 'airflow-mapreduce-cluster'

    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name=mapreduce_cluster_name,
        num_workers=2,
        zone=gce_zone,
        master_machine_type=machine_type,
        worker_machine_type=machine_type)

    hadoop_job = dataproc_operator.DataProcHadoopOperator(
        task_id='hadoop_job',
        cluster_name=mapreduce_cluster_name,
        main_jar=hadoop_job_jar_uri,
        arguments=[
            collisions_dataset_uri,
            f'{hadoop_job_output_bucket}/{exec_dt}'
        ])

    hive_job = dataproc_operator.DataProcHiveOperator(
        task_id='hive_job',
예제 #23
0
with models.DAG(
        'dataproc_cluster_create_pysparkjob_and_delete_1',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    def greeting():
        import logging
        logging.info('Hello World!')

    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='composer-dataproc-{{ ds_nodash }}',
        num_workers=2,
        region='asia-south1',
        zone='asia-south1-a',
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')
    dataprod_pyspark = dataproc_operator.DataProcPySparkOperator(
        task_id='pyspark',
        main='gs://code_deploy/dataproc_read_bucket_to_bigquery.py',
        cluster_name='composer-dataproc-{{ ds_nodash }}',
        region='asia-south1',
        dataproc_pyspark_jars=[])
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='composer-dataproc-{{ ds_nodash }}',
        region='asia-south1',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG(
        'composer_sample_quickstart',
        # Continue to run DAG once per day
        schedule_interval=datetime.timedelta(days=1),
        default_args=default_dag_args) as dag:

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        main_jar=WORDCOUNT_JAR,
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        arguments=wordcount_args)

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
예제 #25
0
    # def greeting():
    #  import logging
    # logging.info('Hello DataPipeline for World Health Data!')

    # An instance of an operator is called a task. In this case, the
    # hello_python task calls the "greeting" Python function.
    # hello_python = python_operator.PythonOperator(
    #   task_id='KickOff',
    # python_callable=greeting)

    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        project_id='dataproc-300110',
        cluster_name='cluster-58-wb',
        num_workers=2,
        region='us-east1',
        init_actions_uris=['gs://worldbank2021/code/init_cluster.sh'],
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2')

dataproc_pyspark_1 = dataproc_operator.DataProcPySparkOperator(
    task_id='Load_BQ_spark_job_1',
    # call the py file for processing
    #    main='gs://dataproc-nyc-taxi-2020/code_deploy/dataproc_wb.py',
    main='gs://worldbank2021/code/dataproc_load_bq.py',
    cluster_name='cluster-58-wb',
    region='us-east1',
    arguments=['wb_country_series_definition'],
    dataproc_pyspark_jars=[
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar'