コード例 #1
0
ファイル: dataproc.py プロジェクト: turbaszek/dbnd
    def terminate_engine(cls):
        from airflow.contrib.hooks.gcp_dataproc_hook import DataProcHook
        from airflow.contrib.operators import dataproc_operator

        dataproc_config = DataprocConfig()

        gcp_conn_id = get_settings().get_env_config(CloudType.gcp).conn_id

        cluster_hook = DataProcHook(gcp_conn_id=gcp_conn_id)
        delete_cluster = dataproc_operator.DataprocClusterDeleteOperator(
            task_id="delete_dataproc_cluster",
            cluster_name=dataproc_config.cluster,
            project_id=cluster_hook.project_id,
            gcp_conn_id=gcp_conn_id,
            region=dataproc_config.region,
        )

        return delete_cluster
コード例 #2
0
        'LSTM_DATAGEN', '--project', 'hd-personalization-dev', '--category',
        'AreaRugs', '--dupletsData',
        'gs://hd-personalization-dev-data/vdc2136/training/duplets/2020-06-01/',
        '--featuresData',
        'gs://hd-personalization-dev-data/vdc2136/training/data/AllFeatures.csv',
        '--finalOutputPath',
        'gs://hd-personalization-dev-data/vdc2136/training/lstm/2020-06-02/',
        '--appName', 'LSTM_DATA_GEN', '--mode=cluster'
    ],
    job_name='airflow_pyspark_job',
    cluster_name='airflow-dataproc-cluster',
    project_id='hd-personalization-dev',
    dag=dag)

delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
    task_id='delete_dataproc_cluster',
    cluster_name='airflow-dataproc-cluster',
    # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
    # even if the Dataproc job fails.
    trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
    project_id='hd-personalization-dev',
    dag=dag)

start_dag = DummyOperator(
    task_id='start',
    default_args=default_args,
    dag=dag,
)

start_dag >> create_dataproc_cluster >> dataproc_pyspark_submit >> delete_dataproc_cluster
コード例 #3
0
            "%s/events-assembly-%s.jar" %
            (DEPLOY_BUCKET_PREFIX, LATEST_JAR_HASH)
        ],
        arguments=[
            "--gcs-input-bucket",
            INPUT_BUCKET,
            "--gcs-output-bucket",
            OUTPUT_BUCKET,
            "--date",
            "{{ ds }}",
        ],
    )

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        project_id=PROJECT_ID,
        task_id="delete_dataproc_cluster",
        cluster_name="gcp-data-platform",
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
    )

    gcs_to_bigquery = GoogleCloudStorageToBigQueryOperator(
        task_id="gcs_to_bigquery",
        bucket=OUTPUT_BUCKET,
        source_objects=['{{ ds_format(ds, "%Y/%m/%d") }}/*.parquet'],
        destination_project_dataset_table=
        "{project_id}.events.events{{ ds_nodash }}".format(
            project_id=PROJECT_ID),
        source_format="PARQUET",
        create_disposition="CREATE_IF_NEEDED",
        write_disposition="WRITE_TRUNCATE",
    )
コード例 #4
0
        init_actions_uris=[
            'gs://goog-dataproc-initialization-actions-europe-west1/python/pip-install.sh'
        ])

    run_batch_kpi_scheduled = dataproc_operator.DataProcPySparkOperator(
        task_id="submit_batch-kpi-scheduled",
        cluster_name='vf-polimi-demo',
        region='europe-west1',
        main='gs://vf-polimi-batch-data/dev/compute-kpi-batch.py',
        dataproc_pyspark_jars=
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar',
        xcom_push=True)

    remove_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        project_id=PROJECT,
        task_id="delete_cluster",
        cluster_name='vf-polimi-demo',
        region='europe-west1')

    def check_batch_kpi_scheduled_cluster_running(**kwargs):
        ti = kwargs['ti']
        xcom_value = ti.xcom_pull(task_ids='batch_kpi_scheduled_cluster')
        if xcom_value == "vf-polimi-demo":
            return 'delete_cluster'
        else:
            return 'end'

    branch_batch_kpi_scheduled_active_cluster = BranchPythonOperator(
        task_id='check_batch_kpi_scheduled_cluster',
        provide_context=True,
        python_callable=check_batch_kpi_scheduled_cluster_running)
コード例 #5
0
    # BashOperator
    # A simple print date
    print_date = BashOperator(task_id='print_date', bash_command='date')

    # dataproc_operator
    # Create small dataproc cluster
    create_dataproc = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('dataproc_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the PySpark job
    run_spark = dataproc_operator.DataProcPySparkOperator(
        task_id='run_spark',
        main=SPARK_CODE,
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        job_name=dataproc_job_name)

    # dataproc_operator
    # Delete Cloud Dataproc cluster.
    delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # STEP 6: Set DAGs dependencies
    # Each task should run after have finished the task before.
    print_date >> create_dataproc >> run_spark >> delete_dataproc
コード例 #6
0
        default_args=default_dag_args) as dag:

    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        main_jar=WORDCOUNT_JAR,
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        arguments=wordcount_args)

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # Define DAG dependencies.
    create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
コード例 #7
0
    # dataproc_operator
    # Create small dataproc cluster
    create_dataproc = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        num_workers=2,
        zone=None,
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1',
        region=models.Variable.get('dataproc_zone'))

    # Run the PySpark job
    run_spark = dataproc_operator.DataProcPySparkOperator(
        task_id='run_spark',
        main=SPARK_CODE,
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        job_name=dataproc_job_name,
        region=models.Variable.get('dataproc_zone'))

    # dataproc_operator
    # Delete Cloud Dataproc cluster.
    delete_dataproc = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc',
        cluster_name='dataproc-cluster-demo-{{ ds_nodash }}',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        region=models.Variable.get('dataproc_zone'))

    # STEP 6: Set DAGs dependencies
    # Each task should run after have finished the task before.
    print_date >> create_dataproc >> run_spark >> delete_dataproc
コード例 #8
0
        cluster_name='composer-dataproc-{{ ds_nodash }}',
        num_workers=2,
        region='asia-south1',
        zone='asia-south1-a',
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')
    dataprod_pyspark = dataproc_operator.DataProcPySparkOperator(
        task_id='pyspark',
        main='gs://code_deploy/dataproc_read_bucket_to_bigquery.py',
        cluster_name='composer-dataproc-{{ ds_nodash }}',
        region='asia-south1',
        dataproc_pyspark_jars=[])
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='composer-dataproc-{{ ds_nodash }}',
        region='asia-south1',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # An instance of an operator is called a task. In this case, the
    # hello_python task calls the "greeting" Python function.
    hello_python = python_operator.PythonOperator(task_id='hello',
                                                  python_callable=greeting)

    # Likewise, the goodbye_bash task calls a Bash script.
    goodbye_bash = bash_operator.BashOperator(task_id='bye',
                                              bash_command='echo Goodbye.')

    # Define the order in which the tasks complete by using the >> and <<
    # operators. In this example, hello_python executes before goodbye_bash.
コード例 #9
0
    # Create a Cloud Dataproc cluster.
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        num_workers=2,
        region='us-central1',
        zone=models.Variable.get('gce_zone'),
        image_version='2.0',
        master_machine_type='n1-standard-2',
        worker_machine_type='n1-standard-2')
    # Run the Hadoop wordcount example installed on the Cloud Dataproc cluster
    # master node.
    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        region='us-central1',
        main_jar=WORDCOUNT_JAR,
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        arguments=wordcount_args)
    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        region='us-central1',
        cluster_name='composer-hadoop-tutorial-cluster-{{ ds_nodash }}',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)
    # Define DAG dependencies.
    create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
コード例 #10
0
    arguments=[
        "--dataproc=1.4", "--job_date={{ ds }}",
        "--bucket=dataproc_dataops_tmp"
    ])

run_pyspark_job_frequency = dataproc_operator.DataProcPySparkOperator(
    task_id='run_pyspark_job_frequency',
    dag=dag,
    main='gs://' + Variable.get('v_composer_bucket') +
    '/dags/dataproc/twitterPySparkFrequency.py',
    cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}',
    dataproc_pyspark_jars=[
        'gs://spark-lib/bigquery/spark-bigquery-latest.jar'
    ],
    arguments=[
        "--dataproc=1.4", "--job_date={{ ds }}",
        "--bucket=dataproc_dataops_tmp"
    ])

# Delete Cloud Dataproc cluster.
delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
    task_id='delete_dataproc_cluster',
    dag=dag,
    project_id=os.environ.get('GCP_PROJECT'),
    cluster_name='twitter-dataproc-mlanciau-{{ ds_nodash }}'  #,
    #trigger_rule=trigger_rule.TriggerRule.ALL_DONE
)

create_dataproc_cluster >> delete_ml_partition >> run_pyspark_job_splitting >> delete_dataproc_cluster
create_dataproc_cluster >> run_pyspark_job_frequency >> delete_dataproc_cluster
コード例 #11
0
        cluster_name='composer-311-complaints-{{ ds_nodash }}',
        num_workers=2,
        region=models.Variable.get('region'),
        zone=models.Variable.get('gce_zone'),
        project_id=models.Variable.get('project_id'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

     run_dataproc_job = dataproc_operator.DataProcPySparkOperator(
        task_id="run_dataproc_job",
        main="gs://311-complaints-spark_jobs/spark_job.py",
        cluster_name='composer-311-complaints-{{ ds_nodash }}',
        region=models.Variable.get('region'),
        dataproc_pyspark_jars=['gs://spark-lib/bigquery/spark-bigquery-latest.jar'],
        arguments=['gs://{{ dag_run.conf.get("bucket") }}/{{ dag_run.conf.get("name") }}'])

     delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='composer-311-complaints-{{ ds_nodash }}',
        project_id=models.Variable.get('project_id'),
        region=models.Variable.get('region'),
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

     bigquery_transformations=BigQueryOperator(
        sql='/sql/job.sql',
        task_id='bigquery_transformations',
        use_legacy_sql=False,
     )

     create_dataproc_cluster >> run_dataproc_job >> delete_dataproc_cluster >> bigquery_transformations
コード例 #12
0
ファイル: sparkjob.py プロジェクト: sanjeevkanabargi/python
    'retries': 1,
    'retry_delay': dt.timedelta(seconds=30),
    'project_id': models.Variable.get('gcp_project')
}

with DAG('dataproc_spark_submit', schedule_interval='0 17 * * *',
    default_args=default_dag_args) as dag:

    create_dataproc_cluster = dpo.DataprocClusterCreateOperator(
        project_id = default_dag_args['project_id'],
        task_id = 'create_dataproc_cluster',
        cluster_name = CLUSTER_NAME,
        num_workers = 2,
        zone = models.Variable.get('gce_zone')
    )

    run_spark_job = dpo.DataProcSparkOperator(
        task_id = 'run_spark_job',
        #main_jar = MAIN_JAR,
        main_class = MAIN_CLASS,
        cluster_name = CLUSTER_NAME
    )

    delete_dataproc_cluster = dpo.DataprocClusterDeleteOperator(
        project_id = default_dag_args['project_id'],
        task_id = 'delete_dataproc_cluster',
        cluster_name = CLUSTER_NAME,
        trigger_rule = trigger_rule.TriggerRule.ALL_DONE
    )

    create_dataproc_cluster >> run_spark_job >> delete_dataproc_cluster
コード例 #13
0
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name=pipeline_cluster_name,
        num_workers=2,
        region='us-central1',
        autoscaling_policy=
        'projects/{}/regions/us-central1/autoscalingPolicies/ephimeral-scaling-policy'
        .format(os.environ['PROJECT_ID']),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    run_py_spark = dataproc_operator.DataProcPySparkOperator(
        task_id='run_py_spark',
        region='us-central1',
        main='gs://{}/data/compute-pi-pipeline/calculate-pi.py'.format(
            os.environ['COMPOSER_BUCKET']),
        arguments=[models.Variable.get("NUM_SAMPLES")],
        cluster_name=pipeline_cluster_name)

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        region='us-central1',
        cluster_name=pipeline_cluster_name,
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # Define DAG dependencies.
    create_dataproc_cluster >> run_py_spark >> delete_dataproc_cluster
    # [END composer_hadoop_steps]
        job=PRODUCTS_STAGING_SPARK_JOB,
        location=REGION,
        project_id=PROJECT_ID)

    transactions_staging_spark_job = DataprocSubmitJobOperator(
        task_id="transactions_staging_spark_job",
        job=TRANSACTIONS_STAGING_SPARK_JOB,
        location=REGION,
        project_id=PROJECT_ID)

    enrich_staging_spark_job = DataprocSubmitJobOperator(
        task_id="enrich_staging_spark_job",
        job=ENRICH_STAGING_SPARK_JOB,
        location=REGION,
        project_id=PROJECT_ID)

    delete_dataproc_acme_sales_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id="delete_dataproc_acme_sales_cluster",
        cluster_name=DATAPROC_CLUSTER_NAME,
        region=REGION,
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE,
        project_id=PROJECT_ID)

    create_dataproc_acme_sales_cluster >> [locations_staging_spark_job,products_staging_spark_job,transactions_staging_spark_job] >> enrich_staging_spark_job >> delete_dataproc_acme_sales_cluster

if __name__ == '__main__':
    dag.clear(dag_run_state=State.NONE)
    dag.run()
コード例 #15
0
        worker_machine_type=machine_type)

    hadoop_job = dataproc_operator.DataProcHadoopOperator(
        task_id='hadoop_job',
        cluster_name=mapreduce_cluster_name,
        main_jar=hadoop_job_jar_uri,
        arguments=[
            collisions_dataset_uri,
            f'{hadoop_job_output_bucket}/{exec_dt}'
        ])

    hive_job = dataproc_operator.DataProcHiveOperator(
        task_id='hive_job',
        cluster_name=mapreduce_cluster_name,
        dataproc_hive_jars=[hive_hcatalog_jar_uri],
        query_uri=hive_job_hql_uri,
        variables={
            'collisions_job_output_bucket': f'{hadoop_job_output_bucket}/{exec_dt}',
            'hive_job_output_bucket':       f'{hive_job_output_bucket}/{exec_dt}',
            'hive_hcatalog_jar':            hive_hcatalog_jar_uri,
            'zips_boroughs_bucket':         zips_boroughs_bucket_uri
        }
    )

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name=mapreduce_cluster_name,
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    create_dataproc_cluster >> hadoop_job >> hive_job >> delete_dataproc_cluster
コード例 #16
0
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with models.DAG('composer_hadoop_wordcount',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:

    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=models.Variable.get('gce_zone'),
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    run_dataproc_hadoop = dataproc_operator.DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        main_jar=WORDCOUNT_JAR,
        cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}',
        arguments=wordcount_args)

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='spikey-wordcount-cluster-{{ ds_nodash }}',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster
コード例 #17
0
        #network_uri = 'default',
        subnetwork_uri = subnet,
        properties = cluster_properties,
        on_success_callback=task_success_slack_alert,
        trigger_rule = trigger_rule.TriggerRule.ALL_SUCCESS
    )

    #Run spark job on the above cluster.
    run_spark_job = BashOperator(
        task_id = 'run_spark_job',
        bash_command = bash_command,
        dag=dag,
        on_success_callback=task_success_slack_alert,
        trigger_rule = trigger_rule.TriggerRule.ALL_SUCCESS
    )

    #Delete the cluster.
    delete_dataproc_cluster = dpo.DataprocClusterDeleteOperator(
        project_id = projectID,
        task_id = 'delete_dataproc_cluster',
        cluster_name = cluster_name,
        region = region,
        #zone = 'us-central1-a',
        #network_uri = 'default',
        subnetwork_uri = subnet,
        on_success_callback=task_success_slack_alert,
        trigger_rule = trigger_rule.TriggerRule.ALL_DONE
    )


    request_job >> create_dataproc_cluster >> run_spark_job >> delete_dataproc_cluster
コード例 #18
0
    create_dataproc_cluster = dataproc_operator.DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # Give the cluster a unique name by appending the date scheduled.
        # See https://airflow.apache.org/code.html#default-variables
        cluster_name='parquetconverter2',
        num_workers=3,
        zone='europe-west1-b',
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-1')

    # Run the pyspark CSV2PARQUET example
    run_dataproc_csv2parquet = dataproc_operator.DataProcPySparkOperator(
        task_id='run_dataproc_parquetconvert',
        cluster_name='parquetconverter2',
        main='gs://alex-code/convert.py')

    # Delete Cloud Dataproc cluster.
    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='parquetconverter2',
        # Setting trigger_rule to ALL_DONE causes the cluster to be deleted
        # even if the Dataproc job fails.
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    # [START composer_quickstart_steps]
    # Define DAG dependencies.
    create_dataproc_cluster >> run_dataproc_csv2parquet >> delete_dataproc_cluster
    # [END composer_quickstart_steps]

# [END composer_quickstart]
コード例 #19
0
        "INSERT INTO TABLE default.chicago_taxi_trips_parquet_autotestbq9 SELECT * FROM default.chicago_taxi_trips_csv_autotestbq9;",
        cluster_name='dataproc',
        region='us-west1',
        dag=dag)

    dataproc_hive_count_table_csv = DataProcHiveOperator(
        task_id='dataproc_hive_count_table_csv',
        gcp_conn_id='google_cloud_default',
        query="select count(*) from default.chicago_taxi_trips_csv_autotestbq9",
        cluster_name='dataproc',
        region='us-west1',
        dag=dag)

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='dataproc',
        region='us-west1',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    load_parquet_bqt = GoogleCloudStorageToBigQueryOperator(
        task_id='load_parquet_bqt',
        bucket='dphivedb',
        source_objects=['HQL/PARQUET/*'],
        schema_fields=None,
        schema_object=None,
        source_format='parquet',
        destination_project_dataset_table='bqdataset.test3',
        bigquery_conn_id='bigquery_default',
        google_cloud_storage_conn_id='google_cloud_default',
        write_disposition='WRITE_TRUNCATE',
        autodetect=True,
コード例 #20
0
ファイル: intergration.py プロジェクト: snithish/listery-data
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        arguments=["--local", "false", "--subprogram", "integration"])

    offer_integration = dataproc_operator.DataProcSparkOperator(
        task_id='offer_integration',
        main_jar=SPARK_JOBS_JAR,
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        arguments=["--local", "false", "--subprogram", "offerIntegration"])

    es_refresh = dataproc_operator.DataProcSparkOperator(
        task_id='es_refresh',
        main_jar=SPARK_JOBS_JAR,
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        arguments=["--local", "false", "--subprogram", "refreshEs"])

    price_diff = dataproc_operator.DataProcSparkOperator(
        task_id='price_diff',
        main_jar=SPARK_JOBS_JAR,
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        arguments=["--local", "false", "--subprogram", "priceDiff"])

    delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='composer-data-integration-cluster-{{ ds_nodash }}',
        trigger_rule=trigger_rule.TriggerRule.ALL_DONE)

    create_dataproc_cluster >> [run_integration_job, offer_integration]
    run_integration_job >> price_diff
    [run_integration_job, offer_integration] >> es_refresh
    [price_diff, es_refresh] >> delete_dataproc_cluster