def setUp(self):
     # instantiate two different test cases with different labels.
     self.labels = [LABEL1, LABEL2]
     self.dataproc_operators = []
     for labels in self.labels:
          self.dataproc_operators.append(
             DataprocClusterCreateOperator(
                 task_id=TASK_ID,
                 cluster_name=CLUSTER_NAME,
                 project_id=PROJECT_ID,
                 num_workers=NUM_WORKERS,
                 zone=ZONE,
                 storage_bucket=STORAGE_BUCKET,
                 image_version=IMAGE_VERSION,
                 master_machine_type=MASTER_MACHINE_TYPE,
                 master_disk_size=MASTER_DISK_SIZE,
                 worker_machine_type=WORKER_MACHINE_TYPE,
                 worker_disk_size=WORKER_DISK_SIZE,
                 num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
                 labels = deepcopy(labels)
             )
          )
     self.dag = DAG(
         'test_dag',
         default_args={
             'owner': 'airflow',
             'start_date': DEFAULT_DATE,
             'end_date': DEFAULT_DATE,
         },
         schedule_interval='@daily')
예제 #2
0
 def setUp(self):
     # instantiate two different test cases with different labels.
     self.labels = [LABEL1, LABEL2]
     self.dataproc_operators = []
     self.mock_conn = Mock()
     for labels in self.labels:
         self.dataproc_operators.append(
             DataprocClusterCreateOperator(
                 task_id=TASK_ID,
                 cluster_name=CLUSTER_NAME,
                 project_id=PROJECT_ID,
                 num_workers=NUM_WORKERS,
                 zone=ZONE,
                 network_uri=NETWORK_URI,
                 subnetwork_uri=SUBNETWORK_URI,
                 tags=TAGS,
                 storage_bucket=STORAGE_BUCKET,
                 image_version=IMAGE_VERSION,
                 master_machine_type=MASTER_MACHINE_TYPE,
                 master_disk_size=MASTER_DISK_SIZE,
                 worker_machine_type=WORKER_MACHINE_TYPE,
                 worker_disk_size=WORKER_DISK_SIZE,
                 num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
                 labels=deepcopy(labels),
                 service_account_scopes=SERVICE_ACCOUNT_SCOPES,
                 idle_delete_ttl=IDLE_DELETE_TTL,
                 auto_delete_time=AUTO_DELETE_TIME,
                 auto_delete_ttl=AUTO_DELETE_TTL))
     self.dag = DAG('test_dag',
                    default_args={
                        'owner': 'airflow',
                        'start_date': DEFAULT_DATE,
                        'end_date': DEFAULT_DATE,
                    },
                    schedule_interval='@daily')
예제 #3
0
    def test_create_cluster(self):
        # Setup service.projects().regions().clusters().create()
        #              .execute()
        self.operation = {'name': 'operation', 'done': True}
        self.mock_execute = Mock()
        self.mock_execute.execute.return_value = self.operation
        self.mock_clusters = Mock()
        self.mock_clusters.create.return_value = self.mock_execute
        self.mock_regions = Mock()
        self.mock_regions.clusters.return_value = self.mock_clusters
        self.mock_projects = Mock()
        self.mock_projects.regions.return_value = self.mock_regions
        self.mock_conn = Mock()
        self.mock_conn.projects.return_value = self.mock_projects

        with patch(HOOK) as MockHook:
            hook = MockHook()
            hook.get_conn.return_value = self.mock_conn
            hook.wait.return_value = None

            dataproc_task = DataprocClusterCreateOperator(
                task_id=TASK_ID,
                region=GCP_REGION,
                cluster_name=CLUSTER_NAME,
                project_id=GCP_PROJECT_ID,
                num_workers=NUM_WORKERS,
                zone=GCE_ZONE,
                dag=self.dag
            )
            dataproc_task.execute(None)

            project_uri = 'https://www.googleapis.com/compute/v1/projects/test-project-id'
            machine_type_uri = project_uri + '/zones/us-central1-a/machineTypes/n1-standard-4'
            zone_uri = project_uri + '/zones/us-central1-a'

            self.mock_clusters.create.assert_called_once_with(
                region=GCP_REGION,
                projectId=GCP_PROJECT_ID,
                requestId=mock.ANY,
                body={
                    'projectId': 'test-project-id',
                    'clusterName': 'test-cluster-name',
                    'config': {
                        'gceClusterConfig':
                            {'zoneUri': zone_uri},
                        'masterConfig': {
                            'numInstances': 1,
                            'machineTypeUri': machine_type_uri,
                            'diskConfig': {'bootDiskType': 'pd-standard', 'bootDiskSizeGb': 1024}},
                        'workerConfig': {
                            'numInstances': 123,
                            'machineTypeUri': machine_type_uri,
                            'diskConfig': {'bootDiskType': 'pd-standard', 'bootDiskSizeGb': 1024}},
                        'secondaryWorkerConfig': {},
                        'softwareConfig': {},
                        'lifecycleConfig': {},
                        'encryptionConfig': {}},
                    'labels': {'airflow-version': mock.ANY}})
            hook.wait.assert_called_once_with(self.operation)
예제 #4
0
 def test_init_with_image_version_and_custom_image_both_set(self):
     with self.assertRaises(AssertionError):
         DataprocClusterCreateOperator(task_id=TASK_ID,
                                       cluster_name=CLUSTER_NAME,
                                       project_id=PROJECT_ID,
                                       num_workers=NUM_WORKERS,
                                       zone=ZONE,
                                       dag=self.dag,
                                       image_version=IMAGE_VERSION,
                                       custom_image=CUSTOM_IMAGE)
예제 #5
0
 def test_init_cluster_with_zero_workers_and_not_non_zero_preemtibles(self):
     with self.assertRaises(AssertionError):
         DataprocClusterCreateOperator(
             task_id=TASK_ID,
             cluster_name=CLUSTER_NAME,
             project_id=PROJECT_ID,
             num_workers=0,
             num_preemptible_workers=2,
             zone=ZONE,
             dag=self.dag,
             image_version=IMAGE_VERSION,
         )
예제 #6
0
        def create_cluster_with_invalid_internal_ip_only_setup():
            # Given
            create_cluster = DataprocClusterCreateOperator(
                task_id=TASK_ID,
                cluster_name=CLUSTER_NAME,
                project_id=GCP_PROJECT_ID,
                num_workers=NUM_WORKERS,
                zone=GCE_ZONE,
                dag=self.dag,
                internal_ip_only=True)

            # When
            create_cluster._build_cluster_data()
예제 #7
0
 def test_build_cluster_data_with_auto_zone(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=GCP_PROJECT_ID,
         num_workers=NUM_WORKERS,
         master_machine_type=MASTER_MACHINE_TYPE,
         worker_machine_type=WORKER_MACHINE_TYPE
     )
     cluster_data = dataproc_operator._build_cluster_data()
     self.assertNotIn('zoneUri', cluster_data['config']['gceClusterConfig'])
     self.assertEqual(cluster_data['config']['masterConfig']['machineTypeUri'], MASTER_MACHINE_TYPE)
     self.assertEqual(cluster_data['config']['workerConfig']['machineTypeUri'], WORKER_MACHINE_TYPE)
예제 #8
0
 def test_build_cluster_data_with_auto_delete_time(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=GCP_PROJECT_ID,
         num_workers=NUM_WORKERS,
         zone=GCE_ZONE,
         dag=self.dag,
         auto_delete_time=AUTO_DELETE_TIME,
     )
     cluster_data = dataproc_operator._build_cluster_data()
     self.assertEqual(cluster_data['config']['lifecycleConfig']['autoDeleteTime'],
                      "2017-06-07T00:00:00.000000Z")
예제 #9
0
 def test_build_cluster_data_with_autoDeleteTtl(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=PROJECT_ID,
         num_workers=NUM_WORKERS,
         zone=ZONE,
         dag=self.dag,
         auto_delete_ttl=AUTO_DELETE_TTL,
     )
     cluster_data = dataproc_operator._build_cluster_data()
     self.assertEqual(
         cluster_data['config']['lifecycleConfig']['autoDeleteTtl'], "654s")
예제 #10
0
 def test_build_single_node_cluster(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=PROJECT_ID,
         num_workers=0,
         num_preemptible_workers=0,
         zone=ZONE,
         dag=self.dag)
     cluster_data = dataproc_operator._build_cluster_data()
     self.assertEqual(
         cluster_data['config']['softwareConfig']['properties']
         ['dataproc:dataproc.allow.zero.workers'], "true")
 def setUp(self):
     self.dataproc = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=PROJECT_ID,
         num_workers=NUM_WORKERS,
         zone=ZONE,
         storage_bucket=STORAGE_BUCKET,
         image_version=IMAGE_VERSION,
         master_machine_type=MASTER_MACHINE_TYPE,
         master_disk_size=MASTER_DISK_SIZE,
         worker_machine_type=WORKER_MACHINE_TYPE,
         worker_disk_size=WORKER_DISK_SIZE,
         num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS)
    def test_cluster_name_log_no_sub(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') \
            as mock_hook, patch('logging.info') as l:
            dataproc_task = DataprocClusterCreateOperator(
                task_id=TASK_ID,
                cluster_name=CLUSTER_NAME,
                project_id=PROJECT_ID,
                num_workers=NUM_WORKERS,
                zone=ZONE,
                dag=self.dag)

            with self.assertRaises(TypeError) as _:
                dataproc_task.execute(None)
            l.assert_called_with(('Creating cluster: ' + CLUSTER_NAME))
 def test_cluster_name_log_no_sub(self):
     with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook:
         mock_hook.return_value.get_conn = self.mock_conn
         dataproc_task = DataprocClusterCreateOperator(
             task_id=TASK_ID,
             cluster_name=CLUSTER_NAME,
             project_id=PROJECT_ID,
             num_workers=NUM_WORKERS,
             zone=ZONE,
             dag=self.dag
         )
         with patch.object(dataproc_task.logger, 'info') as mock_info:
             with self.assertRaises(TypeError) as _:
                 dataproc_task.execute(None)
             mock_info.assert_called_with('Creating cluster: %s', CLUSTER_NAME)
 def test_build_cluster_data_with_autoDeleteTime_and_autoDeleteTtl(self):
     dataproc_operator = DataprocClusterCreateOperator(
         task_id=TASK_ID,
         cluster_name=CLUSTER_NAME,
         project_id=PROJECT_ID,
         num_workers=NUM_WORKERS,
         zone=ZONE,
         dag=self.dag,
         auto_delete_time=AUTO_DELETE_TIME,
         auto_delete_ttl=AUTO_DELETE_TTL,
     )
     cluster_data = dataproc_operator._build_cluster_data()
     if 'autoDeleteTtl' in cluster_data['config']['lifecycleConfig']:
         self.fail("If 'auto_delete_time' and 'auto_delete_ttl' is set, " +
                   "only `auto_delete_time` is used")
     self.assertEqual(cluster_data['config']['lifecycleConfig']['autoDeleteTime'],
                      "2017-06-07T00:00:00.000000Z")
예제 #15
0
    def test_init_with_custom_image(self):
        dataproc_operator = DataprocClusterCreateOperator(
            task_id=TASK_ID,
            cluster_name=CLUSTER_NAME,
            project_id=PROJECT_ID,
            num_workers=NUM_WORKERS,
            zone=ZONE,
            dag=self.dag,
            custom_image=CUSTOM_IMAGE)

        cluster_data = dataproc_operator._build_cluster_data()
        expected_custom_image_url = \
            'https://www.googleapis.com/compute/beta/projects/' \
            '{}/global/images/{}'.format(PROJECT_ID, CUSTOM_IMAGE)
        self.assertEqual(cluster_data['config']['masterConfig']['imageUri'],
                         expected_custom_image_url)
        self.assertEqual(cluster_data['config']['workerConfig']['imageUri'],
                         expected_custom_image_url)
    def test_cluster_name_log_sub(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') \
            as mock_hook, patch('logging.info') as l:
            dataproc_task = DataprocClusterCreateOperator(
                task_id=TASK_ID,
                cluster_name='smoke-cluster-{{ ts_nodash }}',
                project_id=PROJECT_ID,
                num_workers=NUM_WORKERS,
                zone=ZONE,
                dag=self.dag
            )

            context = { 'ts_nodash' : 'testnodash'}

            rendered = dataproc_task.render_template('cluster_name', getattr(dataproc_task,'cluster_name'), context)
            setattr(dataproc_task, 'cluster_name', rendered)
            with self.assertRaises(TypeError) as _:
                dataproc_task.execute(None)
            l.assert_called_with(('Creating cluster: smoke-cluster-testnodash'))
 def setUp(self):
     # instantiate two different test cases with different labels.
     self.labels = [LABEL1, LABEL2]
     self.dataproc_operators = []
     for labels in self.labels:
         self.dataproc_operators.append(
             DataprocClusterCreateOperator(
                 task_id=TASK_ID,
                 cluster_name=CLUSTER_NAME,
                 project_id=PROJECT_ID,
                 num_workers=NUM_WORKERS,
                 zone=ZONE,
                 storage_bucket=STORAGE_BUCKET,
                 image_version=IMAGE_VERSION,
                 master_machine_type=MASTER_MACHINE_TYPE,
                 master_disk_size=MASTER_DISK_SIZE,
                 worker_machine_type=WORKER_MACHINE_TYPE,
                 worker_disk_size=WORKER_DISK_SIZE,
                 num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
                 labels=deepcopy(labels)))
예제 #18
0
    def _internal(begin_task, end_task):
        task_id = 'cluster-{}'.format(job_name)
        cluster_name = '{}-cluster'.format(job_name)

        cluster = DataprocClusterCreateOperator(
            task_id=task_id,
            project_id=project_id,
            zone=zone,
            cluster_name=cluster_name,
            num_workers=2,
            num_preemptible_workers=2,
            storage_bucket=storage_bucket,
            master_machine_type='n1-standard-2',
            master_disk_size=200,
            worker_machine_type='n1-standard-4',
            worker_disk_size=200,
            init_actions_uris=[
                'gs://dataproc-initialization-actions/connectors/connectors.sh'
            ],
            metadata={
                'gcs-connector-version': '1.9.16',
                'bigquery-connector-version': '0.13.16'
            },
            subnetwork_uri=subnetwork_uri,
            internal_ip_only=True,
            region=region,
            idle_delete_ttl=600,
            **default_args)

        job = DataProcPySparkOperator(task_id=job_name,
                                      main=main_file,
                                      arguments=arguments,
                                      pyfiles=extra_files,
                                      job_name=job_name,
                                      cluster_name=cluster_name,
                                      region=region,
                                      **default_args)

        begin_task >> cluster >> job >> end_task
예제 #19
0
    def test_cluster_name_log_sub(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook'
                   ) as mock_hook:
            mock_hook.return_value.get_conn = self.mock_conn
            dataproc_task = DataprocClusterCreateOperator(
                task_id=TASK_ID,
                cluster_name='smoke-cluster-{{ ts_nodash }}',
                project_id=GCP_PROJECT_ID,
                num_workers=NUM_WORKERS,
                zone=GCE_ZONE,
                dag=self.dag)
            with patch.object(dataproc_task.log, 'info') as mock_info:
                context = {'ts_nodash': 'testnodash'}

                rendered = dataproc_task.render_template(
                    'cluster_name', getattr(dataproc_task, 'cluster_name'),
                    context)
                setattr(dataproc_task, 'cluster_name', rendered)
                with self.assertRaises(TypeError):
                    dataproc_task.execute(None)
                mock_info.assert_called_with('Creating cluster: %s',
                                             u'smoke-cluster-testnodash')
args = {
    "owner": "sacha_roggeveen",
    "schedule_interval": "@daily",
    "start_date": airflow.utils.dates.days_ago(1),
}
dag = DAG(dag_id="daggerd", default_args=args, description="clustertjerunnen")

t_start = BashOperator(task_id="print_execution_date",
                       bash_command="date",
                       dag=dag)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="dataproc_create_cluster",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="airflowbolcom-may2829-257c0046",
    num_workers=4,
    zone="europe-west4-a",
    dag=dag)

compute_aggregates = DataProcPySparkOperator(
    task_id="compute_aggregates",
    main="gs://europe-west1-training-airfl-48bde282-bucket/build_statistics.py",
    cluster_name="analyse-pricing-{{ ds }}",
    arguments=[
        "gs://buckster/daily_load_{{ ds }}",
        "gs://buckster/bucketie",
        "gs://buckster/results",
    ],
    dag=dag)
        response = http.run(self.endpoint)
        self.log.info(response.text)

        with NamedTemporaryFile() as tmp_file:
            tmp_file.write(response.content)
            tmp_file.flush()

            hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcs_conn_id)
            hook.upload(bucket=self.bucket, object=self.gcs_path, filename=tmp_file.name)

PROJECT_ID = 'airflowbolcom-9362d2a84f6f553b'

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="pricing-analysis-{{ ds }}",
    project_id=PROJECT_ID,
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,)


for target_currency in ['EUR', 'USD']:
    HttpToGcsOperator(
        task_id='get_currency_' + str(target_currency),
        # when there are multiple options (E.g. in a loop), make task_id parameterized
        gcs_conn_id='postgres_conn',
        gcs_path="currency/{{ ds }}/" + target_currency + ".json",
        http_conn_id='http_new',
        bucket='marloes_bucket',
        endpoint="/convert-currency?date={{ ds }}&from=GBP&to=" + str(target_currency),
        dag=dag,
예제 #22
0
    postgres_conn_id="my_database_connection",
    sql=
    "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket='airflow_training_bucket',
    filename='land_registry_price_paid_uk/{{ ds }}/result.json',
    dag=dag)

my_task = PythonOperator(task_id="task_name",
                         python_callable=print_exec_date,
                         provide_context=True,
                         dag=dag)

create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id='airflowbolcom-20165e4959a78c1d',
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
)

comp_aggregate = DataProcPySparkOperator(
    task_id='compute_aggregates',
    main=
    'gs://europe-west1-training-airfl-159310f1-bucket/other/build_statistics_simple.py',
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=["{{ ds }}"],
    dag=dag,
)

del_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    # Alternatively, this could be set to '@daily' to run the job once a day.
    # more options at https://airflow.apache.org/scheduler.html#dag-runs
}

# Create Directed Acyclic Graph for Airflow
with DAG('ephemeral_dataproc_spark_dag',
         default_args=DEFAULT_DAG_ARGS,
         schedule_interval=None) as dag:  # Here we are using dag as context.
    # Create the Cloud Dataproc cluster.
    # Note: this operator will be flagged a success if the cluster by this name
    # already exists.
    create_cluster = DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        # ds_nodash is an airflow macro for "[Execution] Date string no dashes"
        # in YYYYMMDD format.
        # See docs https://airflow.apache.org/code.html?highlight=macros#macros
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        image_version='1.5-debian10',
        num_workers=2,
        storage_bucket=Variable.get('dataproc_bucket'),
        zone=Variable.get('gce_zone'))

    # Submit the PySpark job.
    submit_pyspark = DataProcPySparkOperator(
        task_id='run_dataproc_pyspark',
        main='gs://' + Variable.get('gcs_bucket') +
        '/spark-jobs/spark_avg_speed.py',
        # Obviously needs to match the name of cluster created in the prior
        # Operator.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        # Let's template our arguments for the pyspark job from the POST
        # payload.
예제 #24
0
# unique tasks
push_unique_cluster_name = PythonOperator(
    task_id='generate_unique_cluster_name',
    provide_context=True,
    python_callable=push_unique_cluster_name,
    dag=dag)

# replicate tasks (dataproc_list)

create_cluster_1 = DataprocClusterCreateOperator(
    task_id='create_cluster_1',
    cluster_name=
    '{{ ti.xcom_pull(key=unique_cluster_name, task_ids="generate_unique_cluster_name") }}'
    + '1',
    project_id=Variable.get('project_id', default_var=None),
    region='us-west1',
    master_machine_type='n1-standard-2',
    worker_machine_type='n1-standard-2',
    num_workers=2,
    execution_timeout=timedelta(minutes=20),
    dag=dag)

delete_cluster_1 = DataprocClusterDeleteOperator(
    task_id='delete_cluster_1',
    cluster_name=
    '{{ ti.xcom_pull(key=unique_cluster_name, task_ids="generate_unique_cluster_name") }}'
    + '1',
    region='us-west1',
    project_id=Variable.get('project_id', default_var=None),
    execution_timeout=timedelta(minutes=20),
    dag=dag)
    'schedule_interval': "30 2 * * *"
}

with DAG('sqoop-incremental-dag', default_args=DEFAULT_DAG_ARGS) as dag:

    create_cluster = DataprocClusterCreateOperator(
        task_id='create_sqoop_cluster',
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        master_machine_type='n1-standard-1',
        worker_machine_type='n1-standard-2',
        init_actions_uris=[
            "gs://dataproc-initialization-actions/cloud-sql-proxy/cloud-sql-proxy.sh"
        ],
        num_workers=2,
        region='asia-southeast2',
        zone='asia-southeast2-a',
        service_account_scopes=[
            "https://www.googleapis.com/auth/sqlservice.admin"
        ],
        properties={
            "hive:hive.metastore.warehouse.dir": BUCKET + "/hive-warehouse"
        },
        metadata={
            "additional-cloud-sql-instances": instance_name + "=tcp:3307",
            "enable-cloud-sql-hive-metastore": "false"
        },
        image_version="1.2")

    sqoop_inc_import = BashOperator(
        task_id='sqoop_incremental_import',
        bash_command=
        "bash /home/airflow/gcs/plugins/sqoop-incremental-imports.sh ephemeral-spark-cluster-{{ds_nodash}}",
     'dfp_data_transfer_unified_impressions_from_hadoop5',
     schedule_interval='@once', 
     #schedule_interval='@once', 
     default_args=default_dag_args) as dag:
  
  # insert overwrite is not working via sparkSQL, and the file file is not working in Hive
  # GoogleCloudStorageDeleteOperator will only be supported in v.1.10
  
  
  create_dataproc_cluster = DataprocClusterCreateOperator(
 	task_id='create_dataproc_cluster',
 	cluster_name=my_cluster_name,
 	region=my_region,
 	num_workers=2,
 	storage_bucket=my_bucket,
 	master_machine_type=my_instance,
 	master_disk_size=my_disk_size,
 	worker_machine_type=my_instance,
 	worker_disk_size=my_disk_size,
     num_preemptible_workers=0, #use scale out/in operator
     zone=my_zone,
     idle_delete_ttl=my_idle_delete_ttl,
  	dag=dag)       
  	
  drop_if_exists_src_table = DataProcHiveOperator(
 	task_id='drop_if_exists_src_table',
 	job_name='drop_if_exists_src_table_job_name',
 	cluster_name=my_cluster_name,
 	region=my_region,
 	query=drop_if_exists_src_table
  )
  
예제 #27
0
# Create Directed Acyclic Graph for Airflow
SPARK_DAG = DAG('Reporting_scheduler_DAG',
                default_args=default_args,
                schedule_interval=timedelta(days=1))

logger.debug('Starting task cto create cluster on GCP')

#Start DataProc Cluster
start_cluster = DataprocClusterCreateOperator(
    task_id='start_cluster',
    cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}',
    num_workers=2,
    worker_machine_type='n1-standard-1',
    properties={
        'spark:spark.executor.cores': '1',
        'spark:spark.executor.memory': '1g',
        # The maximum number of bytes to pack into a single partition when reading files. 256MB
        'spark:spark.files.maxPartitionBytes': '268435456'
    },
    zone=Variable.get('gcp_zone'),
    dag=SPARK_DAG)

logger.debug('Submitting spark job on cluster')
#Submit Spark Job
submit_pyspark = DataProcPySparkOperator(
    task_id='run_dataproc_pyspark_job',
    main=PYSPARK_JOB,
    cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}',
    arguments=[CONFIG_FILE_ARG],
    dag=SPARK_DAG)
예제 #28
0
    'depends_on_past': False,
    "start_date": datetime.utcnow(),
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    "project_id": PROJECT_ID,
    "scheduled_interval": "30 2 * * *"
}

with DAG("flights_delay_etl", default_args=DEFAULT_DAG_ARGS) as dag:

    create_cluster = DataprocClusterCreateOperator(
        task_id="create_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        master_machine_type="n1-standard-1",
        worker_machine_type="n1-standard-2",
        num_workers=2,
        region="asia-east1",
        zone="asia-east1-a")

    submit_pyspark = DataProcPySparkOperator(
        task_id="run_pyspark_etl",
        main=PYSPARK_JOB,
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="asia-east1")

    bq_load_delays_by_distance = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_avg_delays_by_distance",
        bucket='enr1qu319-data-engineer-1',
        source_objects=[
            "flights_data_output/" + current_date + "_distance_category/part-*"
예제 #29
0
    "project_id": "bigdata-etl-20201027",
    "scheduled_interval": "30 2 * * *"  # every day at 2:30 am utc
}

with DAG("sqoop_import_full_table", default_args=DEFAULT_DAG_ARGS) as dag:
    create_cluster = DataprocClusterCreateOperator(
        task_id="create_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        master_machine_type="n1-standard-1",
        worker_machine_type="n1-standard-2",
        init_actions_uris=[
            'gs://dataproc-initialization-actions/cloud-sql-proxy/cloud-sql-proxy.sh'
        ],
        num_workers=2,
        region="us-central1",
        zone="us-central1-a",
        service_account_scopes=[
            'https://www.googleapis.com/auth/sqlservice.admin'
        ],
        properties={
            'hive:hive.metastore.warehouse.dir': BUCKET + '/hive-warehouse'
        },
        metadata={
            'enable-cloud-sql-hive-metastore': 'false',
            'additional-cloud-sql-instances': INSTANCE_NAME
        },
        image_version='1.5')

    submit_sqoop = BashOperator(
        task_id="sqoop_full_table_import",
        bash_command=
        'bash /home/airflow/gcs/plugins/sqoop_simple_table_imports_for_airflow.sh ephemeral-spark-cluster-{{ds_nodash}}'
예제 #30
0
pgsl_to_gcs = PostgresToGoogleCloudStorageOperator(
    task_id="postgres_to_gcs",
    postgres_conn_id="airflow-training-postgres",
    sql="SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket="airflow-training-knab-jochem",
    filename="land_registry_price_paid_uk/{{ ds }}/properties_{}.json",
    dag=dag,
)


dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="gdd-ea393e48abe0a85089b6b551da",
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
    auto_delete_ttl=5 * 60,  # Autodelete after 5 minutes
)


df_to_bq = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
    dataflow_default_options={
        "project": "gdd-ea393e48abe0a85089b6b551da",
        "region": "europe-west1",
    },
    py_file="gs://airflow-training-knab-jochem/dataflow_job.py",
    dag=dag,
)