def test_hook_correct_region(): with patch(HOOK) as mock_hook: dataproc_task = DataProcHiveOperator(task_id=TASK_ID, region=GCP_REGION) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with( mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
def test_hook_correct_region(self): with patch('airflow.contrib.operators.dataproc_operator.DataProcHook' ) as mock_hook: dataproc_task = DataProcHiveOperator(task_id=TASK_ID, region=REGION) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with( mock.ANY, mock.ANY, REGION)
def test_hook_correct_region(self): with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook: dataproc_task = DataProcHiveOperator( task_id=TASK_ID, region=REGION ) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, REGION)
def test_hook_correct_region(): with patch(HOOK) as mock_hook: dataproc_task = DataProcHiveOperator( task_id=TASK_ID, region=GCP_REGION ) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
def test_dataproc_job_id_is_set(): with patch(HOOK) as mock_hook: dataproc_task = DataProcHiveOperator( task_id=TASK_ID ) _assert_dataproc_job_id(mock_hook, dataproc_task)
def test_correct_job_definition(self, mock_hook, mock_uuid): # Expected job job_definition = deepcopy(DATAPROC_JOB_TO_SUBMIT) job_definition['job']['hiveJob'] = {'queryFileUri': None} job_definition['job']['reference']['projectId'] = None job_definition['job']['reference']['jobId'] = DATAPROC_JOB_ID + "_test" # Prepare job using operator task = DataProcHiveOperator(task_id=TASK_ID, region=GCP_REGION, cluster_name=CLUSTER_NAME, job_name=DATAPROC_JOB_ID, labels=LABELS) task.execute(context=None) self.assertDictEqual(job_definition, task.job_template.job)
def hive_task(owner, dag, task_id, query): return DataProcHiveOperator(dag=dag, task_id=task_id, cluster_name='cluster-dataproc', region='europe-west3', job_name='_'.join((owner, task_id, '{{ execution_date.year }}', '{{ params.job_suffix }}')), query=query, params={'job_suffix': randint(0, 100000)})
USERNAME = '******' default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)} dag = DAG( USERNAME + '_data_lake_etl_issue', default_args=default_args, description='Data Lake ETL tasks', schedule_interval="0 0 1 1 *", ) ods_issue = DataProcHiveOperator( task_id='ods_issue', dag=dag, query=""" INSERT OVERWRITE TABLE asamoilov.ods_issue PARTITION (year = {{ execution_date.year }}) SELECT CAST(user_id AS BIGINT) as user_id, CAST(start_time AS TIMESTAMP) AS start_time, CAST(end_time AS TIMESTAMP) AS end_time, title, description, service FROM asamoilov.stg_issue WHERE year(start_time) = {{ execution_date.year }}; """, cluster_name='cluster-dataproc', job_name=USERNAME + '_ods_issue_{{ execution_date.year }}_{{ params.job_suffix }}', params={"job_suffix": randint(0, 100000)}, region='europe-west3', )
start_date = datetime(2019, 11, 18) default_args = { 'owner': 'Airflow', 'depends_on_past': False, 'start_date': start_date, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } with DAG(dag_id='hive-query-submit', default_args=default_args, start_date=start_date, schedule_interval=None) as dag: submit_hive_task = DataProcHiveOperator( task_id='HiveSubmit', project_id='gcp-cicd', cluster_name='cluster-1', query_uri=HQL_BUCKET + HQL_SCRIPT_NAME, # dataproc_hive_jars=[UDF_BUCKET + UDF_JAR_MANE], # variables={'PROJECT_ID': PROJECT_ID}, region='europe-west1') dummy_task = DummyOperator(task_id='DummyTask') dummy_task >> submit_hive_task
datetime.datetime.min.time()) default_dag_args = { 'start_date': yesterday, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('PROJECT_ID','dataproc-usecase-276411') } with models.DAG( 'EndToEndDAG14', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: run_dataproc_hive_create_db = DataProcHiveOperator( task_id='create_db', gcp_conn_id='google_cloud_default', query="CREATE DATABASE IF NOT EXISTS default3 LOCATION 'gs://dphivedb/HQL/CSV/test/';", cluster_name='dataproc', region='us-west1', dag=dag) delete_dataproc_cluster = dataproc_operator.DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='dataproc', region='us-west1', trigger_rule=trigger_rule.TriggerRule.ALL_DONE) run_dataproc_hive_create_db >> delete_dataproc_cluster
from bektova.stg_traffic where year(from_unixtime(cast(`timestamp`/1000 as int))) = {{ execution_date.year }}; """ elif task == 'payment': query = """ insert overwrite table bektova.ods_payment partition (year='{{ execution_date.year }}') select user_id, pay_doc_type, pay_doc_num, account, phone, billing_period, cast(pay_date as DATE), cast(sum as DECIMAL(10,2)) from bektova.stg_payment where year(pay_date) = {{ execution_date.year }}; """ ods.append( DataProcHiveOperator( task_id='ods_' + task, dag=dag, query=query, cluster_name='cluster-dataproc', job_name=username + '_{{ execution_date.year }}_ods_' + task + '_{{ params.job_suffix }}', params={"job_suffix": randint(0, 100000)}, region='europe-west3', )) dm = DataProcHiveOperator( task_id='dm_traffic', dag=dag, query=""" insert overwrite table bektova.dm_traffic partition (year='{{ execution_date.year }}') select user_id, max(bytes_received), min(bytes_received), round(avg(bytes_received)) as avg_bytes_received from bektova.ods_traffic where year = {{ execution_date.year }} group by user_id order by avg_bytes_received; """, cluster_name='cluster-dataproc', job_name=username +
shiftleft(cast(split(device_ip_addr, '[\.]')[1] as int), 16) + shiftleft(cast(split(device_ip_addr, '[\.]')[2] as int), 8) + cast(split(device_ip_addr, '[\.]')[3] as int), cast(bytes_sent as bigint), cast(bytes_received as bigint) -- Ниже - проверил - только двойной cast дает нормальный результат в year - либо надо обрезать миллисекунды from izykov.stg_traffic where year(cast(cast(`timestamp` as bigint) as timestamp)) = {{ execution_date.year }}; """ } for table, hiveql in ods_tables.items(): dpho = DataProcHiveOperator( task_id=user_name + '_ods_' + table, dag=dag, query=hiveql, cluster_name='cluster-dataproc', job_name=user_name + '_ods_' + table + '_{{ execution_date.year }}_{{ params.job_suffix }}', params={"job_suffix": randint(0, 100000)}, region='europe-west3', ) if table == 'traffic': dphoc = DataProcHiveOperator( task_id=user_name + '_dm_' + table, dag=dag, query=""" /* Пояснения по витрине: Несмотря на свежую версию hive (может, конкретная сборка виновата?), никак не работал вариант с create materialized view ... partitioned on / by. Были ошибки типа: FAILED: ParseException line 1:52 mismatched input 'partitioned' expecting AS near 'rewrite' in create materialized view statement
'dm_traffic': [ 'user_id, MAX(bytes_received), MIN(bytes_received), AVG(bytes_received)', 'ods_traffic', 'year' ] } for i in tables: params = randint(0, 100000) if i != 'dm_traffic': data_proc = DataProcHiveOperator( task_id=i, dag=dag, query="INSERT OVERWRITE TABLE alevanov." + i + " PARTITION (year={{ execution_date.year }})\n" + "SELECT " + tables[i][0] + " FROM alevanov." + tables[i][1] + " WHERE year(" + tables[i][2] + ") = '{{ execution_date.year }}';", cluster_name='cluster-dataproc', job_name=USERNAME + '_' + i + '_{{ execution_date.year }}_{{ params.job_suffix }}', params={"job_suffix": randint(0, 100000)}, region='europe-west3', ) if i == 'ods_traffic': date_proc_dm = DataProcHiveOperator( task_id='dm_traffic', dag=dag, query= "INSERT OVERWRITE TABLE alevanov.dm_traffic PARTITION (year={{ execution_date.year }})\n" + "SELECT " + tables['dm_traffic'][0] + " FROM alevanov." + tables['dm_traffic'][1] + " WHERE " + tables['dm_traffic'][2] + " = '{{ execution_date.year }}' GROUP BY user_id;",
from datetime import timedelta, datetime from random import randint from airflow import DAG from airflow.contrib.operators.dataproc_operator import DataProcHiveOperator USERNAME = '******' default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)} dag = DAG( USERNAME + '_data_lake_etl', default_args=default_args, description='Data Lake ETL tasks', schedule_interval="0 0 1 1 *", ) ods_billing = DataProcHiveOperator( task_id='ods_billing', dag=dag, query=""" insert overwrite table emateshuk.ods_billing partition (year='{{ execution_date.year }}') select * from emateshuk.stg_billing where year(created_at) = {{ execution_date.year }}; """, cluster_name='cluster-dataproc', job_name=USERNAME + '_ods_billing_{{ execution_date.year }}_{{ params.job_suffix }}', params={"job_suffix": randint(0, 100000)}, region='europe-west3', )
# If a task fails, retry it once after waiting at least 5 minutes 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with models.DAG( 'composer_sample_quickstart', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: HiveInsertingTable = DataProcHiveOperator( task_id='HiveInsertingTable', gcp_conn_id='google_cloud_default', query= "CREATE EXTERNAL TABLE trades_sample6(trading_date_time TIMESTAMP,network CHAR(1),message_category CHAR(1),message_type CHAR(1),message_sequence BIGINT,market_exchange CHAR(1),symbol VARCHAR(10),trade_price DOUBLE,trade_size BIGINT,trade_conditions VARCHAR(6),trade_conditions2 VARCHAR(6) )ROW FORMAT DELIMITED FIELDS TERMINATED BY ','LOCATION 'gs://market-data11-bucket/data/';", cluster_name='YourClusterName', region='us-central1', dag=dag) QuerytoGS = DataProcHiveOperator( task_id='QuerytoGS', gcp_conn_id='google_cloud_default', query= "INSERT OVERWRITE DIRECTORY 'gs://market-data11-bucket/output/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' SELECT * FROM trades_sample6;", cluster_name='YourClusterName', region='us-central1', dag=dag) QuerytoGS.set_upstream(HiveInsertingTable)
dag = DAG( USERNAME + '_data_lake_complete', default_args=default_args, description='Data Lake Complete ETL by sperfilyev', schedule_interval="@yearly", ) ods_tasks = {} for ods_table in metadata_ods.keys(): ods_tasks[ods_table] = DataProcHiveOperator( query=generate_ods_fill( ods_table, metadata_ods[ods_table]['field_of_partition'], metadata_ods[ods_table]['fields_to_import'], ), cluster_name='cluster-dataproc', job_name=generate_ods_job(ods_table), params={"job_suffix": randint(0, 100000)}, region='europe-west3', task_id='ods_' + ods_table, dag=dag, ) dm_tasks = {} for dm_table in metadata_dm.keys(): dm_tasks[dm_table] = DataProcHiveOperator( query=generate_dm_fill( dm_table, metadata_dm[dm_table]['field_aggregate'], metadata_dm[dm_table]['field_group_by'], ),
main_class="org.apache.spark.examples.SparkPi", dataproc_jars="file:///usr/lib/spark/examples/jars/spark-examples.jar", region=REGION, cluster_name=CLUSTER_NAME, ) pyspark_task = DataProcPySparkOperator( task_id="pyspark_task", main=PYSPARK_URI, region=REGION, cluster_name=CLUSTER_NAME, ) hive_task = DataProcHiveOperator( task_id="hive_task", query="SHOW DATABASES;", region=REGION, cluster_name=CLUSTER_NAME, ) hadoop_task = DataProcHadoopOperator( task_id="hadoop_task", main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar", arguments=["wordcount", "gs://pub/shakespeare/rose.txt", OUTPUT_PATH], region=REGION, cluster_name=CLUSTER_NAME, ) delete_cluster = DataprocClusterDeleteOperator( task_id="delete_cluster", project_id=PROJECT_ID, cluster_name=CLUSTER_NAME,
query = """ insert overwrite table ods.issue partition (year='{{ execution_date.year }}') select cast(user_id as INT), cast(start_time as TIMESTAMP), cast(end_time as TIMESTAMP), title, description, service from stg.issue where year(start_time) = {{ execution_date.year }}; """ elif task == 'payment': query = """ insert overwrite table ods.payment partition (year='{{ execution_date.year }}') select user_id, pay_doc_type, pay_doc_num, account, phone, billing_period, cast(pay_date as DATE), cast(sum as DECIMAL(10,2)) from stg.payment where year(pay_date) = {{ execution_date.year }}; """ ods.append( DataProcHiveOperator( task_id='ods_' + task, dag=dag, query=query, cluster_name='cluster-dataproc', region='us-central1', )) dm = DataProcHiveOperator( task_id='dm_traffic', dag=dag, query=""" insert overwrite table dm.traffic partition (year='{{ execution_date.year }}') select user_id, max(bytes_received), min(bytes_received), round(avg(bytes_received)) as avg_traf from ods.traffic where year = {{ execution_date.year }} group by user_id order by avg_traf; """, cluster_name='cluster-dataproc', region='us-central1', )
USERNAME + '_data_lake_etl_traffic', default_args=default_args, description='Data Lake ETL tasks', schedule_interval="0 0 1 1 *", ) ods_traffic = DataProcHiveOperator( task_id='ods_traffic', dag=dag, query=""" INSERT OVERWRITE TABLE asamoilov.ods_traffic PARTITION (year = {{ execution_date.year }}) SELECT user_id, from_unixtime(CAST(`timestamp`/1000 as BIGINT)) AS traffic_time, device_id, device_ip_addr, bytes_sent, bytes_received FROM asamoilov.stg_traffic WHERE year(from_unixtime(CAST(`timestamp`/1000 as BIGINT))) = {{ execution_date.year }}; """, cluster_name='cluster-dataproc', job_name=USERNAME + '_ods_traffic_{{ execution_date.year }}_{{ params.job_suffix }}', params={"job_suffix": randint(0, 100000)}, region='europe-west3', ) dm_traffic = DataProcHiveOperator( task_id='dm_traffic', dag=dag, query=""" INSERT OVERWRITE TABLE asamoilov.dm_traffic PARTITION (year = {{ execution_date.year }})
schedule_interval=None) as dag: # Create a Cloud Dataproc cluster with one node create_dataproc_cluster = DataprocClusterCreateOperator( task_id='create_dataproc_cluster', project_id=PROJECT_ID, cluster_name=CLUSTER_NAME, num_workers=0, region=REGION_ID, zone=ZONE, # service_account='*****@*****.**', # service_account='*****@*****.**', master_machine_type='n1-standard-1') submit_hive_task = DataProcHiveOperator( task_id='hive_submit', project_id=PROJECT_ID, cluster_name=CLUSTER_NAME, query_uri=HQL_BUCKET + HQL_SCRIPT_NAME, # dataproc_hive_jars=[UDF_BUCKET + UDF_JAR_MANE], # variables={'PROJECT_ID': PROJECT_ID}, region=REGION_ID) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', project_id=PROJECT_ID, cluster_name=CLUSTER_NAME, trigger_rule=TriggerRule.ALL_DONE) create_dataproc_cluster >> submit_hive_task >> delete_dataproc_cluster
region=my_region, num_workers=2, storage_bucket=my_bucket, master_machine_type=my_instance, master_disk_size=my_disk_size, worker_machine_type=my_instance, worker_disk_size=my_disk_size, num_preemptible_workers=0, #use scale out/in operator zone=my_zone, idle_delete_ttl=my_idle_delete_ttl, dag=dag) drop_if_exists_src_table = DataProcHiveOperator( task_id='drop_if_exists_src_table', job_name='drop_if_exists_src_table_job_name', cluster_name=my_cluster_name, region=my_region, query=drop_if_exists_src_table ) drop_if_exists_dst_table = DataProcHiveOperator( task_id='drop_if_exists_dst_table', job_name='drop_if_exists_dst_table_job_name', cluster_name=my_cluster_name, region=my_region, query=drop_if_exists_dst_table ) create_external_src_table = DataProcHiveOperator( task_id='create_external_src_table', job_name='create_external_src_table_job_name',
default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)} dag = DAG( USERNAME + '_data_lake_etl_payment', default_args=default_args, description='Data Lake ETL tasks', schedule_interval="0 0 1 1 *", ) ods_payment = DataProcHiveOperator( task_id='ods_payment', dag=dag, query=""" INSERT OVERWRITE TABLE asamoilov.ods_payment PARTITION (year = {{ execution_date.year }}) SELECT user_id, pay_doc_type, pay_doc_num, account, phone, CAST(from_unixtime(unix_timestamp(billing_period, 'yyyy-MM')) AS DATE) AS billing_period, CAST(pay_date AS TIMESTAMP) AS pay_date, CAST(sum AS DECIMAL(10,2)) AS sum FROM asamoilov.stg_payment WHERE year(pay_date) = {{ execution_date.year }}; """, cluster_name='cluster-dataproc', job_name=USERNAME + '_ods_payment_{{ execution_date.year }}_{{ params.job_suffix }}', params={"job_suffix": randint(0, 100000)}, region='europe-west3', )
USERNAME = '******' default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)} dag = DAG( USERNAME + '_data_lake_etl_billing', default_args=default_args, description='Data Lake ETL tasks', schedule_interval="0 0 1 1 *", ) ods_billing = DataProcHiveOperator( task_id='ods_billing', dag=dag, query=""" INSERT OVERWRITE TABLE asamoilov.ods_billing PARTITION (year = {{ execution_date.year }}) SELECT user_id, CAST(from_unixtime(unix_timestamp(billing_period, 'yyyy-MM')) AS DATE) AS billing_period, service, tariff, CAST(sum AS DECIMAL(10,2)) AS sum, CAST(created_at AS TIMESTAMP) AS created_at FROM asamoilov.stg_billing WHERE year(created_at) = {{ execution_date.year }}; """, cluster_name='cluster-dataproc', job_name=USERNAME + '_ods_billing_{{ execution_date.year }}_{{ params.job_suffix }}', params={"job_suffix": randint(0, 100000)}, region='europe-west3', )
default_args = {'owner': USERNAME, 'start_date': datetime(2012, 1, 1, 0, 0, 0)} dag = DAG( USERNAME + '_data_lake_etl', default_args=default_args, description='Data Lake ETL tasks', schedule_interval="0 0 1 1 *", ) ods_billing = DataProcHiveOperator( task_id='ods_billing', dag=dag, query=""" insert overwrite table dlybin.ods_billing partition (year={{ execution_date.year }}) select user_id,cast(replace(billing_period,"-","") as int),service,tariff,sum,created_at from dlybin.stg_billing where year(created_at) = {{ execution_date.year }}; """, cluster_name='cluster-dataproc', job_name=USERNAME + '_ods_billing_{{ execution_date.year }}_{{ params.job_suffix }}', params={"job_suffix": randint(0, 100000)}, region='europe-west3', ) ods_issue = DataProcHiveOperator( task_id='ods_issue', dag=dag, query=""" insert overwrite table dlybin.ods_issue partition (year={{ execution_date.year }}) select user_id,cast(start_time as timestamp),cast(end_time as timestamp),title,description,service from dlybin.stg_issue where year(end_time) = {{ execution_date.year }}; """, cluster_name='cluster-dataproc',
dag = airflow.DAG('Import-MySQL-to-GS-and-DataProc', 'catchup=False', default_args=default_args, schedule_interval=datetime.timedelta(days=1)) t1 = DataProcPySparkOperator( task_id='import-mysql-data', main='gs://mysqlnosql/spark_jdbc_to_gs.py', cluster_name='mydataproc2', region='us-central1', dataproc_pyspark_jars=['gs://mysqlnosql/spark-avro.jar'], dag=dag) t2 = DataProcHiveOperator(query=query_part, cluster_name='mydataproc2', region='us-central1', task_id='create_table_in_hive_2_cols', dag=dag) t3 = DataProcHiveOperator(query=query_all, cluster_name='mydataproc2', region='us-central1', task_id='create_table_in_hive_all_cols', dag=dag) t4 = DataFlowPythonOperator( py_file='gs://mysqlnosql/beam_gcs_bt.py', task_id='loadfrom-gcs-to-bt', dataflow_default_options={'project': '<yourprojectid>'}, options={ 'avro_input': 'gs://mysqldataflow/avro/customer/',
PROJECT_ID='dataproc-usecase-276215', region='us-west1', num_masters=1, num_workers=2, zone='us-west1-b', master_machine_type='n1-standard-1', master_disk_size = 100, worker_disk_size = 100, num_preemptible_workers=0, worker_machine_type='n1-standard-1', idle_delete_ttl =1800, subnetwork_uri='ctl', optional_components=['PRESTO','SOLR','RANGER'], #service_account_scopes=['cloud-platform','default','sql-admin'], #init_actions_uris=['gs://us-west4-test1-d1c785e8-bucket/data/cloud-sql-proxy/cloud-sql-proxy.sh']) #metadata ={'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted'}) metadata= {'kms-key-uri': 'projects/dataproc-usecase-276215/locations/global/keyRings/my-key-ring/cryptoKeys/migration-key', 'db-hive-password-uri': 'gs://secrectkeybucket2/hive-password.encrypted','use-cloud-sql-private-ip': 'true', 'hive-metastore-instance': 'dataproc-usecase-276215:us-west1:hive-metadata'}) # Run the Hive job on the Cloud Dataproc cluster run_dataproc_hive_create_db = DataProcHiveOperator( task_id='create_db', gcp_conn_id='google_cloud_default', query_uri="gs://data-bucket-hive2/HQL/create_db.hql", cluster_name='dataproc', region='us-west1', dag=dag) # Define DAG dependencies. #create_dataproc_cluster >> run_dataproc_hive >> delete_dataproc_cluster create_dataproc_cluster >> run_dataproc_hive_create_db
} with models.DAG('gcp_poc_automation1', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: create_dataproc_cluster = BashOperator( task_id='create_dataproc_cluster', bash_command= 'gcloud dataproc clusters create test --region us-central1 --subnet default --zone us-central1-a --master-machine-type n1-standard-1 --master-boot-disk-size 100 --num-workers 2 --worker-machine-type n1-standard-1 --worker-boot-disk-size 100 --image-version 1.3-debian10 --project dataproc-usecase-286310', dag=dag) dataproc_hive_create_db = DataProcHiveOperator( task_id='create_db', gcp_conn_id='google_cloud_default', query= "CREATE DATABASE IF NOT EXISTS default_autotestbqdb9 LOCATION 'gs://dataproc-staging-us-central1-788915459809-fk8wm6rc/HQL/';", cluster_name='dataproc', region='us-west1', dag=dag) dataproc_hive_create_table_par = DataProcHiveOperator( task_id='dataproc_hive_create_table_par', gcp_conn_id='google_cloud_default', query= "CREATE EXTERNAL TABLE IF NOT EXISTS default.chicago_taxi_trips_parquet_autotestbq9(unique_key STRING,taxi_id STRING,trip_start_timestamp STRING, trip_end_timestamp STRING, trip_seconds STRING, trip_miles STRING, pickup_census_tract STRING, dropoff_census_tract STRING, pickup_community_area STRING, dropoff_community_area STRING, fare STRING, tips STRING, tolls STRING, extras STRING, trip_total STRING, payment_type STRING, company STRING, pickup_latitude STRING, pickup_longitude STRING, pickup_location STRING,dropoff_latitude STRING, dropoff_longitude STRING, dropoff_location STRING) STORED AS PARQUET location 'gs://dataproc-staging-us-central1-788915459809-fk8wm6rc/HQL/PARQUET/';", cluster_name='dataproc', region='us-west1', dag=dag) dataproc_hive_create_table_csv = DataProcHiveOperator( task_id='dataproc_hive_create_table_csv',
, `account` , `phone` , cast(concat(`billing_period`,'-01') as DATE) , cast(`pay_date` as DATE) , `sum` from pryzhov.stg_payment where year(cast(`pay_date` as DATE)) = {{ execution_date.year }} ''' ods = DataProcHiveOperator( task_id='ods_%s' % table, dag=dag, query=query, job_name='%s_{{ execution_date.year }}_ods_%s_{{ params.job_suffix }}' % (USERNAME, table), params={"job_suffix": randint(0, 100000)}, cluster_name='cluster-dataproc', region='europe-west3', ) load_op >> ods if table == 'traffic': dm = DataProcHiveOperator( task_id='dm_%s' % table, dag=dag, query=''' insert overwrite table pryzhov.dm_traffic partition (year={{ execution_date.year }}) select `user_id` , max(`bytes_received`)