def test_hiveconf_jinja_translate(self): hql = "SELECT ${num_col} FROM ${hiveconf:table};" t = HiveOperator( hiveconf_jinja_translate=True, task_id='dry_run_basic_hql', hql=hql, dag=self.dag) t.prepare_template() self.assertEqual(t.hql, "SELECT {{ num_col }} FROM {{ table }};")
def setUp(self): configuration.load_test_config() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.next_day = (DEFAULT_DATE + datetime.timedelta(days=1)).isoformat()[:10] self.database = 'airflow' self.partition_by = 'ds' self.table = 'static_babynames_partitioned' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( state string, year string, name string, gender string, num int) PARTITIONED BY ({{ params.partition_by }} string); ALTER TABLE {{ params.table }} ADD PARTITION({{ params.partition_by }}='{{ ds }}'); """ self.hook = HiveMetastoreHook() t = HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'partition_by': self.partition_by }, hive_cli_conn_id='beeline_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def setUp(self): configuration.load_test_config() self._upload_dataframe() args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG('test_dag_id', default_args=args) self.database = 'airflow' self.table = 'hive_server_hook' self.hql = """ CREATE DATABASE IF NOT EXISTS {{ params.database }}; USE {{ params.database }}; DROP TABLE IF EXISTS {{ params.table }}; CREATE TABLE IF NOT EXISTS {{ params.table }} ( a int, b int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; LOAD DATA LOCAL INPATH '{{ params.csv_path }}' OVERWRITE INTO TABLE {{ params.table }}; """ self.columns = ['{}.a'.format(self.table), '{}.b'.format(self.table)] self.hook = HiveMetastoreHook() t = HiveOperator( task_id='HiveHook_' + str(random.randint(1, 10000)), params={ 'database': self.database, 'table': self.table, 'csv_path': self.local_path }, hive_cli_conn_id='beeline_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hive_queues(self): t = HiveOperator( task_id='test_hive_queues', hql=self.hql, mapred_queue='default', mapred_queue_priority='HIGH', mapred_job_name='airflow.test_hive_queues', dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_hiveconf(self): hql = "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});" t = HiveOperator( hiveconfs={'table': 'static_babynames', 'day': '{{ ds }}'}, task_id='dry_run_basic_hql', hql=hql, dag=self.dag) t.prepare_template() self.assertEqual( t.hql, "SELECT * FROM ${hiveconf:table} PARTITION (${hiveconf:day});")
def test_hive_airflow_default_config_queue_override(self): specific_mapred_queue = 'default' t = HiveOperator( task_id='test_default_config_queue', hql=self.hql, mapred_queue=specific_mapred_queue, mapred_queue_priority='HIGH', mapred_job_name='airflow.test_default_config_queue', dag=self.dag) self.assertEqual(t.get_hook().mapred_queue, specific_mapred_queue)
def test_hive_airflow_default_config_queue(self): t = HiveOperator( task_id='test_default_config_queue', hql=self.hql, mapred_queue_priority='HIGH', mapred_job_name='airflow.test_default_config_queue', dag=self.dag) # just check that the correct default value in test_default.cfg is used test_config_hive_mapred_queue = configuration.conf.get( 'hive', 'default_hive_mapred_queue' ) self.assertEqual(t.get_hook().mapred_queue, test_config_hive_mapred_queue)
def test_mapred_job_name(self, mock_get_hook): mock_hook = mock.MagicMock() mock_get_hook.return_value = mock_hook t = HiveOperator( task_id='test_mapred_job_name', hql=self.hql, dag=self.dag) fake_execution_date = timezone.datetime(2018, 6, 19) fake_ti = TaskInstance(task=t, execution_date=fake_execution_date) fake_ti.hostname = 'fake_hostname' fake_context = {'ti': fake_ti} t.execute(fake_context) self.assertEqual( "Airflow HiveOperator task for {}.{}.{}.{}" .format(fake_ti.hostname, self.dag.dag_id, t.task_id, fake_execution_date.isoformat()), mock_hook.mapred_job_name)
bash_command='hadoop fs -rm -r -f ' + wf_data_path + '/dt={{ yesterday_ds }}/_SUCCESS', dag=dag ) delete_hdfs_file_2 = BashOperator( task_id='delete_hdfs_file_2', bash_command='hadoop fs -rm -r -f ' + wf_data_path + '/dt={{ ds }}/_SUCCESS', dag=dag ) af_adt_advertiser_revenue_report_ag_hql = HiveOperator( task_id='af_adt_advertiser_revenue_report_ag_hql', hive_cli_conn_id='hiveserver2_default', depends_on_past=True, hql=open(af_hql_path + "/hql_adt_advertiser_revenue_report_ag.hql", 'r').read().replace("hivevar", "hiveconf"), hiveconfs={'data_date': '{{ ds }}', 'data_date_before': '{{ yesterday_ds }}', 'data_date_last_hour': '{{ (execution_date - macros.timedelta(hours=3)).strftime("%Y-%m-%d") }}', 'data_date_next_hour': '{{ (execution_date + macros.timedelta(hours=3)).strftime("%Y-%m-%d") }}'}, params={'wf_data_path': wf_data_path, 'date_enable': True}, dag=dag ) """ 创建SUCCESS文件 """ create_hdfs_file = BashOperator( task_id='create_hdfs_file', bash_command='hadoop fs -touchz ' + wf_data_path + '/dt={{ yesterday_ds }}/_SUCCESS', dag=dag )
def test_beeline(self): t = HiveOperator( task_id='beeline_hql', hive_cli_conn_id='beeline_default', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
'db': obus_table.get('db'), 'table': obus_table.get('table'), 'conn': obus_table.get('conn') }, dag=dag ) ''' 添加hive数据表分区 ''' add_partitions = HiveOperator( task_id='add_partitions_{}'.format(hive_table.format(bs=obus_table.get('table'))), priority_weight=obus_table.get('priority_weight'), hql=''' ALTER TABLE {hive_db}.{table} ADD IF NOT EXISTS PARTITION (country_code='nal', dt='{{{{ ds }}}}') '''.format( hive_db=hive_db, table=hive_table.format(bs=obus_table.get('table')) ), schema=hive_db, dag=dag ) ''' 检查数据导入是否正确 ''' validate_all_data = PythonOperator( task_id='validate_data_{}'.format(hive_table.format(bs=obus_table.get('table'))), priority_weight=obus_table.get('priority_weight'), python_callable=validata_data, provide_context=True, op_kwargs={
dag = DAG('dw_commerce_contract_d', default_args=default_args, schedule_interval='4 1 * * *') start = SqlSensor( task_id='start', conn_id='etl_db', sql= "SELECT * FROM etl.signal WHERE name='dw_sso_basic_d' AND value='{{ macros.ds(ti) }}';", dag=dag) # contract del_partiton_stg_contract = HiveOperator( task_id='del_partiton_stg_contract', hive_cli_conn_id='spark_thrift', hql= "alter table stg.newuuabc_contract drop if exists PARTITION (etl_date='{{ macros.ds(ti) }}');\n ", dag=dag) src_stg_contract = BashOperator( task_id='src_stg_contract', bash_command= 'dataship extract uuold_newuuabc.contract {{ macros.ds(ti) }} {{ macros.tomorrow_ds(ti) }}', pool="embulk_pool", dag=dag) add_partiton_stg_contract = HiveOperator( task_id='add_partiton_stg_contract', hive_cli_conn_id='spark_thrift', hql= "alter table stg.newuuabc_contract add PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
dag = airflow.DAG('init_datavault2_example', schedule_interval="@once", default_args=args, template_searchpath='/usr/local/airflow/sql', max_active_runs=1) t1 = PythonOperator(task_id='init_datavault2_example', python_callable=init_datavault2_example, provide_context=False, dag=dag) t2 = HiveOperator( task_id='create_stg_database', hive_cli_conn_id='hive_default', schema='default', hql='CREATE DATABASE IF NOT EXISTS {0}'.format(ADVWORKS_STAGING), dag=dag) t3 = HiveOperator(task_id='create_dv_database', hive_cli_conn_id='hive_default', schema='default', hql='CREATE DATABASE IF NOT EXISTS {0}'.format(DATAVAULT), dag=dag) t4 = HiveOperator(task_id='create_dv_temp', hive_cli_conn_id='hive_default', schema='default', hql='CREATE DATABASE IF NOT EXISTS {0}'.format(DV_TEMP), dag=dag)
dag = airflow.DAG('okash_hourly', schedule_interval="10 * * * *", default_args=args) check_client_file = OssSensor( task_id='check_client_file', bucket_key= 'okash/okash/{table}/dt={{{{ ds }}}}/hour={{{{ execution_date.strftime("%H") }}}}/' .format(table='client'), bucket_name='okash', timeout=3600, dag=dag) add_client_partitions = HiveOperator(task_id='add_client_partitions', hql=""" ALTER TABLE ods_log_client_hi ADD IF NOT EXISTS PARTITION (dt = '{{ ds }}', hour = '{{ execution_date.strftime("%H") }}'); """, schema='okash_dw', dag=dag) export_to_mysql = HiveToMySqlTransfer(task_id='export_to_mysql', sql=""" --SET mapreduce.job.queuename=root.airflow; SELECT *, from_unixtime(unix_timestamp()) FROM okash_dw.ods_log_client_hi WHERE dt='{{ ds }}' AND hour='{{ execution_date.strftime("%H") }}' """,
# - 家长表 dag = DAG('dw_student_basic_d', default_args=default_args, schedule_interval='0 1 * * *') start = SqlSensor( task_id='start', conn_id='src_main_db', sql= "SELECT * FROM restore_tracking.restore_log WHERE date(restored_time) = current_date;", dag=dag) del_partiton_stg_parents = HiveOperator( task_id='del_partiton_stg_parents', hive_cli_conn_id='spark_thrift', hql= "alter table stg.newuuabc_parents drop if exists PARTITION (etl_date='{{ macros.ds(ti) }}');\n ", dag=dag) src_stg_parents = BashOperator( task_id='src_stg_parents', bash_command= 'dataship extract uuold_newuuabc.parents {{ macros.ds(ti) }} {{ macros.tomorrow_ds(ti) }}', pool="embulk_pool", dag=dag) add_partiton_stg_parents = HiveOperator( task_id='add_partiton_stg_parents', hive_cli_conn_id='spark_thrift', hql= "alter table stg.newuuabc_parents add PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
create_obus_client_event = HiveOperator(task_id='create_obus_client_event', hql=""" CREATE TABLE IF NOT EXISTS `ods_log_client_event`( `ip` string, `server_ip` string, `timestamp` bigint, `common` struct< `user_id`:string, `user_number`:string, `client_timestamp`:string, `platform`:string, `os_version`:string, `app_name`:string, `app_version`:string, `locale`:string, `device_id`:string, `device_screen`:string, `device_model`:string, `device_manufacturer`:string, `is_root`:string, `channel`:string, `subchannel`:string, `gaid`:string, `appsflyer_id`:string >, `events` Array< struct< `event_time`:string, `event_name`:string, `page`:string, `source`:string, `event_value`:string > > ) PARTITIONED BY ( `dt` string, `hour` string) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' with SERDEPROPERTIES("ignore.malformed.json"="true") LOCATION 's3a://opay-bi/obus_buried/obdm.client_event' """, schema='obus_dw_ods', dag=dag)
dag = airflow.DAG('oride_daily', schedule_interval="00 03 * * *", default_args=args) create_oride_driver_overview = HiveOperator( task_id='create_oride_driver_overview', hql=""" CREATE TABLE IF NOT EXISTS oride_driver_overview( driver_id bigint, order_status int, payment_mode int, payment_status int, order_num_total bigint, price_total decimal(10,2), distance_total bigint, duration_total bigint ) PARTITIONED BY ( dt STRING ) STORED AS PARQUET """, schema='dashboard', dag=dag) insert_oride_driver_overview = HiveOperator( task_id='insert_oride_driver_overview', hql=""" INSERT OVERWRITE TABLE oride_driver_overview PARTITION (dt='{{ ds }}')
def test_hive_dryrun(self): t = HiveOperator(task_id='dry_run_basic_hql', hql=self.hql, dag=self.dag) t.dry_run()
hql = """ CREATE EXTERNAL TABLE IF NOT EXISTS dwdii2.noaa_temps ( station string, year int, jan string, feb string, mar string, apr string, may string, jun string, jul string, aug string, sep string, oct string, nov string, dec string ) PARTITIONED BY (exdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE LOCATION '{{ params.hdfs_destination }}'; """ createTable = HiveOperator(task_id='hive_create_ext_table', hql=hql, params=s3fetch_params, dag=dag) t1.set_upstream(cleanHdfs) createTable.set_upstream(t1)
from settings import default_args dag = DAG('dw_sso_basic_d', default_args=default_args, schedule_interval='1 1 * * *') start = SqlSensor( task_id='start', conn_id='src_main_db', sql="SELECT * FROM restore_tracking.restore_log WHERE date(restored_time) = current_date;", dag=dag) # bk_user_info delpart_stg_bk_user_info = HiveOperator( task_id='delpart_stg_bk_user_info', hive_cli_conn_id='spark_thrift', hql="alter table stg.sishu_bk_user_info drop if exists PARTITION (etl_date='{{ macros.ds(ti) }}');\n ", dag=dag) stg_bk_user_info = BashOperator( task_id='stg_bk_user_info', bash_command='dataship extract uuold_sishu.bk_user_info {{ macros.ds(ti) }} {{ macros.tomorrow_ds(ti) }}', pool="embulk_pool", dag=dag) addpart_stg_bk_user_info = HiveOperator( task_id='addpart_stg_bk_user_info', hive_cli_conn_id='spark_thrift', hql="alter table stg.sishu_bk_user_info add PARTITION (etl_date='{{ macros.ds(ti) }}');\n ", dag=dag)
def hook(dag, conn_id, tables, staging_dir='/tmp/airflow', staging_db=None, **options): staging_db = staging_db or 'staging_%s' % conn_id create_staging_db = HiveOperator(task_id='create_staging_db', hql='create database if not exists %s;' % staging_db, dag=dag) create_staging_dir = BashOperator(task_id='create_staging_dir', bash_command='hdfs dfs -mkdir -p %s' % staging_dir, dag=dag) for tbl in tables: table = { 'hive-database': None, 'hive-table': None, 'mappers': 1, 'direct': False, 'format': 'parquet', 'format-options': None, 'partition_fields': [], 'bucket_fields': [] } table.update(tbl) assert table['hive-database'] is not None if table['hive-table'] is None: table['hive-table'] = table['name'] staging_tbl_dir = os.path.join(staging_dir, conn_id, table['name']) clean_sqoop_staging = BashOperator( task_id=('clean_sqoop_staging_dir.%s' % (table['name'])).lower(), bash_command='hdfs dfs -rm -R -f %s' % staging_tbl_dir, dag=dag) clean_staging_tbl = HiveOperator( task_id=('clean_staging_table.%s' % (table['name'])).lower(), hql='''drop table if exists %(staging_db)s.%(staging_tbl)s''' % { 'staging_db': staging_db, 'staging_tbl': table['name'] }, dag=dag) sqoop = SqoopOperator(task_id=('sqoop.%s' % (table['name'])).lower(), conn_id=conn_id, table=table['name'], split_by=table['split_by'], num_mappers=table['mappers'], direct=table['direct'], target_dir=staging_tbl_dir, extra_import_options={ 'hive-import': '', 'hive-database': staging_db, 'hive-table': table['name'], 'hive-delims-replacement': ' ', 'temporary-rootdir': staging_dir, }, dag=dag) create_statement = ('create table %s.%s_tmp\n') % ( table['hive-database'], table['hive-table']) create_statement += 'stored as %s\n' % table['format'] format_opts = table.get('format-options', None) if format_opts: create_statement += '%s\n' % format_opts convert_to_parquet = HiveOperator( task_id=('hive_convert_format.%s' % (table['name'])).lower(), hql= ('create database if not exists %(dst_db)s;\n' 'drop table if exists %(dst_db)s.%(dst_tbl)s_tmp;\n' '%(create_statement)s' 'as select * from %(staging_db)s.%(staging_tbl)s;\n' 'drop table if exists %(dst_db)s.%(dst_tbl)s;\n' 'alter table %(dst_db)s.%(dst_tbl)s_tmp rename to %(dst_db)s.%(dst_tbl)s;\n' ) % { 'dst_db': table['hive-database'], 'dst_tbl': table['hive-table'], 'staging_db': staging_db, 'staging_tbl': table['name'], 'create_statement': create_statement }, dag=dag) clean_staging_tbl.set_upstream(create_staging_db) clean_sqoop_staging.set_upstream(create_staging_dir) sqoop.set_upstream(clean_sqoop_staging) sqoop.set_upstream(clean_staging_tbl) convert_to_parquet.set_upstream(sqoop)
CREATE EXTERNAL TABLE IF NOT EXISTS airflow_hive( `Global_new_confirmed` int, `Global_new_deaths` int, `Global_new_recovered` int, `Global_total_confirmed` int, `Global_total_deaths` int, `Global_total_recovered` int, `Country_code` string, `Country_name` string, `Country_new_deaths` int, `Country_new_recovered` int, `Country_newconfirmed` int, `Country_slug` string, `Country_total_confirmed` int, `Country_total_deaths` int, `Country_total_recovered` int, `Extracted_timestamp` timestamp); insert into airflow_hive from corona; """ hive_task = HiveOperator(hql = hql_query, task_id = "hive_script_task", hive_cli_conn_id = "hive_local", dag = dag_hive ) hive_task if__name__ == '__main__ ': dag_hive.cli()
task_id='upload_to_hdfs', bash_command= f'hdfs dfs -put -f {DATA_FOLDER}/products-with-good-columns.csv /products-with-good-columns.csv', dag=dag, ) # Create Hive table create_product_lookup_table = HiveOperator( task_id='create_product_lookup_table', hive_cli_conn_id='my_hive_conn', hql=''' CREATE TABLE IF NOT EXISTS dim_product_lookup ( UPC STRING, DESCRIPTION STRING, MANUFACTURER STRING, CATEGORY STRING, SUB_CATEGORY STRING, PRODUCT_SIZE STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE ''', dag=dag, ) # Load data in Hive table load_data_to_hive_table = HiveOperator( task_id='load_data_to_hive_table', hive_cli_conn_id='my_hive_conn', hql=''' LOAD DATA INPATH '/products-with-good-columns.csv' OVERWRITE INTO TABLE dim_product_lookup;
hql += str(line) if not line: break print("___________") print(hql) print("_____________________________________________________") return hql args['pool'] = 'pool_dw' # 单独设置 sub dag 的pool 参数 with DAG( # DAG_NAME, dag_id='test_dag', default_args=args, schedule_interval=None, description='dw test') as dag: dummy_dw_mysqlapp = DummyOperator(task_id="DW_Tasks_Start") dag >> dummy_dw_mysqlapp last_task = dummy_dw_mysqlapp for hqlfilepath in dw_list: task_id = hqlfilepath.split("/")[-1].split(".")[0] print('task_id ', task_id) current_task = HiveOperator(hive_cli_conn_id='hive_cli_emr', task_id=task_id, hiveconf_jinja_translate=True, hql=read_hql_file(hqlfilepath), trigger_rule='all_done', dag=dag) current_task.set_upstream(last_task) last_task = current_task
import reddit_scrape, clean_data from airflow import DAG from airflow.contrib.sensors.file_sensor import FileSensor from airflow.operators.python_operator import PythonOperator from airflow.operators.bash_operator import BashOperator from airflow.operators.hive_operator import HiveOperator default_args = {"start_date": datetime(2020, 1, 1), "owner": "jmce619"} with DAG(dag_id="subreddit_dag", schedule_interval="@daily", default_args=default_args, catchup=False) as dag: subreddit_scrape = PythonOperator(task_id="subreddit_scrape", python_callable=reddit_scrape.main) data_clean = PythonOperator(task_id="data_clean", python_callable=clean_data.main) storing_posts = BashOperator( task_id="storing_posts", bash_command='hadoop fs -f -put -f /tmp/subreddit_clean.csv /tmp/') loading_posts = HiveOperator( task_id="loading_posts", hql='LOAD DATA INPATH "/tmp/subreddit_clean.csv" INTO TABLE subreddit') subreddit_scrape >> data_clean_storing >> storing_posts >> loading_posts
partition_names=[ 'fact_transactions/execution_date={{ macros.ds_add(ds, -1) }}' ], metastore_conn_id='my_hive_metastore_conn', poke_interval=30, dag=dag, ) create_product_transactions_table = HiveOperator( task_id='create_product_transactions_table', hive_cli_conn_id='my_hive_conn', hql=''' CREATE TABLE IF NOT EXISTS zkan_product_transactions ( product_description STRING, price DECIMAL(10, 2), units INT, visits INT ) PARTITIONED BY (execution_date DATE) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS TEXTFILE; ''', dag=dag, ) add_new_product_transactions = HiveOperator( task_id='add_new_product_transactions', hive_cli_conn_id='my_hive_conn', hiveconfs={'hive.exec.dynamic.partition.mode': 'nonstrict'}, hql=''' INSERT INTO TABLE zkan_product_transactions SELECT dim_product_lookup.description,
args = { 'owner': 'Airflow', 'start_date': airflow.utils.dates.days_ago(1), } dag_prjt_main = DAG( dag_id=DAG_NAME, default_args=args, schedule_interval='* * * * *' #"@once" ) SQOOP_Task1 = BashOperator(task_id="Sqoop", bash_command='~/sqoop-1.4.7.bin__hadoop-2.6.0/bin/sqoop job --exec Sqoop_weblogdetails_test37', dag=dag_prjt_main) hive_cmd= """use test1; set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.exec.max.dynamic.partitions=1000; insert into weblog_partiton_table partition(host) select id, datevalue, ipaddress, url, responsecode, host from weblog_external as a where not exists(select b.id from weblog_partiton_table as b where a.id = b.id);""" hive_part = HiveOperator(hive_cli_conn_id='hive_cli_default', hql=hive_cmd, task_id = 'Hive', dag=dag_prjt_main) finish_task = DummyOperator(task_id="finaltask", dag=dag_prjt_main) SQOOP_Task1 >> hive_part >> finish_task if __name__ == '__main__': dag_prjt_main.cli()
create_hdfs_spotify_raw_categories_dir = HdfsMkdirFileOperator( task_id='create_hdfs_spotify_raw_categories_dir', directory='/user/hadoop/spotify/rawCategories', hdfs_conn_id='hdfs', dag=dag, ) create_hdfs_spotify_final_dir = HdfsMkdirFileOperator( task_id='create_hdfs_spotify_raw_categories_dir', directory='/user/hadoop/spotify/final', hdfs_conn_id='hdfs', dag=dag, ) ########################################################### S Q L ################################################################ createTableRawPlaylist = HiveOperator(task_id='create_table_raw_playlist', hql=hiveSql_createTableRawPlaylist, hive_cli_conn_id='beeline', dag=dag) createTableRawTrackData = HiveOperator(task_id='createTableRawTrackData', hql=hiveSql_createTableRawTrackData, hive_cli_conn_id='beeline', dag=dag) createTableRawCategorie = HiveOperator(task_id='createTableRawCategorie', hql=hiveSQL_createTableRawCategories, hive_cli_conn_id='beeline', dag=dag) createTableFinal = HiveOperator(task_id='createTableFinal', hql=hiveSQL_createTableFinal, hive_cli_conn_id='beeline',
# verifica se existe o arquivo formatado waiting_file_formated = FileSensor( task_id="waiting_file_formated", fs_conn_id="fs_default", filepath=LOCAL_PATH + FILE_FORMATED, poke_interval=5) # Test => docker-compose -f docker-compose.yml run --rm webserver airflow test trade_etanol_anidro move_file_formated 2020-04-02 # move o arquivo formatado para a pasta de formatado move_file_formated = PythonOperator( task_id="move_file_formated", python_callable=move_file_formated, provide_context=True, op_kwargs={'conn_id': 'ftp_servidor'}) # Test => docker-compose -f docker-compose.yml run --rm webserver airflow test trade_etanol_anidro load_file_formated 2020-04-02 # exibe no log arquivo formatado load_file_formated = PythonOperator( task_id="load_file_formated", python_callable=print_file_formated, provide_context=True) # Test => docker-compose -f docker-compose.yml run --rm webserver airflow test trade_etanol_anidro transfer_into_hive 2020-04-02 # transfer_into_hive = HiveOperator( task_id="transfer_into_hive", hql="LOAD DATA IN PATH '/tmp/trade_etanol_anidro_formated.csv' INTO TABLE etanol") end_dag = DummyOperator(task_id='end_dag') start_dag >> waiting_file_original >> execute_file_original >> waiting_file_formated >> [move_file_formated, load_file_formated] >> transfer_into_hive >> end_dag
path = 'oss://opay-datalake/ofood/client' create_ods_log_client_event_hi = HiveOperator( task_id='create_ods_log_client_event_hi', hql=''' CREATE EXTERNAL TABLE if not exists ods_log_client_event_hi ( user_id bigint comment'用户ID', user_number string comment '用户no', client_timestamp int comment '客户端时间戳', platform string comment '平台ios/android', os_version string comment '系统版本', app_name string comment'应用名', app_version string comment '应用版本', locale string comment '本地语言', device_id string comment '设备号', device_screen string comment '设备分辨率', device_model string comment '设备类型', device_manufacturer string comment '设备品牌', is_root string comment '是否root', event_name string comment '事件名', page string comment 'page', event_values string comment '事件内容' ) PARTITIONED BY (`dt` string, `hour` string) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' with SERDEPROPERTIES("ignore.malformed.json"="true") LOCATION '{location_path}' '''.format(location_path=path), schema='ofood_dw_ods', dag=dag) ods_log_client_event_hi_partition = HiveOperator(
def test_hive(self): t = HiveOperator( task_id='basic_hql', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
task_id='check_table_{}'.format(hive_table_name), priority_weight=priority_weight_nm, python_callable=run_check_table, provide_context=True, op_kwargs={ 'db_name': db_name, 'table_name': table_name, 'conn_id': conn_id, 'hive_table_name': hive_table_name }, dag=dag) # add partitions add_partitions = HiveOperator( task_id='add_partitions_{}'.format(hive_table_name), priority_weight=priority_weight_nm, hql=''' ALTER TABLE {table} ADD IF NOT EXISTS PARTITION (dt = '{{{{ ds }}}}') '''.format(table=hive_table_name), schema=HIVE_DB, dag=dag) validate_all_data = PythonOperator( task_id='validate_data_{}'.format(hive_table_name), priority_weight=priority_weight_nm, python_callable=validata_data, provide_context=True, op_kwargs={ 'db': HIVE_DB, 'table_name': hive_table_name, 'table_format': HIVE_TABLE, 'table_core_list': table_core_list, 'table_not_core_list': table_not_core_list
def test_hive_dryrun(self): t = HiveOperator( task_id='dry_run_basic_hql', hql=self.hql, dag=self.dag) t.dry_run()
dag = DAG('dw_tchops_basic_d', default_args=default_args, schedule_interval='7 1 * * *') start = SqlSensor( task_id='start', conn_id='src_main_db', sql= "SELECT * FROM restore_tracking.restore_log WHERE date(restored_time) = current_date;", dag=dag) # teacher_signed_log delpart_stg_teacher_signed_log = HiveOperator( task_id='delpart_stg_teacher_signed_log', hive_cli_conn_id='spark_thrift', hql= "alter table stg.teacher_contract_teacher_signed_log drop if exists PARTITION (etl_date='{{ macros.ds(ti) }}');\n ", dag=dag) stg_teacher_signed_log = BashOperator( task_id='stg_teacher_signed_log', bash_command= 'dataship extract uuold_teacher_contract.teacher_signed_log {{ macros.ds(ti) }} {{ macros.tomorrow_ds(ti) }}', pool="embulk_pool", dag=dag) addpart_stg_teacher_signed_log = HiveOperator( task_id='addpart_stg_teacher_signed_log', hive_cli_conn_id='spark_thrift', hql= "alter table stg.teacher_contract_teacher_signed_log add PARTITION (etl_date='{{ macros.ds(ti) }}');\n ",
def test_hive(self): t = HiveOperator(task_id='basic_hql', hql=self.hql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
bash_command= f'hdfs dfs -put -f {DATA_FOLDER}/stores-with-good-columns.csv {CLEANED_ZONE}/stores-with-good-columns.csv', dag=dag, ) # Create Hive table create_store_lookup_table = HiveOperator( task_id='create_store_lookup_table', hive_cli_conn_id='my_hive_conn', hql=''' CREATE TABLE IF NOT EXISTS dim_store_lookup ( store_id INT, store_name VARCHAR(100), address_city_name VARCHAR(300), address_state_prov_code VARCHAR(2), msa_code VARCHAR(100), seg_value_name VARCHAR(100), parking_space_qty DECIMAL(38, 2), sales_area_size_num INT, avg_weekly_baskets DECIMAL(38, 2) ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS TEXTFILE; ''', dag=dag, ) # Load data in Hive table load_data_to_hive_table = HiveOperator( task_id='load_data_to_hive_table', hive_cli_conn_id='my_hive_conn', hql=f'''
from airflow import DAG from airflow.contrib.sensors.file_sensor import FileSensor from datetime import datetime from airflow.operators.python_operator import PythonOperator from airflow.operators.bash_operator import BashOperator from airflow.operators.hive_operator import HiveOperator import fetching_tweet, cleaning_tweet default_args = {"start_date": datetime(2020, 1, 1), "owner": "airflow"} with DAG(dag_id="twitter_dag", schedule_interval="@daily", default_args=default_args) as dag: waiting_for_tweets = FileSensor(task_id="waiting_for_tweets", fs_conn_id="fs_tweet", filepath="data.csv", poke_interval=5) fetching_tweets = PythonOperator(task_id="fetching_tweets", python_callable=fetching_tweet.main) cleaning_tweets = PythonOperator(task_id="cleaning_tweets", python_callable=cleaning_tweet.main) storing_tweets = BashOperator( task_id="storing_tweets", bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/") loading_tweets = HiveOperator( task_id="loading_tweets", hql="LOAD DATA INPATH '/tmp/data_cleaned.csv' INTO TABLE tweets")
for channel in to_channels: file_name = "to_" + channel + "_" + yesterday.strftime( "%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/") load_to_hdfs << analyze_tweets load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')") load_to_hive << load_to_hdfs load_to_hive >> hive_to_mysql for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime( "%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/") load_to_hdfs << analyze_tweets
file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql) for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag)
bash_command=""" hdfs dfs -mkdir -p /forex && \ hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex """ ) creating_forex_rates_table = HiveOperator( task_id="creating_forex_rates_table", hive_cli_conn_id="hive_conn", hql=""" CREATE EXTERNAL TABLE IF NOT EXISTS forex_rates( base STRING, last_update DATE, eur DOUBLE, usd DOUBLE, nzd DOUBLE, gbp DOUBLE, jpy DOUBLE, cad DOUBLE ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE """ ) forex_processing = SparkSubmitOperator( task_id="forex_processing", conn_id="spark_conn", application="/usr/local/airflow/dags/scripts/forex_processing.py", verbose=False
} with DAG('twitter_dag_v2', start_date=datetime(2018, 10, 1), schedule_interval="@daily", default_args=DAG_DEFAULT_ARGS, catchup=False) as dag: waiting_file_task = FileSensor( task_id="waiting_file_task", fs_conn_id="fs_default", filepath="/home/airflow/airflow_files/data.csv", poke_interval=5) fetching_tweet_task = PythonOperator(task_id="fetching_tweet_task", python_callable=fetching_tweet.main) cleaning_tweet_task = PythonOperator(task_id="cleaning_tweet_task", python_callable=cleaning_tweet.main) load_into_hdfs_task = BashOperator( task_id="load_into_hdfs_task", bash_command="hadoop fs -put -f /tmp/data_cleaned.csv /tmp/") transfer_into_hive_task = HiveOperator( task_id="transfer_into_hive_task", hql= "LOAD DATA INPATH '/tmp/data_cleaned.csv' INTO TABLE tweets PARTITION(dt='2018-10-01')" ) waiting_file_task >> fetching_tweet_task >> cleaning_tweet_task >> load_into_hdfs_task >> transfer_into_hive_task