load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql) for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime( "%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel +
dag=dag) # Run PySpark Analysis #pyspark directory is by default homedir """ t6 = SparkSubmitOperator( task_id= 'run_spark_analysis', application='/tmp/pyspark/sparkHiveAirflow.py', name='Spark Analysis', executor_cores=1, executor_memory='2g', driver_memory='4g', verbose='true', dag=dag) """ pyspark_job = """ /usr/hdp/current/spark2-client/bin/spark-submit /tmp/pyspark/sparkHiveAirflow.py """ # Run PySpark via BashOperator t7 = BashOperator( task_id='run_pyspark', bash_command=pyspark_job, #env='SPARK_MAJOR_VERSION=2', dag=dag) # defining the job dependency t2.set_upstream(t1) t3.set_upstream(t2) t4.set_upstream(t3) t5.set_upstream(t4) # t6.set_upstream(t5) t7.set_upstream(t5)
# 储存tag_id和datatype的字典 tag_and_datatype_dict = {} # 创建顺序执行的DAG工作流 tail_etl_task = None for tp in sorted_files: file_name = tp[1] etl_name = tp[1].split('.')[1] etl_task = HiveOperator(task_id='user_tag_etl__' + etl_name, hql=file_name, hive_cli_conn_id=HIVE_CONN_ID, dag=dag) if tail_etl_task is not None: etl_task.set_upstream(tail_etl_task) tail_etl_task = etl_task else: tail_etl_task = etl_task collect_tag_id_and_datatype_from_sql_file(file_name) print('tag_and_datatype_dict: {}'.format(tag_and_datatype_dict)) user_tag_to_wide_sql = ''' drop table if exists user_tag_wide; create table user_tag_wide as select user_id '''
load_to_hdfs = BashOperator( task_id= "put_" + chanel + "_to_hdfs", bash_command= "HADOOP_USER_NAME=hdfs hadoop fs -put -f" + local_dir + file_name + hdfs_dir + chanel + "/", dag = dag ) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_upstream(hive_to_mysql) for chanel in from_channels: file_name = "from_" + chanel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id = "put_" + chanel + "_to_hdfs", bash_command = "HADOOP_USER_NAME=hdfs hadoop fs -put -f" + local_dir + file_name + hdfs_dir + chanel + "/", dag=dag ) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id = "load_" + chanel + "_to_hive", hql="LOAD DATA INPATH '" +
task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql) for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive",
def hook(dag, conn_id, tables, staging_dir='/tmp/airflow', staging_db=None, **options): staging_db = staging_db or 'staging_%s' % conn_id create_staging_db = HiveOperator(task_id='create_staging_db', hql='create database if not exists %s;' % staging_db, dag=dag) create_staging_dir = BashOperator(task_id='create_staging_dir', bash_command='hdfs dfs -mkdir -p %s' % staging_dir, dag=dag) for tbl in tables: table = { 'hive-database': None, 'hive-table': None, 'mappers': 1, 'direct': False, 'format': 'parquet', 'format-options': None, 'partition_fields': [], 'bucket_fields': [] } table.update(tbl) assert table['hive-database'] is not None if table['hive-table'] is None: table['hive-table'] = table['name'] staging_tbl_dir = os.path.join(staging_dir, conn_id, table['name']) clean_sqoop_staging = BashOperator( task_id=('clean_sqoop_staging_dir.%s' % (table['name'])).lower(), bash_command='hdfs dfs -rm -R -f %s' % staging_tbl_dir, dag=dag) clean_staging_tbl = HiveOperator( task_id=('clean_staging_table.%s' % (table['name'])).lower(), hql='''drop table if exists %(staging_db)s.%(staging_tbl)s''' % { 'staging_db': staging_db, 'staging_tbl': table['name'] }, dag=dag) sqoop = SqoopOperator(task_id=('sqoop.%s' % (table['name'])).lower(), conn_id=conn_id, table=table['name'], split_by=table['split_by'], num_mappers=table['mappers'], direct=table['direct'], target_dir=staging_tbl_dir, extra_import_options={ 'hive-import': '', 'hive-database': staging_db, 'hive-table': table['name'], 'hive-delims-replacement': ' ', 'temporary-rootdir': staging_dir, }, dag=dag) create_statement = ('create table %s.%s_tmp\n') % ( table['hive-database'], table['hive-table']) create_statement += 'stored as %s\n' % table['format'] format_opts = table.get('format-options', None) if format_opts: create_statement += '%s\n' % format_opts convert_to_parquet = HiveOperator( task_id=('hive_convert_format.%s' % (table['name'])).lower(), hql= ('create database if not exists %(dst_db)s;\n' 'drop table if exists %(dst_db)s.%(dst_tbl)s_tmp;\n' '%(create_statement)s' 'as select * from %(staging_db)s.%(staging_tbl)s;\n' 'drop table if exists %(dst_db)s.%(dst_tbl)s;\n' 'alter table %(dst_db)s.%(dst_tbl)s_tmp rename to %(dst_db)s.%(dst_tbl)s;\n' ) % { 'dst_db': table['hive-database'], 'dst_tbl': table['hive-table'], 'staging_db': staging_db, 'staging_tbl': table['name'], 'create_statement': create_statement }, dag=dag) clean_staging_tbl.set_upstream(create_staging_db) clean_sqoop_staging.set_upstream(create_staging_dir) sqoop.set_upstream(clean_sqoop_staging) sqoop.set_upstream(clean_staging_tbl) convert_to_parquet.set_upstream(sqoop)
hql += str(line) if not line: break print("___________") print(hql) print("_____________________________________________________") return hql args['pool'] = 'pool_dw' # 单独设置 sub dag 的pool 参数 with DAG( # DAG_NAME, dag_id='test_dag', default_args=args, schedule_interval=None, description='dw test') as dag: dummy_dw_mysqlapp = DummyOperator(task_id="DW_Tasks_Start") dag >> dummy_dw_mysqlapp last_task = dummy_dw_mysqlapp for hqlfilepath in dw_list: task_id = hqlfilepath.split("/")[-1].split(".")[0] print('task_id ', task_id) current_task = HiveOperator(hive_cli_conn_id='hive_cli_emr', task_id=task_id, hiveconf_jinja_translate=True, hql=read_hql_file(hqlfilepath), trigger_rule='all_done', dag=dag) current_task.set_upstream(last_task) last_task = current_task
hql = """ CREATE EXTERNAL TABLE IF NOT EXISTS dwdii2.noaa_temps ( station string, year int, jan string, feb string, mar string, apr string, may string, jun string, jul string, aug string, sep string, oct string, nov string, dec string ) PARTITIONED BY (exdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS TEXTFILE LOCATION '{{ params.hdfs_destination }}'; """ createTable = HiveOperator(task_id='hive_create_ext_table', hql=hql, params=s3fetch_params, dag=dag) t1.set_upstream(cleanHdfs) createTable.set_upstream(t1)
source_data_sensor = WebHdfsSensor( task_id='source_data_sensor', filepath='/data/mydata/{{ ds }}/mydata.csv', poke_interval=10, timeout=5, dag=dag ) create_hive_db = HiveOperator( task_id='create_hive_db', hql="DROP DATABASE IF EXISTS {db} CASCADE; CREATE DATABASE {db};".format(db='my_hive_db'), provide_context=True, dag=dag ) create_hive_db.set_upstream(source_data_sensor) hdfs_to_hive_trasfer = HiveOperator( task_id='hdfs_to_hive_trasfer', hql=hql.HQL_HDFS_TO_HIVE_TRANSFER.format(table_name='mydata', tmp_table_name='mydata_tmp', hdfs_path='/data/mydata/{{ ds }}'), schema='my_hive_db', provide_context=True, dag=dag ) hdfs_to_hive_trasfer.set_upstream(create_hive_db) count_data_rows = BranchPythonOperator( task_id='count_data_rows',
task_id = "ads_bi_risk_acct_c_zy", hql = ads_bi_risk.sqlads_bi_risk_acct_c_zy, hive_cli_conn_id = 'hive', dag = dag ) ods_data_check = BashOperator( task_id='ods_data_check', bash_command='date', dag=dag) #设置依赖关系 #先跑数据层,并在跑批前检查当天ods跑数情况 apt_card_txn_cu.set_upstream(ods_data_check) apt_card_stmt_cu.set_upstream(ods_data_check) apt_card_mpur_cu.set_upstream(ods_data_check) apt_card_jorj_cu.set_upstream(ods_data_check) apt_card_commtxn_cu.set_upstream(ods_data_check) apt_card_cnta_cu.set_upstream(ods_data_check) apt_card_chgs_cu.set_upstream(ods_data_check) apt_card_ccdcust_cu.set_upstream(ods_data_check) apt_card_card_cu.set_upstream(ods_data_check) apt_card_appr_cu.set_upstream(ods_data_check) apt_card_apma_cu.set_upstream(ods_data_check) apt_card_acct_cu.set_upstream(ods_data_check) #stg层 stg_card_acct_cu.set_upstream(apt_card_acct_cu) stg_card_acct_cu.set_upstream(dwd_card_stmt_hs)