Esempio n. 1
0
    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir +
        file_name + hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive",
                                hql="LOAD DATA INPATH '" + hdfs_dir + channel +
                                "/" + file_name + "' "
                                "INTO TABLE " + channel + " "
                                "PARTITION(dt='" + dt + "')",
                                dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_downstream(hive_to_mysql)

for channel in from_channels:
    file_name = "from_" + channel + "_" + yesterday.strftime(
        "%Y-%m-%d") + ".csv"
    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir +
        file_name + hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(task_id="load_" + channel + "_to_hive",
                                hql="LOAD DATA INPATH '" + hdfs_dir + channel +
Esempio n. 2
0
                  dag=dag)

# Run PySpark Analysis #pyspark directory is by default homedir
""" t6 = SparkSubmitOperator(
    task_id= 'run_spark_analysis',
    application='/tmp/pyspark/sparkHiveAirflow.py',
    name='Spark Analysis',
    executor_cores=1,
    executor_memory='2g',
    driver_memory='4g',
    verbose='true',
    dag=dag) """

pyspark_job = """
    /usr/hdp/current/spark2-client/bin/spark-submit /tmp/pyspark/sparkHiveAirflow.py
    """

# Run PySpark via BashOperator
t7 = BashOperator(
    task_id='run_pyspark',
    bash_command=pyspark_job,
    #env='SPARK_MAJOR_VERSION=2',
    dag=dag)

# defining the job dependency
t2.set_upstream(t1)
t3.set_upstream(t2)
t4.set_upstream(t3)
t5.set_upstream(t4)
# t6.set_upstream(t5)
t7.set_upstream(t5)
Esempio n. 3
0
# 储存tag_id和datatype的字典
tag_and_datatype_dict = {}

# 创建顺序执行的DAG工作流
tail_etl_task = None

for tp in sorted_files:
    file_name = tp[1]
    etl_name = tp[1].split('.')[1]
    etl_task = HiveOperator(task_id='user_tag_etl__' + etl_name,
                            hql=file_name,
                            hive_cli_conn_id=HIVE_CONN_ID,
                            dag=dag)

    if tail_etl_task is not None:
        etl_task.set_upstream(tail_etl_task)
        tail_etl_task = etl_task
    else:
        tail_etl_task = etl_task

    collect_tag_id_and_datatype_from_sql_file(file_name)

print('tag_and_datatype_dict: {}'.format(tag_and_datatype_dict))

user_tag_to_wide_sql = '''
drop table if exists user_tag_wide;
create table user_tag_wide
as
select
user_id
'''
Esempio n. 4
0
    load_to_hdfs = BashOperator(
        task_id= "put_" + chanel + "_to_hdfs",
        bash_command= "HADOOP_USER_NAME=hdfs hadoop fs -put -f" +
                        local_dir + file_name + hdfs_dir + chanel + "/",
        dag = dag
    )
    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id="load_" + channel + "_to_hive",
        hql="LOAD DATA INPATH '" +
            hdfs_dir + channel + "/" + file_name + "' "
            "INTO TABLE " + channel + " "
            "PARTITION(dt='" + dt + "')",
        dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_upstream(hive_to_mysql)

for chanel in from_channels:
    file_name = "from_" + chanel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"
    load_to_hdfs = BashOperator(
        task_id = "put_" + chanel + "_to_hdfs",
        bash_command = "HADOOP_USER_NAME=hdfs hadoop fs -put -f" +
                        local_dir + file_name + hdfs_dir + chanel + "/",
        dag=dag
    )
    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id = "load_" + chanel + "_to_hive",
        hql="LOAD DATA INPATH '" +
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id="load_" + channel + "_to_hive",
        hql="LOAD DATA INPATH '" +
            hdfs_dir + channel + "/" + file_name + "' "
            "INTO TABLE " + channel + " "
            "PARTITION(dt='" + dt + "')",
        dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_downstream(hive_to_mysql)

for channel in from_channels:
    file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"
    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id="load_" + channel + "_to_hive",
def hook(dag,
         conn_id,
         tables,
         staging_dir='/tmp/airflow',
         staging_db=None,
         **options):

    staging_db = staging_db or 'staging_%s' % conn_id

    create_staging_db = HiveOperator(task_id='create_staging_db',
                                     hql='create database if not exists %s;' %
                                     staging_db,
                                     dag=dag)

    create_staging_dir = BashOperator(task_id='create_staging_dir',
                                      bash_command='hdfs dfs -mkdir -p %s' %
                                      staging_dir,
                                      dag=dag)

    for tbl in tables:
        table = {
            'hive-database': None,
            'hive-table': None,
            'mappers': 1,
            'direct': False,
            'format': 'parquet',
            'format-options': None,
            'partition_fields': [],
            'bucket_fields': []
        }
        table.update(tbl)
        assert table['hive-database'] is not None
        if table['hive-table'] is None:
            table['hive-table'] = table['name']

        staging_tbl_dir = os.path.join(staging_dir, conn_id, table['name'])

        clean_sqoop_staging = BashOperator(
            task_id=('clean_sqoop_staging_dir.%s' % (table['name'])).lower(),
            bash_command='hdfs dfs -rm -R -f %s' % staging_tbl_dir,
            dag=dag)

        clean_staging_tbl = HiveOperator(
            task_id=('clean_staging_table.%s' % (table['name'])).lower(),
            hql='''drop table if exists %(staging_db)s.%(staging_tbl)s''' % {
                'staging_db': staging_db,
                'staging_tbl': table['name']
            },
            dag=dag)

        sqoop = SqoopOperator(task_id=('sqoop.%s' % (table['name'])).lower(),
                              conn_id=conn_id,
                              table=table['name'],
                              split_by=table['split_by'],
                              num_mappers=table['mappers'],
                              direct=table['direct'],
                              target_dir=staging_tbl_dir,
                              extra_import_options={
                                  'hive-import': '',
                                  'hive-database': staging_db,
                                  'hive-table': table['name'],
                                  'hive-delims-replacement': ' ',
                                  'temporary-rootdir': staging_dir,
                              },
                              dag=dag)

        create_statement = ('create table %s.%s_tmp\n') % (
            table['hive-database'], table['hive-table'])

        create_statement += 'stored as %s\n' % table['format']

        format_opts = table.get('format-options', None)
        if format_opts:
            create_statement += '%s\n' % format_opts

        convert_to_parquet = HiveOperator(
            task_id=('hive_convert_format.%s' % (table['name'])).lower(),
            hql=
            ('create database if not exists %(dst_db)s;\n'
             'drop table if exists %(dst_db)s.%(dst_tbl)s_tmp;\n'
             '%(create_statement)s'
             'as select * from %(staging_db)s.%(staging_tbl)s;\n'
             'drop table if exists %(dst_db)s.%(dst_tbl)s;\n'
             'alter table %(dst_db)s.%(dst_tbl)s_tmp rename to  %(dst_db)s.%(dst_tbl)s;\n'
             ) % {
                 'dst_db': table['hive-database'],
                 'dst_tbl': table['hive-table'],
                 'staging_db': staging_db,
                 'staging_tbl': table['name'],
                 'create_statement': create_statement
             },
            dag=dag)

        clean_staging_tbl.set_upstream(create_staging_db)
        clean_sqoop_staging.set_upstream(create_staging_dir)
        sqoop.set_upstream(clean_sqoop_staging)
        sqoop.set_upstream(clean_staging_tbl)
        convert_to_parquet.set_upstream(sqoop)
Esempio n. 7
0
            hql += str(line)
            if not line:
                break
    print("___________")
    print(hql)
    print("_____________________________________________________")
    return hql


args['pool'] = 'pool_dw'  # 单独设置 sub dag 的pool 参数
with DAG(
        # DAG_NAME,
        dag_id='test_dag',
        default_args=args,
        schedule_interval=None,
        description='dw test') as dag:
    dummy_dw_mysqlapp = DummyOperator(task_id="DW_Tasks_Start")
    dag >> dummy_dw_mysqlapp
    last_task = dummy_dw_mysqlapp
    for hqlfilepath in dw_list:
        task_id = hqlfilepath.split("/")[-1].split(".")[0]
        print('task_id ', task_id)
        current_task = HiveOperator(hive_cli_conn_id='hive_cli_emr',
                                    task_id=task_id,
                                    hiveconf_jinja_translate=True,
                                    hql=read_hql_file(hqlfilepath),
                                    trigger_rule='all_done',
                                    dag=dag)
        current_task.set_upstream(last_task)
        last_task = current_task
Esempio n. 8
0
hql = """
    CREATE EXTERNAL TABLE IF NOT EXISTS dwdii2.noaa_temps 
    (
        station string,
        year int,
        jan string,
        feb string,
        mar string,
        apr string,
        may string,
        jun string,
        jul string,
        aug string,
        sep string,
        oct string,
        nov string,
        dec string
    ) PARTITIONED BY (exdt string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
    STORED AS TEXTFILE
    LOCATION '{{ params.hdfs_destination }}';
"""

createTable = HiveOperator(task_id='hive_create_ext_table',
                           hql=hql,
                           params=s3fetch_params,
                           dag=dag)

t1.set_upstream(cleanHdfs)
createTable.set_upstream(t1)
Esempio n. 9
0
source_data_sensor = WebHdfsSensor(
    task_id='source_data_sensor',
    filepath='/data/mydata/{{ ds }}/mydata.csv',
    poke_interval=10,
    timeout=5,
    dag=dag
)

create_hive_db = HiveOperator(
    task_id='create_hive_db',
    hql="DROP DATABASE IF EXISTS {db} CASCADE; CREATE DATABASE {db};".format(db='my_hive_db'),
    provide_context=True,
    dag=dag
)
create_hive_db.set_upstream(source_data_sensor)

hdfs_to_hive_trasfer = HiveOperator(
    task_id='hdfs_to_hive_trasfer',
    hql=hql.HQL_HDFS_TO_HIVE_TRANSFER.format(table_name='mydata',
                                             tmp_table_name='mydata_tmp',
                                             hdfs_path='/data/mydata/{{ ds }}'),
    schema='my_hive_db',
    provide_context=True,
    dag=dag
)
hdfs_to_hive_trasfer.set_upstream(create_hive_db)


count_data_rows = BranchPythonOperator(
    task_id='count_data_rows',
Esempio n. 10
0
      task_id = "ads_bi_risk_acct_c_zy",
      hql = ads_bi_risk.sqlads_bi_risk_acct_c_zy,
      hive_cli_conn_id = 'hive',
      dag = dag
	)


ods_data_check = BashOperator(
    task_id='ods_data_check',
    bash_command='date',
    dag=dag)


#设置依赖关系
#先跑数据层,并在跑批前检查当天ods跑数情况
apt_card_txn_cu.set_upstream(ods_data_check)
apt_card_stmt_cu.set_upstream(ods_data_check)
apt_card_mpur_cu.set_upstream(ods_data_check)
apt_card_jorj_cu.set_upstream(ods_data_check)
apt_card_commtxn_cu.set_upstream(ods_data_check)
apt_card_cnta_cu.set_upstream(ods_data_check)
apt_card_chgs_cu.set_upstream(ods_data_check)
apt_card_ccdcust_cu.set_upstream(ods_data_check)
apt_card_card_cu.set_upstream(ods_data_check)
apt_card_appr_cu.set_upstream(ods_data_check)
apt_card_apma_cu.set_upstream(ods_data_check)
apt_card_acct_cu.set_upstream(ods_data_check)

#stg层
stg_card_acct_cu.set_upstream(apt_card_acct_cu)
stg_card_acct_cu.set_upstream(dwd_card_stmt_hs)