コード例 #1
1
ファイル: hive_to_druid.py プロジェクト: asnir/airflow
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = "druid." + context["task_instance_key_str"].replace(".", "_")
        sql = self.sql.strip().strip(";")
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = '')
        AS
        {sql}
        """.format(
            **locals()
        )
        logging.info("Running command:\n {}".format(hql))
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find("/user")
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split(".")

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        logging.info("HDFS path: " + static_path)

        try:
            druid.load_from_hdfs(
                datasource=self.druid_datasource,
                intervals=self.intervals,
                static_path=static_path,
                ts_dim=self.ts_dim,
                columns=columns,
                num_shards=self.num_shards,
                target_partition_size=self.target_partition_size,
                query_granularity=self.query_granularity,
                segment_granularity=self.segment_granularity,
                metric_spec=self.metric_spec,
                hadoop_dependency_coordinates=self.hadoop_dependency_coordinates,
            )
            logging.info("Load seems to have succeeded!")
        finally:
            logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table))
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
コード例 #2
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_')
        sql = self.sql.strip().strip(';')
        tblproperties = ''.join([", '{}' = '{}'"
                                .format(k, v)
                                 for k, v in self.hive_tblproperties.items()])
        hql = """\
        SET mapred.output.compress=false;
        SET hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
        AS
        {sql}
        """.format(hive_table=hive_table, tblproperties=tblproperties, sql=sql)
        self.log.info("Running command:\n %s", hql)
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)

        # Get the Hive table and extract the columns
        t = m.get_table(hive_table)
        columns = [col.name for col in t.sd.cols]

        # Get the path on hdfs
        static_path = m.get_table(hive_table).sd.location

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)

        try:
            index_spec = self.construct_ingest_query(
                static_path=static_path,
                columns=columns,
            )

            self.log.info("Inserting rows into Druid, hdfs path: %s", static_path)

            druid.submit_indexing_job(index_spec)

            self.log.info("Load seems to have succeeded!")
        finally:
            self.log.info(
                "Cleaning up by dropping the temp Hive table %s",
                hive_table
            )
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
コード例 #3
0
    def test_run_cli_with_hive_conf(self):
        hql = "set key;\n" \
              "set airflow.ctx.dag_id;\nset airflow.ctx.dag_run_id;\n" \
              "set airflow.ctx.task_id;\nset airflow.ctx.execution_date;\n"

        dag_id_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_ID']['env_var_format']
        task_id_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_TASK_ID']['env_var_format']
        execution_date_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_EXECUTION_DATE'][
                'env_var_format']
        dag_run_id_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_RUN_ID'][
                'env_var_format']
        os.environ[dag_id_ctx_var_name] = 'test_dag_id'
        os.environ[task_id_ctx_var_name] = 'test_task_id'
        os.environ[execution_date_ctx_var_name] = 'test_execution_date'
        os.environ[dag_run_id_ctx_var_name] = 'test_dag_run_id'

        hook = HiveCliHook()
        output = hook.run_cli(hql=hql, hive_conf={'key': 'value'})
        self.assertIn('value', output)
        self.assertIn('test_dag_id', output)
        self.assertIn('test_task_id', output)
        self.assertIn('test_execution_date', output)
        self.assertIn('test_dag_run_id', output)

        del os.environ[dag_id_ctx_var_name]
        del os.environ[task_id_ctx_var_name]
        del os.environ[execution_date_ctx_var_name]
        del os.environ[dag_run_id_ctx_var_name]
コード例 #4
0
    def test_run_cli_with_hive_conf(self):
        hql = "set key;\n" \
              "set airflow.ctx.dag_id;\nset airflow.ctx.dag_run_id;\n" \
              "set airflow.ctx.task_id;\nset airflow.ctx.execution_date;\n"

        dag_id_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_ID']['env_var_format']
        task_id_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_TASK_ID']['env_var_format']
        execution_date_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_EXECUTION_DATE'][
                'env_var_format']
        dag_run_id_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_RUN_ID'][
                'env_var_format']
        os.environ[dag_id_ctx_var_name] = 'test_dag_id'
        os.environ[task_id_ctx_var_name] = 'test_task_id'
        os.environ[execution_date_ctx_var_name] = 'test_execution_date'
        os.environ[dag_run_id_ctx_var_name] = 'test_dag_run_id'

        hook = HiveCliHook()
        output = hook.run_cli(hql=hql, hive_conf={'key': 'value'})
        self.assertIn('value', output)
        self.assertIn('test_dag_id', output)
        self.assertIn('test_task_id', output)
        self.assertIn('test_execution_date', output)
        self.assertIn('test_dag_run_id', output)

        del os.environ[dag_id_ctx_var_name]
        del os.environ[task_id_ctx_var_name]
        del os.environ[execution_date_ctx_var_name]
        del os.environ[dag_run_id_ctx_var_name]
コード例 #5
0
def merge_pre_hi_with_full_data_task(hive_db, hive_h_his_table_name,
                                     hive_hi_table_name, mysql_db_name,
                                     mysql_table_name, mysql_conn,
                                     sqoop_temp_db_name, sqoop_table_name, pt,
                                     now_hour, pre_day, pre_hour_day, pre_hour,
                                     is_must_have_data, **kwargs):
    sqoopSchema = SqoopSchemaUpdate()

    hive_columns = sqoopSchema.get_hive_column_name(hive_db,
                                                    hive_h_his_table_name)
    mysql_columns = sqoopSchema.get_mysql_column_name(mysql_db_name,
                                                      mysql_table_name,
                                                      mysql_conn)
    pre_day_ms = int(time.mktime(time.strptime(pre_day, "%Y-%m-%d"))) * 1000

    hql = MERGE_HI_WITH_FULL_SQL.format(
        columns=',\n'.join(hive_columns),
        pt=pt,
        now_hour=now_hour,
        db_name=hive_db,
        mysql_db_name=mysql_db_name,
        hive_h_his_table_name=hive_h_his_table_name,
        hive_hi_table_name=hive_hi_table_name,
        mysql_table_name=mysql_table_name,
        pre_day_ms=pre_day_ms,
        mysql_columns=',\n'.join(mysql_columns),
        sqoop_temp_db_name=sqoop_temp_db_name,
        sqoop_table_name=sqoop_table_name)

    hive_hook = HiveCliHook()

    # 读取sql
    logging.info('Executing: %s', hql)

    # 执行Hive
    hive_hook.run_cli(hql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(
        pt, hive_db, hive_h_his_table_name,
        H_HIS_OSS_PATH % hive_h_his_table_name, "false", is_must_have_data,
        now_hour)
コード例 #6
0
def create_hive_external_table(db, table, conn, **op_kwargs):
    sqoopSchema = SqoopSchemaUpdate()
    response = sqoopSchema.update_hive_schema(
        hive_db=hive_db,
        hive_table=hive_table.format(bs=table),
        mysql_db=db,
        mysql_table=table,
        mysql_conn=conn
    )
    #if response:
    #    return True

    mysql_conn = get_db_conn(conn)
    mcursor = mysql_conn.cursor()
    sql = '''
        select 
            COLUMN_NAME, 
            DATA_TYPE, 
            COLUMN_COMMENT,
            COLUMN_TYPE 
        from information_schema.COLUMNS 
        where TABLE_SCHEMA='{db}' and 
            TABLE_NAME='{table}' 
        order by ORDINAL_POSITION
    '''.format(db=db, table=table)
    # logging.info(sql)
    mcursor.execute(sql)
    res = mcursor.fetchall()
    # logging.info(res)
    columns = []
    for (name, type, comment, co_type) in res:
        if type.upper() == 'DECIMAL':
            columns.append("`%s` %s comment '%s'" % (name, co_type.replace('unsigned', '').replace('signed', ''), comment))
        else:
            columns.append("`%s` %s comment '%s'" % (name, mysql_type_to_hive.get(type.upper(), 'string'), comment))
    mysql_conn.close()
    # 创建hive数据表的sql
    hql = ods_create_table_hql.format(
        db_name=hive_db,
        table_name=hive_table.format(bs=table),
        columns=",\n".join(columns),
        hdfs_path=hdfs_path.format(bs=table)
    )
    logging.info(hql)
    hive_hook = HiveCliHook()
    logging.info('Executing: %s', hql)
    hive_hook.run_cli(hql)
コード例 #7
0
def execution_data_task_id(ds, **kwargs):
    v_date = kwargs.get('v_execution_date')
    v_day = kwargs.get('v_execution_day')
    v_hour = kwargs.get('v_execution_hour')

    hive_hook = HiveCliHook()
    """
            #功能函数
            alter语句: alter_partition
            删除分区: delete_partition
            生产success: touchz_success

            #参数
            第一个参数true: 所有国家是否上线。false 没有
            第二个参数true: 数据目录是有country_code分区。false 没有
            第三个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

            #读取sql
            %_sql(ds,v_hour)

            第一个参数ds: 天级任务
            第二个参数v_hour: 小时级任务,需要使用

        """
    cf = CountriesPublicFrame("true", ds, db_name, table_name, hdfs_path,
                              "true", "true")

    # 删除分区
    #cf.delete_partition()

    # 读取sql
    _sql = "\n" + cf.alter_partition(
    ) + "\n" + dm_oride_passenger_base_cube_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据,如果数据不能为0
    # check_key_data_cnt_task(ds)

    # 熔断数据
    check_key_data_cnt_task(ds)

    # 生产success
    cf.touchz_success()
コード例 #8
0
 def ddl(self):
     """
     Retrieve table ddl
     """
     table = request.args.get("table")
     sql = "SHOW CREATE TABLE {table};".format(table=table)
     hook = HiveCliHook(HIVE_CLI_CONN_ID)
     return hook.run_cli(sql)
コード例 #9
0
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = ods_sqoop_base_bd_agent_df_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "false", "true")
コード例 #10
0
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dwm_oride_driver_act_w_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success('{pt}'.format(pt=airflow.macros.ds_add(ds, +6)), db_name, table_name, hdfs_path, "true", "true")
コード例 #11
0
def execution_act_driver_task(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = app_oride_act_driver_cohort_w_sql_task(ds)

    # 执行hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    pt = airflow.macros.ds_add(ds, +6)
    hdfs_path = get_table_info(3)[1]
    TaskTouchzSuccess().countries_touchz_success(pt, "oride_dw", get_table_info(3)[0], hdfs_path, "true", "true")
コード例 #12
0
def execution_data_task_id(ds, ds_nodash,  **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = app_opay_active_user_report_w_sql_task(ds,ds_nodash)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
コード例 #13
0
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dwd_opay_topup_with_card_record_di_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)


    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
コード例 #14
0
def execution_new_driver_task(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = app_oride_new_driver_cohort_m_sql_task(ds)

    # 执行hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    hdfs_path = get_table_info(1)[1]
    TaskTouchzSuccess().countries_touchz_success(ds, "oride_dw",
                                                 get_table_info(1)[0],
                                                 hdfs_path, "true", "true")
コード例 #15
0
def execution_data_task_id(ds,**kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dim_oride_passenger_whitelist_base_sql_task(ds)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据
    check_key_data_task(ds)

    # 生成_SUCCESS
    """
        第一个参数true: 数据目录是有country_code分区。false 没有
        第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

        """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
コード例 #16
0
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = app_opay_owealth_report_d_19_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(airflow.macros.ds_add(ds, +1),
                                                 db_name, table_name,
                                                 hdfs_path, "true", "true")
コード例 #17
0
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = app_oride_driver_transport_capacity_d_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据
    # check_key_data_task(ds)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
コード例 #18
0
def execution_data_task_id(ds, **kwargs):
    hive_hook = HiveCliHook()

    v_hour = kwargs.get('v_execution_hour')

    # 读取sql
    _sql = dwd_driver_track_data_hi_sql_task(ds, v_hour)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name,
                                                 hdfs_path, "true", "false",
                                                 v_hour)
コード例 #19
0
def execution_data_task_id(ds, execution_date, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dwd_oride_location_driver_event_hi_sql_task(ds, execution_date)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据
    # check_key_data_task(ds, execution_date)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false",
                                                 execution_date.strftime("%H"))
コード例 #20
0
def merge_pre_hi_data_task(hive_db, hive_all_hi_table_name, hive_hi_table_name, is_must_have_data, pt, now_hour,
                           pre_hour_day, pre_hour,
                           **kwargs):
    sqoopSchema = SqoopSchemaUpdate()
    hive_columns = sqoopSchema.get_hive_column_name(hive_db, hive_all_hi_table_name)

    hql = ADD_HI_SQL.format(
        db_name=hive_db,
        hive_all_hi_table_name=hive_all_hi_table_name,
        hive_hi_table_name=hive_hi_table_name,
        pt=pt,
        now_hour=now_hour,
        pre_hour_day=pre_hour_day,
        pre_hour=pre_hour,
        columns=',\n'.join(hive_columns)
    )

    hive_hook = HiveCliHook()

    # 读取sql

    logging.info('Executing: %s', hql)

    # 执行Hive
    hive_hook.run_cli(hql)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    TaskTouchzSuccess().countries_touchz_success(pt, hive_db, hive_all_hi_table_name,
                                                 ALL_HI_OSS_PATH % hive_all_hi_table_name,
                                                 "false",
                                                 is_must_have_data,
                                                 now_hour)
コード例 #21
0
def execution_data_task_id(ds, **kargs):
    hive_hook = HiveCliHook()

    # 读取sql
    _sql = dwd_active_user_week_di_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 熔断数据
    # check_key_data_task(ds)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    after_6_day = airflow.macros.ds_add(ds, +6)
    TaskTouchzSuccess().countries_touchz_success(after_6_day, db_name,
                                                 table_name, hdfs_path, "true",
                                                 "true")
コード例 #22
0
ファイル: mapr_tasks_dag.py プロジェクト: verdyr/mapr-airflow
def query_hive(**kwargs):
    ti = kwargs['ti']
    # get sha of latest commit
    v1 = ti.xcom_pull(key=None, task_ids='get_last_commit_task')
    json_value = json.loads(v1)
    sha = json_value['sha']

    hive_cli = HiveCliHook()
    hql = "select * from mapr_music_updates where commit_sha = '" + sha + "';"
    latest_commit = hive_cli.run_cli(hql)

    changed = latest_commit.find(sha) == -1
    ti.xcom_push(key='sha', value=sha)
    ti.xcom_push(key='is_changed', value=changed)

    return 'reimport_dataset_task' if changed else 'skip_reimport_dataset_task'
コード例 #23
0
    def execute(self, context=None):
        try:
            metastore_hook = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
            table_metadata = metastore_hook.get_table(self.table, db=self.schema)
            is_partitioned = len(table_metadata.partitionKeys) > 0
            column_string = ', '.join([col.name for col in table_metadata.sd.cols])

            where_clause = 'WHERE {}'.format(self.partition) if is_partitioned else ''
            self.hql = "SELECT COUNT(col2sum) FROM (SELECT COUNT(1) AS col2sum FROM {}.{} {} GROUP BY {}) t2 " \
                       "WHERE t2.col2sum > 1".format(self.schema, self.table, where_clause, column_string)

            hook = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id, mapred_queue=self.mapred_queue)
            hook.hive_cli_params = '-S'  # suppress hive junk output
            output = hook.run_cli(hql=self.hql, schema=self.schema)
            output_row = int(output.strip())

            if output_row > self.max_duplicates:
                raise AirflowException('There are {} duplicate records found whereas the max number of duplicates'
                                       ' can be {}'.format(output_row, self.max_duplicates))

        except Exception as e:
            raise AirflowException('An error occurred with the following duplicate check query:\n\t{}\n{}'
                                   .format(self.hql, e))
コード例 #24
0
 def ddl(self):
     table = request.args.get("table")
     sql = "SHOW CREATE TABLE {table};".format(table=table)
     h = HiveCliHook(HIVE_CLI_CONN_ID)
     return h.run_cli(sql)
コード例 #25
0
def create_table_info(hive_db, hive_table, dwd_db_name, dwd_table_name):
    """
        建表信息
    """
    hive_hook = HiveCliHook()

    hive_cursor = get_hive_cursor()

    hql = '''
        DESCRIBE FORMATTED {db}.{table} 
    '''.format(db=hive_db, table=hive_table)
    #logging.info(hql)
    hive_cursor.execute(hql)
    res = hive_cursor.fetchall()

    hive_schema = []

    hive_schema_exp = []

    location = None

    for (column_name, column_type, column_comment) in res:

        col_name = column_name.lower().strip()

        column_type = str(column_type).strip()

        if col_name == 'location:':

            location = column_type
            break

        #将空字符串替换给未知
        if column_comment == "" or column_comment == "from deserializer":
            column_comment = "未知"

        if col_name == '# col_name' or col_name == '':

            continue

        if col_name == '# partition information':

            if column_comment is None:
                column_comment = "未知"

            break

        _schema = col_name + " " + column_type + " " + "COMMENT" + " '" + column_comment.replace(
            "\\n", "") + "',"

        hive_schema_exp.append(_schema)

    nm = len(hive_schema_exp)

    a = 0

    b = ""

    for i in hive_schema_exp:

        a = a + 1

        #将最后一条数据去除','
        if a == nm:
            i = i.replace(",", "")

        b = b + "\n" + i

    create_str = """
    use {dwd_hive_db};
    CREATE EXTERNAL TABLE {dwd_hive_db}.{dwd_hive_table}(
    {schema_sql}
    )
    comment ""
    partitioned by (country_code string comment '二位国家码',dt string comment '分区时间')
        STORED AS orc
        LOCATION
        "ufile://opay-datalake/oride/{dwd_hive_db}/{dwd_hive_table}";
    """.format(dwd_hive_db=dwd_db_name,
               dwd_hive_table=dwd_table_name,
               schema_sql=b)

    hive_hook.run_cli(create_str)
コード例 #26
0
def run_check_table(schema_table_db_name, schema_table_name,
                    target_table_db_name, target_table_name, conn_id,
                    hive_table_name, server_name, **kwargs):
    # SHOW TABLES in oride_db LIKE 'data_aa'
    check_sql = 'SHOW TABLES in %s LIKE \'%s\'' % (HIVE_DB, hive_table_name)
    hive2_conn = HiveServer2Hook().get_conn()
    cursor = hive2_conn.cursor()
    cursor.execute(check_sql)
    if len(cursor.fetchall()) == 0:
        logging.info('Create Hive Table: %s.%s', HIVE_DB, hive_table_name)
        # get table column
        column_sql = '''
                SELECT
                    COLUMN_NAME,
                    DATA_TYPE,
                    NUMERIC_PRECISION,
                    NUMERIC_SCALE,
                    COLUMN_COMMENT
                FROM
                    information_schema.columns
                WHERE
                    table_schema='{db_name}' and table_name='{table_name}'
            '''.format(db_name=schema_table_db_name,
                       table_name=schema_table_name)
        mysql_hook = MySqlHook(conn_id)
        mysql_conn = mysql_hook.get_conn()
        mysql_cursor = mysql_conn.cursor()
        mysql_cursor.execute(column_sql)
        results = mysql_cursor.fetchall()
        rows = []
        for result in results:
            if result[0] == 'dt':
                col_name = '_dt'
            else:
                col_name = result[0]
            if result[1] == 'timestamp' or result[1] == 'varchar' or result[1] == 'char' or result[1] == 'text' or \
                    result[1] == 'longtext' or \
                    result[1] == 'mediumtext' or \
                    result[1] == 'json' or \
                    result[1] == 'datetime':
                data_type = 'string'
            elif result[1] == 'decimal':
                data_type = result[1] + "(" + str(result[2]) + "," + str(
                    result[3]) + ")"
            else:
                data_type = result[1]
            rows.append("`%s` %s comment '%s'" %
                        (col_name, data_type, str(result[4]).replace(
                            '\n', '').replace('\r', '')))
        mysql_conn.close()

        # hive create table
        hive_hook = HiveCliHook()
        sql = ODS_CREATE_TABLE_SQL.format(
            db_name=HIVE_DB,
            table_name=hive_table_name,
            columns=",\n".join(rows),
            oss_path=OSS_PATH % ("{server_name}.{db_name}.{table_name}".format(
                server_name=server_name,
                db_name=target_table_db_name,
                table_name=target_table_name)))
        logging.info('Executing: %s', sql)
        hive_hook.run_cli(sql)

    else:
        sqoopSchema = SqoopSchemaUpdate()
        response = sqoopSchema.append_hive_schema(
            hive_db=HIVE_DB,
            hive_table=hive_table_name,
            mysql_db=schema_table_db_name,
            mysql_table=schema_table_name,
            mysql_conn=conn_id,
            oss_path=OSS_PATH % ("{server_name}.{db_name}.{table_name}".format(
                server_name=server_name,
                db_name=target_table_db_name,
                table_name=target_table_name)))
        if response:
            return True
    return
コード例 #27
0
    def execute(self, context):
        ti = context['ti']
        host, dagid, taskid, exectime = ti.hostname.split(
            '.')[0], ti.dag_id, ti.task_id, ti.execution_date.isoformat()
        hook = HiveCliHook(
            hive_cli_conn_id=self.hive_cli_conn_id,
            mapred_queue=self.mapred_queue,
            mapred_job_name='Airflow HiveEmailOperator task for {}.{}.{}.{}'.
            format(host, dagid, taskid, exectime))
        hook.hive_cli_params = '-S'  # suppress hive junk output
        output = hook.run_cli(hql=self.hql,
                              schema=self.schema,
                              hive_conf={'hive.cli.print.header': 'true'})

        output_rows = [line for line in output.split('\n') if line]
        col_names = output_rows[0].split('\t')
        output_rows = output_rows[1:]

        if len(output_rows) > self.cutoff:
            msg = 'The query returned > {} rows.. Adding tsv as an attachment.'.format(
                self.cutoff)
            logging.warn(msg)
            f = tempfile.NamedTemporaryFile(delete=False)
            f.write(output)
            f.close()
            self.files = [f.name]
            self.html_content = '{}<br>Dag id: {}<br>Task id: {}<br>Execution Time: {}'.format(
                msg, dagid, taskid, exectime)
        else:
            context.update({
                'hql': self.hql,
                'rows': output_rows,
                'col_names': col_names
            })

            if not self.html_content:
                check_path = os.path.join(os.path.dirname(__file__), '..',
                                          'templates',
                                          'hive_email_default.html')
            else:
                dag_path = conf.get('core', 'dags_folder')
                check_path = os.path.join(
                    dag_path, os.path.dirname(context['dag'].filepath),
                    self.html_content)

            if os.path.exists(check_path):
                path, filename = os.path.split(os.path.abspath(check_path))
                template = Environment(
                    loader=FileSystemLoader(path)).get_template(filename)
                logging.info("Using templated file located at: {path}".format(
                    path=check_path))
            else:
                template = Environment(loader=BaseLoader()).from_string(
                    self.html_content)

            self.html_content = template.render(**context)

        super(HiveEmailOperator, self).execute(context)

        # delete the temp file after successfully attached to email
        if len(output_rows) > self.cutoff:
            os.unlink(f.name)
コード例 #28
0
 def test_run_cli(self):
     hook = HiveCliHook()
     hook.run_cli("SHOW DATABASES")
def execution_data_task_id(ds, dag, **kwargs):
    v_date = kwargs.get('v_execution_date')
    v_day = kwargs.get('v_execution_day')
    v_hour = kwargs.get('v_execution_hour')

    hive_hook = HiveCliHook()

    # 读取sql
    # _sql = app_otrade_b2b_order_target_retailer_di_sql_task(ds)

    # logging.info('Executing: %s', _sql)

    # 执行Hive
    # hive_hook.run_cli(_sql)

    # 熔断数据
    # check_key_data_task(ds)

    # 生成_SUCCESS
    """
    第一个参数true: 数据目录是有country_code分区。false 没有
    第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS 

    """
    # TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false")
    """
        #功能函数
            alter语句: alter_partition()
            删除分区: delete_partition()
            生产success: touchz_success()

        #参数
            is_countries_online --是否开通多国家业务 默认(true 开通)
            db_name --hive 数据库的名称
            table_name --hive 表的名称
            data_oss_path --oss 数据目录的地址
            is_country_partition --是否有国家码分区,[默认(true 有country_code分区)]
            is_result_force_exist --数据是否强行产出,[默认(true 必须有数据才生成_SUCCESS)] false 数据没有也生成_SUCCESS 
            execute_time --当前脚本执行时间(%Y-%m-%d %H:%M:%S)
            is_hour_task --是否开通小时级任务,[默认(false)]
            frame_type --模板类型(只有 is_hour_task:'true' 时生效): utc 产出分区为utc时间,local 产出分区为本地时间,[默认(utc)]。
            is_offset --是否开启时间前后偏移(影响success 文件)
            execute_time_offset --执行时间偏移值(-1、0、1),在当前执行时间上,前后偏移原有时间,用于产出前后小时分区
            business_key --产品线名称

        #读取sql
            %_sql(ds,v_hour)

    """

    args = [{
        "dag": dag,
        "is_countries_online": "true",
        "db_name": db_name,
        "table_name": table_name,
        "data_oss_path": hdfs_path,
        "is_country_partition": "true",
        "is_result_force_exist": "false",
        "execute_time": v_date,
        "is_hour_task": "false",
        "frame_type": "local",
        "is_offset": "true",
        "execute_time_offset": -1,
        "business_key": "otrade"
    }]

    cf = CountriesAppFrame(args)

    # 删除分区
    cf.delete_partition()

    # 读取sql
    _sql = "\n" + cf.alter_partition(
    ) + "\n" + app_otrade_b2b_order_target_retailer_di_sql_task(ds)

    logging.info('Executing: %s', _sql)

    # 执行Hive
    hive_hook.run_cli(_sql)

    # 生产success
    cf.touchz_success()
コード例 #30
0
    def append_hive_schema(self, **info):
        try:
            hive_db = info.get('hive_db', None)
            hive_table = info.get('hive_table', None)
            mysql_db = info.get('mysql_db', None)
            mysql_table = info.get('mysql_table', None)
            mysql_conn = info.get('mysql_conn', None)
            oss_path = info.get('oss_path', None)

            if hive_db is None or hive_table is None or mysql_db is None or mysql_table is None or mysql_conn is None:
                return None

            hive_info = self.__get_hive_table_and_type_schema(hive_db, hive_table)
            hive_columns = list(hive_info[0])
            hive_schema = list(hive_info[1])

            mysql_info = self.__get_mysql_table_schema_and_column_name(mysql_db, mysql_table, mysql_conn)
            mysql_schema = mysql_info[0]
            mysql_column = mysql_info[1]
            add_columns = set(mysql_column).difference(set(hive_columns))
            add_columns = list(add_columns)

            if len(add_columns) == 0:
                return True

            columns = []
            for add_column in add_columns:
                for column in mysql_schema:
                    if column.get('column', None) == add_column:
                        column_info = column.get('column_info', None)
                        if column_info:
                            columns.append(column_info)

            hive_schema.extend(columns)

            hql = '''
                DROP TABLE IF EXISTS {db}.`{table}`;
                CREATE EXTERNAL TABLE IF NOT EXISTS {db}.`{table}`(
                    {columns}
                )
                PARTITIONED BY (
                  `dt` string,
                  `hour` string
                )
                ROW FORMAT SERDE 
                    'org.openx.data.jsonserde.JsonSerDe' 
                WITH SERDEPROPERTIES ( 
                    'ignore.malformed.json'='true') 
                STORED AS INPUTFORMAT 
                    'org.apache.hadoop.mapred.TextInputFormat' 
                OUTPUTFORMAT 
                    'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
                LOCATION
                  '{oss_path}';
                MSCK REPAIR TABLE {db}.`{table}`;
                
            '''.format(
                db=hive_db,
                table=hive_table,
                columns=",\n".join(hive_schema),
                oss_path=oss_path
            )

            logging.info(hql)
            hive_hook = HiveCliHook()
            hive_hook.run_cli(hql)
            return True
        except BaseException as e:
            logging.info("Exception Info::")
            logging.info(e)
            raise AirflowException('sqoop append hive schema error')
コード例 #31
0
ファイル: main.py プロジェクト: AdamUnger/incubator-airflow
 def ddl(self):
     table = request.args.get("table")
     sql = "SHOW CREATE TABLE {table};".format(table=table)
     h = HiveCliHook(HIVE_CLI_CONN_ID)
     return h.run_cli(sql)
コード例 #32
0
def move_data_mssql(templates_dict, **kwargs):

    tmp_table = 'clean_mydata_tmp'
    table = 'clean_mydata'
    delimiter = "','"
    quote_char = "'\"'"
    escape_char = "'\\\\'"
    number_of_mappers = '4'
    mssql_table = 'mydata'

    # -------Convert Hive Table from PARQUET to TEXT --------------------------------------------

    hook = HiveCliHook()

    query = """
        DROP TABLE {tmp_table};

        CREATE TABLE {tmp_table}
            ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
            WITH SERDEPROPERTIES (
               "separatorChar" = {delimiter},
               "quoteChar"     = {quote_char},
               "escapeChar"    = {escape_char}
            )
        STORED AS TEXTFILE
        AS
        SELECT column1, column2, column3, column4, column5, column6,column7, column8, column9, column10, source_id
        FROM {table}

    """.format(tmp_table=tmp_table, table=table,
               delimiter=delimiter, quote_char=quote_char, escape_char=escape_char)

    hook.run_cli(schema=templates_dict['schema'], hql=query)

    # --------------Run sqoop--------------------------------------------------------------------

    # get default mssql connection
    sql_conn = BaseHook.get_connection('mssql_default')
    conn_str = "'jdbc:sqlserver://{host};databaseName={database}'".format(
        host=sql_conn.host,
        database=sql_conn.extra_dejson.get('database')
    )

    # get default hive cli connection
    hive_conn = BaseHook.get_connection('hive_cli_default')
    hdfs_export_dir = 'hdfs://{host}:{port}/user/hive/warehouse/my_hive_db.db/{table}'.format(
        host=hive_conn.host,
        port=8020,
        table=tmp_table
    )

    cmd = ['/usr/bin/sqoop', 'export', '--connect', conn_str,
           '--username', sql_conn.login, '--password', sql_conn.password,
           '--table', mssql_table,
           '--export-dir', hdfs_export_dir,
           '-m', number_of_mappers,
           '--input-fields-terminated-by', delimiter,
           '--input-enclosed-by', quote_char,
           '--input-escaped-by', escape_char,
           '--input-null-string', "'\\\N'"]
    cmd = ' '.join(cmd)

    print cmd

    logging.info("Executing sqoop")
    sp = Popen(cmd, shell=True)

    sp.wait()
    logging.info("Command exited with return code {0}".format(sp.returncode))
コード例 #33
0
 def test_run_cli(self):
     hook = HiveCliHook()
     hook.run_cli("SHOW DATABASES")