def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = "druid." + context["task_instance_key_str"].replace(".", "_") sql = self.sql.strip().strip(";") hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = '') AS {sql} """.format( **locals() ) logging.info("Running command:\n {}".format(hql)) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find("/user") static_path = hdfs_uri[pos:] schema, table = hive_table.split(".") druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") logging.info("HDFS path: " + static_path) try: druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, num_shards=self.num_shards, target_partition_size=self.target_partition_size, query_granularity=self.query_granularity, segment_granularity=self.segment_granularity, metric_spec=self.metric_spec, hadoop_dependency_coordinates=self.hadoop_dependency_coordinates, ) logging.info("Load seems to have succeeded!") finally: logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table)) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.log.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_') sql = self.sql.strip().strip(';') tblproperties = ''.join([", '{}' = '{}'" .format(k, v) for k, v in self.hive_tblproperties.items()]) hql = """\ SET mapred.output.compress=false; SET hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = ''{tblproperties}) AS {sql} """.format(hive_table=hive_table, tblproperties=tblproperties, sql=sql) self.log.info("Running command:\n %s", hql) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) # Get the Hive table and extract the columns t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] # Get the path on hdfs static_path = m.get_table(hive_table).sd.location schema, table = hive_table.split('.') druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) try: index_spec = self.construct_ingest_query( static_path=static_path, columns=columns, ) self.log.info("Inserting rows into Druid, hdfs path: %s", static_path) druid.submit_indexing_job(index_spec) self.log.info("Load seems to have succeeded!") finally: self.log.info( "Cleaning up by dropping the temp Hive table %s", hive_table ) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def test_run_cli_with_hive_conf(self): hql = "set key;\n" \ "set airflow.ctx.dag_id;\nset airflow.ctx.dag_run_id;\n" \ "set airflow.ctx.task_id;\nset airflow.ctx.execution_date;\n" dag_id_ctx_var_name = \ AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_ID']['env_var_format'] task_id_ctx_var_name = \ AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_TASK_ID']['env_var_format'] execution_date_ctx_var_name = \ AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_EXECUTION_DATE'][ 'env_var_format'] dag_run_id_ctx_var_name = \ AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_RUN_ID'][ 'env_var_format'] os.environ[dag_id_ctx_var_name] = 'test_dag_id' os.environ[task_id_ctx_var_name] = 'test_task_id' os.environ[execution_date_ctx_var_name] = 'test_execution_date' os.environ[dag_run_id_ctx_var_name] = 'test_dag_run_id' hook = HiveCliHook() output = hook.run_cli(hql=hql, hive_conf={'key': 'value'}) self.assertIn('value', output) self.assertIn('test_dag_id', output) self.assertIn('test_task_id', output) self.assertIn('test_execution_date', output) self.assertIn('test_dag_run_id', output) del os.environ[dag_id_ctx_var_name] del os.environ[task_id_ctx_var_name] del os.environ[execution_date_ctx_var_name] del os.environ[dag_run_id_ctx_var_name]
def merge_pre_hi_with_full_data_task(hive_db, hive_h_his_table_name, hive_hi_table_name, mysql_db_name, mysql_table_name, mysql_conn, sqoop_temp_db_name, sqoop_table_name, pt, now_hour, pre_day, pre_hour_day, pre_hour, is_must_have_data, **kwargs): sqoopSchema = SqoopSchemaUpdate() hive_columns = sqoopSchema.get_hive_column_name(hive_db, hive_h_his_table_name) mysql_columns = sqoopSchema.get_mysql_column_name(mysql_db_name, mysql_table_name, mysql_conn) pre_day_ms = int(time.mktime(time.strptime(pre_day, "%Y-%m-%d"))) * 1000 hql = MERGE_HI_WITH_FULL_SQL.format( columns=',\n'.join(hive_columns), pt=pt, now_hour=now_hour, db_name=hive_db, mysql_db_name=mysql_db_name, hive_h_his_table_name=hive_h_his_table_name, hive_hi_table_name=hive_hi_table_name, mysql_table_name=mysql_table_name, pre_day_ms=pre_day_ms, mysql_columns=',\n'.join(mysql_columns), sqoop_temp_db_name=sqoop_temp_db_name, sqoop_table_name=sqoop_table_name) hive_hook = HiveCliHook() # 读取sql logging.info('Executing: %s', hql) # 执行Hive hive_hook.run_cli(hql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success( pt, hive_db, hive_h_his_table_name, H_HIS_OSS_PATH % hive_h_his_table_name, "false", is_must_have_data, now_hour)
def create_hive_external_table(db, table, conn, **op_kwargs): sqoopSchema = SqoopSchemaUpdate() response = sqoopSchema.update_hive_schema( hive_db=hive_db, hive_table=hive_table.format(bs=table), mysql_db=db, mysql_table=table, mysql_conn=conn ) #if response: # return True mysql_conn = get_db_conn(conn) mcursor = mysql_conn.cursor() sql = ''' select COLUMN_NAME, DATA_TYPE, COLUMN_COMMENT, COLUMN_TYPE from information_schema.COLUMNS where TABLE_SCHEMA='{db}' and TABLE_NAME='{table}' order by ORDINAL_POSITION '''.format(db=db, table=table) # logging.info(sql) mcursor.execute(sql) res = mcursor.fetchall() # logging.info(res) columns = [] for (name, type, comment, co_type) in res: if type.upper() == 'DECIMAL': columns.append("`%s` %s comment '%s'" % (name, co_type.replace('unsigned', '').replace('signed', ''), comment)) else: columns.append("`%s` %s comment '%s'" % (name, mysql_type_to_hive.get(type.upper(), 'string'), comment)) mysql_conn.close() # 创建hive数据表的sql hql = ods_create_table_hql.format( db_name=hive_db, table_name=hive_table.format(bs=table), columns=",\n".join(columns), hdfs_path=hdfs_path.format(bs=table) ) logging.info(hql) hive_hook = HiveCliHook() logging.info('Executing: %s', hql) hive_hook.run_cli(hql)
def execution_data_task_id(ds, **kwargs): v_date = kwargs.get('v_execution_date') v_day = kwargs.get('v_execution_day') v_hour = kwargs.get('v_execution_hour') hive_hook = HiveCliHook() """ #功能函数 alter语句: alter_partition 删除分区: delete_partition 生产success: touchz_success #参数 第一个参数true: 所有国家是否上线。false 没有 第二个参数true: 数据目录是有country_code分区。false 没有 第三个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS #读取sql %_sql(ds,v_hour) 第一个参数ds: 天级任务 第二个参数v_hour: 小时级任务,需要使用 """ cf = CountriesPublicFrame("true", ds, db_name, table_name, hdfs_path, "true", "true") # 删除分区 #cf.delete_partition() # 读取sql _sql = "\n" + cf.alter_partition( ) + "\n" + dm_oride_passenger_base_cube_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据,如果数据不能为0 # check_key_data_cnt_task(ds) # 熔断数据 check_key_data_cnt_task(ds) # 生产success cf.touchz_success()
def ddl(self): """ Retrieve table ddl """ table = request.args.get("table") sql = "SHOW CREATE TABLE {table};".format(table=table) hook = HiveCliHook(HIVE_CLI_CONN_ID) return hook.run_cli(sql)
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = ods_sqoop_base_bd_agent_df_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "false", "true")
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = dwm_oride_driver_act_w_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success('{pt}'.format(pt=airflow.macros.ds_add(ds, +6)), db_name, table_name, hdfs_path, "true", "true")
def execution_act_driver_task(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = app_oride_act_driver_cohort_w_sql_task(ds) # 执行hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ pt = airflow.macros.ds_add(ds, +6) hdfs_path = get_table_info(3)[1] TaskTouchzSuccess().countries_touchz_success(pt, "oride_dw", get_table_info(3)[0], hdfs_path, "true", "true")
def execution_data_task_id(ds, ds_nodash, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = app_opay_active_user_report_w_sql_task(ds,ds_nodash) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = dwd_opay_topup_with_card_record_di_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
def execution_new_driver_task(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = app_oride_new_driver_cohort_m_sql_task(ds) # 执行hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ hdfs_path = get_table_info(1)[1] TaskTouchzSuccess().countries_touchz_success(ds, "oride_dw", get_table_info(1)[0], hdfs_path, "true", "true")
def execution_data_task_id(ds,**kargs): hive_hook = HiveCliHook() # 读取sql _sql = dim_oride_passenger_whitelist_base_sql_task(ds) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据 check_key_data_task(ds) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = app_opay_owealth_report_d_19_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(airflow.macros.ds_add(ds, +1), db_name, table_name, hdfs_path, "true", "true")
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = app_oride_driver_transport_capacity_d_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据 # check_key_data_task(ds) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "true")
def execution_data_task_id(ds, **kwargs): hive_hook = HiveCliHook() v_hour = kwargs.get('v_execution_hour') # 读取sql _sql = dwd_driver_track_data_hi_sql_task(ds, v_hour) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false", v_hour)
def execution_data_task_id(ds, execution_date, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = dwd_oride_location_driver_event_hi_sql_task(ds, execution_date) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据 # check_key_data_task(ds, execution_date) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false", execution_date.strftime("%H"))
def merge_pre_hi_data_task(hive_db, hive_all_hi_table_name, hive_hi_table_name, is_must_have_data, pt, now_hour, pre_hour_day, pre_hour, **kwargs): sqoopSchema = SqoopSchemaUpdate() hive_columns = sqoopSchema.get_hive_column_name(hive_db, hive_all_hi_table_name) hql = ADD_HI_SQL.format( db_name=hive_db, hive_all_hi_table_name=hive_all_hi_table_name, hive_hi_table_name=hive_hi_table_name, pt=pt, now_hour=now_hour, pre_hour_day=pre_hour_day, pre_hour=pre_hour, columns=',\n'.join(hive_columns) ) hive_hook = HiveCliHook() # 读取sql logging.info('Executing: %s', hql) # 执行Hive hive_hook.run_cli(hql) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ TaskTouchzSuccess().countries_touchz_success(pt, hive_db, hive_all_hi_table_name, ALL_HI_OSS_PATH % hive_all_hi_table_name, "false", is_must_have_data, now_hour)
def execution_data_task_id(ds, **kargs): hive_hook = HiveCliHook() # 读取sql _sql = dwd_active_user_week_di_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 熔断数据 # check_key_data_task(ds) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ after_6_day = airflow.macros.ds_add(ds, +6) TaskTouchzSuccess().countries_touchz_success(after_6_day, db_name, table_name, hdfs_path, "true", "true")
def query_hive(**kwargs): ti = kwargs['ti'] # get sha of latest commit v1 = ti.xcom_pull(key=None, task_ids='get_last_commit_task') json_value = json.loads(v1) sha = json_value['sha'] hive_cli = HiveCliHook() hql = "select * from mapr_music_updates where commit_sha = '" + sha + "';" latest_commit = hive_cli.run_cli(hql) changed = latest_commit.find(sha) == -1 ti.xcom_push(key='sha', value=sha) ti.xcom_push(key='is_changed', value=changed) return 'reimport_dataset_task' if changed else 'skip_reimport_dataset_task'
def execute(self, context=None): try: metastore_hook = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table_metadata = metastore_hook.get_table(self.table, db=self.schema) is_partitioned = len(table_metadata.partitionKeys) > 0 column_string = ', '.join([col.name for col in table_metadata.sd.cols]) where_clause = 'WHERE {}'.format(self.partition) if is_partitioned else '' self.hql = "SELECT COUNT(col2sum) FROM (SELECT COUNT(1) AS col2sum FROM {}.{} {} GROUP BY {}) t2 " \ "WHERE t2.col2sum > 1".format(self.schema, self.table, where_clause, column_string) hook = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id, mapred_queue=self.mapred_queue) hook.hive_cli_params = '-S' # suppress hive junk output output = hook.run_cli(hql=self.hql, schema=self.schema) output_row = int(output.strip()) if output_row > self.max_duplicates: raise AirflowException('There are {} duplicate records found whereas the max number of duplicates' ' can be {}'.format(output_row, self.max_duplicates)) except Exception as e: raise AirflowException('An error occurred with the following duplicate check query:\n\t{}\n{}' .format(self.hql, e))
def ddl(self): table = request.args.get("table") sql = "SHOW CREATE TABLE {table};".format(table=table) h = HiveCliHook(HIVE_CLI_CONN_ID) return h.run_cli(sql)
def create_table_info(hive_db, hive_table, dwd_db_name, dwd_table_name): """ 建表信息 """ hive_hook = HiveCliHook() hive_cursor = get_hive_cursor() hql = ''' DESCRIBE FORMATTED {db}.{table} '''.format(db=hive_db, table=hive_table) #logging.info(hql) hive_cursor.execute(hql) res = hive_cursor.fetchall() hive_schema = [] hive_schema_exp = [] location = None for (column_name, column_type, column_comment) in res: col_name = column_name.lower().strip() column_type = str(column_type).strip() if col_name == 'location:': location = column_type break #将空字符串替换给未知 if column_comment == "" or column_comment == "from deserializer": column_comment = "未知" if col_name == '# col_name' or col_name == '': continue if col_name == '# partition information': if column_comment is None: column_comment = "未知" break _schema = col_name + " " + column_type + " " + "COMMENT" + " '" + column_comment.replace( "\\n", "") + "'," hive_schema_exp.append(_schema) nm = len(hive_schema_exp) a = 0 b = "" for i in hive_schema_exp: a = a + 1 #将最后一条数据去除',' if a == nm: i = i.replace(",", "") b = b + "\n" + i create_str = """ use {dwd_hive_db}; CREATE EXTERNAL TABLE {dwd_hive_db}.{dwd_hive_table}( {schema_sql} ) comment "" partitioned by (country_code string comment '二位国家码',dt string comment '分区时间') STORED AS orc LOCATION "ufile://opay-datalake/oride/{dwd_hive_db}/{dwd_hive_table}"; """.format(dwd_hive_db=dwd_db_name, dwd_hive_table=dwd_table_name, schema_sql=b) hive_hook.run_cli(create_str)
def run_check_table(schema_table_db_name, schema_table_name, target_table_db_name, target_table_name, conn_id, hive_table_name, server_name, **kwargs): # SHOW TABLES in oride_db LIKE 'data_aa' check_sql = 'SHOW TABLES in %s LIKE \'%s\'' % (HIVE_DB, hive_table_name) hive2_conn = HiveServer2Hook().get_conn() cursor = hive2_conn.cursor() cursor.execute(check_sql) if len(cursor.fetchall()) == 0: logging.info('Create Hive Table: %s.%s', HIVE_DB, hive_table_name) # get table column column_sql = ''' SELECT COLUMN_NAME, DATA_TYPE, NUMERIC_PRECISION, NUMERIC_SCALE, COLUMN_COMMENT FROM information_schema.columns WHERE table_schema='{db_name}' and table_name='{table_name}' '''.format(db_name=schema_table_db_name, table_name=schema_table_name) mysql_hook = MySqlHook(conn_id) mysql_conn = mysql_hook.get_conn() mysql_cursor = mysql_conn.cursor() mysql_cursor.execute(column_sql) results = mysql_cursor.fetchall() rows = [] for result in results: if result[0] == 'dt': col_name = '_dt' else: col_name = result[0] if result[1] == 'timestamp' or result[1] == 'varchar' or result[1] == 'char' or result[1] == 'text' or \ result[1] == 'longtext' or \ result[1] == 'mediumtext' or \ result[1] == 'json' or \ result[1] == 'datetime': data_type = 'string' elif result[1] == 'decimal': data_type = result[1] + "(" + str(result[2]) + "," + str( result[3]) + ")" else: data_type = result[1] rows.append("`%s` %s comment '%s'" % (col_name, data_type, str(result[4]).replace( '\n', '').replace('\r', ''))) mysql_conn.close() # hive create table hive_hook = HiveCliHook() sql = ODS_CREATE_TABLE_SQL.format( db_name=HIVE_DB, table_name=hive_table_name, columns=",\n".join(rows), oss_path=OSS_PATH % ("{server_name}.{db_name}.{table_name}".format( server_name=server_name, db_name=target_table_db_name, table_name=target_table_name))) logging.info('Executing: %s', sql) hive_hook.run_cli(sql) else: sqoopSchema = SqoopSchemaUpdate() response = sqoopSchema.append_hive_schema( hive_db=HIVE_DB, hive_table=hive_table_name, mysql_db=schema_table_db_name, mysql_table=schema_table_name, mysql_conn=conn_id, oss_path=OSS_PATH % ("{server_name}.{db_name}.{table_name}".format( server_name=server_name, db_name=target_table_db_name, table_name=target_table_name))) if response: return True return
def execute(self, context): ti = context['ti'] host, dagid, taskid, exectime = ti.hostname.split( '.')[0], ti.dag_id, ti.task_id, ti.execution_date.isoformat() hook = HiveCliHook( hive_cli_conn_id=self.hive_cli_conn_id, mapred_queue=self.mapred_queue, mapred_job_name='Airflow HiveEmailOperator task for {}.{}.{}.{}'. format(host, dagid, taskid, exectime)) hook.hive_cli_params = '-S' # suppress hive junk output output = hook.run_cli(hql=self.hql, schema=self.schema, hive_conf={'hive.cli.print.header': 'true'}) output_rows = [line for line in output.split('\n') if line] col_names = output_rows[0].split('\t') output_rows = output_rows[1:] if len(output_rows) > self.cutoff: msg = 'The query returned > {} rows.. Adding tsv as an attachment.'.format( self.cutoff) logging.warn(msg) f = tempfile.NamedTemporaryFile(delete=False) f.write(output) f.close() self.files = [f.name] self.html_content = '{}<br>Dag id: {}<br>Task id: {}<br>Execution Time: {}'.format( msg, dagid, taskid, exectime) else: context.update({ 'hql': self.hql, 'rows': output_rows, 'col_names': col_names }) if not self.html_content: check_path = os.path.join(os.path.dirname(__file__), '..', 'templates', 'hive_email_default.html') else: dag_path = conf.get('core', 'dags_folder') check_path = os.path.join( dag_path, os.path.dirname(context['dag'].filepath), self.html_content) if os.path.exists(check_path): path, filename = os.path.split(os.path.abspath(check_path)) template = Environment( loader=FileSystemLoader(path)).get_template(filename) logging.info("Using templated file located at: {path}".format( path=check_path)) else: template = Environment(loader=BaseLoader()).from_string( self.html_content) self.html_content = template.render(**context) super(HiveEmailOperator, self).execute(context) # delete the temp file after successfully attached to email if len(output_rows) > self.cutoff: os.unlink(f.name)
def test_run_cli(self): hook = HiveCliHook() hook.run_cli("SHOW DATABASES")
def execution_data_task_id(ds, dag, **kwargs): v_date = kwargs.get('v_execution_date') v_day = kwargs.get('v_execution_day') v_hour = kwargs.get('v_execution_hour') hive_hook = HiveCliHook() # 读取sql # _sql = app_otrade_b2b_order_target_retailer_di_sql_task(ds) # logging.info('Executing: %s', _sql) # 执行Hive # hive_hook.run_cli(_sql) # 熔断数据 # check_key_data_task(ds) # 生成_SUCCESS """ 第一个参数true: 数据目录是有country_code分区。false 没有 第二个参数true: 数据有才生成_SUCCESS false 数据没有也生成_SUCCESS """ # TaskTouchzSuccess().countries_touchz_success(ds, db_name, table_name, hdfs_path, "true", "false") """ #功能函数 alter语句: alter_partition() 删除分区: delete_partition() 生产success: touchz_success() #参数 is_countries_online --是否开通多国家业务 默认(true 开通) db_name --hive 数据库的名称 table_name --hive 表的名称 data_oss_path --oss 数据目录的地址 is_country_partition --是否有国家码分区,[默认(true 有country_code分区)] is_result_force_exist --数据是否强行产出,[默认(true 必须有数据才生成_SUCCESS)] false 数据没有也生成_SUCCESS execute_time --当前脚本执行时间(%Y-%m-%d %H:%M:%S) is_hour_task --是否开通小时级任务,[默认(false)] frame_type --模板类型(只有 is_hour_task:'true' 时生效): utc 产出分区为utc时间,local 产出分区为本地时间,[默认(utc)]。 is_offset --是否开启时间前后偏移(影响success 文件) execute_time_offset --执行时间偏移值(-1、0、1),在当前执行时间上,前后偏移原有时间,用于产出前后小时分区 business_key --产品线名称 #读取sql %_sql(ds,v_hour) """ args = [{ "dag": dag, "is_countries_online": "true", "db_name": db_name, "table_name": table_name, "data_oss_path": hdfs_path, "is_country_partition": "true", "is_result_force_exist": "false", "execute_time": v_date, "is_hour_task": "false", "frame_type": "local", "is_offset": "true", "execute_time_offset": -1, "business_key": "otrade" }] cf = CountriesAppFrame(args) # 删除分区 cf.delete_partition() # 读取sql _sql = "\n" + cf.alter_partition( ) + "\n" + app_otrade_b2b_order_target_retailer_di_sql_task(ds) logging.info('Executing: %s', _sql) # 执行Hive hive_hook.run_cli(_sql) # 生产success cf.touchz_success()
def append_hive_schema(self, **info): try: hive_db = info.get('hive_db', None) hive_table = info.get('hive_table', None) mysql_db = info.get('mysql_db', None) mysql_table = info.get('mysql_table', None) mysql_conn = info.get('mysql_conn', None) oss_path = info.get('oss_path', None) if hive_db is None or hive_table is None or mysql_db is None or mysql_table is None or mysql_conn is None: return None hive_info = self.__get_hive_table_and_type_schema(hive_db, hive_table) hive_columns = list(hive_info[0]) hive_schema = list(hive_info[1]) mysql_info = self.__get_mysql_table_schema_and_column_name(mysql_db, mysql_table, mysql_conn) mysql_schema = mysql_info[0] mysql_column = mysql_info[1] add_columns = set(mysql_column).difference(set(hive_columns)) add_columns = list(add_columns) if len(add_columns) == 0: return True columns = [] for add_column in add_columns: for column in mysql_schema: if column.get('column', None) == add_column: column_info = column.get('column_info', None) if column_info: columns.append(column_info) hive_schema.extend(columns) hql = ''' DROP TABLE IF EXISTS {db}.`{table}`; CREATE EXTERNAL TABLE IF NOT EXISTS {db}.`{table}`( {columns} ) PARTITIONED BY ( `dt` string, `hour` string ) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' WITH SERDEPROPERTIES ( 'ignore.malformed.json'='true') STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION '{oss_path}'; MSCK REPAIR TABLE {db}.`{table}`; '''.format( db=hive_db, table=hive_table, columns=",\n".join(hive_schema), oss_path=oss_path ) logging.info(hql) hive_hook = HiveCliHook() hive_hook.run_cli(hql) return True except BaseException as e: logging.info("Exception Info::") logging.info(e) raise AirflowException('sqoop append hive schema error')
def move_data_mssql(templates_dict, **kwargs): tmp_table = 'clean_mydata_tmp' table = 'clean_mydata' delimiter = "','" quote_char = "'\"'" escape_char = "'\\\\'" number_of_mappers = '4' mssql_table = 'mydata' # -------Convert Hive Table from PARQUET to TEXT -------------------------------------------- hook = HiveCliHook() query = """ DROP TABLE {tmp_table}; CREATE TABLE {tmp_table} ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = {delimiter}, "quoteChar" = {quote_char}, "escapeChar" = {escape_char} ) STORED AS TEXTFILE AS SELECT column1, column2, column3, column4, column5, column6,column7, column8, column9, column10, source_id FROM {table} """.format(tmp_table=tmp_table, table=table, delimiter=delimiter, quote_char=quote_char, escape_char=escape_char) hook.run_cli(schema=templates_dict['schema'], hql=query) # --------------Run sqoop-------------------------------------------------------------------- # get default mssql connection sql_conn = BaseHook.get_connection('mssql_default') conn_str = "'jdbc:sqlserver://{host};databaseName={database}'".format( host=sql_conn.host, database=sql_conn.extra_dejson.get('database') ) # get default hive cli connection hive_conn = BaseHook.get_connection('hive_cli_default') hdfs_export_dir = 'hdfs://{host}:{port}/user/hive/warehouse/my_hive_db.db/{table}'.format( host=hive_conn.host, port=8020, table=tmp_table ) cmd = ['/usr/bin/sqoop', 'export', '--connect', conn_str, '--username', sql_conn.login, '--password', sql_conn.password, '--table', mssql_table, '--export-dir', hdfs_export_dir, '-m', number_of_mappers, '--input-fields-terminated-by', delimiter, '--input-enclosed-by', quote_char, '--input-escaped-by', escape_char, '--input-null-string', "'\\\N'"] cmd = ' '.join(cmd) print cmd logging.info("Executing sqoop") sp = Popen(cmd, shell=True) sp.wait() logging.info("Command exited with return code {0}".format(sp.returncode))