def execute(self, context: Dict[str, Any]) -> None: hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.log.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'].replace( '.', '_') sql = self.sql.strip().strip(';') tblproperties = ''.join([ ", '{}' = '{}'".format(k, v) for k, v in self.hive_tblproperties.items() ]) hql = f"""\ SET mapred.output.compress=false; SET hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = ''{tblproperties}) AS {sql} """ self.log.info("Running command:\n %s", hql) hive.run_cli(hql) meta_hook = HiveMetastoreHook(self.metastore_conn_id) # Get the Hive table and extract the columns table = meta_hook.get_table(hive_table) columns = [col.name for col in table.sd.cols] # Get the path on hdfs static_path = meta_hook.get_table(hive_table).sd.location druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) try: index_spec = self.construct_ingest_query( static_path=static_path, columns=columns, ) self.log.info("Inserting rows into Druid, hdfs path: %s", static_path) druid.submit_indexing_job(index_spec) self.log.info("Load seems to have succeeded!") finally: self.log.info("Cleaning up by dropping the temp Hive table %s", hive_table) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def test_run_cli_with_hive_conf(self): hql = "set key;\n" \ "set airflow.ctx.dag_id;\nset airflow.ctx.dag_run_id;\n" \ "set airflow.ctx.task_id;\nset airflow.ctx.execution_date;\n" dag_id_ctx_var_name = \ AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_ID']['env_var_format'] task_id_ctx_var_name = \ AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_TASK_ID']['env_var_format'] execution_date_ctx_var_name = \ AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_EXECUTION_DATE'][ 'env_var_format'] dag_run_id_ctx_var_name = \ AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_RUN_ID'][ 'env_var_format'] with mock.patch.dict( 'os.environ', { dag_id_ctx_var_name: 'test_dag_id', task_id_ctx_var_name: 'test_task_id', execution_date_ctx_var_name: 'test_execution_date', dag_run_id_ctx_var_name: 'test_dag_run_id', }): hook = HiveCliHook() output = hook.run_cli(hql=hql, hive_conf={'key': 'value'}) self.assertIn('value', output) self.assertIn('test_dag_id', output) self.assertIn('test_task_id', output) self.assertIn('test_execution_date', output) self.assertIn('test_dag_run_id', output)
def ddl(self): """ Retrieve table ddl """ table = request.args.get("table") sql = "SHOW CREATE TABLE {table};".format(table=table) hook = HiveCliHook(HIVE_CLI_CONN_ID) return hook.run_cli(sql)
def test_run_cli(self): hook = HiveCliHook() hook.run_cli("SHOW DATABASES")