Example #1
0
    def execute(self, context: Dict[str, Any]) -> None:
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace(
            '.', '_')
        sql = self.sql.strip().strip(';')
        tblproperties = ''.join([
            ", '{}' = '{}'".format(k, v)
            for k, v in self.hive_tblproperties.items()
        ])
        hql = f"""\
        SET mapred.output.compress=false;
        SET hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
        AS
        {sql}
        """
        self.log.info("Running command:\n %s", hql)
        hive.run_cli(hql)

        meta_hook = HiveMetastoreHook(self.metastore_conn_id)

        # Get the Hive table and extract the columns
        table = meta_hook.get_table(hive_table)
        columns = [col.name for col in table.sd.cols]

        # Get the path on hdfs
        static_path = meta_hook.get_table(hive_table).sd.location

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)

        try:
            index_spec = self.construct_ingest_query(
                static_path=static_path,
                columns=columns,
            )

            self.log.info("Inserting rows into Druid, hdfs path: %s",
                          static_path)

            druid.submit_indexing_job(index_spec)

            self.log.info("Load seems to have succeeded!")
        finally:
            self.log.info("Cleaning up by dropping the temp Hive table %s",
                          hive_table)
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
Example #2
0
    def test_run_cli_with_hive_conf(self):
        hql = "set key;\n" \
              "set airflow.ctx.dag_id;\nset airflow.ctx.dag_run_id;\n" \
              "set airflow.ctx.task_id;\nset airflow.ctx.execution_date;\n"

        dag_id_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_ID']['env_var_format']
        task_id_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_TASK_ID']['env_var_format']
        execution_date_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_EXECUTION_DATE'][
                'env_var_format']
        dag_run_id_ctx_var_name = \
            AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_RUN_ID'][
                'env_var_format']
        with mock.patch.dict(
                'os.environ', {
                    dag_id_ctx_var_name: 'test_dag_id',
                    task_id_ctx_var_name: 'test_task_id',
                    execution_date_ctx_var_name: 'test_execution_date',
                    dag_run_id_ctx_var_name: 'test_dag_run_id',
                }):
            hook = HiveCliHook()
            output = hook.run_cli(hql=hql, hive_conf={'key': 'value'})
            self.assertIn('value', output)
            self.assertIn('test_dag_id', output)
            self.assertIn('test_task_id', output)
            self.assertIn('test_execution_date', output)
            self.assertIn('test_dag_run_id', output)
Example #3
0
 def ddl(self):
     """
     Retrieve table ddl
     """
     table = request.args.get("table")
     sql = "SHOW CREATE TABLE {table};".format(table=table)
     hook = HiveCliHook(HIVE_CLI_CONN_ID)
     return hook.run_cli(sql)
Example #4
0
 def test_run_cli(self):
     hook = HiveCliHook()
     hook.run_cli("SHOW DATABASES")