def execute(self, context: Dict[str, Any]) -> None: hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.log.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'].replace( '.', '_') sql = self.sql.strip().strip(';') tblproperties = ''.join([ ", '{}' = '{}'".format(k, v) for k, v in self.hive_tblproperties.items() ]) hql = f"""\ SET mapred.output.compress=false; SET hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = ''{tblproperties}) AS {sql} """ self.log.info("Running command:\n %s", hql) hive.run_cli(hql) meta_hook = HiveMetastoreHook(self.metastore_conn_id) # Get the Hive table and extract the columns table = meta_hook.get_table(hive_table) columns = [col.name for col in table.sd.cols] # Get the path on hdfs static_path = meta_hook.get_table(hive_table).sd.location druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) try: index_spec = self.construct_ingest_query( static_path=static_path, columns=columns, ) self.log.info("Inserting rows into Druid, hdfs path: %s", static_path) druid.submit_indexing_job(index_spec) self.log.info("Load seems to have succeeded!") finally: self.log.info("Cleaning up by dropping the temp Hive table %s", hive_table) hql = "DROP TABLE IF EXISTS {}".format(hive_table) hive.run_cli(hql)
def table(self): """Create table view""" table_name = request.args.get("table") metastore = HiveMetastoreHook(METASTORE_CONN_ID) table = metastore.get_table(table_name) return self.render_template("metastore_browser/table.html", table=table, table_name=table_name, datetime=datetime, int=int)
def execute(self, context: Optional[Dict[str, Any]] = None) -> None: metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table = metastore.get_table(table_name=self.table) field_types = {col.name: col.type for col in table.sd.cols} exprs: Any = {('', 'count'): 'COUNT(*)'} for col, col_type in list(field_types.items()): if self.assignment_func: assign_exprs = self.assignment_func(col, col_type) if assign_exprs is None: assign_exprs = self.get_default_exprs(col, col_type) else: assign_exprs = self.get_default_exprs(col, col_type) exprs.update(assign_exprs) exprs.update(self.extra_exprs) exprs = OrderedDict(exprs) exprs_str = ",\n ".join([v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()]) where_clause_ = ["{} = '{}'".format(k, v) for k, v in self.partition.items()] where_clause = " AND\n ".join(where_clause_) sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format( exprs_str=exprs_str, table=self.table, where_clause=where_clause ) presto = PrestoHook(presto_conn_id=self.presto_conn_id) self.log.info('Executing SQL check: %s', sql) row = presto.get_first(hql=sql) self.log.info("Record: %s", row) if not row: raise AirflowException("The query returned None") part_json = json.dumps(self.partition, sort_keys=True) self.log.info("Deleting rows from previous runs if they exist") mysql = MySqlHook(self.mysql_conn_id) sql = """ SELECT 1 FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}' LIMIT 1; """.format( table=self.table, part_json=part_json, dttm=self.dttm ) if mysql.get_records(sql): sql = """ DELETE FROM hive_stats WHERE table_name='{table}' AND partition_repr='{part_json}' AND dttm='{dttm}'; """.format( table=self.table, part_json=part_json, dttm=self.dttm ) mysql.run(sql) self.log.info("Pivoting and loading cells into the Airflow db") rows = [ (self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1]) for r in zip(exprs, row) ] mysql.insert_rows( table='hive_stats', rows=rows, target_fields=[ 'ds', 'dttm', 'table_name', 'partition_repr', 'col', 'metric', 'value', ], )