def execute_hql(self, sql): cmd = [hive_util.get_hive(), "-e", "set mapred.job.queue.name=%s; %s"% (self.fair_scheduling_queue, sql)] (result, stdout, stderr) = os_util.execute_command(cmd, do_sideput=True) if result != 0: logging.info("failed run cmd="+`cmd`+" stdout="+stdout+" stderr="+stderr) return 0 return (result, stdout, stderr)
def get_tab_schema_detail(self, table_name): """ executes hive describe formatted <table_name>'. Returns map of results Also and date formats: Wed Mar 11 20:07:23 UTC 2015 """ kvre = re.compile(r'(\w+): (.*)$') timere = re.compile(r'\w+ (\w+) (\d+) (\d+):(\d+):(\d+) \w+ (\d+)') cmd = [ hive_util.get_hive(), "-e", '"use %s;describe formatted %s"' % (self.schema, table_name) ] logging.info("Running " + " ".join(cmd)) schema_hash = {} schema_hash['cols'] = {} schema_hash['part_cols'] = {} schema_hash['details'] = {} schema_hash['storage'] = {} (result, stdout, stderr) = os_util.execute_command(cmd, do_sideput=True) if result != 0: return schema_hash cols = [] part_info = "# Partition Information" deatiled_info = "# Detailed Table Information" storage_info = "# Storage Information" curr_key = 'cols' for line in stdout.split('\n'): line = line.strip() if not line: continue if line[0] == "#": if part_info in line: curr_key = 'part_cols' if deatiled_info in line: curr_key = 'details' if storage_info in line: curr_key = 'storage' continue kv = kvre.match(line) if kv: parm = kv.group(1).lower().strip() val = kv.group(2).strip() tval_s = timere.match(val) if tval_s: m = tval_s.group(1) d = tval_s.group(2) Y = tval_s.group(6) H = tval_s.group(3) M = tval_s.group(4) S = tval_s.group(5) datestr = "{m} {d} {Y} {H}:{M}:{S}".format(**locals()) val = datetime.datetime.strptime(datestr, "%b %d %Y %H:%M:%S") schema_hash[curr_key][parm] = val logging.info("table {table_name} desc: {stdout}".format(**locals())) return schema_hash
def get_latest_update_time(self, history_table, pkid, hdfs_temp_dir=None): """ get_row_count """ """ hdfs_temp_dir is specified for merge and load operations. None is used for sqoop""" maxUpdateAt = None maxPKID = 0 minPKID = 0 result = None if hdfs_temp_dir: maxUpdateAt = self.get_aggregate(hdfs_temp_dir, "max_date") maxPKID = self.get_aggregate(hdfs_temp_dir, "max_pkid", asString=False) minPKID = self.get_aggregate(hdfs_temp_dir, "min_pkid", asString=False, getMax=False) logging.info("get_aggregate: (%s, %s, %s)" % (maxUpdateAt, minPKID, maxPKID)) else: cols = self.get_table_schema(history_table) col_names = [nm for (nm, ty) in cols] max_id_func = "max(cast(%s as int))" % pkid max_time_col = "'0000-00-00 00:00:00'" for col in timestamp_cols: if col in zip(*cols)[0]: max_time_col = "max(%s)" % col break cmd = [ hive_util.get_hive(), "-e", "use %s; select CONCAT(%s, ', ', %s ) from %s limit 1" % (self.schema, max_id_func, max_time_col, history_table) ] logging.info("Running hive cmd %s" % " ".join(cmd)) (result, stdout, stderr) = os_util.execute_command(cmd, do_sideput=True) if result is not None and stdout.strip() != 'NULL': if result != 0: logging.info("failed run cmd=" + ` cmd ` + " stdout=" + stdout + " stderr=" + stderr) maxUpdateAt = "" else: for line in stdout.split('\n'): if "WARN" not in line: maxList = line.strip().split(",") if not maxList: maxList = ['0', '0000-00-00 00:00:00'] maxPKID = maxList[0] maxUpdateAt = maxList[1] logging.info("get_aggregate. ELSE: (%s, %s, %s)" % (maxUpdateAt, minPKID, maxPKID)) return (maxUpdateAt, int(minPKID), int(maxPKID))
def table_partitions(self, table_name): """ executes hive describe <table_name>'. Returns results""" cmd = [ hive_util.get_hive(), "-e", "use %s;show partitions %s" % (self.schema, table_name) ] (result, stdout, stderr) = os_util.execute_command(cmd, do_sideput=False) if result != 0: logging.info("failed run stdout=" + stdout + " stderr=" + stderr) return [] return [line.strip() for line in stdout.split('\n')]
def execute_hql_get_int(self, sql): int_value = 0 cmd = [hive_util.get_hive(), "-e", "set mapred.job.queue.name=%s; %s"% (self.fair_scheduling_queue, sql)] (result, stdout, stderr) = os_util.execute_command(cmd, do_sideput=True) if result != 0: logging.info("failed run cmd="+`cmd`+" stdout="+stdout+" stderr="+stderr) elif result == 0: try: int_value = int(stdout.strip()) except: pass return int_value
def get_row_count(self, table_name): """ get_row_count """ cmd = [ hive_util.get_hive(), "-e", "use %s;select count(1) from %s" % (self.schema, table_name) ] (result, stdout, stderr) = os_util.execute_command(cmd, do_sideput=True) if result != 0: logging.info("failed run cmd=" + ` cmd ` + " stdout=" + stdout + " stderr=" + stderr) return "" return stdout.strip()
def get_table_schema(self, table_name): """ executes hive describe <table_name>'. Returns results""" cols = [] cmd = [ hive_util.get_hive(), "-e", "use %s;describe %s" % (self.schema, table_name) ] logging.info("Running hive cmd %s" % " ".join(cmd)) (result, stdout, stderr) = os_util.execute_command(cmd, do_sideput=True) if result != 0: logging.info("failed run stdout=" + stdout + " stderr=" + stderr) return [] if "Table not found" in stdout: logging.info("table does not exists stdout=" + stdout + " stderr=" + stderr) return [] for line in stdout.split('\n'): print >> sys.stderr, line if "WARN" not in line: name, type = shlex.split(line) cols.append((name, type)) return cols