コード例 #1
0
ファイル: sql_adaptor.py プロジェクト: seshendranath/Grpn
 def execute_hql(self, sql):
     cmd = [hive_util.get_hive(), "-e",  "set mapred.job.queue.name=%s; %s"% (self.fair_scheduling_queue, sql)]
     (result, stdout, stderr) = os_util.execute_command(cmd, do_sideput=True)
     if result != 0:
         logging.info("failed run  cmd="+`cmd`+" stdout="+stdout+" stderr="+stderr)
         return 0
     return (result, stdout, stderr)
コード例 #2
0
    def get_tab_schema_detail(self, table_name):
        """ executes hive describe formatted <table_name>'. Returns map of results
            Also and date formats: Wed Mar 11 20:07:23 UTC 2015
        """
        kvre = re.compile(r'(\w+): (.*)$')
        timere = re.compile(r'\w+ (\w+) (\d+) (\d+):(\d+):(\d+) \w+ (\d+)')

        cmd = [
            hive_util.get_hive(), "-e",
            '"use %s;describe formatted %s"' % (self.schema, table_name)
        ]
        logging.info("Running " + " ".join(cmd))
        schema_hash = {}
        schema_hash['cols'] = {}
        schema_hash['part_cols'] = {}
        schema_hash['details'] = {}
        schema_hash['storage'] = {}
        (result, stdout, stderr) = os_util.execute_command(cmd,
                                                           do_sideput=True)
        if result != 0:
            return schema_hash
        cols = []
        part_info = "# Partition Information"
        deatiled_info = "# Detailed Table Information"
        storage_info = "# Storage Information"
        curr_key = 'cols'
        for line in stdout.split('\n'):
            line = line.strip()
            if not line:
                continue
            if line[0] == "#":
                if part_info in line:
                    curr_key = 'part_cols'
                if deatiled_info in line:
                    curr_key = 'details'
                if storage_info in line:
                    curr_key = 'storage'
                continue
            kv = kvre.match(line)
            if kv:
                parm = kv.group(1).lower().strip()
                val = kv.group(2).strip()
                tval_s = timere.match(val)
                if tval_s:
                    m = tval_s.group(1)
                    d = tval_s.group(2)
                    Y = tval_s.group(6)
                    H = tval_s.group(3)
                    M = tval_s.group(4)
                    S = tval_s.group(5)
                    datestr = "{m} {d} {Y} {H}:{M}:{S}".format(**locals())
                    val = datetime.datetime.strptime(datestr,
                                                     "%b %d %Y %H:%M:%S")

                schema_hash[curr_key][parm] = val

        logging.info("table {table_name} desc: {stdout}".format(**locals()))
        return schema_hash
コード例 #3
0
    def get_latest_update_time(self, history_table, pkid, hdfs_temp_dir=None):
        """ get_row_count """
        """ hdfs_temp_dir is specified for merge and load operations. None is used for sqoop"""

        maxUpdateAt = None
        maxPKID = 0
        minPKID = 0
        result = None
        if hdfs_temp_dir:
            maxUpdateAt = self.get_aggregate(hdfs_temp_dir, "max_date")
            maxPKID = self.get_aggregate(hdfs_temp_dir,
                                         "max_pkid",
                                         asString=False)
            minPKID = self.get_aggregate(hdfs_temp_dir,
                                         "min_pkid",
                                         asString=False,
                                         getMax=False)
            logging.info("get_aggregate: (%s, %s, %s)" %
                         (maxUpdateAt, minPKID, maxPKID))
        else:
            cols = self.get_table_schema(history_table)
            col_names = [nm for (nm, ty) in cols]
            max_id_func = "max(cast(%s as int))" % pkid
            max_time_col = "'0000-00-00 00:00:00'"
            for col in timestamp_cols:
                if col in zip(*cols)[0]:
                    max_time_col = "max(%s)" % col
                    break

            cmd = [
                hive_util.get_hive(), "-e",
                "use %s; select CONCAT(%s, ', ', %s ) from %s limit 1" %
                (self.schema, max_id_func, max_time_col, history_table)
            ]
            logging.info("Running hive cmd %s" % " ".join(cmd))
            (result, stdout, stderr) = os_util.execute_command(cmd,
                                                               do_sideput=True)

            if result is not None and stdout.strip() != 'NULL':
                if result != 0:
                    logging.info("failed run  cmd=" + ` cmd ` + " stdout=" +
                                 stdout + " stderr=" + stderr)
                    maxUpdateAt = ""
                else:
                    for line in stdout.split('\n'):
                        if "WARN" not in line:
                            maxList = line.strip().split(",")
                    if not maxList:
                        maxList = ['0', '0000-00-00 00:00:00']
                    maxPKID = maxList[0]
                    maxUpdateAt = maxList[1]
            logging.info("get_aggregate. ELSE: (%s, %s, %s)" %
                         (maxUpdateAt, minPKID, maxPKID))
        return (maxUpdateAt, int(minPKID), int(maxPKID))
コード例 #4
0
 def table_partitions(self, table_name):
     """ executes hive describe <table_name>'. Returns results"""
     cmd = [
         hive_util.get_hive(), "-e",
         "use %s;show partitions %s" % (self.schema, table_name)
     ]
     (result, stdout, stderr) = os_util.execute_command(cmd,
                                                        do_sideput=False)
     if result != 0:
         logging.info("failed run  stdout=" + stdout + " stderr=" + stderr)
         return []
     return [line.strip() for line in stdout.split('\n')]
コード例 #5
0
ファイル: sql_adaptor.py プロジェクト: seshendranath/Grpn
 def execute_hql_get_int(self, sql):
     int_value = 0
     cmd = [hive_util.get_hive(), "-e",  "set mapred.job.queue.name=%s; %s"% (self.fair_scheduling_queue, sql)]
     (result, stdout, stderr) = os_util.execute_command(cmd, do_sideput=True)
     if result != 0:
         logging.info("failed run  cmd="+`cmd`+" stdout="+stdout+" stderr="+stderr)
     elif result == 0:
         try:
             int_value = int(stdout.strip())
         except:
             pass
         
     return int_value
コード例 #6
0
    def get_row_count(self, table_name):
        """ get_row_count """
        cmd = [
            hive_util.get_hive(), "-e",
            "use %s;select count(1) from %s" % (self.schema, table_name)
        ]
        (result, stdout, stderr) = os_util.execute_command(cmd,
                                                           do_sideput=True)
        if result != 0:
            logging.info("failed run  cmd=" + ` cmd ` + " stdout=" + stdout +
                         " stderr=" + stderr)
            return ""

        return stdout.strip()
コード例 #7
0
 def get_table_schema(self, table_name):
     """ executes hive describe <table_name>'. Returns results"""
     cols = []
     cmd = [
         hive_util.get_hive(), "-e",
         "use %s;describe %s" % (self.schema, table_name)
     ]
     logging.info("Running hive cmd %s" % " ".join(cmd))
     (result, stdout, stderr) = os_util.execute_command(cmd,
                                                        do_sideput=True)
     if result != 0:
         logging.info("failed run  stdout=" + stdout + " stderr=" + stderr)
         return []
     if "Table not found" in stdout:
         logging.info("table does not exists  stdout=" + stdout +
                      " stderr=" + stderr)
         return []
     for line in stdout.split('\n'):
         print >> sys.stderr, line
         if "WARN" not in line:
             name, type = shlex.split(line)
             cols.append((name, type))
     return cols