class HawqUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)
        self.segDirs = self.getSegmentDirs()   
        
    def printReport(self):
        printInfo("Getting database names from HAWQ")
        dbNames = self.getDatabaseNames()
        printInfo("Getting quotas from these databases %s" % (dbNames))    
        for db in dbNames:
            self.printDatabaseQuota(db)
        self.printTopKLargestDatabases(dbNames)
        
    def getDatabaseNames(self):
        names = queryPostgres(self.conf.get(Config.HAWQ_PORT), self.conf.get(Config.HAWQ_USERNAME), self.conf.get(Config.HAWQ_METADATA_DB), "SELECT DISTINCT datname FROM pg_database", split=False)
        
        retval = []
        for name in names.split():
            if not name in self.conf.get(Config.HAWQ_SYSTEM_DB_BLACKLIST):
                retval.append(name)
        return retval
        
    def getDatabaseOID(self, name):
        oids = queryPostgres(self.conf.get(Config.HAWQ_PORT), self.conf.get(Config.HAWQ_USERNAME), self.conf.get(Config.HAWQ_METADATA_DB), "SELECT oid FROM pg_database WHERE datname = '%s'" % (name))
        if not oids is None and len(oids) == 1:
            return oids[0]
        elif oids is None:
            printError("Database %s not found" % (name))
        else:
            printError("Received %i OIDs, expecting 1" %(len(oids)))
            return None
        
    def getTableOID(self, database, table):
        oids = queryPostgres(self.conf.get(Config.HAWQ_PORT), self.conf.get(Config.HAWQ_USERNAME), database, "SELECT oid FROM pg_class WHERE relname = '%s'" % (table))
        if len(oids) == 1:
            return oids[0]
        else:
            printError("Received %i OIDs, expecting 1" %(len(oids)))
            return None
        
    def getSegmentDirs(self):
        cmd = "hdfs dfs -ls %s | grep gpseg | awk '{print $8}'" % (self.conf.get(Config.HAWQ_HDFS_DIR))
        dirs = getCommandOutput(cmd).split()
    
        if len(dirs) == 0:
            printError("Failed to get any segment directories from HDFS")
            sys.exit(1)  
            
        return dirs

    def getSchemaTables(self, database):
        output = queryPostgres(port, username, database, "SELECT table_schema, table_name FROM information_schema.tables", split=False)

        schemaTableMap = dict()
        for record in output.split('\n'):
            (schema, table) = record.strip().replace('|', '').split()
            
            if not schema in self.schemaBlackList:
                try:            
                    schemaTableMap[schema] = schemaTableMap[schema] + [table]
                except KeyError:
                    schemaTableMap[schema] = [table]

        return schemaTableMap

    def printDatabaseQuota(self, db):
        dbOID = self.getDatabaseOID(db)

        if not dbOID is None:
            printInfo("Getting quota status for database %s" % (db))
            hdfsDBDirs = []
            for segDir in self.segDirs:
                hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID))

            quotas = self.hdfsUtil.getSpaceQuotas(hdfsDBDirs)

            self.__printDBQuotaInserts(db, quotas)

            row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

            toPrint = []
            for (directory, quota, remainingQuota) in quotas:
                quotaHR = bytes2human(quota) if quota != 'none' else quota
                remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota
                toPrint.append(row(db, directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
            
            pprinttable(toPrint)

    def __printDBQuotaInserts(self, db, quotas):        
        for (directory, quota, remainingQuota) in quotas:
            row = HawqDBQuotaRow()
            row.database = db
            row.dir = directory
            
            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None
                
            self.pgUtil.writeInsert(row)           
            
    def getDatabaseSize(self, db):
        dbOID = self.getDatabaseOID(db)

        if not dbOID is None:
            dbDir = "%s/*/16385/%s" % (self.conf.get(Config.HAWQ_HDFS_DIR), dbOID)

            sizes = self.hdfsUtil.getDirSizes([dbDir])
            
            sum = 0
            for (dir, size) in sizes:
                sum += size
            
            return (db, sum)
        else:
            return None
            
    def printTopKLargestDatabases(self, dbNames):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s largest HAWQ databases" % (k))
        
        dbSizes = []
        for db in dbNames:
            tDbSize = self.getDatabaseSize(db)
            if not tDbSize is None:
                dbSizes.append(tDbSize)

        if len(dbSizes) == 0:
            printInfo("No HAWQ databases found in HDFS")
            return
        
        dbSizes.sort(key=operator.itemgetter(1), reverse=True)
        
        if len(dbSizes) > k:
            dbSizes = dbSizes[0:k]
            
        self.__printTopKLargestDatabasesInserts(dbSizes)
            
        # print sizes
        row = namedtuple('Row', ['Database', 'Size', 'SizeHR'])

        toPrint = []
        for (db, size) in dbSizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(db, str(size), str(sizeHR)))
        pprinttable(toPrint)

    def __printTopKLargestDatabasesInserts(self, dbSizes):
    
        for (db, size) in dbSizes:
            row = HawqDBSizeRow()
            row.database = db
            row.size = size
            
            self.pgUtil.writeInsert(row)       
    
    def setDatabaseQuota(self, db, quota):
    
        if db == self.conf.get(Config.HAWQ_DB_BLACKLIST).split():
            printError("Database %s is in the blacklist. Remove to set quota" % (db))
            return
            
        dbOID = self.getDatabaseOID(db)

        if not dbOID is None:
            printInfo("Setting quota for %s to %s bytes" % (db, quota))
            hdfsDBDirs = []
            for segDir in self.segDirs:
                hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID))

            self.hdfsUtil.setSpaceQuotas(hdfsDBDirs, quota)
        else:
            sys.exit(1)

    def clearDatabaseQuota(self, db):
        dbOID = self.getDatabaseOID(db)

        if not dbOID is None:
            printInfo("Clearing quota for database %s" % (db))
            hdfsDBDirs = []
            for segDir in self.segDirs:
                hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID))

            self.hdfsUtil.clearSpaceQuotas(hdfsDBDirs)
        else:
            sys.exit(1)
Example #2
0
class UserUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)

    def printReport(self):
        self.printUserSpaceQuotas()
        self.printUserINodeQuotas()
        self.printTopKSpaceUsers()
        self.printTopKINodeUsers()

    def printUserSpaceQuotas(self):
        printInfo("Getting space quota status for users")

        quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories())

        if len(quotas) == 0:
            printInfo("No user directories found in HDFS")
            return

        quotas.sort()

        self.__printUserSpaceQuotasInserts(quotas)

        row = namedtuple(
            'Row',
            ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            # Sometimes the remaining quota is negative...
            if remainingQuota != 'inf':
                if long(remainingQuotaHR) < 0:
                    remainingQuotaHR = "-" + bytes2human(-long(remainingQuota))
                else:
                    remainingQuotaHR = bytes2human(remainingQuota)
            else:
                remainingQuotaHR = remainingQuota
            toPrint.append(
                row(directory, quota, remainingQuota, quotaHR,
                    remainingQuotaHR))

        pprinttable(toPrint)

    def __printUserSpaceQuotasInserts(self, quotas):
        for (directory, quota, remainingQuota) in quotas:
            row = UserSpaceQuotaRow()
            row.username = directory[6:]
            row.dir = directory

            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None

            self.pgUtil.writeInsert(row)

    def printUserINodeQuotas(self):
        printInfo("Getting inode quota status for users")

        quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories())

        if len(quotas) == 0:
            printInfo("No user directories found in HDFS")
            return

        quotas.sort()

        self.__printUserINodeQuotasInserts(quotas)

        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            toPrint.append(row(directory, quota, remainingQuota))

        pprinttable(toPrint)

    def __printUserINodeQuotasInserts(self, quotas):
        for (directory, quota, remainingQuota) in quotas:
            row = UserINodeQuotaRow()
            row.username = directory[6:]
            row.dir = directory

            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None

            self.pgUtil.writeInsert(row)

    def printUserSpaceQuota(self, user):
        printInfo("Getting space quota status for user %s" % (user))

        quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)])

        if len(quotas) == 0:
            printInfo("Directory for user %s not found in HDFS" % (quotas))
            return

        row = namedtuple(
            'Row',
            ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(
                remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(
                row(directory, quota, remainingQuota, quotaHR,
                    remainingQuotaHR))

        pprinttable(toPrint)

    def printUserINodeQuota(self, user):
        printInfo("Getting inode quota status for user %s" % (user))

        quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)])

        if len(quotas) == 0:
            printInfo("Directory for user %s not found in HDFS" % (quotas))
            return

        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            toPrint.append(row(directory, quota, remainingQuota))

        pprinttable(toPrint)

    def printTopKSpaceUsers(self):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s space users" % (k))

        sizes = self.hdfsUtil.getDirSizes(['/user'])

        if len(sizes) == 0:
            printInfo("No user directories found in HDFS")
            return

        sizes.sort(key=operator.itemgetter(1), reverse=True)

        if len(sizes) > k:
            sizes = sizes[0:k]

        self.__printTopKSpaceInserts(sizes)

        row = namedtuple('Row', ['User', 'Size', 'SizeHR'])

        toPrint = []
        for (dir, size) in sizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(dir, str(size), str(sizeHR)))

        pprinttable(toPrint)

    def __printTopKSpaceInserts(self, sizes):
        for (dir, size) in sizes:
            row = UserSpaceSizeRow()
            row.username = dir[6:]
            row.dir = dir
            row.size = size

            self.pgUtil.writeInsert(row)

    def printTopKINodeUsers(self):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s inode users" % (k))

        counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories())

        if len(counts) == 0:
            printInfo("No user directories found in HDFS")
            return

        counts.sort(key=operator.itemgetter(1), reverse=True)

        if len(counts) > k:
            counts = counts[0:k]

        self.__printTopKINodeUsersInserts(counts)

        row = namedtuple('Row', ['User', 'INodes'])

        toPrint = []
        for (dir, count) in counts:
            toPrint.append(row(dir, str(count)))

        pprinttable(toPrint)

    def __printTopKINodeUsersInserts(self, counts):
        for (dir, count) in counts:
            row = UserINodeSizeRow()
            row.username = dir[6:]
            row.dir = dir
            row.size = count

            self.pgUtil.writeInsert(row)

    def setUserSpaceQuota(self, user, quota):
        if user == self.conf.get(Config.USER_DIR_BLACKLIST).split():
            printError("User %s is in the blacklist.  Remove to set quota" %
                       (db))
            return

        self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota)

    def clearUserSpaceQuota(self, user):
        self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)])

    def setUserINodeQuota(self, user, quota):
        if user == self.conf.get(Config.USER_DIR_BLACKLIST).split():
            printError("User %s is in the blacklist.  Remove to set quota" %
                       (db))
            return

        self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota)

    def clearUserINodeQuota(self, user):
        self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)])

    def getUserDirectories(self):
        return self.hdfsUtil.listDirs(['/user'])
Example #3
0
class HawqUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)
        self.segDirs = self.getSegmentDirs()

    def printReport(self):
        printInfo("Getting database names from HAWQ")
        dbNames = self.getDatabaseNames()
        printInfo("Getting quotas from these databases %s" % (dbNames))
        for db in dbNames:
            self.printDatabaseQuota(db)
        self.printTopKLargestDatabases(dbNames)

    def getDatabaseNames(self):
        names = queryPostgres(self.conf.get(Config.HAWQ_PORT),
                              self.conf.get(Config.HAWQ_USERNAME),
                              self.conf.get(Config.HAWQ_METADATA_DB),
                              "SELECT DISTINCT datname FROM pg_database",
                              split=False)

        retval = []
        for name in names.split():
            if not name in self.conf.get(Config.HAWQ_SYSTEM_DB_BLACKLIST):
                retval.append(name)
        return retval

    def getDatabaseOID(self, name):
        oids = queryPostgres(
            self.conf.get(Config.HAWQ_PORT),
            self.conf.get(Config.HAWQ_USERNAME),
            self.conf.get(Config.HAWQ_METADATA_DB),
            "SELECT oid FROM pg_database WHERE datname = '%s'" % (name))
        if not oids is None and len(oids) == 1:
            return oids[0]
        elif oids is None:
            printError("Database %s not found" % (name))
        else:
            printError("Received %i OIDs, expecting 1" % (len(oids)))
            return None

    def getTableOID(self, database, table):
        oids = queryPostgres(
            self.conf.get(Config.HAWQ_PORT),
            self.conf.get(Config.HAWQ_USERNAME), database,
            "SELECT oid FROM pg_class WHERE relname = '%s'" % (table))
        if len(oids) == 1:
            return oids[0]
        else:
            printError("Received %i OIDs, expecting 1" % (len(oids)))
            return None

    def getSegmentDirs(self):
        cmd = "hdfs dfs -ls %s | grep gpseg | awk '{print $8}'" % (
            self.conf.get(Config.HAWQ_HDFS_DIR))
        dirs = getCommandOutput(cmd).split()

        if len(dirs) == 0:
            printError("Failed to get any segment directories from HDFS")
            sys.exit(1)

        return dirs

    def getSchemaTables(self, database):
        output = queryPostgres(
            port,
            username,
            database,
            "SELECT table_schema, table_name FROM information_schema.tables",
            split=False)

        schemaTableMap = dict()
        for record in output.split('\n'):
            (schema, table) = record.strip().replace('|', '').split()

            if not schema in self.schemaBlackList:
                try:
                    schemaTableMap[schema] = schemaTableMap[schema] + [table]
                except KeyError:
                    schemaTableMap[schema] = [table]

        return schemaTableMap

    def printDatabaseQuota(self, db):
        dbOID = self.getDatabaseOID(db)

        if not dbOID is None:
            printInfo("Getting quota status for database %s" % (db))
            hdfsDBDirs = []
            for segDir in self.segDirs:
                hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID))

            quotas = self.hdfsUtil.getSpaceQuotas(hdfsDBDirs)

            self.__printDBQuotaInserts(db, quotas)

            row = namedtuple('Row', [
                'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR',
                'RemainingHR'
            ])

            toPrint = []
            for (directory, quota, remainingQuota) in quotas:
                quotaHR = bytes2human(quota) if quota != 'none' else quota
                remainingQuotaHR = bytes2human(
                    remainingQuota
                ) if remainingQuota != 'inf' else remainingQuota
                toPrint.append(
                    row(db, directory, quota, remainingQuota, quotaHR,
                        remainingQuotaHR))

            pprinttable(toPrint)

    def __printDBQuotaInserts(self, db, quotas):
        for (directory, quota, remainingQuota) in quotas:
            row = HawqDBQuotaRow()
            row.database = db
            row.dir = directory

            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None

            self.pgUtil.writeInsert(row)

    def getDatabaseSize(self, db):
        dbOID = self.getDatabaseOID(db)

        if not dbOID is None:
            dbDir = "%s/*/16385/%s" % (self.conf.get(
                Config.HAWQ_HDFS_DIR), dbOID)

            sizes = self.hdfsUtil.getDirSizes([dbDir])

            sum = 0
            for (dir, size) in sizes:
                sum += size

            return (db, sum)
        else:
            return None

    def printTopKLargestDatabases(self, dbNames):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s largest HAWQ databases" % (k))

        dbSizes = []
        for db in dbNames:
            tDbSize = self.getDatabaseSize(db)
            if not tDbSize is None:
                dbSizes.append(tDbSize)

        if len(dbSizes) == 0:
            printInfo("No HAWQ databases found in HDFS")
            return

        dbSizes.sort(key=operator.itemgetter(1), reverse=True)

        if len(dbSizes) > k:
            dbSizes = dbSizes[0:k]

        self.__printTopKLargestDatabasesInserts(dbSizes)

        # print sizes
        row = namedtuple('Row', ['Database', 'Size', 'SizeHR'])

        toPrint = []
        for (db, size) in dbSizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(db, str(size), str(sizeHR)))
        pprinttable(toPrint)

    def __printTopKLargestDatabasesInserts(self, dbSizes):

        for (db, size) in dbSizes:
            row = HawqDBSizeRow()
            row.database = db
            row.size = size

            self.pgUtil.writeInsert(row)

    def setDatabaseQuota(self, db, quota):

        if db == self.conf.get(Config.HAWQ_DB_BLACKLIST).split():
            printError("Database %s is in the blacklist. Remove to set quota" %
                       (db))
            return

        dbOID = self.getDatabaseOID(db)

        if not dbOID is None:
            printInfo("Setting quota for %s to %s bytes" % (db, quota))
            hdfsDBDirs = []
            for segDir in self.segDirs:
                hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID))

            self.hdfsUtil.setSpaceQuotas(hdfsDBDirs, quota)
        else:
            sys.exit(1)

    def clearDatabaseQuota(self, db):
        dbOID = self.getDatabaseOID(db)

        if not dbOID is None:
            printInfo("Clearing quota for database %s" % (db))
            hdfsDBDirs = []
            for segDir in self.segDirs:
                hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID))

            self.hdfsUtil.clearSpaceQuotas(hdfsDBDirs)
        else:
            sys.exit(1)
class UserUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)
        
    def printReport(self):
        self.printUserSpaceQuotas()
        self.printUserINodeQuotas()
        self.printTopKSpaceUsers()
        self.printTopKINodeUsers()
        
    def printUserSpaceQuotas(self):
        printInfo("Getting space quota status for users")

        quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories())
        
        if len(quotas) == 0:
            printInfo("No user directories found in HDFS")
            return
    
        quotas.sort()
        
        self.__printUserSpaceQuotasInserts(quotas)
    
        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            # Sometimes the remaining quota is negative...
            if remainingQuota != 'inf':
                if long(remainingQuotaHR) < 0:
                    remainingQuotaHR = "-" + bytes2human(-long(remainingQuota))
                else:
                    remainingQuotaHR = bytes2human(remainingQuota)
            else:
                remainingQuotaHR = remainingQuota
            toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
        
        pprinttable(toPrint)
        
    def __printUserSpaceQuotasInserts(self, quotas):    
        for (directory, quota, remainingQuota) in quotas:
            row = UserSpaceQuotaRow()
            row.username = directory[6:]
            row.dir = directory
                
            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None
                
            self.pgUtil.writeInsert(row)
                
    def printUserINodeQuotas(self):
        printInfo("Getting inode quota status for users")

        quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories())
        
        if len(quotas) == 0:
            printInfo("No user directories found in HDFS")
            return
    
        quotas.sort()
        
        self.__printUserINodeQuotasInserts(quotas)
    
        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            toPrint.append(row(directory, quota, remainingQuota))
        
        pprinttable(toPrint)
        
    def __printUserINodeQuotasInserts(self, quotas):    
        for (directory, quota, remainingQuota) in quotas:
            row = UserINodeQuotaRow()
            row.username = directory[6:]
            row.dir = directory
                
            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None
                
            self.pgUtil.writeInsert(row)
            
    def printUserSpaceQuota(self, user):
        printInfo("Getting space quota status for user %s" % (user))

        quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)])
        
        if len(quotas) == 0:
            printInfo("Directory for user %s not found in HDFS" % (quotas))
            return
        
        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
    
        pprinttable(toPrint)
        
    def printUserINodeQuota(self, user):
        printInfo("Getting inode quota status for user %s" % (user))

        quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)])
        
        if len(quotas) == 0:
            printInfo("Directory for user %s not found in HDFS" % (quotas))
            return
    
        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            toPrint.append(row(directory, quota, remainingQuota))
    
        pprinttable(toPrint)
         
    def printTopKSpaceUsers(self):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s space users" % (k))
    
        sizes = self.hdfsUtil.getDirSizes(['/user'])
        
        if len(sizes) == 0:
            printInfo("No user directories found in HDFS")
            return
        
        sizes.sort(key=operator.itemgetter(1), reverse=True)
        
        if len(sizes) > k:
            sizes = sizes[0:k]
            
        self.__printTopKSpaceInserts(sizes)
            
        row = namedtuple('Row', ['User', 'Size', 'SizeHR'])

        toPrint = []
        for (dir, size) in sizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(dir, str(size), str(sizeHR)))
            
        pprinttable(toPrint)
             
    def __printTopKSpaceInserts(self, sizes):    
        for (dir, size) in sizes:
            row = UserSpaceSizeRow()
            row.username = dir[6:]
            row.dir = dir
            row.size = size
            
            self.pgUtil.writeInsert(row)
            
    def printTopKINodeUsers(self):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s inode users" % (k))
    
        counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories())
        
        if len(counts) == 0:
            printInfo("No user directories found in HDFS")
            return
            
        counts.sort(key=operator.itemgetter(1), reverse=True)
        
        if len(counts) > k:
            counts = counts[0:k]
            
        self.__printTopKINodeUsersInserts(counts)
            
        row = namedtuple('Row', ['User', 'INodes'])

        toPrint = []
        for (dir, count) in counts:
            toPrint.append(row(dir, str(count)))
            
        pprinttable(toPrint)
            
    def __printTopKINodeUsersInserts(self, counts):    
        for (dir, count) in counts:
            row = UserINodeSizeRow()
            row.username = dir[6:]
            row.dir = dir
            row.size = count
            
            self.pgUtil.writeInsert(row)
            
    def setUserSpaceQuota(self, user, quota):
        if user == self.conf.get(Config.USER_DIR_BLACKLIST).split():
            printError("User %s is in the blacklist.  Remove to set quota" % (db))
            return
            
        self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota)

    def clearUserSpaceQuota(self, user):
        self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)])
        
    def setUserINodeQuota(self, user, quota):
        if user == self.conf.get(Config.USER_DIR_BLACKLIST).split():
            printError("User %s is in the blacklist.  Remove to set quota" % (db))
            return
            
        self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota)

    def clearUserINodeQuota(self, user):
        self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)])
        
    def getUserDirectories(self):
        return self.hdfsUtil.listDirs(['/user'])
Example #5
0
class HiveUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)

    def printReport(self):
        printInfo("Fetching contents of Hive warehouse")

        hivedbdirs = self.getHiveDatabaseDirectories()

        self.printDatabaseQuotas(hivedbdirs)
        self.printTopKLargestDatabases(hivedbdirs)

    def getHiveDatabaseDirectories(self):
        hivedirs = self.hdfsUtil.listDirs(
            [self.conf.get(Config.HIVE_WAREHOUSE_DIR)])
        retval = []
        for dir in hivedirs:
            if dir.endswith(".db"):
                retval.append(dir)
        return retval

    def printDatabaseQuota(self, db):
        printInfo("Getting quota status for Hive database %s" % (db))

        quotas = self.hdfsUtil.getSpaceQuotas(
            ["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)])

        if len(quotas) == 0:
            printInfo("No Hive databases found")
            return

        row = namedtuple('Row', [
            'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR',
            'RemainingHR'
        ])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            dbName = directory.replace(".db", "").replace(
                self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(
                remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(
                row(dbName, directory, quota, remainingQuota, quotaHR,
                    remainingQuotaHR))

        pprinttable(toPrint)

    def printDatabaseQuotas(self, hivedbdirs):
        printInfo("Getting quota status for Hive databases")

        hdfsDirs = []
        for dir in hivedbdirs:
            db = self.getDbNameFromPath(dir)
            hdfsDirs.append("%s/%s.db" %
                            (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db))

        quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs)

        if len(quotas) == 0:
            printInfo("No Hive databases found")
            return

        quotas.sort()

        self.__printDBQuotasInserts(quotas)

        row = namedtuple('Row', [
            'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR',
            'RemainingHR'
        ])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            dbName = directory.replace(".db", "").replace(
                self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(
                remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(
                row(dbName, directory, quota, remainingQuota, quotaHR,
                    remainingQuotaHR))

        pprinttable(toPrint)

    def __printDBQuotasInserts(self, quotas):
        for (directory, quota, remainingQuota) in quotas:
            row = HiveDBQuotaRow()
            row.database = directory.replace(".db", "").replace(
                self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            row.dir = directory

            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None

            self.pgUtil.writeInsert(row)

    def getDatabaseSize(self, dbDir):
        sizes = self.hdfsUtil.getDirSizes([dbDir])

        sum = 0
        for (dir, size) in sizes:
            sum += size

        return (dbDir, sum)

    def printTopKLargestDatabases(self, hivedbdirs):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s largest Hive databases" % (k))

        dbSizes = []
        for dbDir in hivedbdirs:
            tDbSize = self.getDatabaseSize(dbDir)
            if not tDbSize is None:
                dbSizes.append(tDbSize)

        if len(dbSizes) == 0:
            printInfo("No Hive databases found in HDFS")
            return

        dbSizes.sort(key=operator.itemgetter(1), reverse=True)

        if len(dbSizes) > k:
            dbSizes = dbSizes[0:k]

        self.__printTopKLargestDatabases(dbSizes)

        # print sizes
        row = namedtuple('Row', ['Database', 'Size', 'SizeHR'])

        toPrint = []
        for (db, size) in dbSizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(db, str(size), str(sizeHR)))
        pprinttable(toPrint)

    def __printTopKLargestDatabases(self, dbSizes):

        for (db, size) in dbSizes:
            row = HiveDBSizeRow()
            row.database = db
            row.size = size

            self.pgUtil.writeInsert(row)

    def setDatabaseQuota(self, db, quota):

        if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split():
            printError("Database %s is in the blacklist. Remove to set quota" %
                       (db))
            return

        printInfo("Setting quota for %s to %s bytes" % (db, quota))

        self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota)

    def clearDatabaseQuota(self, db):
        printInfo("Clearing quota for database %s" % (db))
        self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)])

    def getDbNameFromPath(self, dir):
        return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR),
                           "").replace(".db", "")

    def getDbPathFromName(self, db):
        return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)
class HiveUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)
        
    def printReport(self):
        printInfo("Fetching contents of Hive warehouse")
        
        hivedbdirs = self.getHiveDatabaseDirectories()

        self.printDatabaseQuotas(hivedbdirs)    
        self.printTopKLargestDatabases(hivedbdirs)
                
    def getHiveDatabaseDirectories(self):
        hivedirs = self.hdfsUtil.listDirs([self.conf.get(Config.HIVE_WAREHOUSE_DIR)])
        retval = []
        for dir in hivedirs:
            if dir.endswith(".db"):
                retval.append(dir)
        return retval
        
    def printDatabaseQuota(self, db):
        printInfo("Getting quota status for Hive database %s" % (db))
        
        quotas = self.hdfsUtil.getSpaceQuotas(["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)])
    
        if len(quotas) == 0:
            printInfo("No Hive databases found")
            return;
            
        row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
    
        pprinttable(toPrint)
        
    def printDatabaseQuotas(self, hivedbdirs):
        printInfo("Getting quota status for Hive databases")
            
        hdfsDirs = []
        for dir in hivedbdirs:
            db = self.getDbNameFromPath(dir)
            hdfsDirs.append("%s/%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db))
        
        quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs)
        
        if len(quotas) == 0:
            printInfo("No Hive databases found")
            return;
            
        quotas.sort()
        
        self.__printDBQuotasInserts(quotas)
    
        row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
    
        pprinttable(toPrint)
            
    def __printDBQuotasInserts(self, quotas):
        for (directory, quota, remainingQuota) in quotas:
            row = HiveDBQuotaRow()
            row.database = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            row.dir = directory
            
            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None
                
            self.pgUtil.writeInsert(row)
            
    def getDatabaseSize(self, dbDir):
        sizes = self.hdfsUtil.getDirSizes([dbDir])
        
        sum = 0
        for (dir, size) in sizes:
            sum += size
        
        return (dbDir, sum)
            
    def printTopKLargestDatabases(self, hivedbdirs):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s largest Hive databases" % (k))
        
        dbSizes = []
        for dbDir in hivedbdirs:
            tDbSize = self.getDatabaseSize(dbDir)
            if not tDbSize is None:
                dbSizes.append(tDbSize)

        if len(dbSizes) == 0:
            printInfo("No Hive databases found in HDFS")
            return
        
        dbSizes.sort(key=operator.itemgetter(1), reverse=True)
        
        if len(dbSizes) > k:
            dbSizes = dbSizes[0:k]
            
        self.__printTopKLargestDatabases(dbSizes)
            
        # print sizes
        row = namedtuple('Row', ['Database', 'Size', 'SizeHR'])

        toPrint = []
        for (db, size) in dbSizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(db, str(size), str(sizeHR)))
        pprinttable(toPrint)
        
    def __printTopKLargestDatabases(self, dbSizes):
    
        for (db, size) in dbSizes:
            row = HiveDBSizeRow()
            row.database = db
            row.size = size
            
            self.pgUtil.writeInsert(row)
    
    def setDatabaseQuota(self, db, quota):
    
        if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split():
            printError("Database %s is in the blacklist. Remove to set quota" % (db))
            return

        printInfo("Setting quota for %s to %s bytes" % (db, quota))
        
        self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota)
            
    def clearDatabaseQuota(self, db):
        printInfo("Clearing quota for database %s" % (db))
        self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)])
            
    def getDbNameFromPath(self, dir):
        return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "").replace(".db", "")
        
    def getDbPathFromName(self, db):
        return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db);