class UserUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): self.printUserSpaceQuotas() self.printUserINodeQuotas() self.printTopKSpaceUsers() self.printTopKINodeUsers() def printUserSpaceQuotas(self): printInfo("Getting space quota status for users") quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserSpaceQuotasInserts(quotas) row = namedtuple( 'Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota # Sometimes the remaining quota is negative... if remainingQuota != 'inf': if long(remainingQuotaHR) < 0: remainingQuotaHR = "-" + bytes2human(-long(remainingQuota)) else: remainingQuotaHR = bytes2human(remainingQuota) else: remainingQuotaHR = remainingQuota toPrint.append( row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printUserSpaceQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserSpaceQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserINodeQuotas(self): printInfo("Getting inode quota status for users") quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserINodeQuotasInserts(quotas) row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def __printUserINodeQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserINodeQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserSpaceQuota(self, user): printInfo("Getting space quota status for user %s" % (user)) quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple( 'Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printUserINodeQuota(self, user): printInfo("Getting inode quota status for user %s" % (user)) quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def printTopKSpaceUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s space users" % (k)) sizes = self.hdfsUtil.getDirSizes(['/user']) if len(sizes) == 0: printInfo("No user directories found in HDFS") return sizes.sort(key=operator.itemgetter(1), reverse=True) if len(sizes) > k: sizes = sizes[0:k] self.__printTopKSpaceInserts(sizes) row = namedtuple('Row', ['User', 'Size', 'SizeHR']) toPrint = [] for (dir, size) in sizes: sizeHR = bytes2human(size) toPrint.append(row(dir, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKSpaceInserts(self, sizes): for (dir, size) in sizes: row = UserSpaceSizeRow() row.username = dir[6:] row.dir = dir row.size = size self.pgUtil.writeInsert(row) def printTopKINodeUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s inode users" % (k)) counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories()) if len(counts) == 0: printInfo("No user directories found in HDFS") return counts.sort(key=operator.itemgetter(1), reverse=True) if len(counts) > k: counts = counts[0:k] self.__printTopKINodeUsersInserts(counts) row = namedtuple('Row', ['User', 'INodes']) toPrint = [] for (dir, count) in counts: toPrint.append(row(dir, str(count))) pprinttable(toPrint) def __printTopKINodeUsersInserts(self, counts): for (dir, count) in counts: row = UserINodeSizeRow() row.username = dir[6:] row.dir = dir row.size = count self.pgUtil.writeInsert(row) def setUserSpaceQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota) def clearUserSpaceQuota(self, user): self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)]) def setUserINodeQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota) def clearUserINodeQuota(self, user): self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)]) def getUserDirectories(self): return self.hdfsUtil.listDirs(['/user'])
class HiveUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): printInfo("Fetching contents of Hive warehouse") hivedbdirs = self.getHiveDatabaseDirectories() self.printDatabaseQuotas(hivedbdirs) self.printTopKLargestDatabases(hivedbdirs) def getHiveDatabaseDirectories(self): hivedirs = self.hdfsUtil.listDirs( [self.conf.get(Config.HIVE_WAREHOUSE_DIR)]) retval = [] for dir in hivedirs: if dir.endswith(".db"): retval.append(dir) return retval def printDatabaseQuota(self, db): printInfo("Getting quota status for Hive database %s" % (db)) quotas = self.hdfsUtil.getSpaceQuotas( ["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)]) if len(quotas) == 0: printInfo("No Hive databases found") return row = namedtuple('Row', [ 'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR' ]) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace( self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printDatabaseQuotas(self, hivedbdirs): printInfo("Getting quota status for Hive databases") hdfsDirs = [] for dir in hivedbdirs: db = self.getDbNameFromPath(dir) hdfsDirs.append("%s/%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)) quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs) if len(quotas) == 0: printInfo("No Hive databases found") return quotas.sort() self.__printDBQuotasInserts(quotas) row = namedtuple('Row', [ 'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR' ]) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace( self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printDBQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = HiveDBQuotaRow() row.database = directory.replace(".db", "").replace( self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def getDatabaseSize(self, dbDir): sizes = self.hdfsUtil.getDirSizes([dbDir]) sum = 0 for (dir, size) in sizes: sum += size return (dbDir, sum) def printTopKLargestDatabases(self, hivedbdirs): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s largest Hive databases" % (k)) dbSizes = [] for dbDir in hivedbdirs: tDbSize = self.getDatabaseSize(dbDir) if not tDbSize is None: dbSizes.append(tDbSize) if len(dbSizes) == 0: printInfo("No Hive databases found in HDFS") return dbSizes.sort(key=operator.itemgetter(1), reverse=True) if len(dbSizes) > k: dbSizes = dbSizes[0:k] self.__printTopKLargestDatabases(dbSizes) # print sizes row = namedtuple('Row', ['Database', 'Size', 'SizeHR']) toPrint = [] for (db, size) in dbSizes: sizeHR = bytes2human(size) toPrint.append(row(db, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKLargestDatabases(self, dbSizes): for (db, size) in dbSizes: row = HiveDBSizeRow() row.database = db row.size = size self.pgUtil.writeInsert(row) def setDatabaseQuota(self, db, quota): if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split(): printError("Database %s is in the blacklist. Remove to set quota" % (db)) return printInfo("Setting quota for %s to %s bytes" % (db, quota)) self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota) def clearDatabaseQuota(self, db): printInfo("Clearing quota for database %s" % (db)) self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)]) def getDbNameFromPath(self, dir): return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "").replace(".db", "") def getDbPathFromName(self, db): return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)
class UserUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): self.printUserSpaceQuotas() self.printUserINodeQuotas() self.printTopKSpaceUsers() self.printTopKINodeUsers() def printUserSpaceQuotas(self): printInfo("Getting space quota status for users") quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserSpaceQuotasInserts(quotas) row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota # Sometimes the remaining quota is negative... if remainingQuota != 'inf': if long(remainingQuotaHR) < 0: remainingQuotaHR = "-" + bytes2human(-long(remainingQuota)) else: remainingQuotaHR = bytes2human(remainingQuota) else: remainingQuotaHR = remainingQuota toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printUserSpaceQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserSpaceQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserINodeQuotas(self): printInfo("Getting inode quota status for users") quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserINodeQuotasInserts(quotas) row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def __printUserINodeQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserINodeQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserSpaceQuota(self, user): printInfo("Getting space quota status for user %s" % (user)) quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printUserINodeQuota(self, user): printInfo("Getting inode quota status for user %s" % (user)) quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def printTopKSpaceUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s space users" % (k)) sizes = self.hdfsUtil.getDirSizes(['/user']) if len(sizes) == 0: printInfo("No user directories found in HDFS") return sizes.sort(key=operator.itemgetter(1), reverse=True) if len(sizes) > k: sizes = sizes[0:k] self.__printTopKSpaceInserts(sizes) row = namedtuple('Row', ['User', 'Size', 'SizeHR']) toPrint = [] for (dir, size) in sizes: sizeHR = bytes2human(size) toPrint.append(row(dir, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKSpaceInserts(self, sizes): for (dir, size) in sizes: row = UserSpaceSizeRow() row.username = dir[6:] row.dir = dir row.size = size self.pgUtil.writeInsert(row) def printTopKINodeUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s inode users" % (k)) counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories()) if len(counts) == 0: printInfo("No user directories found in HDFS") return counts.sort(key=operator.itemgetter(1), reverse=True) if len(counts) > k: counts = counts[0:k] self.__printTopKINodeUsersInserts(counts) row = namedtuple('Row', ['User', 'INodes']) toPrint = [] for (dir, count) in counts: toPrint.append(row(dir, str(count))) pprinttable(toPrint) def __printTopKINodeUsersInserts(self, counts): for (dir, count) in counts: row = UserINodeSizeRow() row.username = dir[6:] row.dir = dir row.size = count self.pgUtil.writeInsert(row) def setUserSpaceQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota) def clearUserSpaceQuota(self, user): self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)]) def setUserINodeQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota) def clearUserINodeQuota(self, user): self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)]) def getUserDirectories(self): return self.hdfsUtil.listDirs(['/user'])
class HiveUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): printInfo("Fetching contents of Hive warehouse") hivedbdirs = self.getHiveDatabaseDirectories() self.printDatabaseQuotas(hivedbdirs) self.printTopKLargestDatabases(hivedbdirs) def getHiveDatabaseDirectories(self): hivedirs = self.hdfsUtil.listDirs([self.conf.get(Config.HIVE_WAREHOUSE_DIR)]) retval = [] for dir in hivedirs: if dir.endswith(".db"): retval.append(dir) return retval def printDatabaseQuota(self, db): printInfo("Getting quota status for Hive database %s" % (db)) quotas = self.hdfsUtil.getSpaceQuotas(["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)]) if len(quotas) == 0: printInfo("No Hive databases found") return; row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printDatabaseQuotas(self, hivedbdirs): printInfo("Getting quota status for Hive databases") hdfsDirs = [] for dir in hivedbdirs: db = self.getDbNameFromPath(dir) hdfsDirs.append("%s/%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)) quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs) if len(quotas) == 0: printInfo("No Hive databases found") return; quotas.sort() self.__printDBQuotasInserts(quotas) row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printDBQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = HiveDBQuotaRow() row.database = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def getDatabaseSize(self, dbDir): sizes = self.hdfsUtil.getDirSizes([dbDir]) sum = 0 for (dir, size) in sizes: sum += size return (dbDir, sum) def printTopKLargestDatabases(self, hivedbdirs): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s largest Hive databases" % (k)) dbSizes = [] for dbDir in hivedbdirs: tDbSize = self.getDatabaseSize(dbDir) if not tDbSize is None: dbSizes.append(tDbSize) if len(dbSizes) == 0: printInfo("No Hive databases found in HDFS") return dbSizes.sort(key=operator.itemgetter(1), reverse=True) if len(dbSizes) > k: dbSizes = dbSizes[0:k] self.__printTopKLargestDatabases(dbSizes) # print sizes row = namedtuple('Row', ['Database', 'Size', 'SizeHR']) toPrint = [] for (db, size) in dbSizes: sizeHR = bytes2human(size) toPrint.append(row(db, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKLargestDatabases(self, dbSizes): for (db, size) in dbSizes: row = HiveDBSizeRow() row.database = db row.size = size self.pgUtil.writeInsert(row) def setDatabaseQuota(self, db, quota): if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split(): printError("Database %s is in the blacklist. Remove to set quota" % (db)) return printInfo("Setting quota for %s to %s bytes" % (db, quota)) self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota) def clearDatabaseQuota(self, db): printInfo("Clearing quota for database %s" % (db)) self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)]) def getDbNameFromPath(self, dir): return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "").replace(".db", "") def getDbPathFromName(self, db): return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db);