Ejemplo n.º 1
0
 def __init__(self, conf):
     self.conf = conf
     self.hdfsUtil = HdfsUtil(conf)
     self.pgUtil = PostgresUtil(conf)
Ejemplo n.º 2
0
class FsUtil:
    def __init__(self, conf):
        self.conf = conf
        self.pgUtil = PostgresUtil(conf)

    def printReport(self):
        printInfo("Grabbing and printing drive metrics")
        self.printDriveUsage()
        self.printDriveWriteTest()

    # This function prints usage statistics for drives that are available
    # It runs an initial test to make sure the drive exists on the host
    # If the drive does not exist, it is not reported in this report
    def printDriveUsage(self):
        hostfile = self.conf.get(Config.HOST_FILE)
        dataDrives = self.conf.get(Config.DATA_DRIVES)

        driveMetrics = []
        for dataDrive in dataDrives.split(" "):
            # First, test if the drive exists on all host, getting the list of where it worked
            output = getCommandOutput("massh %s worked test -e %s" %
                                      (hostfile, dataDrive))
            if len(output) > 0:
                # We have some hosts that have this drive
                tmpHostfile = self.writeHostFile(output.split("\n"))
                cmd = "massh %s verbose \"df %s | grep -v Filesystem\" | awk '{print $1,$8,$7,$4,$5,$6}'" % (
                    tmpHostfile, dataDrive)
                output = getCommandOutput(cmd).split("\n")

                for line in output:
                    (host, drive, perc, size, used, avail) = line.split(" ")
                    driveMetrics.append(
                        (host.replace("[", "").replace("]", ""), drive,
                         perc.replace("%", ""), size, used, avail))

        driveMetrics.sort()

        self.__printDriveUsageInserts(driveMetrics)

        row = namedtuple(
            'Row', ['Host', 'Drive', 'PercentUsed', 'Size', 'Used', 'Avail'])

        toPrint = []
        for (host, drive, perc, size, used, avail) in driveMetrics:
            toPrint.append(row(host, drive, perc, size, used, avail))

        pprinttable(toPrint)

    def __printDriveUsageInserts(self, driveMetrics):
        for (host, drive, perc, size, used, avail) in driveMetrics:
            row = DriveUsageRow()
            row.host = host
            row.drive = drive
            row.perc = perc
            row.size = size
            row.used = used
            row.avail = avail

            self.pgUtil.writeInsert(row)

    def printDriveWriteTest(self):
        printInfo("Getting non-writeable drives")
        hostfile = self.conf.get(Config.HOST_FILE)
        dataDrives = self.conf.get(Config.DATA_DRIVES)

        failedDrives = []
        for drive in dataDrives.split(" "):
            # Check if the drives exist
            output = getCommandOutput("massh %s bombed sudo test -e %s" %
                                      (hostfile, drive))
            if len(output) > 0:
                for host in output.split("\n"):
                    failedDrives.append((host, drive, 'dne'))

            output = getCommandOutput("massh %s worked sudo test -e %s" %
                                      (hostfile, drive))
            if len(output) > 0:
                tmpHostFile = self.writeHostFile(output.split("\n"))
                output = getCommandOutput("massh %s bombed sudo test -w %s" %
                                          (tmpHostFile, drive))
                if len(output) > 0:
                    for host in output.split("\n"):
                        failedDrives.append((host, drive, 'ro'))

        if len(failedDrives) == 0:
            printInfo("No non-writeable drives to report")
        else:
            row = namedtuple('Row', ['Host', 'Drive', 'Reason'])

            failedDrives.sort()

            self.__printDriveWriteTest(failedDrives)

            toPrint = []
            for (host, drive, reason) in failedDrives:
                toPrint.append(row(host, drive, reason))

            pprinttable(toPrint)

    def __printDriveWriteTest(self, failedDrives):
        for (host, drive, reason) in failedDrives:
            row = DriveWriteTestRow()
            row.host = host
            row.drive = drive
            row.reason = reason

            self.pgUtil.writeInsert(row)

    def writeHostFile(self, hosts):
        fName = self.conf.get(Config.TMP_DIR) + "/fsutil.txt"
        f = open(fName, 'w')

        for item in hosts:
            f.write(item + "\n")

        f.flush()
        f.close()
        return fName
Ejemplo n.º 3
0
class UserUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)

    def printReport(self):
        self.printUserSpaceQuotas()
        self.printUserINodeQuotas()
        self.printTopKSpaceUsers()
        self.printTopKINodeUsers()

    def printUserSpaceQuotas(self):
        printInfo("Getting space quota status for users")

        quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories())

        if len(quotas) == 0:
            printInfo("No user directories found in HDFS")
            return

        quotas.sort()

        self.__printUserSpaceQuotasInserts(quotas)

        row = namedtuple(
            'Row',
            ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            # Sometimes the remaining quota is negative...
            if remainingQuota != 'inf':
                if long(remainingQuotaHR) < 0:
                    remainingQuotaHR = "-" + bytes2human(-long(remainingQuota))
                else:
                    remainingQuotaHR = bytes2human(remainingQuota)
            else:
                remainingQuotaHR = remainingQuota
            toPrint.append(
                row(directory, quota, remainingQuota, quotaHR,
                    remainingQuotaHR))

        pprinttable(toPrint)

    def __printUserSpaceQuotasInserts(self, quotas):
        for (directory, quota, remainingQuota) in quotas:
            row = UserSpaceQuotaRow()
            row.username = directory[6:]
            row.dir = directory

            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None

            self.pgUtil.writeInsert(row)

    def printUserINodeQuotas(self):
        printInfo("Getting inode quota status for users")

        quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories())

        if len(quotas) == 0:
            printInfo("No user directories found in HDFS")
            return

        quotas.sort()

        self.__printUserINodeQuotasInserts(quotas)

        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            toPrint.append(row(directory, quota, remainingQuota))

        pprinttable(toPrint)

    def __printUserINodeQuotasInserts(self, quotas):
        for (directory, quota, remainingQuota) in quotas:
            row = UserINodeQuotaRow()
            row.username = directory[6:]
            row.dir = directory

            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None

            self.pgUtil.writeInsert(row)

    def printUserSpaceQuota(self, user):
        printInfo("Getting space quota status for user %s" % (user))

        quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)])

        if len(quotas) == 0:
            printInfo("Directory for user %s not found in HDFS" % (quotas))
            return

        row = namedtuple(
            'Row',
            ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(
                remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(
                row(directory, quota, remainingQuota, quotaHR,
                    remainingQuotaHR))

        pprinttable(toPrint)

    def printUserINodeQuota(self, user):
        printInfo("Getting inode quota status for user %s" % (user))

        quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)])

        if len(quotas) == 0:
            printInfo("Directory for user %s not found in HDFS" % (quotas))
            return

        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            toPrint.append(row(directory, quota, remainingQuota))

        pprinttable(toPrint)

    def printTopKSpaceUsers(self):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s space users" % (k))

        sizes = self.hdfsUtil.getDirSizes(['/user'])

        if len(sizes) == 0:
            printInfo("No user directories found in HDFS")
            return

        sizes.sort(key=operator.itemgetter(1), reverse=True)

        if len(sizes) > k:
            sizes = sizes[0:k]

        self.__printTopKSpaceInserts(sizes)

        row = namedtuple('Row', ['User', 'Size', 'SizeHR'])

        toPrint = []
        for (dir, size) in sizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(dir, str(size), str(sizeHR)))

        pprinttable(toPrint)

    def __printTopKSpaceInserts(self, sizes):
        for (dir, size) in sizes:
            row = UserSpaceSizeRow()
            row.username = dir[6:]
            row.dir = dir
            row.size = size

            self.pgUtil.writeInsert(row)

    def printTopKINodeUsers(self):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s inode users" % (k))

        counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories())

        if len(counts) == 0:
            printInfo("No user directories found in HDFS")
            return

        counts.sort(key=operator.itemgetter(1), reverse=True)

        if len(counts) > k:
            counts = counts[0:k]

        self.__printTopKINodeUsersInserts(counts)

        row = namedtuple('Row', ['User', 'INodes'])

        toPrint = []
        for (dir, count) in counts:
            toPrint.append(row(dir, str(count)))

        pprinttable(toPrint)

    def __printTopKINodeUsersInserts(self, counts):
        for (dir, count) in counts:
            row = UserINodeSizeRow()
            row.username = dir[6:]
            row.dir = dir
            row.size = count

            self.pgUtil.writeInsert(row)

    def setUserSpaceQuota(self, user, quota):
        if user == self.conf.get(Config.USER_DIR_BLACKLIST).split():
            printError("User %s is in the blacklist.  Remove to set quota" %
                       (db))
            return

        self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota)

    def clearUserSpaceQuota(self, user):
        self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)])

    def setUserINodeQuota(self, user, quota):
        if user == self.conf.get(Config.USER_DIR_BLACKLIST).split():
            printError("User %s is in the blacklist.  Remove to set quota" %
                       (db))
            return

        self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota)

    def clearUserINodeQuota(self, user):
        self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)])

    def getUserDirectories(self):
        return self.hdfsUtil.listDirs(['/user'])
class UserUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)
        
    def printReport(self):
        self.printUserSpaceQuotas()
        self.printUserINodeQuotas()
        self.printTopKSpaceUsers()
        self.printTopKINodeUsers()
        
    def printUserSpaceQuotas(self):
        printInfo("Getting space quota status for users")

        quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories())
        
        if len(quotas) == 0:
            printInfo("No user directories found in HDFS")
            return
    
        quotas.sort()
        
        self.__printUserSpaceQuotasInserts(quotas)
    
        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            # Sometimes the remaining quota is negative...
            if remainingQuota != 'inf':
                if long(remainingQuotaHR) < 0:
                    remainingQuotaHR = "-" + bytes2human(-long(remainingQuota))
                else:
                    remainingQuotaHR = bytes2human(remainingQuota)
            else:
                remainingQuotaHR = remainingQuota
            toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
        
        pprinttable(toPrint)
        
    def __printUserSpaceQuotasInserts(self, quotas):    
        for (directory, quota, remainingQuota) in quotas:
            row = UserSpaceQuotaRow()
            row.username = directory[6:]
            row.dir = directory
                
            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None
                
            self.pgUtil.writeInsert(row)
                
    def printUserINodeQuotas(self):
        printInfo("Getting inode quota status for users")

        quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories())
        
        if len(quotas) == 0:
            printInfo("No user directories found in HDFS")
            return
    
        quotas.sort()
        
        self.__printUserINodeQuotasInserts(quotas)
    
        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            toPrint.append(row(directory, quota, remainingQuota))
        
        pprinttable(toPrint)
        
    def __printUserINodeQuotasInserts(self, quotas):    
        for (directory, quota, remainingQuota) in quotas:
            row = UserINodeQuotaRow()
            row.username = directory[6:]
            row.dir = directory
                
            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None
                
            self.pgUtil.writeInsert(row)
            
    def printUserSpaceQuota(self, user):
        printInfo("Getting space quota status for user %s" % (user))

        quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)])
        
        if len(quotas) == 0:
            printInfo("Directory for user %s not found in HDFS" % (quotas))
            return
        
        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
    
        pprinttable(toPrint)
        
    def printUserINodeQuota(self, user):
        printInfo("Getting inode quota status for user %s" % (user))

        quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)])
        
        if len(quotas) == 0:
            printInfo("Directory for user %s not found in HDFS" % (quotas))
            return
    
        row = namedtuple('Row', ['Directory', 'Quota', 'Remaining'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            toPrint.append(row(directory, quota, remainingQuota))
    
        pprinttable(toPrint)
         
    def printTopKSpaceUsers(self):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s space users" % (k))
    
        sizes = self.hdfsUtil.getDirSizes(['/user'])
        
        if len(sizes) == 0:
            printInfo("No user directories found in HDFS")
            return
        
        sizes.sort(key=operator.itemgetter(1), reverse=True)
        
        if len(sizes) > k:
            sizes = sizes[0:k]
            
        self.__printTopKSpaceInserts(sizes)
            
        row = namedtuple('Row', ['User', 'Size', 'SizeHR'])

        toPrint = []
        for (dir, size) in sizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(dir, str(size), str(sizeHR)))
            
        pprinttable(toPrint)
             
    def __printTopKSpaceInserts(self, sizes):    
        for (dir, size) in sizes:
            row = UserSpaceSizeRow()
            row.username = dir[6:]
            row.dir = dir
            row.size = size
            
            self.pgUtil.writeInsert(row)
            
    def printTopKINodeUsers(self):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s inode users" % (k))
    
        counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories())
        
        if len(counts) == 0:
            printInfo("No user directories found in HDFS")
            return
            
        counts.sort(key=operator.itemgetter(1), reverse=True)
        
        if len(counts) > k:
            counts = counts[0:k]
            
        self.__printTopKINodeUsersInserts(counts)
            
        row = namedtuple('Row', ['User', 'INodes'])

        toPrint = []
        for (dir, count) in counts:
            toPrint.append(row(dir, str(count)))
            
        pprinttable(toPrint)
            
    def __printTopKINodeUsersInserts(self, counts):    
        for (dir, count) in counts:
            row = UserINodeSizeRow()
            row.username = dir[6:]
            row.dir = dir
            row.size = count
            
            self.pgUtil.writeInsert(row)
            
    def setUserSpaceQuota(self, user, quota):
        if user == self.conf.get(Config.USER_DIR_BLACKLIST).split():
            printError("User %s is in the blacklist.  Remove to set quota" % (db))
            return
            
        self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota)

    def clearUserSpaceQuota(self, user):
        self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)])
        
    def setUserINodeQuota(self, user, quota):
        if user == self.conf.get(Config.USER_DIR_BLACKLIST).split():
            printError("User %s is in the blacklist.  Remove to set quota" % (db))
            return
            
        self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota)

    def clearUserINodeQuota(self, user):
        self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)])
        
    def getUserDirectories(self):
        return self.hdfsUtil.listDirs(['/user'])
 def __init__(self, conf):
     self.conf = conf
     self.hdfsUtil = HdfsUtil(conf)
     self.pgUtil = PostgresUtil(conf)
Ejemplo n.º 6
0
 def __init__(self, conf):
     self.conf = conf
     self.pgutil = PostgresUtil(conf)
        printError("`which hdfs` returned a non-zero exit code.  Make sur eyou are using this utility from an HDFS node")
        sys.exit(1)
        
    if getCommandOutput("whoami") != "gpadmin":
        printError("Please execute this utility as gpadmin")
        sys.exit(2)
        
## Report option
    if sys.argv[1] == "report":
        parser = OptionParser()
        parser.add_option("-c", "--config", dest="configFile", help="Configuration file (default phd-metrics.ini)", default="phd-metrics.ini")
        parser.add_option("-s", "--sqlfile", dest="sqlFile", help="Filename to write SQL statements to (default none)", default=None)
            
        conf = Config(parser, sys.argv[2:])
        
        pgutil = PostgresUtil(conf)
        pgutil.open()
        
        HdfsUtil(conf).printReport()
        HawqUtil(conf).printReport()
        HiveUtil(conf).printReport()
        UserUtil(conf).printReport()
        FsUtil(conf).printReport()
        
        pgutil.close()
        
# Local filesystem option
    elif sys.argv[1] == "fs-util":
        parser = OptionParser()
        parser.add_option("-c", "--config", dest="configFile", help="Configuration file (default phd-metrics.ini)", default="phd-metrics.ini")
        parser.add_option("-a", "--action", dest="action", help="Choose an action: report", default=None)
Ejemplo n.º 8
0
class HiveUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)

    def printReport(self):
        printInfo("Fetching contents of Hive warehouse")

        hivedbdirs = self.getHiveDatabaseDirectories()

        self.printDatabaseQuotas(hivedbdirs)
        self.printTopKLargestDatabases(hivedbdirs)

    def getHiveDatabaseDirectories(self):
        hivedirs = self.hdfsUtil.listDirs(
            [self.conf.get(Config.HIVE_WAREHOUSE_DIR)])
        retval = []
        for dir in hivedirs:
            if dir.endswith(".db"):
                retval.append(dir)
        return retval

    def printDatabaseQuota(self, db):
        printInfo("Getting quota status for Hive database %s" % (db))

        quotas = self.hdfsUtil.getSpaceQuotas(
            ["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)])

        if len(quotas) == 0:
            printInfo("No Hive databases found")
            return

        row = namedtuple('Row', [
            'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR',
            'RemainingHR'
        ])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            dbName = directory.replace(".db", "").replace(
                self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(
                remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(
                row(dbName, directory, quota, remainingQuota, quotaHR,
                    remainingQuotaHR))

        pprinttable(toPrint)

    def printDatabaseQuotas(self, hivedbdirs):
        printInfo("Getting quota status for Hive databases")

        hdfsDirs = []
        for dir in hivedbdirs:
            db = self.getDbNameFromPath(dir)
            hdfsDirs.append("%s/%s.db" %
                            (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db))

        quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs)

        if len(quotas) == 0:
            printInfo("No Hive databases found")
            return

        quotas.sort()

        self.__printDBQuotasInserts(quotas)

        row = namedtuple('Row', [
            'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR',
            'RemainingHR'
        ])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            dbName = directory.replace(".db", "").replace(
                self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(
                remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(
                row(dbName, directory, quota, remainingQuota, quotaHR,
                    remainingQuotaHR))

        pprinttable(toPrint)

    def __printDBQuotasInserts(self, quotas):
        for (directory, quota, remainingQuota) in quotas:
            row = HiveDBQuotaRow()
            row.database = directory.replace(".db", "").replace(
                self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            row.dir = directory

            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None

            self.pgUtil.writeInsert(row)

    def getDatabaseSize(self, dbDir):
        sizes = self.hdfsUtil.getDirSizes([dbDir])

        sum = 0
        for (dir, size) in sizes:
            sum += size

        return (dbDir, sum)

    def printTopKLargestDatabases(self, hivedbdirs):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s largest Hive databases" % (k))

        dbSizes = []
        for dbDir in hivedbdirs:
            tDbSize = self.getDatabaseSize(dbDir)
            if not tDbSize is None:
                dbSizes.append(tDbSize)

        if len(dbSizes) == 0:
            printInfo("No Hive databases found in HDFS")
            return

        dbSizes.sort(key=operator.itemgetter(1), reverse=True)

        if len(dbSizes) > k:
            dbSizes = dbSizes[0:k]

        self.__printTopKLargestDatabases(dbSizes)

        # print sizes
        row = namedtuple('Row', ['Database', 'Size', 'SizeHR'])

        toPrint = []
        for (db, size) in dbSizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(db, str(size), str(sizeHR)))
        pprinttable(toPrint)

    def __printTopKLargestDatabases(self, dbSizes):

        for (db, size) in dbSizes:
            row = HiveDBSizeRow()
            row.database = db
            row.size = size

            self.pgUtil.writeInsert(row)

    def setDatabaseQuota(self, db, quota):

        if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split():
            printError("Database %s is in the blacklist. Remove to set quota" %
                       (db))
            return

        printInfo("Setting quota for %s to %s bytes" % (db, quota))

        self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota)

    def clearDatabaseQuota(self, db):
        printInfo("Clearing quota for database %s" % (db))
        self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)])

    def getDbNameFromPath(self, dir):
        return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR),
                           "").replace(".db", "")

    def getDbPathFromName(self, db):
        return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)
Ejemplo n.º 9
0
class HdfsUtil:
    def __init__(self, conf):
        self.conf = conf
        self.pgutil = PostgresUtil(conf)

    def printReport(self):
        self.printFsckSummary()
        self.printNameNodeReport()

    def listDirs(self, directories):
        if len(directories) == 0:
            return []

        dirStr = ""
        for d in directories:
            dirStr = dirStr + "%s " % (d)

        cmd = "hdfs dfs -ls %s | awk '{print $8}'" % (dirStr)

        out = getCommandOutput(cmd)

        if len(out) > 0:
            return out.split("\n")
        else:
            return out

    def getDirSizes(self, directories):
        if len(directories) == 0:
            return []

        cmd = "hdfs dfs -du "
        for directory in directories:
            cmd = cmd + " " + directory

        cmd = cmd + " | awk '{print $1,$2}'"

        out = getCommandOutput(cmd)

        if len(out) == 0:
            return []
        else:
            retval = []
            for line in out.split('\n'):
                # Returns list of (dir, size) pairs
                retval.append((line.split(' ')[1], int(line.split(' ')[0])))

            return retval

    def printFsckSummary(self):
        printInfo("Getting FSCK summary")
        # Redirecting syslog to /dev/null
        cmd = "hdfs fsck / 2> /dev/null | grep -v \"^\.\""

        out = getCommandOutput(cmd)

        self.__printFsckInserts(out)

        print out

    def __printFsckInserts(self, lines):

        row = FsckRow()

        for line in lines.split("\n"):
            if "Total size" in line:
                row.totalSize = int(re.sub(r"\D", "", line))
            elif "Total dirs" in line:
                row.totalDirs = int(re.sub(r"\D", "", line))
            elif "Total files" in line:
                row.totalFiles = int(re.sub(r"\D", "", line))
            elif "Total symlinks" in line:
                row.totalSymlinks = int(re.sub(r"\D", "", line))
            elif "Total blocks" in line:
                tmp = line.split('\t')[1]
                row.totalBlocks = int(tmp[0:tmp.index(' ')])
            elif "Minimally replicated blocks" in line:
                tmp = line.split('\t')[1]
                row.minRepBlocks = int(tmp[0:tmp.index(' ')])
            elif "Over-replicated blocks" in line:
                tmp = line.split('\t')[1]
                row.overRepBlocks = int(tmp[0:tmp.index(' ')])
            elif "Under-replicated blocks" in line:
                tmp = line.split('\t')[1]
                row.underRepBlocks = int(tmp[0:tmp.index(' ')])
            elif "Mis-replicated blocks" in line:
                tmp = line.split('\t')[2]
                row.misRepBlocks = int(tmp[0:tmp.index(' ')])
            elif "Corrupt blocks" in line:
                row.corruptBlocks = int(re.sub(r"\D", "", line))
            elif "Missing replicas" in line:
                tmp = line.split('\t')[2]
                row.missReplicas = int(tmp[0:tmp.index(' ')])
            elif "Number of data-nodes" in line:
                row.numDataNodes = int(re.sub(r"\D", "", line))
            elif "Number of racks" in line:
                row.numRacks = int(re.sub(r"\D", "", line))

        self.pgutil.writeInsert(row)

    def printNameNodeReport(self):
        printInfo("Getting NameNode report")
        # Redirecting syslog to /dev/null
        cmd = "hdfs dfsadmin -report 2> /dev/null | grep -v \"^\.\""
        out = getCommandOutput(cmd)

        self.__printNameNodeReportInserts(out)

        print out

    def __printNameNodeReportInserts(self, lines):

        row = None
        alive = True
        hitLive = False
        for line in lines.split("\n"):
            if "Live datanodes:" in line:
                alive = True
                hitLive = True
            elif "Dead datanodes:" in line:
                alive = False

            if not hitLive:
                continue

            if "Name:" in line:
                # Write out the  to our list if we've hit a new node report
                if not row is None:
                    self.pgutil.writeInsert(row)
                # make a new row
                row = HdfsReportRow()
                row.name = line[line.index(' ') + 1:line.index('(') - 1]
                row.alive = alive
            elif "Hostname:" in line:
                row.hostname = line[line.index(' ') + 1:]
            elif "Rack:" in line:
                row.rack = line[line.index(' ') + 1:]
            elif "Decommission Status :" in line:
                row.decommission_status = line.split(' ')[3]
            elif "Configured Capacity:" in line:
                row.conf_capacity = int(line.split(' ')[2])
            elif "DFS Used:" in line[0:9]:
                row.dfs_used = int(line.split(' ')[2])
            elif "Non DFS Used:" in line:
                row.non_dfs_used = int(line.split(' ')[3])
            elif "DFS Remaining:" in line:
                row.dfs_remaining = int(line.split(' ')[2])
            elif "DFS Used%:" in line:
                row.dfs_used_perc = float(
                    line.split(' ')[2][0:len(line.split(' ')[2]) - 1])
            elif "DFS Remaining%:" in line:
                row.dfs_remaining_perc = float(
                    line.split(' ')[2][0:len(line.split(' ')[2]) - 1])
            elif "Last contact:" in line:
                row.last_contact = line[14:]

        # Write out the last row
        if not row is None:
            self.pgutil.writeInsert(row)

    def getINodeCounts(self, directories):
        if len(directories) == 0:
            return []

        retval = []
        for directory in directories:
            # Redirecting syslog to /dev/null
            cmd = "hdfs fsck %s 2> /dev/null | grep Total | egrep \"Total dirs|Total files|Total blocks\"" % (
                directory)

            iNodeCount = 0
            for line in getCommandOutput(cmd).split('\n'):
                if 'dirs' in line:
                    iNodeCount += int(line.split('\t')[1])
                if 'files' in line:
                    iNodeCount += int(line.split('\t')[1])
                if 'blocks' in line:
                    iNodeCount += int(line.split('\t')[1][0:1])

            retval.append((directory, iNodeCount))

        return retval

    def getSpaceQuotas(self, directories):
        if len(directories) == 0:
            return []

        cmd = "hdfs dfs -count -q"
        for directory in directories:
            cmd = cmd + " " + directory

        try:
            quotas = getCommandOutput(cmd).split("\n")
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)

        retval = []
        for quota in quotas:
            # Returns list of (directory, quota, remainingQuota)
            retval.append(
                (quota.split()[7], quota.split()[2], quota.split()[3]))
        return retval

    def setSpaceQuotas(self, directories, quota):
        if len(directories) == 0:
            return []

        cmd = "hdfs dfsadmin -setSpaceQuota %s" % (quota)
        for directory in directories:
            cmd = cmd + " " + directory

        try:
            getCommandOutput(cmd)
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)

    def clearSpaceQuotas(self, directories):
        if len(directories) == 0:
            return []

        cmd = "hdfs dfsadmin -clrSpaceQuota"
        for directory in directories:
            cmd = cmd + " " + directory

        try:
            getCommandOutput(cmd)
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)

    def getINodeQuotas(self, directories):
        if len(directories) == 0:
            return []

        cmd = "hdfs dfs -count -q"
        for directory in directories:
            cmd = cmd + " " + directory

        try:
            quotas = getCommandOutput(cmd).split("\n")
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)

        retval = []
        for quota in quotas:
            # TODO get the proper indexes from the count
            retval.append(
                (quota.split()[7], quota.split()[0], quota.split()[1]))
        return retval

    def setINodeQuotas(self, directories, quota):
        if len(directories) == 0:
            return []

        cmd = "hdfs dfsadmin -setQuota %s" % (quota)
        for directory in directories:
            cmd = cmd + " " + directory

        try:
            getCommandOutput(cmd).split("\n")
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)

    def clearINodeQuotas(self, directories):
        if len(directories) == 0:
            return []

        cmd = "hdfs dfsadmin -clrQuota"
        for directory in directories:
            cmd = cmd + " " + directory

        try:
            getCommandOutput(cmd).split("\n")
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)
Ejemplo n.º 10
0
        parser = OptionParser()
        parser.add_option("-c",
                          "--config",
                          dest="configFile",
                          help="Configuration file (default phd-metrics.ini)",
                          default="phd-metrics.ini")
        parser.add_option(
            "-s",
            "--sqlfile",
            dest="sqlFile",
            help="Filename to write SQL statements to (default none)",
            default=None)

        conf = Config(parser, sys.argv[2:])

        pgutil = PostgresUtil(conf)
        pgutil.open()

        HdfsUtil(conf).printReport()
        HawqUtil(conf).printReport()
        HiveUtil(conf).printReport()
        UserUtil(conf).printReport()
        FsUtil(conf).printReport()

        pgutil.close()

# Local filesystem option
    elif sys.argv[1] == "fs-util":
        parser = OptionParser()
        parser.add_option("-c",
                          "--config",
 def __init__(self, conf):
     self.conf = conf
     self.pgutil = PostgresUtil(conf)
class HdfsUtil:
    def __init__(self, conf):
        self.conf = conf
        self.pgutil = PostgresUtil(conf)
        
    def printReport(self):
        self.printFsckSummary()
        self.printNameNodeReport()
        
    def listDirs(self, directories):
        if len(directories) == 0:
            return []
            
        dirStr = ""
        for d in directories:
            dirStr = dirStr + "%s " %(d)
            
        cmd = "hdfs dfs -ls %s | awk '{print $8}'" % (dirStr)
        
        out = getCommandOutput(cmd)
        
        if len(out) > 0:
            return out.split("\n")
        else:
            return out
        
    def getDirSizes(self, directories):
        if len(directories) == 0:
            return []
            
        cmd = "hdfs dfs -du "
        for directory in directories:
            cmd = cmd + " " + directory
            
        cmd = cmd + " | awk '{print $1,$2}'"
    
        out = getCommandOutput(cmd)
        
        if len(out) == 0:
            return []
        else:
            retval = []
            for line in out.split('\n'):
                # Returns list of (dir, size) pairs
                retval.append((line.split(' ')[1], int(line.split(' ')[0])))
                
            return retval
            
    def printFsckSummary(self):
        printInfo("Getting FSCK summary")
        # Redirecting syslog to /dev/null
        cmd = "hdfs fsck / 2> /dev/null | grep -v \"^\.\""
        
        out = getCommandOutput(cmd)

        self.__printFsckInserts(out)
        
        print out

    def __printFsckInserts(self, lines):

        row = FsckRow()
        
        for line in lines.split("\n"):
            if "Total size" in line:
                row.totalSize = int(re.sub(r"\D", "", line))
            elif "Total dirs" in line:
                row.totalDirs = int(re.sub(r"\D", "", line))
            elif "Total files" in line:
                row.totalFiles = int(re.sub(r"\D", "", line))
            elif "Total symlinks" in line:
                row.totalSymlinks = int(re.sub(r"\D", "", line))
            elif "Total blocks" in line:
                tmp = line.split('\t')[1]
                row.totalBlocks = int(tmp[0:tmp.index(' ')])
            elif "Minimally replicated blocks" in line:
                tmp = line.split('\t')[1]
                row.minRepBlocks = int(tmp[0:tmp.index(' ')])
            elif "Over-replicated blocks" in line:
                tmp = line.split('\t')[1]
                row.overRepBlocks = int(tmp[0:tmp.index(' ')])
            elif "Under-replicated blocks" in line:
                tmp = line.split('\t')[1]
                row.underRepBlocks = int(tmp[0:tmp.index(' ')])
            elif "Mis-replicated blocks" in line:
                tmp = line.split('\t')[2]
                row.misRepBlocks = int(tmp[0:tmp.index(' ')])
            elif "Corrupt blocks" in line:
                row.corruptBlocks = int(re.sub(r"\D", "", line))
            elif "Missing replicas" in line:
                tmp = line.split('\t')[2]
                row.missReplicas = int(tmp[0:tmp.index(' ')])
            elif "Number of data-nodes" in line:
                row.numDataNodes = int(re.sub(r"\D", "", line))
            elif "Number of racks" in line:
                row.numRacks = int(re.sub(r"\D", "", line))
        
        self.pgutil.writeInsert(row)
        
    def printNameNodeReport(self):
        printInfo("Getting NameNode report")
        # Redirecting syslog to /dev/null
        cmd = "hdfs dfsadmin -report 2> /dev/null | grep -v \"^\.\""
        out = getCommandOutput(cmd)
        
        self.__printNameNodeReportInserts(out)

        print out

    def __printNameNodeReportInserts(self, lines):

        row = None
        alive = True
        hitLive = False
        for line in lines.split("\n"):
            if "Live datanodes:" in line:
                alive = True
                hitLive = True
            elif "Dead datanodes:" in line:
                alive = False

            if not hitLive:
                continue

            if "Name:" in line:
                # Write out the  to our list if we've hit a new node report
                if not row is None:
                    self.pgutil.writeInsert(row)
                # make a new row 
                row = HdfsReportRow()
                row.name = line[line.index(' ')+1:line.index('(')-1]
                row.alive = alive
            elif "Hostname:" in line:
                row.hostname = line[line.index(' ')+1:]
            elif "Rack:" in line:
                row.rack = line[line.index(' ')+1:]
            elif "Decommission Status :" in line:
                row.decommission_status = line.split(' ')[3]
            elif "Configured Capacity:" in line:
                row.conf_capacity = int(line.split(' ')[2])
            elif "DFS Used:" in line[0:9]:
                row.dfs_used = int(line.split(' ')[2])
            elif "Non DFS Used:" in line:
                row.non_dfs_used = int(line.split(' ')[3])
            elif "DFS Remaining:" in line:
                row.dfs_remaining = int(line.split(' ')[2])
            elif "DFS Used%:" in line:
                row.dfs_used_perc = float(line.split(' ')[2][0:len(line.split(' ')[2])-1])
            elif "DFS Remaining%:" in line:
                row.dfs_remaining_perc = float(line.split(' ')[2][0:len(line.split(' ')[2])-1])
            elif "Last contact:" in line:
                row.last_contact = line[14:]
    
        # Write out the last row    
        if not row is None:
            self.pgutil.writeInsert(row)
        
    def getINodeCounts(self, directories):
        if len(directories) == 0:
            return []
            
        retval = []
        for directory in directories:
            # Redirecting syslog to /dev/null
            cmd = "hdfs fsck %s 2> /dev/null | grep Total | egrep \"Total dirs|Total files|Total blocks\"" % (directory)
            
            iNodeCount = 0
            for line in getCommandOutput(cmd).split('\n'):
                if 'dirs' in line:
                    iNodeCount += int(line.split('\t')[1])
                if 'files' in line:
                    iNodeCount += int(line.split('\t')[1])
                if 'blocks' in line:
                    iNodeCount += int(line.split('\t')[1][0:1])
            
            retval.append((directory, iNodeCount))
            
        return retval

    def getSpaceQuotas(self, directories):
        if len(directories) == 0:
            return []
            
        cmd = "hdfs dfs -count -q"
        for directory in directories:
            cmd = cmd + " " + directory
            
        try:
            quotas = getCommandOutput(cmd).split("\n")
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)

        retval = []
        for quota in quotas:
            # Returns list of (directory, quota, remainingQuota)
            retval.append(( quota.split()[7], quota.split()[2], quota.split()[3] ))
        return retval
        
    def setSpaceQuotas(self, directories, quota):
        if len(directories) == 0:
            return []
            
        cmd = "hdfs dfsadmin -setSpaceQuota %s" % (quota)
        for directory in directories:
            cmd = cmd + " " + directory

        try:
            getCommandOutput(cmd)
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)

    def clearSpaceQuotas(self, directories):
        if len(directories) == 0:
            return []

        cmd = "hdfs dfsadmin -clrSpaceQuota"
        for directory in directories:
            cmd = cmd + " " + directory
        
        try:
            getCommandOutput(cmd)
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)
        
    def getINodeQuotas(self, directories):    
        if len(directories) == 0:
            return []
            
        cmd = "hdfs dfs -count -q"
        for directory in directories:
            cmd = cmd + " " + directory

        try:
            quotas = getCommandOutput(cmd).split("\n")
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)

        retval = []
        for quota in quotas:
            # TODO get the proper indexes from the count
            retval.append(( quota.split()[7], quota.split()[0], quota.split()[1] ))
        return retval
        
    def setINodeQuotas(self, directories, quota):    
        if len(directories) == 0:
            return []

        cmd = "hdfs dfsadmin -setQuota %s" % (quota)
        for directory in directories:
            cmd = cmd + " " + directory
            
        try:
            getCommandOutput(cmd).split("\n")
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)

    def clearINodeQuotas(self, directories):    
        if len(directories) == 0:
            return []

        cmd = "hdfs dfsadmin -clrQuota"
        for directory in directories:
            cmd = cmd + " " + directory

        try:
            getCommandOutput(cmd).split("\n")
        except subprocess.CalledProcessError:
            printError("Directories not found: %s" % (cmd))
            sys.exit(1)
class HiveUtil:
    def __init__(self, conf):
        self.conf = conf
        self.hdfsUtil = HdfsUtil(conf)
        self.pgUtil = PostgresUtil(conf)
        
    def printReport(self):
        printInfo("Fetching contents of Hive warehouse")
        
        hivedbdirs = self.getHiveDatabaseDirectories()

        self.printDatabaseQuotas(hivedbdirs)    
        self.printTopKLargestDatabases(hivedbdirs)
                
    def getHiveDatabaseDirectories(self):
        hivedirs = self.hdfsUtil.listDirs([self.conf.get(Config.HIVE_WAREHOUSE_DIR)])
        retval = []
        for dir in hivedirs:
            if dir.endswith(".db"):
                retval.append(dir)
        return retval
        
    def printDatabaseQuota(self, db):
        printInfo("Getting quota status for Hive database %s" % (db))
        
        quotas = self.hdfsUtil.getSpaceQuotas(["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)])
    
        if len(quotas) == 0:
            printInfo("No Hive databases found")
            return;
            
        row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
    
        pprinttable(toPrint)
        
    def printDatabaseQuotas(self, hivedbdirs):
        printInfo("Getting quota status for Hive databases")
            
        hdfsDirs = []
        for dir in hivedbdirs:
            db = self.getDbNameFromPath(dir)
            hdfsDirs.append("%s/%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db))
        
        quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs)
        
        if len(quotas) == 0:
            printInfo("No Hive databases found")
            return;
            
        quotas.sort()
        
        self.__printDBQuotasInserts(quotas)
    
        row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])

        toPrint = []
        for (directory, quota, remainingQuota) in quotas:
            dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            quotaHR = bytes2human(quota) if quota != 'none' else quota
            remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota
            toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
    
        pprinttable(toPrint)
            
    def __printDBQuotasInserts(self, quotas):
        for (directory, quota, remainingQuota) in quotas:
            row = HiveDBQuotaRow()
            row.database = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
            row.dir = directory
            
            if not quota == 'none':
                row.quota = int(quota)
                row.quotaRemaining = int(remainingQuota)
                row.quotaUsed = row.quota - row.quotaRemaining
            else:
                row.quota = None
                row.quotaRemaining = None
                row.quotaUsed = None
                
            self.pgUtil.writeInsert(row)
            
    def getDatabaseSize(self, dbDir):
        sizes = self.hdfsUtil.getDirSizes([dbDir])
        
        sum = 0
        for (dir, size) in sizes:
            sum += size
        
        return (dbDir, sum)
            
    def printTopKLargestDatabases(self, hivedbdirs):
        k = self.conf.get(Config.REPORTER_K)
        printInfo("Getting top %s largest Hive databases" % (k))
        
        dbSizes = []
        for dbDir in hivedbdirs:
            tDbSize = self.getDatabaseSize(dbDir)
            if not tDbSize is None:
                dbSizes.append(tDbSize)

        if len(dbSizes) == 0:
            printInfo("No Hive databases found in HDFS")
            return
        
        dbSizes.sort(key=operator.itemgetter(1), reverse=True)
        
        if len(dbSizes) > k:
            dbSizes = dbSizes[0:k]
            
        self.__printTopKLargestDatabases(dbSizes)
            
        # print sizes
        row = namedtuple('Row', ['Database', 'Size', 'SizeHR'])

        toPrint = []
        for (db, size) in dbSizes:
            sizeHR = bytes2human(size)
            toPrint.append(row(db, str(size), str(sizeHR)))
        pprinttable(toPrint)
        
    def __printTopKLargestDatabases(self, dbSizes):
    
        for (db, size) in dbSizes:
            row = HiveDBSizeRow()
            row.database = db
            row.size = size
            
            self.pgUtil.writeInsert(row)
    
    def setDatabaseQuota(self, db, quota):
    
        if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split():
            printError("Database %s is in the blacklist. Remove to set quota" % (db))
            return

        printInfo("Setting quota for %s to %s bytes" % (db, quota))
        
        self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota)
            
    def clearDatabaseQuota(self, db):
        printInfo("Clearing quota for database %s" % (db))
        self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)])
            
    def getDbNameFromPath(self, dir):
        return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "").replace(".db", "")
        
    def getDbPathFromName(self, db):
        return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db);
Ejemplo n.º 14
0
class FsUtil:
    def __init__(self, conf):
        self.conf = conf
        self.pgUtil = PostgresUtil(conf)
        
    def printReport(self):
        printInfo("Grabbing and printing drive metrics")
        self.printDriveUsage()
        self.printDriveWriteTest()
    
    # This function prints usage statistics for drives that are available
    # It runs an initial test to make sure the drive exists on the host
    # If the drive does not exist, it is not reported in this report
    def printDriveUsage(self):
        hostfile = self.conf.get(Config.HOST_FILE)
        dataDrives = self.conf.get(Config.DATA_DRIVES)
        
        driveMetrics = []
        for dataDrive in dataDrives.split(" "):
            # First, test if the drive exists on all host, getting the list of where it worked
            output = getCommandOutput("massh %s worked test -e %s" % (hostfile, dataDrive))
            if len(output) > 0:
                # We have some hosts that have this drive
                tmpHostfile = self.writeHostFile(output.split("\n"))
                cmd = "massh %s verbose \"df %s | grep -v Filesystem\" | awk '{print $1,$8,$7,$4,$5,$6}'" % (tmpHostfile, dataDrive)
                output = getCommandOutput(cmd).split("\n")
        
                for line in output:
                    (host, drive, perc, size, used, avail) = line.split(" ")
                    driveMetrics.append((host.replace("[","").replace("]",""), drive, perc.replace("%", ""), size, used, avail))
        
        driveMetrics.sort()
        
        self.__printDriveUsageInserts(driveMetrics)
        
        row = namedtuple('Row', ['Host', 'Drive', 'PercentUsed', 'Size', 'Used', 'Avail'])

        toPrint = []
        for (host, drive, perc, size, used, avail) in driveMetrics:
            toPrint.append(row(host, drive, perc, size, used, avail))
    
        pprinttable(toPrint)
    
    def __printDriveUsageInserts(self, driveMetrics):
        for (host, drive, perc, size, used, avail) in driveMetrics:
            row = DriveUsageRow()
            row.host = host
            row.drive = drive
            row.perc = perc
            row.size = size
            row.used = used
            row.avail = avail
            
            self.pgUtil.writeInsert(row)
        
    def printDriveWriteTest(self):
        printInfo("Getting non-writeable drives")
        hostfile = self.conf.get(Config.HOST_FILE)
        dataDrives = self.conf.get(Config.DATA_DRIVES)
        
        failedDrives = []
        for drive in dataDrives.split(" "):
            # Check if the drives exist
            output = getCommandOutput("massh %s bombed sudo test -e %s" % (hostfile, drive))
            if len(output) > 0:
                for host in output.split("\n"):
                    failedDrives.append((host, drive, 'dne'))
                    
            output = getCommandOutput("massh %s worked sudo test -e %s" % (hostfile, drive))
            if len(output) > 0:
                tmpHostFile = self.writeHostFile(output.split("\n"))
                output = getCommandOutput("massh %s bombed sudo test -w %s" % (tmpHostFile, drive))
                if len(output) > 0:
                    for host in output.split("\n"):
                        failedDrives.append((host, drive, 'ro'))
                    
        if len(failedDrives) == 0:
            printInfo("No non-writeable drives to report")
        else:
            row = namedtuple('Row', ['Host', 'Drive', 'Reason'])
            
            failedDrives.sort()
            
            self.__printDriveWriteTest(failedDrives)
            
            toPrint = []
            for (host, drive, reason) in failedDrives:
                    toPrint.append(row(host, drive, reason))
        
            pprinttable(toPrint)

    def __printDriveWriteTest(self, failedDrives):
        for (host, drive, reason) in failedDrives:
            row = DriveWriteTestRow()
            row.host = host
            row.drive = drive
            row.reason = reason
            
            self.pgUtil.writeInsert(row)
        
    def writeHostFile(self, hosts):
        fName = self.conf.get(Config.TMP_DIR) + "/fsutil.txt"
        f = open(fName, 'w')

        for item in hosts:
            f.write(item + "\n")

        f.flush()
        f.close()
        return fName