This repository has been archived by the owner on Jan 5, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
HiveUtil.py
executable file
·167 lines (120 loc) · 5.82 KB
/
HiveUtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/python
import ConfigParser
import operator
from collections import namedtuple
from funcs import *
from Config import *
from HdfsUtil import *
from PostgresUtil import *
class HiveUtil:
def __init__(self, conf):
self.conf = conf
self.hdfsUtil = HdfsUtil(conf)
self.pgUtil = PostgresUtil(conf)
def printReport(self):
printInfo("Fetching contents of Hive warehouse")
hivedbdirs = self.getHiveDatabaseDirectories()
self.printDatabaseQuotas(hivedbdirs)
self.printTopKLargestDatabases(hivedbdirs)
def getHiveDatabaseDirectories(self):
hivedirs = self.hdfsUtil.listDirs([self.conf.get(Config.HIVE_WAREHOUSE_DIR)])
retval = []
for dir in hivedirs:
if dir.endswith(".db"):
retval.append(dir)
return retval
def printDatabaseQuota(self, db):
printInfo("Getting quota status for Hive database %s" % (db))
quotas = self.hdfsUtil.getSpaceQuotas(["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)])
if len(quotas) == 0:
printInfo("No Hive databases found")
return;
row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])
toPrint = []
for (directory, quota, remainingQuota) in quotas:
dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
quotaHR = bytes2human(quota) if quota != 'none' else quota
remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota
toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
pprinttable(toPrint)
def printDatabaseQuotas(self, hivedbdirs):
printInfo("Getting quota status for Hive databases")
hdfsDirs = []
for dir in hivedbdirs:
db = self.getDbNameFromPath(dir)
hdfsDirs.append("%s/%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db))
quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs)
if len(quotas) == 0:
printInfo("No Hive databases found")
return;
quotas.sort()
self.__printDBQuotasInserts(quotas)
row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR'])
toPrint = []
for (directory, quota, remainingQuota) in quotas:
dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
quotaHR = bytes2human(quota) if quota != 'none' else quota
remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota
toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR))
pprinttable(toPrint)
def __printDBQuotasInserts(self, quotas):
for (directory, quota, remainingQuota) in quotas:
row = HiveDBQuotaRow()
row.database = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "")
row.dir = directory
if not quota == 'none':
row.quota = int(quota)
row.quotaRemaining = int(remainingQuota)
row.quotaUsed = row.quota - row.quotaRemaining
else:
row.quota = None
row.quotaRemaining = None
row.quotaUsed = None
self.pgUtil.writeInsert(row)
def getDatabaseSize(self, dbDir):
sizes = self.hdfsUtil.getDirSizes([dbDir])
sum = 0
for (dir, size) in sizes:
sum += size
return (dbDir, sum)
def printTopKLargestDatabases(self, hivedbdirs):
k = self.conf.get(Config.REPORTER_K)
printInfo("Getting top %s largest Hive databases" % (k))
dbSizes = []
for dbDir in hivedbdirs:
tDbSize = self.getDatabaseSize(dbDir)
if not tDbSize is None:
dbSizes.append(tDbSize)
if len(dbSizes) == 0:
printInfo("No Hive databases found in HDFS")
return
dbSizes.sort(key=operator.itemgetter(1), reverse=True)
if len(dbSizes) > k:
dbSizes = dbSizes[0:k]
self.__printTopKLargestDatabases(dbSizes)
# print sizes
row = namedtuple('Row', ['Database', 'Size', 'SizeHR'])
toPrint = []
for (db, size) in dbSizes:
sizeHR = bytes2human(size)
toPrint.append(row(db, str(size), str(sizeHR)))
pprinttable(toPrint)
def __printTopKLargestDatabases(self, dbSizes):
for (db, size) in dbSizes:
row = HiveDBSizeRow()
row.database = db
row.size = size
self.pgUtil.writeInsert(row)
def setDatabaseQuota(self, db, quota):
if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split():
printError("Database %s is in the blacklist. Remove to set quota" % (db))
return
printInfo("Setting quota for %s to %s bytes" % (db, quota))
self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota)
def clearDatabaseQuota(self, db):
printInfo("Clearing quota for database %s" % (db))
self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)])
def getDbNameFromPath(self, dir):
return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "").replace(".db", "")
def getDbPathFromName(self, db):
return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db);