This repository has been archived by the owner on Jan 5, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
HdfsUtil.py
executable file
·286 lines (228 loc) · 9.42 KB
/
HdfsUtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/python
import ConfigParser
import re
from collections import namedtuple
from funcs import *
from Config import *
from PostgresUtil import *
class HdfsUtil:
def __init__(self, conf):
self.conf = conf
self.pgutil = PostgresUtil(conf)
def printReport(self):
self.printFsckSummary()
self.printNameNodeReport()
def listDirs(self, directories):
if len(directories) == 0:
return []
dirStr = ""
for d in directories:
dirStr = dirStr + "%s " %(d)
cmd = "hdfs dfs -ls %s | awk '{print $8}'" % (dirStr)
out = getCommandOutput(cmd)
if len(out) > 0:
return out.split("\n")
else:
return out
def getDirSizes(self, directories):
if len(directories) == 0:
return []
cmd = "hdfs dfs -du "
for directory in directories:
cmd = cmd + " " + directory
cmd = cmd + " | awk '{print $1,$2}'"
out = getCommandOutput(cmd)
if len(out) == 0:
return []
else:
retval = []
for line in out.split('\n'):
# Returns list of (dir, size) pairs
retval.append((line.split(' ')[1], int(line.split(' ')[0])))
return retval
def printFsckSummary(self):
printInfo("Getting FSCK summary")
# Redirecting syslog to /dev/null
cmd = "hdfs fsck / 2> /dev/null | grep -v \"^\.\""
out = getCommandOutput(cmd)
self.__printFsckInserts(out)
print out
def __printFsckInserts(self, lines):
row = FsckRow()
for line in lines.split("\n"):
if "Total size" in line:
row.totalSize = int(re.sub(r"\D", "", line))
elif "Total dirs" in line:
row.totalDirs = int(re.sub(r"\D", "", line))
elif "Total files" in line:
row.totalFiles = int(re.sub(r"\D", "", line))
elif "Total symlinks" in line:
row.totalSymlinks = int(re.sub(r"\D", "", line))
elif "Total blocks" in line:
tmp = line.split('\t')[1]
row.totalBlocks = int(tmp[0:tmp.index(' ')])
elif "Minimally replicated blocks" in line:
tmp = line.split('\t')[1]
row.minRepBlocks = int(tmp[0:tmp.index(' ')])
elif "Over-replicated blocks" in line:
tmp = line.split('\t')[1]
row.overRepBlocks = int(tmp[0:tmp.index(' ')])
elif "Under-replicated blocks" in line:
tmp = line.split('\t')[1]
row.underRepBlocks = int(tmp[0:tmp.index(' ')])
elif "Mis-replicated blocks" in line:
tmp = line.split('\t')[2]
row.misRepBlocks = int(tmp[0:tmp.index(' ')])
elif "Corrupt blocks" in line:
row.corruptBlocks = int(re.sub(r"\D", "", line))
elif "Missing replicas" in line:
tmp = line.split('\t')[2]
row.missReplicas = int(tmp[0:tmp.index(' ')])
elif "Number of data-nodes" in line:
row.numDataNodes = int(re.sub(r"\D", "", line))
elif "Number of racks" in line:
row.numRacks = int(re.sub(r"\D", "", line))
self.pgutil.writeInsert(row)
def printNameNodeReport(self):
printInfo("Getting NameNode report")
# Redirecting syslog to /dev/null
cmd = "hdfs dfsadmin -report 2> /dev/null | grep -v \"^\.\""
out = getCommandOutput(cmd)
self.__printNameNodeReportInserts(out)
print out
def __printNameNodeReportInserts(self, lines):
row = None
alive = True
hitLive = False
for line in lines.split("\n"):
if "Live datanodes:" in line:
alive = True
hitLive = True
elif "Dead datanodes:" in line:
alive = False
if not hitLive:
continue
if "Name:" in line:
# Write out the to our list if we've hit a new node report
if not row is None:
self.pgutil.writeInsert(row)
# make a new row
row = HdfsReportRow()
row.name = line[line.index(' ')+1:line.index('(')-1]
row.alive = alive
elif "Hostname:" in line:
row.hostname = line[line.index(' ')+1:]
elif "Rack:" in line:
row.rack = line[line.index(' ')+1:]
elif "Decommission Status :" in line:
row.decommission_status = line.split(' ')[3]
elif "Configured Capacity:" in line:
row.conf_capacity = int(line.split(' ')[2])
elif "DFS Used:" in line[0:9]:
row.dfs_used = int(line.split(' ')[2])
elif "Non DFS Used:" in line:
row.non_dfs_used = int(line.split(' ')[3])
elif "DFS Remaining:" in line:
row.dfs_remaining = int(line.split(' ')[2])
elif "DFS Used%:" in line:
row.dfs_used_perc = float(line.split(' ')[2][0:len(line.split(' ')[2])-1])
elif "DFS Remaining%:" in line:
row.dfs_remaining_perc = float(line.split(' ')[2][0:len(line.split(' ')[2])-1])
elif "Last contact:" in line:
row.last_contact = line[14:]
# Write out the last row
if not row is None:
self.pgutil.writeInsert(row)
def getINodeCounts(self, directories):
if len(directories) == 0:
return []
retval = []
for directory in directories:
# Redirecting syslog to /dev/null
cmd = "hdfs fsck %s 2> /dev/null | grep Total | egrep \"Total dirs|Total files|Total blocks\"" % (directory)
iNodeCount = 0
for line in getCommandOutput(cmd).split('\n'):
if 'dirs' in line:
iNodeCount += int(line.split('\t')[1])
if 'files' in line:
iNodeCount += int(line.split('\t')[1])
if 'blocks' in line:
iNodeCount += int(line.split('\t')[1][0:1])
retval.append((directory, iNodeCount))
return retval
def getSpaceQuotas(self, directories):
if len(directories) == 0:
return []
cmd = "hdfs dfs -count -q"
for directory in directories:
cmd = cmd + " " + directory
try:
quotas = getCommandOutput(cmd).split("\n")
except subprocess.CalledProcessError:
printError("Directories not found: %s" % (cmd))
sys.exit(1)
retval = []
for quota in quotas:
# Returns list of (directory, quota, remainingQuota)
retval.append(( quota.split()[7], quota.split()[2], quota.split()[3] ))
return retval
def setSpaceQuotas(self, directories, quota):
if len(directories) == 0:
return []
cmd = "hdfs dfsadmin -setSpaceQuota %s" % (quota)
for directory in directories:
cmd = cmd + " " + directory
try:
getCommandOutput(cmd)
except subprocess.CalledProcessError:
printError("Directories not found: %s" % (cmd))
sys.exit(1)
def clearSpaceQuotas(self, directories):
if len(directories) == 0:
return []
cmd = "hdfs dfsadmin -clrSpaceQuota"
for directory in directories:
cmd = cmd + " " + directory
try:
getCommandOutput(cmd)
except subprocess.CalledProcessError:
printError("Directories not found: %s" % (cmd))
sys.exit(1)
def getINodeQuotas(self, directories):
if len(directories) == 0:
return []
cmd = "hdfs dfs -count -q"
for directory in directories:
cmd = cmd + " " + directory
try:
quotas = getCommandOutput(cmd).split("\n")
except subprocess.CalledProcessError:
printError("Directories not found: %s" % (cmd))
sys.exit(1)
retval = []
for quota in quotas:
# TODO get the proper indexes from the count
retval.append(( quota.split()[7], quota.split()[0], quota.split()[1] ))
return retval
def setINodeQuotas(self, directories, quota):
if len(directories) == 0:
return []
cmd = "hdfs dfsadmin -setQuota %s" % (quota)
for directory in directories:
cmd = cmd + " " + directory
try:
getCommandOutput(cmd).split("\n")
except subprocess.CalledProcessError:
printError("Directories not found: %s" % (cmd))
sys.exit(1)
def clearINodeQuotas(self, directories):
if len(directories) == 0:
return []
cmd = "hdfs dfsadmin -clrQuota"
for directory in directories:
cmd = cmd + " " + directory
try:
getCommandOutput(cmd).split("\n")
except subprocess.CalledProcessError:
printError("Directories not found: %s" % (cmd))
sys.exit(1)