-
Notifications
You must be signed in to change notification settings - Fork 0
/
mr.py
executable file
·746 lines (515 loc) · 21.2 KB
/
mr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
#!/usr/bin/env python
import marshal
import types
import sys
import glob
import os
import os.path as path
import hashlib
import rpyc
from rpyc.utils.server import ForkingServer
import redis
import errno
from socket import gethostname
import time
import logging
import random
from multiprocessing import Process
from heapq import merge
from flask import Flask
logging.basicConfig(level='DEBUG')
#### CONFIG ####
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_DB = '10'
USE_SLAVES_CACHE = False # this doesn't seem to work too well, don't use it
REPLICATION_FACTOR = 3
#### UTIL FUNCTIONS ####
def _serialize_function(func):
'''Serializes a function. Note that it only serialized the code, not the
default arguments or the globals.'''
return marshal.dumps(func.func_code)
def _deserialize_function(sfunc):
'''Deserializes a function from _serialize_function. This method returns a function.'''
return types.FunctionType(marshal.loads(sfunc), globals())
def _group_sorted(list_of_kv):
'''Groups a list of key value pairs by keys. This method assumes that list_of_kv is sorted.
For example: [(1,2),(2,3),(2,4)] -> [ (1, [2]), (2, [3,4]) ]
This method produces a generator, not a list.'''
if len(list_of_kv) == 0:
return
cur_k = list_of_kv[0][0]
cur_vs = []
for k, v in list_of_kv:
if k != cur_k:
yield cur_k, cur_vs
cur_k = k
cur_vs = []
cur_vs.append(v)
yield cur_k, cur_vs
def _mkdirp(path):
'''Mimicks the behavior of "mkdir -p". It creates what local directories it needs to create and doens't
complain when something already exists.'''
path = path.strip()
if len(path) == 0:
raise ValueError("I can't make a directory with no name")
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise exc
def _pathcheck(file_name):
'''Checks to see if 'file_name' is a valid file name for use in our DFS'''
file_name = file_name.strip()
if len(file_name) == 0:
raise ValueError("The file name cannot be empty")
for token in file_name.split('/'):
if not token.isalnum():
raise ValueError("This file name '%s' is not valid, we only accept letters or numbers that are unempty" % file_name)
_REDIS = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)
def _get_redis():
'''Returns a redis instance using the defaults provided at the top of mr.py'''
global _REDIS
try:
_REDIS.ping()
except redis.ConnectionError as ce:
_REDIS = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)
return _REDIS
def _get_timestamp():
'''Returns the current timestamp.'''
return time.time()
def _cat_host(hostname, port):
'''Returns a single string containing the host and the port '1.2.3.4:444' from a hostname '1.2.3.4' and a port 444'''
return str(hostname) + ':' + str(port)
def _split_hostport(hostnameport):
'''A convenience function that takes a string "x.x.x.x:yyyy' and splits it into "x.x.x.x", yyyy'''
h, p = hostnameport.split(':')
return h, int(p)
def _connect(hostname, port=None):
'''Connects to a rpyc slave. It can either take the form of _connect('1.2.3.4:1234') or _connect('1.2.3.4', 1234)'''
if port is None:
hostname, port = hostname.split(':')
try:
a = rpyc.connect(str(hostname), int(port))
return a
except Exception as e:
logging.warning(' '.join(['There was a problem connecting to', hostname, str(port), \
':', str(e), '... i\'m going to unregister it']))
_unregister(hostname, port)
raise e
#### STANDARD LIBRARY ####
def identity_mapper(key, val, params):
'''Just forwards on the key and value as-is.
This is the default mapper used by MapReduce.'''
yield key, val
def identity_reducer(key, values, params):
'''Just fowards on the key and value as-is, in a flattened fashion.
This is the default reducer used by MapReduce.'''
for v in values:
yield key, v
def hash_partitioner(value, num_reducers, params):
'''This partitioner hashes the value and mods the number of reducers.
This is the default partitioner used by MapReduce.'''
return int(hashlib.sha1(str(value.__hash__())).hexdigest(), 16) % num_reducers
def dfs_linereader(params):
'''This reader takes the DFS file and reads it line-by-line. The key is the line number.
This is the default input function used by MapReduce'''
return enumerate(read(params['inputfilepath']).splitlines())
def basic_mapoutput(reducer_number, payload, params):
'''This map output function writes data out as a serialized list of key, value tuples.
This is the default map output function used by MapReduce.'''
write('%s/tmp/%s/%s' % (params['outputdir'], reducer_number, params['inputfilepath']), marshal.dumps(payload))
def dfs_line_output(reducer_number, payload, params):
''' This reducer output function writes out the key/value pairs separated by tabs, each record on a newline.
This is the default reducer output function used by MapReduce. '''
out_str = '\n'.join( str(k) + '\t' + str(v) for k, v in payload )
write('%s/reduce%s' % (params['outputdir'], str(reducer_number).zfill(6)), out_str)
def devnull_output(reducer_number, payload, params):
'''This reducer output function does nothing, no matter what is passed to it.'''
pass # "pass" is intentional. this is not a stub.
def basic_shuffle(reducer_number, params):
'''Reads the respective reducer's data out of DFS.
This is the default shuffle function used by MapReduce.'''
inputs = ls('%s/tmp/%d/*' % (params['outputdir'], int(reducer_number)))
return [ marshal.loads(read(inp)) for inp in inputs ]
def basic_sort(mapper_outputs, params):
'''Merges the sorted reducer input files into a single stream of key, values pairs.
This is the default merge of MapReduce.'''
# mapper_outputs is a list of k,v sets that are sorted
cur_vs = []
cur_k = None
a = []
for p in merge(mapper_outputs):
a.append(p)
for k, v in merge(*mapper_outputs):
if cur_k is None:
cur_k = k
if cur_k != k:
yield cur_k, cur_vs
cur_k = k
cur_vs = []
cur_vs.append(v)
yield cur_k, cur_vs
#### CLIENTS and APIs ####
_SLAVES_CACHE = None
_SLAVES_CACHE_TS = 0
def slaves(timeout=30, cache_expire=60, be_sure=False):
'''
Returns a list of slaves that are alive and available.
'timeout' (seconds) tells this to check in on a slave if it hasn't registered in a while
'cache_expire' (seconds) tells the client how long to keep the cache hot for.
Set it to 0 if you don't want to use it.
'be_sure' to True makes this method go out and check that each slave is alive.
It also forces a register for each.
'''
global _SLAVES_CACHE
global _SLAVES_CACHE_TS
if USE_SLAVES_CACHE and _get_timestamp() < _SLAVES_CACHE_TS + cache_expire:
return _SLAVES_CACHE
slave_dict = _get_redis().hgetall('slaves')
curtime = _get_timestamp()
out_hosts = []
for host in slave_dict:
if be_sure or float(slave_dict[host]) + timeout < curtime:
h,p = host.split(':')
try:
c = _connect(h, int(p))
c.root.register()
except Exception as e:
continue
out_hosts.append(host)
_SLAVES_CACHE = out_hosts
_SLAVES_CACHE_TS = _get_timestamp()
return sorted(out_hosts)
def random_slave(*k, **kv):
'''Returns a random slave that is available.
This is good to use if you don't care who services your request.'''
ss = slaves(*k, **kv)
if len(ss) == 0:
raise OSError('There are no slaves currently running, so I can\'t randomly select one')
return random.choice(ss)
#### FILE SYSTEM OPS ####
def _register_file(hostname, port, file_name):
'''Registers a file to a hostname with the redis namenode'''
# tell the register host:port has a file
r = _get_redis()
r.hset('file-' + file_name, _cat_host(hostname, port), _get_timestamp())
# we need to have a dummy value because otherwise redis throws away the file
# once it has zero replicas
r.hset('file-' + file_name, '!', '!')
def _unregister_file(hostname, port, file_name):
'''Unregisters a file to a hostname with the redis namenode'''
# tell the register that host:port no longer has a file
_get_redis().hdel('file-' + file_name, _cat_host(hostname, port))
def format_fs(are_you_sure=False):
'''Deletes all files from your DFS, starting from fresh.
Set are_you_sure to True to actually run this function.'''
if not are_you_sure:
raise ValueError("you have to call format_fs(are_you_sure=True) in order to format")
for f in ls():
delete(f)
# Just in case we have anything lingering
for fname in _get_redis().keys('file-*"'):
_get_redis().delete(fname)
def check_exists(file_name):
'''Returns true if the file_name exists, False if not.'''
_pathcheck(file_name)
return _get_redis().exists('file-' + file_name)
def who_has(file_name):
'''Returns the list of slaves that are thought to be currently holding the file.'''
_pathcheck(file_name)
if not check_exists(file_name):
raise OSError('The file "%s" does not exist' % file_name)
return [ h for h in _get_redis().hkeys('file-' + file_name) if h != '!' ]
def write(file_name, payload):
'''Write a file with the contents of 'payload' to the file in DFS 'file_name'.'''
_pathcheck(file_name)
if check_exists(file_name):
raise OSError('The file "%s" already exists' % file_name)
a = _connect(*_split_hostport(random_slave()))
a.root.save(file_name, payload)
def put(local_file, file_name):
'''Moves the contents of 'local_file' into the DFS file called 'file_name'.'''
_pathcheck(file_name)
write(file_name, open(local_file).read())
def delete(file_name):
'''Deletes a file from the registery and the slaves.'''
_pathcheck(file_name)
_get_redis().hdel('file-' + file_name, '!')
for hostport in who_has(file_name):
try:
a = _connect(*_split_hostport(hostport))
a.root.delete(file_name)
except Exception as e:
logging.warning(' '.join(["I tried to delete", file_name, "from", hostport, \
"but he seems to be gone... I'm going to unregister this file from this host."]))
_unregister_file( *(_split_hostport(hostport) + (file_name,)) )
def deletes(file_glob):
'''Delete multiple files specified by a glob in the registry as well as in the slaves.'''
files_to_del = _get_redis().keys('file-' + file_glob)
for f in files_to_del:
delete(f.split('-', 1)[1])
def rmdir(directory_name):
files_to_del = _get_redis().keys('file-' + directory_name.rstrip('/') + '/*')
if len(files_to_del) == 0: raise OSError(directory_name + ' does not exist')
for f in files_to_del:
delete(f.split('-', 1)[1])
def read(file_name):
_pathcheck(file_name)
if not check_exists(file_name):
raise OSError('The file "%s" does not exist' % file_name)
a = _connect(*_split_hostport(random.choice(who_has(file_name))))
return a.root.fetch(file_name)
def copy(file_name, new_file_name):
'''copy the contents of file_name to new_file_name'''
write(new_file_name, read(file_name))
def get(file_name, local_file):
_pathcheck(file_name)
open(local_file, 'w').write(read(file_name))
def ls(file_glob = '*'):
'''Lists all of the files in the root directory if nothing is passed in.
If a parameter is passed into file_glob, then that glob is used.'''
output = []
for f in _get_redis().keys('file-' + file_glob):
output.append(f.split('-', 1)[1])
return sorted(output)
def ll(file_glob = '*'):
output = []
for f in _get_redis().keys('file-' + file_glob):
fn = f.split('-', 1)[1]
output.append((fn, who_has(fn)) )
return sorted(output)
#### MAPREDUCE ####
def mapreduce(inputs, output_dir, \
input_func=dfs_linereader, map_func=identity_mapper, \
combiner_func=identity_reducer, partitioner_func=hash_partitioner, \
mapout_func=basic_mapoutput, \
shuffle_func=basic_shuffle, sort_func=basic_sort, \
reduce_func=identity_reducer, output_func=dfs_line_output, \
num_reducers=1, params=None):
sf = _serialize_function # in retrospect, this function name was too long
sif = sf(input_func)
smf = sf(map_func)
scf = sf(combiner_func)
spf = sf(partitioner_func)
smof = sf(mapout_func)
sshf = sf(shuffle_func)
ssof = sf(sort_func)
srf = sf(reduce_func)
sof = sf(output_func)
if params is None: params = {}
params['outputdir'] = output_dir
logging.info('map stage starting')
map_tasks = []
for ins in inputs:
for f in ls(ins):
params['inputfilepath'] = f
p = Process(target=_start_map, args=(sif, smf, scf, spf, smof, num_reducers, params))
p.start()
map_tasks.append(p)
for p in map_tasks:
p.join()
logging.info('map stage complete')
logging.info('reduce stage starting')
reduce_tasks = []
for ri in range(num_reducers):
p = Process(target=_start_reduce, args=(ri, sshf, ssof, srf, sof, params))
p.start()
reduce_tasks.append(p)
for p in reduce_tasks:
p.join()
logging.info('reduce stage complete')
def _start_map(input_func, map_func, combiner_func, partitioner_func, mapout_func, num_reducers, params):
conn = _connect(random.choice(who_has(params['inputfilepath'])))
conn.root.map(input_func, map_func, combiner_func, partitioner_func, mapout_func, num_reducers, params)
def _start_reduce(reducer_num, shuffle_func, sort_func, reduce_func, output_func, params):
conn = _connect(random_slave())
conn.root.reduce(reducer_num, shuffle_func, sort_func, reduce_func, output_func, params)
#### SLAVE SERVER ####
def _register(hostname, port):
ts = _get_timestamp()
r = _get_redis()
r.hset("slaves", _cat_host(hostname, port), ts)
r.hdel('deadslaves', _cat_host(hostname, port))
return ts
def _unregister(hostname, port):
r = _get_redis()
r.hdel('slaves', _cat_host(hostname, port))
r.hset('deadslaves', _cat_host(hostname, port), _get_timestamp())
def start_slave(port, data_dir):
# set up data directory
data_dir = path.abspath(data_dir)
_mkdirp(data_dir)
# store some config for later
# (i don't really have anywhere better to put it)
SlaveServer._port = int(port)
SlaveServer._hostname = gethostname()
SlaveServer._datadir = data_dir
_mkdirp(path.join(data_dir, 'storage'))
s = ForkingServer(SlaveServer, port=int(port))
_register(SlaveServer._hostname, SlaveServer._port)
s.start()
_unregister(SlaveServer._hostname, SlaveServer._port)
def start_slaves(start_port, data_dir_root, num_slaves, wait=True):
ports = range(int(start_port), int(start_port) + int(num_slaves))
processes = []
for port in ports:
p = Process(target=start_slave, args=(port, data_dir_root + str(port)))
p.start()
processes.append(p)
if wait:
for p in processes:
p.join()
else:
return processes
class SlaveServer(rpyc.Service):
## general utility commands ##
def exposed_ping(self):
return 'pong'
def on_connect(self):
self.exposed_register()
def on_disconnect(self):
pass
def exposed_register(self):
# register with redis registry
_register(self._hostname, self._port)
## mapreduce commands ##
def exposed_map(self, \
input_func, map_func, \
combiner_func, partitioner_func, \
output_func, num_reducers, params):
input_func = _deserialize_function(input_func)
map_func = _deserialize_function(map_func)
combiner_func = _deserialize_function(combiner_func)
partitioner_func = _deserialize_function(partitioner_func)
output_func = _deserialize_function(output_func)
map_out = []
for k,v in input_func(params):
for k,v in map_func(k, v, params):
map_out.append((k, v))
map_out.sort()
if combiner_func != None:
combiner_out = []
for k, vs in _group_sorted(map_out):
for k, v in combiner_func(k, vs, params):
combiner_out.append((k,v))
map_out = combiner_out
buckets = {}
for k, v in map_out:
reducer_num = partitioner_func(k, num_reducers, params)
buckets.setdefault(reducer_num, [])
buckets[reducer_num].append((k,v))
for reducer_num in buckets:
output_func(reducer_num, sorted(buckets[reducer_num]), params)
return True
def exposed_reduce(self, \
reducer_num, shuffle_func, sort_func, reduce_func, \
output_func, params):
## file system commands ##
shuffle_func = _deserialize_function(shuffle_func)
sort_func = _deserialize_function(sort_func)
reduce_func = _deserialize_function(reduce_func)
output_func = _deserialize_function(output_func)
reduce_out = []
for k, vs in sort_func(shuffle_func(reducer_num, params), params):
for k, v in reduce_func(k, vs, params):
reduce_out.append((k, v))
output_func(reducer_num, reduce_out, params)
def exposed_save(self, file_name, payload, replicate=(REPLICATION_FACTOR - 1)):
_pathcheck(file_name)
opath = path.join(self._datadir, 'storage', file_name)
if path.exists(opath):
raise OSError("The file '%s' already exists" % file_name)
return False
_mkdirp(path.split(opath)[0])
of = open(opath, 'w')
of.write(payload)
of.close()
_register_file(SlaveServer._hostname, SlaveServer._port, file_name)
if replicate > 0:
# select a replication target of someone other than someone who has it
target = random.sample(set(slaves()) - set(who_has(file_name)), 1)[0]
c = _connect(*_split_hostport(target))
c.root.save(file_name, payload, replicate - 1)
return True
def exposed_fetch(self, file_name):
_pathcheck(file_name)
try:
return open(path.join(self._datadir, 'storage', file_name)).read()
except IOError as e:
logging.warning(' '.join['unregistering', file_name, 'because of:', str(e)])
_unregister_file(self._hostname, self._port, file_name)
def exposed_delete(self, file_name):
_pathcheck(file_name)
_unregister_file(SlaveServer._hostname, SlaveServer._port, file_name)
try:
os.remove(path.join(self._datadir, 'storage', file_name))
except OSError as e:
logging.warning(' '.join["you tried to delete", file_name, "but it was already gone.", str(e)])
return False
return True
def exposed_pushremote(self, file_name, destination):
_pathcheck(file_name)
c = _connect(*destination)
c.root.save(file_name, open(path.join(self._datadir, 'storage', file_name)).read())
return True
## slave code ends here ##
#### WEB MONITOR AND CONTROL PAGE ####
def start_monitor(port):
app = Flask('mr.py monitor')
@app.route("/")
def main():
contents = []
contents.append('<h1>mr.py monitor page</h1>')
contents.append('<h2>slaves</h2><table><tr><td>' + '</td></tr><tr><td>'.join(slaves()) + '</td></tr></table>')
contents.append('<h2>files</h2><table><tr><td>' + '</td></tr><tr><td>'.join(sorted(ls())) + '</td></tr></table>')
return '\n'.join(contents)
app.run(port=int(port))
#### SHELL COMMANDS ####
def fs_ls(args):
if len(args) == 0:
print '\n'.join(ls())
else:
for f in args:
print '\n'.join(ls()) + '\n'
def fs_put(args):
local_file = args[0]
destn_file = args[1]
if local_file == '-' :
write(destn_file, sys.stdin.read())
else:
put(local_file, destn_file)
def fs_get(args):
source_file = args[0]
destn_local_file = args[1]
get(source_file, destn_local_file)
def fs_cat(args):
for fs in args:
for f in ls(fs):
print read(f),
def fs_rm(args):
for f in args:
if '*' in f or '?' in f:
deletes(f)
else:
delete(f)
_FS_COMMANDS = {'ls' : fs_ls, 'put' : fs_put, 'get' : fs_get, 'cat' : fs_cat, \
'rm' : fs_rm}
def file_system_shell(args):
_FS_COMMANDS[args[0]](args[1:])
#### MAIN ####
if __name__ == "__main__":
if len(sys.argv) <= 1:
print 'asf'
elif sys.argv[1] == 'slave':
start_slave(sys.argv[2], sys.argv[3])
elif sys.argv[1] == 'slaves':
start_slaves(sys.argv[2], sys.argv[3], sys.argv[4])
elif sys.argv[1] == 'fs':
file_system_shell(sys.argv[2:])
elif sys.argv[1] == 'monitor':
start_monitor(sys.argv[2])
else:
print 'you did something wrong'