-
Notifications
You must be signed in to change notification settings - Fork 4
/
celery_map_reduce.py
73 lines (56 loc) · 1.89 KB
/
celery_map_reduce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import time
import random
from celery import Celery, chord, chain
from toolz.itertoolz import partition_all, concat
app = Celery('celery_map_reduce',
broker='redis://localhost', backend='redis://localhost')
app.conf.CELERY_ACCEPT_CONTENT = ['json']
app.conf.CELERY_TASK_SERIALIZER = 'json'
app.conf.CELERY_RESULT_SERIALIZER = 'json'
@app.task
def reduce(mapped):
""" Reduce worker """
return list(concat(mapped))
@app.task
def map(data):
""" Map worker """
results = []
for chunk in data:
results.append(sum(chunk))
return results
@app.task
def mapreduce(chunk_size):
""" A long running task which splits up the input data to many workers """
# create some sample data for our summation function
data = []
for i in range(10000):
x = []
for j in range(random.randrange(10) + 5):
x.append(random.randrange(10000))
data.append(x)
# break up our data into chunks and create a dynamic list of workers
maps = (map.s(x) for x in partition_all(chunk_size, data))
mapreducer = chord(maps)(reduce.s())
return {'chord_id': mapreducer.id}
def create_work(chunk_size):
""" A fast task for initiating our map function """
return mapreduce.delay(chunk_size).id
def get_work(chord_id):
""" A fast task for checking our map result """
if app.AsyncResult(chord_id).ready():
result_id = app.AsyncResult(chord_id).get()['chord_id']
else:
return {'status': 'pending', 'stage': 1}
if app.AsyncResult(result_id).ready():
return {
'status': 'success',
'results': app.AsyncResult(result_id).get()}
else:
return {'status': 'pending', 'stage': 2}
if __name__ == '__main__':
my_id = create_work(chunk_size=4)
for i in range(100):
time.sleep(1)
results = get_work(my_id)
if results['status'] == 'success':
break