-
Notifications
You must be signed in to change notification settings - Fork 1
/
runforrest.py
416 lines (326 loc) · 14.7 KB
/
runforrest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
from uuid import uuid4 as uuid
from pathlib import Path
from subprocess import Popen, STDOUT, PIPE
import sys
import os
import signal
from argparse import ArgumentParser
import dill
import time
def _identity(thing):
"""Just a helper."""
return thing
def defer(fun, *args, **kwargs):
"""Wrap a function or data for execution.
Returns a `Task` without running the function. This `Task` can
be used as an argument for other deferred functions to build a
call graph. The call graph can then be executed by an `Executor`.
`fun` can also be non-callable data, in which case the resulting
`Task` will evaluate to that data and function arguments are
ignored. This can be useful to defer later attribute accesses.
Additionally, you can access attributes and indexes of the
`Task`.
"""
if not callable(fun) and not isinstance(fun, Task):
# return non-functions varbatim
return Task(_identity, [fun], {})
else:
return Task(fun, args, kwargs)
class Task:
"""A proxy for a function result.
Accessing attributes, indexing, and calling returns more
`Tasks`.
"""
def __init__(self, fun, args, kwargs):
self._fun = fun
self._args = args
self._kwargs = kwargs
self._id = str(id(self))
def __eq__(self, other):
return self._id == other._id and self._args == other._args and self._fun == other._fun
def __getattr__(self, name):
if name in ['__getstate__', '_id']:
raise AttributeError()
return TaskAttribute(self, name)
def __getitem__(self, key):
return TaskItem(self, key)
class PartOfTask():
"""A proxy for a part of a Task.
Tasks from accessing attributes, indexing, or calling a
`Task`. Each `PartOfTask` has an `_id` that is shared for all
equivalent attribute/index/call accesses.
"""
def __init__(self, parent, index):
self._parent = parent
self._index = index
self._id = parent._id + str(index)
def __eq__(self, other):
return self._id == other._id and self._parent == other._parent
def __getattr__(self, name):
if name in ['__getstate__', '_id']:
raise AttributeError()
return TaskAttribute(self, name)
def __getitem__(self, key):
return TaskItem(self, key)
class TaskAttribute(PartOfTask):
pass
class TaskItem(PartOfTask):
pass
class TaskList:
"""Schedule tasks and run them in many processes.
A `TaskList` schedules `Tasks`, and then executes them on several
processes in parallel. For each `Task`, it walks the call chain,
and executes all the code necessary to calculate the return
values. The `TaskList` takes great pride in not evaluating `Tasks`
more often than necessary, even if several `PartOfTasks` lead to
the same original `Task`.
By default, `schedule` dumps each `Task` in the directory
`{directory}/todo`. Once the `Task` has been executed, it is
transferred to either `{directory}/done` or `{directory}/failed`,
depending on whether it raised errors or not.
The `TaskList` delegates all the actual running of code to
`evaluate`, which is called by invoking this very script as a
command line program. The `TaskList` merely makes sure that a
fixed number of processes are running at all times.
"""
def __init__(self, directory, exist_ok=False, pre_clean=True,
post_clean=False, logfile=None, noschedule_if_exist=False):
self._directory = Path(directory)
self._post_clean = post_clean
self._logfile = Path(logfile) if logfile else None
self._noschedule = False
if self._directory.exists():
if noschedule_if_exist:
self._noschedule = True
pre_clean = False
elif not exist_ok:
raise RuntimeError(f'TaskList directory {str(self._directory)} already exists')
if pre_clean:
self.clean()
for dir in [self._directory, self._directory / 'todo',
self._directory / 'done', self._directory / 'fail']:
if not dir.exists():
dir.mkdir()
self._processes = {}
def __del__(self):
if self._post_clean:
self.clean()
def schedule(self, task, metadata=None):
"""Schedule a task for later execution.
The task is saved to the `{directory}/todo` directory. Use
`run` to execute all the tasks in the `{directory}/todo}
directory.
If you want, you can attach metadata to the task, which you
can retrieve as `task.metadata` after the task has been run.
"""
if self._noschedule:
return
task.errorvalue = None
task.returnvalue = None
task.metadata = metadata
taskfilename = (str(uuid()) + '.pkl')
with (self._directory / 'todo' / taskfilename).open('wb') as f:
dill.dump(task, f)
self._log('schedule', taskfilename)
def run(self, nprocesses=4, print_errors=False, save_session=False, autokill=None):
"""Execute all tasks in the `{directory}/todo}` directory.
All tasks are executed in their own processes, and `run` makes
sure that no more than `nprocesses` are active at any time.
If `print_errors=True`, processes will print full stack traces
of failing tasks. Since these errors happen on another
process, this will not be caught by the debugger, and will not
stop the `run`.
Use `save_session` to recreate all current globals in each
process.
"""
if save_session:
dill.dump_session(self._directory / 'session.pkl')
class TaskIterator:
def __init__(self, parent, todos, save_session):
self.parent = parent
self.todos = todos
self.save_session = save_session
def __iter__(self):
for todo in self.todos:
yield from self.parent._finish_tasks(nprocesses, autokill=autokill)
self.parent._start_task(todo.name, print_errors, save_session)
# wait for running jobs to finish:
yield from self.parent._finish_tasks(1, autokill=autokill)
def __len__(self):
return len(self.todos)
return TaskIterator(self, list((self._directory / 'todo').iterdir()), save_session)
def _start_task(self, taskfilename, print_errors, save_session):
"""Start a new process, and append to self._processes."""
args = ['python', '-m', 'runforrest',
self._directory / 'todo' / taskfilename,
self._directory / 'done' / taskfilename]
if print_errors:
args += ['-p']
if save_session:
args += ['-s', self._directory / 'session.pkl']
kwargs = dict(start_new_session=True, cwd=os.getcwd())
if self._logfile:
kwargs['stdout'] = PIPE
kwargs['stderr'] = STDOUT
self._processes[taskfilename] = Popen(args, **kwargs)
self._processes[taskfilename].start_time = time.perf_counter()
self._log('start', taskfilename)
def _finish_tasks(self, nprocesses, autokill):
"""Wait while `nprocesses` are running and return finished tasks."""
while len(self._processes) >= nprocesses:
for file, proc in list(self._processes.items()):
if proc.poll() is not None:
task = self._retrieve_task(file)
try:
stdout, _ = proc.communicate(timeout=10)
self._log('done' if task.errorvalue is None else 'fail', file)
if stdout:
self._log(stdout, file)
yield task
except subprocess.TimeoutExpired as err:
# something is wrong. Kill the process and move on.
process_group = os.getpgid(proc.pid)
os.killpg(process_group, signal.SIGKILL)
self._log('lost contact', file)
finally:
del self._processes[file]
elif autokill and time.perf_counter() - proc.start_time > autokill:
try:
# kill the whole process group.
# This is a mean thing to do, and might leave dangling
# intermedite files. But at this point, the program was
# provably not able to terminate on its own, and drastic
# measures are our last resort.
process_group = os.getpgid(proc.pid)
os.killpg(process_group, signal.SIGKILL)
self._log('autokilled', file)
# sometimes, even the above does not work. In this case,
# we will leak the process, but continue anyway:
del self._processes[file]
except Exception as err:
self._log(err.message, file)
else:
time.sleep(0.1)
def _retrieve_task(self, taskfilename):
"""Load task, and sort into `{directory}/done` or `{directory}/fail`."""
try:
with (self._directory / 'done' / taskfilename).open('rb') as f:
task = dill.load(f)
except Exception as error:
with (self._directory / 'todo' / taskfilename).open('rb') as f:
task = dill.load(f)
task.returnvalue = None
task.errorvalue = error
with (self._directory / 'done' / taskfilename).open('wb') as f:
dill.dump(task, f)
(self._directory / 'todo' / taskfilename).unlink()
if task.errorvalue is not None:
(self._directory / 'done' / taskfilename).rename(self._directory / 'fail' / taskfilename)
return task
def _log(self, message, taskfilename):
if not self._logfile:
return
with self._logfile.open('a') as f:
f.write(f"{time.strftime('%Y-%m-%dT%H:%M:%S')} {taskfilename} {message}\n")
def todo_tasks(self):
"""Yield all tasks in `{directory}/todo`."""
for todo in (self._directory / 'todo').iterdir():
with todo.open('rb') as f:
yield dill.load(f)
def done_tasks(self):
"""Yield all tasks in `{directory}/done`."""
for done in (self._directory / 'done').iterdir():
with done.open('rb') as f:
try: # safeguard against broken tasks:
yield dill.load(f)
except EOFError as err:
print(f'skipping {done.name} ({err})')
def fail_tasks(self):
"""Yield all tasks in `{directory}/fail`."""
for fail in (self._directory / 'fail').iterdir():
with fail.open('rb') as f:
yield dill.load(f)
def clean(self, clean_todo=True, clean_done=True, clean_fail=True):
"""Remove `{directory}` and all todo/done/fail tasks."""
def remove(dir):
if dir.exists():
for f in dir.iterdir():
f.unlink()
dir.rmdir()
if clean_todo:
remove(self._directory / 'todo')
if clean_fail:
remove(self._directory / 'fail')
if clean_done:
remove(self._directory / 'done')
if clean_todo and clean_fail and clean_done:
if (self._directory / 'session.pkl').exists():
(self._directory / 'session.pkl').unlink()
remove(self._directory)
def main():
parser = ArgumentParser(description="Run an enqueued function")
parser.add_argument('infile', type=Path, help='contains the enqueued function')
parser.add_argument('outfile', type=Path, help='contains the evaluation results')
parser.add_argument('-s', '--sessionfile', type=Path, action='store', default=None)
parser.add_argument('-p', '--do_print', action='store_true', default=False)
parser.add_argument('-r', '--do_raise', action='store_true', default=False)
args = parser.parse_args()
run_task(args.infile, args.outfile, args.sessionfile, args.do_print, args.do_raise)
def run_task(infile, outfile, sessionfile, do_print, do_raise):
"""Execute `infile` and produce `outfile`.
If `sessionfile` is given, load session from that file.
Set `do_print` or `do_raise` to `True` if errors should be printed or
raised.
"""
if sessionfile:
dill.load_session(Path(sessionfile))
with infile.open('rb') as f:
task = dill.load(f)
try:
start_time = time.perf_counter()
task.returnvalue = evaluate(task)
task.errorvalue = None
except Exception as err:
task.errorvalue = err
task.returnvalue = None
finally:
task.runtime = time.perf_counter() - start_time
with outfile.open('wb') as f:
dill.dump(task, f)
if task.errorvalue is not None and do_raise:
raise task.errorvalue
if task.errorvalue is not None and do_print:
print(f'Error in {infile.name}: {task.errorvalue.__repr__()}')
sys.exit(0 if task.errorvalue is None else -1)
def evaluate(task, known_results=None):
"""Execute a `task` and calculate its return value.
`evaluate` walks the call chain to the `task`, and executes all
the code necessary to calculate the return values. No `task` are
executed more than once, even if several `PartOfTasks` lead to
the same original `Task`.
This is a recursive function that passes its state in
`known_results`, where return values of all executed `Tasks` are
stored.
"""
# because pickling breaks isinstance(task, Task)
if not 'Task' in task.__class__.__name__:
return task
if known_results is None:
known_results = {}
if task._id not in known_results:
if task.__class__.__name__ in ['TaskItem', 'TaskAttribute']:
returnvalue = evaluate(task._parent, known_results)
if task.__class__.__name__ == 'TaskItem':
known_results[task._id] = returnvalue[task._index]
elif task.__class__.__name__ == 'TaskAttribute':
known_results[task._id] = getattr(returnvalue, task._index)
else:
raise TypeError(f'unknown Task {type(task)}')
else: # is Task
args = [evaluate(arg, known_results) for arg in task._args]
kwargs = {k: evaluate(v, known_results) for k, v in task._kwargs.items()}
returnvalue = task._fun(*args, **kwargs)
known_results[task._id] = returnvalue
return known_results[task._id]
if __name__ == '__main__':
main()