def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: # shuffle_id start from 1 swd = ShuffleWorkDir(0, task.id, ttid.task_try) tmppath = swd.alloc_tmp(len(data)) with open(tmppath, 'wb') as f: f.write(data) f.close() path = swd.export(tmppath) data = '/'.join([env.server_uri] + path.split('/')[-3:]) flag += 2 return TaskState.finished, cPickle.dumps( ((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str( e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError, LookupError, SyntaxError, TypeError, AssertionError) prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED" return TaskState.failed, '{}_EXCEPTION_{}'.format( prefix, ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return TaskState.finished, cPickle.dumps( ((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str( e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ return TaskState.failed, 'FAILED_EXCEPTION_{}'.format( ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return 'TASK_FINISHED', cPickle.dumps( (Success(), (flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return 'TASK_FAILED', cPickle.dumps((e, None, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', cPickle.dumps( (OtherFailure(msg), None, None, None), -1) finally: gc.collect() gc.enable()
def try_id(self): return TTID.make_taskset_id(self.id, self.num_try + 1) # incr num_try After create TaskSet
def killTask(self, task_id, num_try): tid = Dict() tid.value = TTID.make_ttid(task_id, num_try) self.driver.killTask(tid)
def statusUpdate(self, driver, status): def plot_progresses(): if self.color: total = len(self.active_tasksets) logger.info('\x1b[2K\x1b[J\x1b[1A') for i, taskset_id in enumerate(self.active_tasksets): if i == total - 1: ending = '\x1b[%sA' % total else: ending = '' tasksets = self.active_tasksets[taskset_id] tasksets.progress(ending) mesos_task_id = status.task_id.value state = status.state reason = status.get('message') # set by mesos data = status.get('data') logger.debug('status update: %s %s', mesos_task_id, state) ttid = TTID(mesos_task_id) taskset = self.active_tasksets.get(ttid.taskset_id) if taskset is None: if state == 'TASK_RUNNING': logger.debug('kill task %s as its taskset has gone', mesos_task_id) self.driver.killTask(Dict(value=mesos_task_id)) else: logger.debug('ignore task %s as its taskset has gone', mesos_task_id) return if state == 'TASK_RUNNING': taskset.statusUpdate(ttid.task_id, ttid.task_try, state) if taskset.tasksFinished == 0: plot_progresses() else: if mesos_task_id not in taskset.ttids: logger.debug( 'ignore task %s as it has finished or failed, new msg: %s', mesos_task_id, (state, reason)) else: taskset.ttids.remove(mesos_task_id) if mesos_task_id in self.ttid_to_agent_id: agent_id = self.ttid_to_agent_id[mesos_task_id] if agent_id in self.agent_id_to_ttids: self.agent_id_to_ttids[agent_id] -= 1 del self.ttid_to_agent_id[mesos_task_id] if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate, task_stats = cPickle.loads( decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.request.urlopen(data).read() except IOError: # try again data = urllib.request.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) taskset.statusUpdate(ttid.task_id, ttid.task_try, state, reason, result, accUpdate, task_stats) if state == 'TASK_FINISHED': plot_progresses() except Exception as e: logger.warning( 'error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' taskset.statusUpdate(ttid.task_id, ttid.task_try, state, 'load failed: %s' % e) else: # killed, lost taskset.statusUpdate(ttid.task_id, ttid.task_try, state, reason or data)