def _fetch_missing(cls, key): result = {} urls = env.trackerClient.call(GetValueMessage('mutable_dict:%s' % key)) for url in urls: f = urllib.request.urlopen(url) if f.code is not None and f.code != 200: raise IOError('Open %s failed:%s' % (url, f.code)) data = f.read() if len(data) < 4: raise IOError('Transfer %s failed: %s received' % (url, len(data))) length, = struct.unpack('<I', data[:4]) if length != len(data): raise IOError('Transfer %s failed: %s received, %s expected' % (url, len(data), length)) data = cPickle.loads(decompress(data[4:])) for k, v in data.items(): if k in result: r = result[k] if v[1] == r[1]: r0 = r[0] v0 = v[0] merged = r0.value if isinstance(r0, ConflictValues) else [r0] merged += v0.value if isinstance(v0, ConflictValues) else [v0] result[k] = (ConflictValues(merged), r[1]) else: result[k] = v if v[1] > r[1] else r else: result[k] = v return result
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: # shuffle_id start from 1 swd = ShuffleWorkDir(0, task.id, ttid.task_try) tmppath = swd.alloc_tmp(len(data)) with open(tmppath, 'wb') as f: f.write(data) f.close() path = swd.export(tmppath) data = '/'.join([env.server_uri] + path.split('/')[-3:]) flag += 2 return TaskState.finished, cPickle.dumps( ((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str( e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError, LookupError, SyntaxError, TypeError, AssertionError) prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED" return TaskState.failed, '{}_EXCEPTION_{}'.format( prefix, ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: # shuffle_id start from 1 swd = ShuffleWorkDir(0, task.id, ttid.task_try) tmppath = swd.alloc_tmp(len(data)) with open(tmppath, 'wb') as f: f.write(data) f.close() path = swd.export(tmppath) data = '/'.join( [env.server_uri] + path.split('/')[-3:] ) flag += 2 return TaskState.finished, cPickle.dumps(((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str(e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError, LookupError, SyntaxError, TypeError, AssertionError) prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED" return TaskState.failed, '{}_EXCEPTION_{}'.format(prefix, ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return TaskState.finished, cPickle.dumps( ((flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return TaskState.failed, TaskEndReason.fetch_failed, str( e), cPickle.dumps(e) except Exception as e: import traceback msg = traceback.format_exc() ename = e.__class__.__name__ return TaskState.failed, 'FAILED_EXCEPTION_{}'.format( ename), msg, cPickle.dumps(e) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, task_try_id = loads(decompress(task_data)) ttid = TTID(task_try_id) Accumulator.clear() result = task.run(ttid.ttid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return 'TASK_FINISHED', cPickle.dumps( (Success(), (flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return 'TASK_FAILED', cPickle.dumps((e, None, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', cPickle.dumps( (OtherFailure(msg), None, None, None), -1) finally: gc.collect() gc.enable()
def load_stream(self, stream): while True: head = stream.read(5) if not head: return length, is_marshal, is_sorted = unpack_header(head) assert (is_sorted) buf = stream.read(length) if len(buf) < length: raise IOError("length not match: expected %d, but got %d" % (length, len(buf))) buf = decompress(buf) AutoBatchedSerializer.size_loaded += len(buf) if is_marshal: vs = marshal.loads(buf) else: vs = pickle.loads(buf) for v in vs: yield v
def unsorted_batches(self): f = None # TEST_RETRY = True try: f, exp_size = self.open() total_size = 0 while True: head = f.read(5) if len(head) == 0: break length, is_marshal, is_sorted = unpack_header(head) assert (not is_sorted) total_size += length + 5 d = f.read(length) if length != len(d): raise IOError( "length not match: expected %d, but got %d" % (length, len(d))) d = decompress(d) if is_marshal: items = marshal.loads(d) else: try: items = pickle.loads(d) except: time.sleep(1) items = pickle.loads(d) yield items # if TEST_RETRY and self.num_retry == 0: # raise Exception("test_retry") if total_size != exp_size: raise IOError( "fetch size not match: expected %d, but got %d" % (exp_size, total_size)) env.task_stats.bytes_fetch += exp_size finally: if f: f.close()
def unsorted_batches(self): f = None # TEST_RETRY = True try: f, exp_size = self.open() total_size = 0 while True: head = f.read(5) if len(head) == 0: break length, is_marshal, is_sorted = unpack_header(head) assert (not is_sorted) total_size += length + 5 d = f.read(length) if length != len(d): raise IOError("length not match: expected %d, but got %d" % (length, len(d))) d = decompress(d) if is_marshal: items = marshal.loads(d) else: try: items = pickle.loads(d) except: time.sleep(1) items = pickle.loads(d) yield items # if TEST_RETRY and self.num_retry == 0: # raise Exception("test_retry") if total_size != exp_size: raise IOError("fetch size not match: expected %d, but got %d" % (exp_size, total_size)) env.task_stats.bytes_fetch += exp_size finally: if f: f.close()
def _fetch_missing(cls, key): result = {} urls = env.trackerClient.call(GetValueMessage('mutable_dict:%s' % key)) for url in urls: f = urllib.request.urlopen(url) if f.code is not None and f.code != 200: raise IOError('Open %s failed:%s' % (url, f.code)) data = f.read() if len(data) < 4: raise IOError('Transfer %s failed: %s received' % (url, len(data))) length, = struct.unpack('<I', data[:4]) if length != len(data): raise IOError('Transfer %s failed: %s received, %s expected' % (url, len(data), length)) data = cPickle.loads(decompress(data[4:])) for k, v in data.items(): if k in result: r = result[k] if v[1] == r[1]: r0 = r[0] v0 = v[0] merged = r0.value if isinstance( r0, ConflictValues) else [r0] merged += v0.value if isinstance( v0, ConflictValues) else [v0] result[k] = (ConflictValues(merged), r[1]) else: result[k] = v if v[1] > r[1] else r else: result[k] = v return result
def statusUpdate(self, driver, status): def plot_progresses(): if self.color: total = len(self.active_tasksets) logger.info('\x1b[2K\x1b[J\x1b[1A') for i, taskset_id in enumerate(self.active_tasksets): if i == total - 1: ending = '\x1b[%sA' % total else: ending = '' tasksets = self.active_tasksets[taskset_id] tasksets.progress(ending) mesos_task_id = status.task_id.value state = status.state reason = status.get('message') # set by mesos data = status.get('data') logger.debug('status update: %s %s', mesos_task_id, state) ttid = TTID(mesos_task_id) taskset = self.active_tasksets.get(ttid.taskset_id) if taskset is None: if state == 'TASK_RUNNING': logger.debug('kill task %s as its taskset has gone', mesos_task_id) self.driver.killTask(Dict(value=mesos_task_id)) else: logger.debug('ignore task %s as its taskset has gone', mesos_task_id) return if state == 'TASK_RUNNING': taskset.statusUpdate(ttid.task_id, ttid.task_try, state) if taskset.tasksFinished == 0: plot_progresses() else: if mesos_task_id not in taskset.ttids: logger.debug( 'ignore task %s as it has finished or failed, new msg: %s', mesos_task_id, (state, reason)) else: taskset.ttids.remove(mesos_task_id) if mesos_task_id in self.ttid_to_agent_id: agent_id = self.ttid_to_agent_id[mesos_task_id] if agent_id in self.agent_id_to_ttids: self.agent_id_to_ttids[agent_id] -= 1 del self.ttid_to_agent_id[mesos_task_id] if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate, task_stats = cPickle.loads( decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.request.urlopen(data).read() except IOError: # try again data = urllib.request.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) taskset.statusUpdate(ttid.task_id, ttid.task_try, state, reason, result, accUpdate, task_stats) if state == 'TASK_FINISHED': plot_progresses() except Exception as e: logger.warning( 'error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' taskset.statusUpdate(ttid.task_id, ttid.task_try, state, 'load failed: %s' % e) else: # killed, lost taskset.statusUpdate(ttid.task_id, ttid.task_try, state, reason or data)
def statusUpdate(self, driver, status): def plot_progresses(): if self.color: total = len(self.activeJobs) logger.info('\x1b[2K\x1b[J\x1b[1A') for i, job_id in enumerate(self.activeJobs): if i == total - 1: ending = '\x1b[%sA' % total else: ending = '' jobs = self.activeJobs[job_id] jobs.progress(ending) tid = status.task_id.value state = status.state logger.debug('status update: %s %s', tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = list(map(int, tid.split(':'))) if state == 'TASK_RUNNING': if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) if job.tasksFinished == 0: plot_progresses() else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(Dict(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToAgentId: agent_id = self.taskIdToAgentId[tid] if agent_id in self.agentTasks: self.agentTasks[agent_id] -= 1 del self.taskIdToAgentId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] reason = status.get('message') data = status.get('data') if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate, task_stats = cPickle.loads( decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.request.urlopen(data).read() except IOError: # try again data = urllib.request.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) except Exception as e: logger.warning('error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' job.statusUpdate(task_id, tried, state, 'load failed: %s' % e) return else: job.statusUpdate(task_id, tried, state, reason, result, accUpdate, task_stats) if state == 'TASK_FINISHED': plot_progresses() return # killed, lost, load failed job.statusUpdate(task_id, tried, state, reason or data)