def _fetch_missing(self, key): result = {} urls = env.trackerClient.call(GetValueMessage('mutable_dict:%s' % key)) for url in urls: f = urllib.urlopen(url) if f.code is not None and f.code != 200: raise IOError('Open %s failed:%s' % (url, f.code)) data = f.read() if len(data) < 4: raise IOError('Transfer %s failed: %s received' % (url, len(data))) length, = struct.unpack('<I', data[:4]) if length != len(data): raise IOError('Transfer %s failed: %s received, %s expected' % (url, len(data), length)) data = cPickle.loads(decompress(data[4:])) for k,v in data.items(): if k in result: r = result[k] if v[1] == r[1]: r = r.value if isinstance(r, ConflictValues) else [r] r += v.value if isinstance(v, ConflictValues) else [v] result[k] = ConflictValues(r) else: result[k] = v if v[1] > r[1] else r else: result[k] = v return result
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'w') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps((Success(), (flag, data), accUpdate), -1) except FetchFailed, e: return mesos_pb2.TASK_FAILED, cPickle.dumps((e, None, None), -1)
def launchTask(self, driver, task): try: t, ntry = cPickle.loads(decompress(task.data)) reply_status(driver, task, mesos_pb2.TASK_RUNNING) logging.debug("launch task %s", t.id) pool = self.get_idle_worker() self.busy_workers[task.task_id.value] = (task, pool) def callback((state, data)): with self.lock: if task.task_id.value not in self.busy_workers: return _, pool = self.busy_workers.pop(task.task_id.value) pool.done += 1 reply_status(driver, task, state, data) if (len(self.idle_workers) + len(self.busy_workers) < self.parallel and len(self.idle_workers) < MAX_IDLE_WORKERS and pool.done < MAX_TASKS_PER_WORKER and get_pool_memory(pool) < get_task_memory(task)): # maybe memory leak in executor self.idle_workers.append((time.time(), pool)) else: try: pool.terminate() except: pass pool.apply_async(run_task, [t, ntry], callback=callback) except Exception, e: import traceback msg = traceback.format_exc() reply_status(driver, task, mesos_pb2.TASK_LOST, msg) return
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: workdir = env.get('WORKDIR') name = 'task_%s_%s.result' % (task.id, ntry) path = os.path.join(workdir, name) f = open(path, 'w') f.write(data) f.close() data = LocalFileShuffle.getServerUri() + '/' + name flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps((task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps((task.id, OtherFailure(msg), None, None), -1)
def statusUpdate(self, driver, status): tid = status.task_id.value state = status.state logger.debug("status update: %s %s", tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = map(int, tid.split(':')) if state == mesos_pb2.TASK_RUNNING: if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(mesos_pb2.TaskID(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToSlaveId: slave_id = self.taskIdToSlaveId[tid] if slave_id in self.slaveTasks: self.slaveTasks[slave_id] -= 1 del self.taskIdToSlaveId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] if state in (mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_FAILED) and status.data: try: reason, result, accUpdate = cPickle.loads(status.data) if result: flag, data = result if flag >= 2: try: data = urllib.urlopen(data).read() except IOError: # try again data = urllib.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) except Exception, e: logger.warning("error when cPickle.loads(): %s, data:%s", e, len(status.data)) state = mesos_pb2.TASK_FAILED return job.statusUpdate(task_id, tried, mesos_pb2.TASK_FAILED, 'load failed: %s' % e) else: return job.statusUpdate(task_id, tried, state, reason, result, accUpdate)
def statusUpdate(self, driver, status): tid = status.task_id.value state = status.state logger.debug("status update: %s %s", tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = map(int, tid.split(':')) if state == mesos_pb2.TASK_RUNNING: if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(mesos_pb2.TaskID(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToSlaveId: slave_id = self.taskIdToSlaveId[tid] if slave_id in self.slaveTasks: self.slaveTasks[slave_id] -= 1 del self.taskIdToSlaveId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] if state in (mesos_pb2.TASK_FINISHED, mesos_pb2.TASK_FAILED) and status.data: try: reason,result,accUpdate = cPickle.loads(status.data) if result: flag, data = result if flag >= 2: try: data = urllib.urlopen(data).read() except IOError: # try again data = urllib.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) except Exception, e: logger.warning("error when cPickle.loads(): %s, data:%s", e, len(status.data)) state = mesos_pb2.TASK_FAILED return job.statusUpdate(task_id, tried, mesos_pb2.TASK_FAILED, 'load failed: %s' % e) else: return job.statusUpdate(task_id, tried, state, reason, result, accUpdate)
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception, e: flag, data = 1, cPickle.dumps(result, -1) else:
def fetch_one(self, uri, shuffleId, part, reduceId): if uri == LocalFileShuffle.getServerUri(): # urllib can open local file url = 'file://' + LocalFileShuffle.getOutputFile( shuffleId, part, reduceId) else: url = "%s/%d/%d/%d" % (uri, shuffleId, part, reduceId) logger.debug("fetch %s", url) tries = 2 while True: try: f = urllib.request.urlopen(url) if f.code == 404: f.close() raise IOError("not found") d = f.read() flag = d[:1] length, = struct.unpack("I", d[1:5]) if length != len(d): raise ValueError( "length not match: expected %d, but got %d" % (length, len(d))) d = decompress(d[5:]) f.close() if flag == b'm': d = marshal.loads(d) elif flag == b'p': d = six.moves.cPickle.loads(d) else: raise ValueError("invalid flag") return d except Exception as e: logger.debug( "Fetch failed for shuffle %d," " reduce %d, %d, %s, %s, try again", shuffleId, reduceId, part, url, e) tries -= 1 if not tries: logger.warning( "Fetch failed for shuffle %d," " reduce %d, %d, %s, %s", shuffleId, reduceId, part, url, e) from dpark.schedule import FetchFailed raise FetchFailed(uri, shuffleId, part, reduceId) time.sleep(2**(2 - tries) * 0.1)
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception, e: flag, data = 1, cPickle.dumps(result, -1) else:
def fetch_one(self, uri, shuffleId, part, reduceId): if uri == LocalFileShuffle.getServerUri(): # urllib can open local file url = LocalFileShuffle.getOutputFile(shuffleId, part, reduceId) else: url = "%s/%d/%d/%d" % (uri, shuffleId, part, reduceId) logger.debug("fetch %s", url) tries = 2 while True: try: f = urllib.urlopen(url) if f.code == 404: f.close() raise IOError("not found") d = f.read() flag = d[:1] length, = struct.unpack("I", d[1:5]) if length != len(d): raise ValueError( "length not match: expected %d, but got %d" % (length, len(d))) d = decompress(d[5:]) f.close() if flag == 'm': d = marshal.loads(d) elif flag == 'p': d = cPickle.loads(d) else: raise ValueError("invalid flag") return d except Exception as e: logger.debug("Fetch failed for shuffle %d," " reduce %d, %d, %s, %s, try again", shuffleId, reduceId, part, url, e) tries -= 1 if not tries: logger.warning("Fetch failed for shuffle %d," " reduce %d, %d, %s, %s", shuffleId, reduceId, part, url, e) from dpark.schedule import FetchFailed raise FetchFailed(uri, shuffleId, part, reduceId) time.sleep(2 ** (2 - tries) * 0.1)
def run_task(task_data): try: gc.disable() task, (job_id, ntry) = loads(decompress(task_data)) tid = '%s:%s:%s' % (job_id, task.id, ntry) Accumulator.clear() result = task.run(tid) env.task_stats.bytes_max_rss = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss * 1024 accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception as e: flag, data = 1, six.moves.cPickle.dumps(result, -1) else: flag, data = 1, six.moves.cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'wb') f.write(data) f.close() data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:]) flag += 2 return 'TASK_FINISHED', six.moves.cPickle.dumps( (Success(), (flag, data), accUpdate, env.task_stats), -1) except FetchFailed as e: return 'TASK_FAILED', six.moves.cPickle.dumps((e, None, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', six.moves.cPickle.dumps( (OtherFailure(msg), None, None, None), -1) finally: gc.collect() gc.enable()
def run_task(task_data): try: gc.disable() task, ntry = loads(decompress(task_data)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() MutableDict.flush() if marshalable(result): try: flag, data = 0, marshal.dumps(result) except Exception as e: flag, data = 1, cPickle.dumps(result, -1) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data)) f = open(path, 'w') f.write(data) f.close() data = '/'.join( [LocalFileShuffle.getServerUri()] + path.split('/')[-3:] ) flag += 2 return 'TASK_FINISHED', cPickle.dumps( (Success(), (flag, data), accUpdate), -1) except FetchFailed as e: return 'TASK_FAILED', cPickle.dumps((e, None, None), -1) except: import traceback msg = traceback.format_exc() return 'TASK_FAILED', cPickle.dumps( (OtherFailure(msg), None, None), -1) finally: close_mfs() gc.collect() gc.enable()
def load_stream(self, stream): while True: head = stream.read(5) if not head: return length, is_marshal, is_sorted = unpack_header(head) assert (is_sorted) buf = stream.read(length) if len(buf) < length: raise IOError("length not match: expected %d, but got %d" % (length, len(buf))) buf = decompress(buf) AutoBatchedSerializer.size_loaded += len(buf) if is_marshal: vs = marshal.loads(buf) else: vs = pickle.loads(buf) for v in vs: yield v
def unsorted_batches(self): f = None #TEST_RETRY = True try: f, exp_size = self.open() total_size = 0 while True: head = f.read(5) if len(head) == 0: break length, is_marshal, is_sorted = unpack_header(head) assert (not is_sorted) total_size += length + 5 d = f.read(length) if length != len(d): raise IOError("length not match: expected %d, but got %d" % (length, len(d))) d = decompress(d) if is_marshal: items = marshal.loads(d) else: try: items = pickle.loads(d) except: time.sleep(1) items = pickle.loads(d) yield items #if TEST_RETRY and self.num_retry == 0: # raise Exception("test_retry") if total_size != exp_size: raise IOError("fetch size not match: expected %d, but got %d" % (exp_size, total_size)) env.task_stats.bytes_fetch += exp_size finally: if f: f.close()
def fetch_one(self, uri, shuffleId, part, reduceId): if uri == LocalFileShuffle.getServerUri(): # urllib can open local file url = LocalFileShuffle.getOutputFile(shuffleId, part, reduceId) else: url = "%s/%d/%d/%d" % (uri, shuffleId, part, reduceId) logger.debug("fetch %s", url) tries = 4 while True: try: f = urllib.urlopen(url) if f.code == 404: f.close() raise IOError("%s not found", url) d = f.read() flag = d[:1] length, = struct.unpack("I", d[1:5]) if length != len(d): raise ValueError("length not match: expected %d, but got %d" % (length, len(d))) d = decompress(d[5:]) f.close() if flag == 'm': d = marshal.loads(d) elif flag == 'p': d = cPickle.loads(d) else: raise ValueError("invalid flag") return d except Exception, e: logger.debug("Fetch failed for shuffle %d, reduce %d, %d, %s, %s, try again", shuffleId, reduceId, part, url, e) tries -= 1 if not tries: logger.error("Fetch failed for shuffle %d, reduce %d, %d, %s, %s", shuffleId, reduceId, part, url, e) raise time.sleep(2**(3-tries))
def _fetch_dct(self): f = urllib.request.urlopen(self.url) if f.code == 404: f.close() raise IOError("not found") d = f.read() flag = d[:1] length, = struct.unpack("I", d[1:5]) if length != len(d): raise ValueError("length not match: expected %d, but got %d" % (length, len(d))) env.task_stats.bytes_shuffle_read += length d = decompress(d[5:]) f.close() if flag == b'm': d = marshal.loads(d) elif flag == b'p': d = pickle.loads(d) else: raise ValueError("invalid flag") return d
def launchTask(self, driver, task): try: t, ntry = cPickle.loads(decompress(task.data)) reply_status(driver, task, mesos_pb2.TASK_RUNNING) logging.debug("launch task %s", t.id) pool = self.get_idle_worker() self.busy_workers[task.task_id.value] = (task, pool) def callback((state, data)): with self.lock: if task.task_id.value not in self.busy_workers: return _, pool = self.busy_workers.pop(task.task_id.value) pool.done += 1 reply_status(driver, task, state, data) if (len(self.idle_workers) + len(self.busy_workers) < self.parallel and len(self.idle_workers) < MAX_IDLE_WORKERS and pool.done < MAX_TASKS_PER_WORKER and get_pool_memory(pool) < get_task_memory(task) ): # maybe memory leak in executor self.idle_workers.append((time.time(), pool)) else: try: pool.terminate() except: pass pool.apply_async(run_task, [t, ntry], callback=callback) except Exception, e: import traceback msg = traceback.format_exc() reply_status(driver, task, mesos_pb2.TASK_LOST, msg) return
def statusUpdate(self, driver, status): def plot_progresses(): if self.color: total = len(self.activeJobs) logger.info('\x1b[2K\x1b[J\x1b[1A') for i, jid in enumerate(self.activeJobs): if i == total - 1: ending = '\x1b[%sA' % total else: ending = '' jobs = self.activeJobs[jid] jobs.progress(ending) tid = status.task_id.value state = status.state logger.debug('status update: %s %s', tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = list(map(int, tid.split(':'))) if state == 'TASK_RUNNING': if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) if job.tasksFinished == 0: plot_progresses() else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(Dict(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToAgentId: agent_id = self.taskIdToAgentId[tid] if agent_id in self.agentTasks: self.agentTasks[agent_id] -= 1 del self.taskIdToAgentId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] reason = status.get('message') data = status.get('data') if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate, task_stats = six.moves.cPickle.loads( decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.request.urlopen(data).read() except IOError: # try again data = urllib.request.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = six.moves.cPickle.loads(data) except Exception as e: logger.warning( 'error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' job.statusUpdate(task_id, tried, state, 'load failed: %s' % e) return else: job.statusUpdate(task_id, tried, state, reason, result, accUpdate, task_stats) if state == 'TASK_FINISHED': plot_progresses() return # killed, lost, load failed job.statusUpdate(task_id, tried, state, reason or data)
def unBlockifyObject(self, blocks): s = ''.join(decompress(b.data) for b in blocks) try: return marshal.loads(s) except Exception : return cPickle.loads(s)
def statusUpdate(self, driver, status): tid = status.task_id.value state = status.state logger.debug('status update: %s %s', tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = map(int, tid.split(':')) if state == 'TASK_RUNNING': if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(Dict(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToAgentId: agent_id = self.taskIdToAgentId[tid] if agent_id in self.agentTasks: self.agentTasks[agent_id] -= 1 del self.taskIdToAgentId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] data = status.get('data') if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate = cPickle.loads( decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.urlopen(data).read() except IOError: # try again data = urllib.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) except Exception as e: logger.warning( 'error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' return job.statusUpdate( task_id, tried, 'TASK_FAILED', 'load failed: %s' % e) else: return job.statusUpdate(task_id, tried, state, reason, result, accUpdate) # killed, lost, load failed job.statusUpdate(task_id, tried, state, data)
def unBlockifyObject(self, blocks): s = ''.join(decompress(b.data) for b in blocks) try: return marshal.loads(s) except Exception: return cPickle.loads(s)
def statusUpdate(self, driver, status): tid = status.task_id.value state = status.state logger.debug('status update: %s %s', tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = map(int, tid.split(':')) if state == 'TASK_RUNNING': if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(Dict(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToAgentId: agent_id = self.taskIdToAgentId[tid] if agent_id in self.agentTasks: self.agentTasks[agent_id] -= 1 del self.taskIdToAgentId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] data = status.get('data') if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate = cPickle.loads(decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.urlopen(data).read() except IOError: # try again data = urllib.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) except Exception as e: logger.warning('error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' return job.statusUpdate(task_id, tried, 'TASK_FAILED', 'load failed: %s' % e) else: return job.statusUpdate(task_id, tried, state, reason, result, accUpdate) # killed, lost, load failed job.statusUpdate(task_id, tried, state, data)
def unBlockifyObject(self, blocks): s = ''.join(decompress(b.data) for b in blocks) if s[0] == '0': return marshal.loads(s[1:]) else: return cPickle.loads(s[1:])