def frameworkMessage(self, driver, executorId, agentId, message): """ Invoked when an executor sends a message. """ # Take it out of base 64 encoding from Protobuf if USING_PYTHON2: message = decode_data(message) else: message = decode_data(message).decode() log.debug('Got framework message from executor %s running on agent %s: %s', executorId.value, agentId.value, message) message = ast.literal_eval(message) assert isinstance(message, dict) # Handle the mandatory fields of a message nodeAddress = message.pop('address') executor = self._registerNode(nodeAddress, agentId.value) # Handle optional message fields for k, v in iteritems(message): if k == 'nodeInfo': assert isinstance(v, dict) resources = [taskData for taskData in itervalues(self.runningJobMap) if taskData.executorID == executorId.value] requestedCores = sum(taskData.cores for taskData in resources) requestedMemory = sum(taskData.memory for taskData in resources) executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v) self.executors[nodeAddress] = executor else: raise RuntimeError("Unknown message field '%s'." % k)
def launchTask(self, driver, task): task_id = task.task_id reply_status(driver, task_id, TaskState.running) logger.debug('launch task %s', task.task_id.value) def worker(procname, q, task_id_value, task_data): task_id_str = "task %s" % (task_id_value,) threading.current_thread().name = task_id_str setproctitle(procname) set_oom_score(100) env.start_slave() q.put((task_id_value, run_task(task_data))) try: name = '[Task-%s]%s' % (task.task_id.value, Script) proc = multiprocessing.Process(target=worker, args=(name, self.result_queue, task.task_id.value, decode_data(task.data),)) proc.name = name proc.daemon = True proc.start() self.tasks[task.task_id.value] = (task, proc) except Exception as e: import traceback msg = traceback.format_exc() reply_status(driver, task_id, TaskState.failed, TaskEndReason.launch_failed, msg, cPickle.dumps(e))
def runTask(): log.debug("Running task %s", task.task_id.value) startTime = time.time() sendUpdate(task, 'TASK_RUNNING', wallTime=0) # try to unpickle the task try: taskData = pickle.loads(decode_data(task.data)) except: exc_info = sys.exc_info() log.error('Exception while unpickling task: ', exc_info=exc_info) exc_type, exc_value, exc_trace = exc_info sendUpdate(task, 'TASK_FAILED', wallTime=0, msg=''.join( traceback.format_exception_only( exc_type, exc_value))) return # This is where task.data is first invoked. Using this position to setup cleanupInfo if self.workerCleanupInfo is not None: assert self.workerCleanupInfo == taskData.workerCleanupInfo else: self.workerCleanupInfo = taskData.workerCleanupInfo # try to invoke a run on the unpickled task try: process = runJob(taskData) self.runningTasks[task.task_id.value] = process.pid try: exitStatus = process.wait() wallTime = time.time() - startTime if 0 == exitStatus: sendUpdate(task, 'TASK_FINISHED', wallTime) elif -9 == exitStatus: sendUpdate(task, 'TASK_KILLED', wallTime) else: sendUpdate(task, 'TASK_FAILED', wallTime, msg=str(exitStatus)) finally: del self.runningTasks[task.task_id.value] except: wallTime = time.time() - startTime exc_info = sys.exc_info() log.error('Exception while running task:', exc_info=exc_info) exc_type, exc_value, exc_trace = exc_info sendUpdate(task, 'TASK_FAILED', wallTime=wallTime, msg=''.join( traceback.format_exception_only( exc_type, exc_value))) wallTime = time.time() - startTime sendUpdate(task, 'TASK_FINISHED', wallTime)
def run_task(task): update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_RUNNING' update.timestamp = time.time() driver.sendStatusUpdate(update) # 保留以作测试用 print(decode_data(task.data), file=sys.stderr) cnt = 0 N = 2000000 for i in range(N): x = random() y = random() if (x * x + y * y) < 1: cnt += 1 vPi = 4.0 * cnt / N print(vPi) driver.sendFrameworkMessage(encode_data(str(vPi))) time.sleep(30) update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_FINISHED' update.timestamp = time.time() driver.sendStatusUpdate(update)
def frameworkMessage(self, driver, executorId, slaveId, message): self.sumPi = self.sumPi + float(decode_data(message)) self.temp = self.temp + 1 if self.temp >= self.count: self.Pi = self.sumPi / self.count print(self.Pi) driver.stop()
def launchTask(self, driver, task): task_id = task.task_id reply_status(driver, task_id, 'TASK_RUNNING') logger.debug('launch task %s', task.task_id.value) def worker(procname, q, task_id_value, task_data): task_id_str = "task %s" % (task_id_value, ) threading.current_thread().name = task_id_str setproctitle(procname) set_oom_score(100) env.start() q.put((task_id_value, run_task(task_data))) try: name = '[Task-%s]%s' % (task.task_id.value, Script) proc = multiprocessing.Process(target=worker, args=( name, self.result_queue, task.task_id.value, decode_data(task.data), )) proc.name = name proc.daemon = True proc.start() self.tasks[task.task_id.value] = (task, proc) except Exception: import traceback msg = traceback.format_exc() reply_status(driver, task_id, 'TASK_LOST', msg)
def frameworkMessage(self, driver, executorId, slaveId, message): ans = decode_data(message) print('get an ans %s' % ans) ans = ans.split(' ') self.left = self.left + int(ans[0]) self.right = self.right + int(ans[1]) self.Task_finished = self.Task_finished + 1
def run_task(task): update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_RUNNING' update.timestamp = time.time() driver.sendStatusUpdate(update) tmp = decode_data(task.data).split('!') fun = tmp[0] left = float(tmp[1]) right = float(tmp[2]) step = float(tmp[3]) res_tot = 0 x = left while x < right - 1e-16: exec(fun) in globals(), locals() res_tot = res_tot + res x = x + step driver.sendFrameworkMessage(encode_data(repr(step * res_tot))) update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_FINISHED' update.timestamp = time.time() driver.sendStatusUpdate(update)
def launchTask(self, driver, task): task_id = task.task_id reply_status(driver, task_id, 'TASK_RUNNING') logger.debug('launch task %s', task.task_id.value) def worker(name, q, task_id_value, task_data, init_args): setproctitle(name) init_env(init_args) q.put((task_id_value, run_task(task_data))) try: name = '[Task-%s]%s' % (task.task_id.value, Script) proc = multiprocessing.Process(target=worker, args=(name, self.result_queue, task.task_id.value, decode_data(task.data), self.init_args)) proc.name = name proc.daemon = True proc.start() self.tasks[task.task_id.value] = (task, proc) except Exception: import traceback msg = traceback.format_exc() reply_status(driver, task_id, 'TASK_LOST', msg)
def registered(self, driver, executorInfo, frameworkInfo, agent_info): try: global Script ( Script, cwd, python_path, osenv, self.parallel, out_logger, err_logger, logLevel, args ) = marshal.loads(decode_data(executorInfo.data)) self.init_args = args sys.path = python_path os.environ.update(osenv) setproctitle('[Executor]' + Script) prefix = '[%s] ' % socket.gethostname() fmt = '%(asctime)-15s [%(levelname)s] [%(name)-9s] %(message)s' logging.basicConfig(format=fmt, level=logLevel) r1 = self.stdout_redirect = Redirect(1, out_logger, prefix) sys.stdout = r1.pipe_wfile r2 = self.stderr_redirect = Redirect(2, err_logger, prefix) sys.stderr = r2.pipe_wfile if os.path.exists(cwd): try: os.chdir(cwd) except Exception as e: logger.warning('change cwd to %s failed: %s', cwd, e) else: logger.warning('cwd (%s) not exists', cwd) self.workdir = args['WORKDIR'] main_workdir = self.workdir[0] root = os.path.dirname(main_workdir) if not os.path.exists(root): os.mkdir(root) os.chmod(root, 0o777) # because umask mkdir_p(main_workdir) self._try_flock(main_workdir) args['SERVER_URI'] = startWebServer(main_workdir) if 'MESOS_SLAVE_PID' in os.environ: # make unit test happy setup_cleaner_process(self.workdir) spawn(self.check_memory, driver) spawn(self.replier, driver) logger.debug('executor started at %s', agent_info.hostname) except Exception as e: import traceback msg = traceback.format_exc() logger.error('init executor failed: %s', msg) raise
def frameworkMessage(self, driver, executorId, slaveId, message): # merge task result result = decode_data(message).decode().split('ThisIsAnOuterSeparator') for item in result: item = item.split('ThisIsAnInnerSeparator') if item[0] in self.word_count: self.word_count[item[0]] += int(item[1]) else: self.word_count[item[0]] = int(item[1])
def registered(self, driver, executorInfo, frameworkInfo, agent_info): try: global Script ( Script, cwd, python_path, osenv, self.parallel, out_logger, err_logger, logLevel, use_color, dpark_env ) = marshal.loads(decode_data(executorInfo.data)) sys.path = python_path os.environ.update(osenv) setproctitle('[Executor]' + Script) prefix = formatter_message( '{MAGENTA}[%s]{RESET} ' % socket.gethostname().ljust(10), use_color ) init_dpark_logger(logLevel, use_color=use_color) logging.root.setLevel(logLevel) r1 = self.stdout_redirect = Redirect(1, out_logger, prefix) sys.stdout = r1.pipe_wfile r2 = self.stderr_redirect = Redirect(2, err_logger, prefix) sys.stderr = r2.pipe_wfile spawn_rconsole(locals()) if os.path.exists(cwd): try: os.chdir(cwd) except Exception as e: logger.warning('change cwd to %s failed: %s', cwd, e) else: logger.warning('cwd (%s) not exists', cwd) env.workdir.init(dpark_env.get(env.DPARK_ID)) self._try_flock(env.workdir.main) dpark_env['SERVER_URI'] = startWebServer(env.workdir.main) if 'MESOS_SLAVE_PID' in os.environ: # make unit test happy env.workdir.setup_cleaner_process() spawn(self.check_alive, driver) spawn(self.replier, driver) env.environ.update(dpark_env) from dpark.broadcast import start_download_manager start_download_manager() logger.debug('executor started at %s', agent_info.hostname) except Exception as e: import traceback msg = traceback.format_exc() logger.error('init executor failed: %s', msg) raise
def registered(self, driver, executorInfo, frameworkInfo, agent_info): try: global Script (Script, cwd, python_path, osenv, self.parallel, out_logger, err_logger, logLevel, use_color, dpark_env) = marshal.loads(decode_data(executorInfo.data)) sys.path = python_path os.environ.update(osenv) setproctitle('[Executor]' + Script) prefix = formatter_message( '{MAGENTA}[%s]{RESET} ' % socket.gethostname().ljust(10), use_color) init_dpark_logger(logLevel, use_color=use_color) logging.root.setLevel(logLevel) r1 = self.stdout_redirect = Redirect(1, out_logger, prefix) sys.stdout = r1.pipe_wfile r2 = self.stderr_redirect = Redirect(2, err_logger, prefix) sys.stderr = r2.pipe_wfile spawn_rconsole(locals()) if os.path.exists(cwd): try: os.chdir(cwd) except Exception as e: logger.warning('change cwd to %s failed: %s', cwd, e) else: logger.warning('cwd (%s) not exists', cwd) env.workdir.init(dpark_env.get(env.DPARK_ID)) self._try_flock(env.workdir.main) dpark_env['SERVER_URI'] = startWebServer(env.workdir.main) if 'MESOS_SLAVE_PID' in os.environ: # make unit test happy env.workdir.setup_cleaner_process() spawn(self.check_alive, driver) spawn(self.replier, driver) env.environ.update(dpark_env) from dpark.broadcast import start_download_manager start_download_manager() logger.debug('executor started at %s', agent_info.hostname) except Exception as e: import traceback msg = traceback.format_exc() logger.error('init executor failed: %s', msg) raise
def run_task(task): sendback = Dict() sendback.state = 'TASK_RUNNING' sendback.task_id.value = task.task_id.value sendback.timestamp = time.time() driver.sendStatusUpdate(sendback) os.system(decode_data(task.data)) sendback = Dict() sendback.state = 'TASK_FINISHED' sendback.task_id.value = task.task_id.value sendback.timestamp = time.time() driver.sendStatusUpdate(sendback)
def run_task(task): update = Dict() update.task_id.value = task.task_id.value update.state = "TASK_RUNNING" update.timestamp = time.time() driver.sendStatusUpdate(update) print(decode_data(task.data), file=sys.stderr) time.sleep(30) update = Dict() update.task_id.value = task.task_id.value update.state = "TASK_FINISHED" update.timestamp = time.time() driver.sendStatusUpdate(update)
def run_task(task): update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_RUNNING' update.timestamp = time.time() driver.sendStatusUpdate(update) result = len(decode_data(task.data).split(' ')) driver.sendFrameworkMessage(encode_data(str(result))) update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_FINISHED' update.timestamp = time.time() driver.sendStatusUpdate(update)
def run_task(task): update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_RUNNING' update.timestamp = time.time() driver.sendStatusUpdate(update) print(decode_data(task.data), file=sys.stderr) time.sleep(30) update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_FINISHED' update.timestamp = time.time() driver.sendStatusUpdate(update)
def run_task(task): update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_RUNNING' update.timestamp = time.time() logging.debug('Task running: %s %s', update.task_id.value, update.state) driver.sendStatusUpdate(update) print(decode_data(task.data), file=sys.stderr) time.sleep(30) update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_FINISHED' update.timestamp = time.time() logging.debug('Task finished: %s %s', update.task_id.value, update.state) driver.sendStatusUpdate(update)
def run_task(task): update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_RUNNING' update.timestamp = time.time() driver.sendStatusUpdate(update) tmp = decode_data(task.data).split(' ') left = int(tmp[0]) right = int(tmp[1]) res = 0 for i in xrange(left, right): res = res + i driver.sendFrameworkMessage(encode_data(str(res))) update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_FINISHED' update.timestamp = time.time() driver.sendStatusUpdate(update)
def run_task(task): update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_RUNNING' update.timestamp = time.time() driver.sendStatusUpdate(update) data = decode_data(task.data).split(' ') result = 0 for x in data: if x != '': result += int(x) # send the result to the scheduler driver.sendFrameworkMessage(encode_data(str(result))) update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_FINISHED' update.timestamp = time.time() driver.sendStatusUpdate(update)
def run_task(task): #更新状态,表明任务开始 update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_RUNNING' update.timestamp = time.time() driver.sendStatusUpdate(update) #解析数据 data = decode_data(task.data) data = data.split('\n') #初始化统计量 left = 0 right = 0 ans = '' #计算 for x in data: if x == '': break tmp = x.split(' ') a = float(tmp[0]) b = float(tmp[1]) c = float(tmp[2]) deta = math.sqrt(b * b - 4 * a * c) pt = (-deta - b) * 0.5 / a if pt > 0: right = right + 1 else: left = left + 1 #返回计算结果 ans = str(left) + ' ' + str(right) driver.sendFrameworkMessage(encode_data(ans)) #更新状态,表明计算结束 update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_FINISHED' update.timestamp = time.time() driver.sendStatusUpdate(update)
def launch_task(task, stdout_name, stderr_name): """Launches the task using the command available in the json map from the data field. Parameters ---------- task: map The task to execute. stdout_name: string The file to use to redirect stdout. stderr_name: string The file to use to redirect stderr. Returns ------- When command is provided and a process can be started, the tuple containing the process launched, stdout file, and stderr file. Else it logs the reason and returns None. """ try: data_string = decode_data(task['data']).decode('utf8') data_json = json.loads(data_string) command = str(data_json['command']).strip() logging.info('Command: {}'.format(command)) if not command: logging.warning('No command provided!') return None stdout = open(stdout_name, 'a+') stderr = open(stderr_name, 'a+') process = subprocess.Popen(command, shell=True, stdout=stdout, stderr=stderr) return process, stdout, stderr except Exception: logging.exception('Error in launch_task') return None
def run_task(task): # config start state update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_RUNNING' update.timestamp = time.time() driver.sendStatusUpdate(update) # word count # ---------------------------------------- # data preparation words = decode_data(task.data).decode().split('ThisIsASeparator') # count words word_count = {} for word in words: if word in word_count: word_count[word] += 1 else: word_count[word] = 1 # prepare result result = [] for word, cnt in word_count.items(): result.append(word + 'ThisIsAnInnerSeparator' + str(cnt)) result = 'ThisIsAnOuterSeparator'.join(result) # send result to scheduler driver.sendFrameworkMessage(encode_data(bytes(result, 'utf-8'))) # ---------------------------------------- # config end state update = Dict() update.task_id.value = task.task_id.value update.state = 'TASK_FINISHED' update.timestamp = time.time() driver.sendStatusUpdate(update)
def launch_task(task, environment): """Launches the task using the command available in the json map from the data field. Parameters ---------- task: dictionary The task to execute. environment: dictionary The task environment. Returns ------- When command is provided and a process can be started, the process launched. Else it logs the reason and returns None. """ try: data_string = pm.decode_data(task['data']).decode('utf8') data_json = json.loads(data_string) command = str(data_json['command']).strip() logging.info('Command: {}'.format(command)) return cs.launch_process(command, environment) except Exception: logging.exception('Error in launch_task') return None
def statusUpdate(self, driver, status): tid = status.task_id.value state = status.state logger.debug('status update: %s %s', tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = map(int, tid.split(':')) if state == 'TASK_RUNNING': if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(Dict(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToAgentId: agent_id = self.taskIdToAgentId[tid] if agent_id in self.agentTasks: self.agentTasks[agent_id] -= 1 del self.taskIdToAgentId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] data = status.get('data') if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate = cPickle.loads( decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.urlopen(data).read() except IOError: # try again data = urllib.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) except Exception as e: logger.warning( 'error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' return job.statusUpdate( task_id, tried, 'TASK_FAILED', 'load failed: %s' % e) else: return job.statusUpdate(task_id, tried, state, reason, result, accUpdate) # killed, lost, load failed job.statusUpdate(task_id, tried, state, data)
def parse_message(encoded_message): return json.loads(decode_data(encoded_message).decode('utf8'))
def statusUpdate(self, driver, status): def plot_progresses(): if self.color: total = len(self.active_tasksets) logger.info('\x1b[2K\x1b[J\x1b[1A') for i, taskset_id in enumerate(self.active_tasksets): if i == total - 1: ending = '\x1b[%sA' % total else: ending = '' tasksets = self.active_tasksets[taskset_id] tasksets.progress(ending) mesos_task_id = status.task_id.value state = status.state reason = status.get('message') # set by mesos data = status.get('data') logger.debug('status update: %s %s', mesos_task_id, state) ttid = TTID(mesos_task_id) taskset = self.active_tasksets.get(ttid.taskset_id) if taskset is None: if state == 'TASK_RUNNING': logger.debug('kill task %s as its taskset has gone', mesos_task_id) self.driver.killTask(Dict(value=mesos_task_id)) else: logger.debug('ignore task %s as its taskset has gone', mesos_task_id) return if state == 'TASK_RUNNING': taskset.statusUpdate(ttid.task_id, ttid.task_try, state) if taskset.tasksFinished == 0: plot_progresses() else: if mesos_task_id not in taskset.ttids: logger.debug( 'ignore task %s as it has finished or failed, new msg: %s', mesos_task_id, (state, reason)) else: taskset.ttids.remove(mesos_task_id) if mesos_task_id in self.ttid_to_agent_id: agent_id = self.ttid_to_agent_id[mesos_task_id] if agent_id in self.agent_id_to_ttids: self.agent_id_to_ttids[agent_id] -= 1 del self.ttid_to_agent_id[mesos_task_id] if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate, task_stats = cPickle.loads( decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.request.urlopen(data).read() except IOError: # try again data = urllib.request.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) taskset.statusUpdate(ttid.task_id, ttid.task_try, state, reason, result, accUpdate, task_stats) if state == 'TASK_FINISHED': plot_progresses() except Exception as e: logger.warning( 'error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' taskset.statusUpdate(ttid.task_id, ttid.task_try, state, 'load failed: %s' % e) else: # killed, lost taskset.statusUpdate(ttid.task_id, ttid.task_try, state, reason or data)
def launch_task(self, driver, task): reply_status(driver, task.task_id, 'TASK_RUNNING') host = socket.gethostname() cwd, command, _env, shell, addr1, addr2, addr3 = pickle.loads( decode_data(task.data) ) prefix = "[%s@%s] " % (str(task.task_id.value), host) prefix = prefix.encode('utf-8') outr, outw = os.pipe() errr, errw = os.pipe() t1 = Thread(target=forword, args=[outr, addr1, prefix]) t1.daemon = True t1.start() t2 = Thread(target=forword, args=[errr, addr2, prefix]) t2.daemon = True t2.start() wout = os.fdopen(outw, 'wb', 0) werr = os.fdopen(errw, 'wb', 0) if addr3: tid = int(task.task_id.value.split('-')[0]) subscriber = ctx.socket(zmq.SUB) subscriber.connect(addr3) subscriber.setsockopt(zmq.SUBSCRIBE, b'') poller = zmq.Poller() poller.register(subscriber, zmq.POLLIN) socks = dict(poller.poll(min(tid / 100.0 + 1, 5) * 60 * 1000)) if socks and socks.get(subscriber) == zmq.POLLIN: hosts = pickle.loads(subscriber.recv(zmq.NOBLOCK)) line = hosts.get(host) if not six.PY2: line = line.decode('utf-8') if line: command = line.split(' ') else: return reply_status(driver, task.task_id, 'TASK_FAILED') else: return reply_status(driver, task.task_id, 'TASK_FAILED') mem = 100 for r in task.resources: if r.name == 'mem': mem = r.scalar.value break try: env = dict(os.environ) env.update(_env) if not os.path.exists(cwd): print('CWD %s is not exists, use /tmp instead' % cwd, file=werr) cwd = '/tmp' p = subprocess.Popen(command, stdout=wout, stderr=werr, cwd=cwd, env=env, shell=shell) tid = task.task_id.value self.ps[tid] = p code = None last_time = 0 while True: time.sleep(0.1) code = p.poll() if code is not None: break now = time.time() if now < last_time + 2: continue last_time = now try: import psutil process = psutil.Process(p.pid) rss = sum((proc.memory_info().rss for proc in process.get_children(recursive=True)), process.memory_info().rss) rss = (rss >> 20) if rss > mem * 1.5: print("task %s used too much memory: %dMB > %dMB * 1.5, kill it. " \ "use -m argument to request more memory." % ( tid, rss, mem), file=werr) p.kill() elif rss > mem: print("task %s used too much memory: %dMB > %dMB, " \ "use -m to request for more memory" % ( tid, rss, mem), file=werr) except Exception: pass if code == 0: status = 'TASK_FINISHED' else: print(' '.join(command) + ' exit with %s' % code, file=werr) status = 'TASK_FAILED' except Exception: status = 'TASK_FAILED' import traceback print('exception while open ' + ' '.join(command), file=werr) for line in traceback.format_exc(): werr.write(line) reply_status(driver, task.task_id, status) wout.close() werr.close() t1.join() t2.join() self.ps.pop(tid, None) self.ts.pop(tid, None)
def frameworkMessage(self, driver, executorId, slaveId, message): self.result += int(decode_data(message))
def frameworkMessage(self, driver, executorId, slaveId, message): self.sum_res = self.sum_res + float(decode_data(message)) self.finished = self.finished + 1 if self.finished >= self.counts: print(self.sum_res) driver.stop()
def assert_message(testcase, expected_message, actual_encoded_message): actual_message = json.loads( decode_data(actual_encoded_message).decode('utf8')) testcase.assertEquals(expected_message, actual_message)
def statusUpdate(self, driver, status): tid = status.task_id.value state = status.state logger.debug('status update: %s %s', tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = map(int, tid.split(':')) if state == 'TASK_RUNNING': if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(Dict(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToAgentId: agent_id = self.taskIdToAgentId[tid] if agent_id in self.agentTasks: self.agentTasks[agent_id] -= 1 del self.taskIdToAgentId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] data = status.get('data') if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate = cPickle.loads(decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.urlopen(data).read() except IOError: # try again data = urllib.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = cPickle.loads(data) except Exception as e: logger.warning('error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' return job.statusUpdate(task_id, tried, 'TASK_FAILED', 'load failed: %s' % e) else: return job.statusUpdate(task_id, tried, state, reason, result, accUpdate) # killed, lost, load failed job.statusUpdate(task_id, tried, state, data)
def statusUpdate(self, driver, status): def plot_progresses(): if self.color: total = len(self.activeJobs) logger.info('\x1b[2K\x1b[J\x1b[1A') for i, jid in enumerate(self.activeJobs): if i == total - 1: ending = '\x1b[%sA' % total else: ending = '' jobs = self.activeJobs[jid] jobs.progress(ending) tid = status.task_id.value state = status.state logger.debug('status update: %s %s', tid, state) jid = self.taskIdToJobId.get(tid) _, task_id, tried = list(map(int, tid.split(':'))) if state == 'TASK_RUNNING': if jid in self.activeJobs: job = self.activeJobs[jid] job.statusUpdate(task_id, tried, state) if job.tasksFinished == 0: plot_progresses() else: logger.debug('kill task %s as its job has gone', tid) self.driver.killTask(Dict(value=tid)) return self.taskIdToJobId.pop(tid, None) if jid in self.jobTasks: self.jobTasks[jid].remove(tid) if tid in self.taskIdToAgentId: agent_id = self.taskIdToAgentId[tid] if agent_id in self.agentTasks: self.agentTasks[agent_id] -= 1 del self.taskIdToAgentId[tid] if jid not in self.activeJobs: logger.debug('ignore task %s as its job has gone', tid) return job = self.activeJobs[jid] reason = status.get('message') data = status.get('data') if state in ('TASK_FINISHED', 'TASK_FAILED') and data: try: reason, result, accUpdate, task_stats = six.moves.cPickle.loads( decode_data(data)) if result: flag, data = result if flag >= 2: try: data = urllib.request.urlopen(data).read() except IOError: # try again data = urllib.request.urlopen(data).read() flag -= 2 data = decompress(data) if flag == 0: result = marshal.loads(data) else: result = six.moves.cPickle.loads(data) except Exception as e: logger.warning( 'error when cPickle.loads(): %s, data:%s', e, len(data)) state = 'TASK_FAILED' job.statusUpdate(task_id, tried, state, 'load failed: %s' % e) return else: job.statusUpdate(task_id, tried, state, reason, result, accUpdate, task_stats) if state == 'TASK_FINISHED': plot_progresses() return # killed, lost, load failed job.statusUpdate(task_id, tried, state, reason or data)