Esempio n. 1
0
 def frameworkMessage(self, driver, executorId, agentId, message):
     """
     Invoked when an executor sends a message.
     """
     
     # Take it out of base 64 encoding from Protobuf
     if USING_PYTHON2:
         message = decode_data(message)
     else:
         message = decode_data(message).decode()
     
     log.debug('Got framework message from executor %s running on agent %s: %s',
               executorId.value, agentId.value, message)
     message = ast.literal_eval(message)
     assert isinstance(message, dict)
     # Handle the mandatory fields of a message
     nodeAddress = message.pop('address')
     executor = self._registerNode(nodeAddress, agentId.value)
     # Handle optional message fields
     for k, v in iteritems(message):
         if k == 'nodeInfo':
             assert isinstance(v, dict)
             resources = [taskData for taskData in itervalues(self.runningJobMap)
                          if taskData.executorID == executorId.value]
             requestedCores = sum(taskData.cores for taskData in resources)
             requestedMemory = sum(taskData.memory for taskData in resources)
             executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v)
             self.executors[nodeAddress] = executor
         else:
             raise RuntimeError("Unknown message field '%s'." % k)
Esempio n. 2
0
    def launchTask(self, driver, task):
        task_id = task.task_id
        reply_status(driver, task_id, TaskState.running)
        logger.debug('launch task %s', task.task_id.value)

        def worker(procname, q, task_id_value, task_data):
            task_id_str = "task %s" % (task_id_value,)
            threading.current_thread().name = task_id_str
            setproctitle(procname)
            set_oom_score(100)
            env.start_slave()
            q.put((task_id_value, run_task(task_data)))

        try:
            name = '[Task-%s]%s' % (task.task_id.value, Script)
            proc = multiprocessing.Process(target=worker,
                                           args=(name,
                                                 self.result_queue,
                                                 task.task_id.value,
                                                 decode_data(task.data),))
            proc.name = name
            proc.daemon = True
            proc.start()
            self.tasks[task.task_id.value] = (task, proc)

        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            reply_status(driver, task_id, TaskState.failed, TaskEndReason.launch_failed, msg, cPickle.dumps(e))
Esempio n. 3
0
        def runTask():

            log.debug("Running task %s", task.task_id.value)
            startTime = time.time()
            sendUpdate(task, 'TASK_RUNNING', wallTime=0)

            # try to unpickle the task
            try:
                taskData = pickle.loads(decode_data(task.data))
            except:
                exc_info = sys.exc_info()
                log.error('Exception while unpickling task: ',
                          exc_info=exc_info)
                exc_type, exc_value, exc_trace = exc_info
                sendUpdate(task,
                           'TASK_FAILED',
                           wallTime=0,
                           msg=''.join(
                               traceback.format_exception_only(
                                   exc_type, exc_value)))
                return

            # This is where task.data is first invoked. Using this position to setup cleanupInfo
            if self.workerCleanupInfo is not None:
                assert self.workerCleanupInfo == taskData.workerCleanupInfo
            else:
                self.workerCleanupInfo = taskData.workerCleanupInfo

            # try to invoke a run on the unpickled task
            try:
                process = runJob(taskData)
                self.runningTasks[task.task_id.value] = process.pid
                try:
                    exitStatus = process.wait()
                    wallTime = time.time() - startTime
                    if 0 == exitStatus:
                        sendUpdate(task, 'TASK_FINISHED', wallTime)
                    elif -9 == exitStatus:
                        sendUpdate(task, 'TASK_KILLED', wallTime)
                    else:
                        sendUpdate(task,
                                   'TASK_FAILED',
                                   wallTime,
                                   msg=str(exitStatus))
                finally:
                    del self.runningTasks[task.task_id.value]
            except:
                wallTime = time.time() - startTime
                exc_info = sys.exc_info()
                log.error('Exception while running task:', exc_info=exc_info)
                exc_type, exc_value, exc_trace = exc_info
                sendUpdate(task,
                           'TASK_FAILED',
                           wallTime=wallTime,
                           msg=''.join(
                               traceback.format_exception_only(
                                   exc_type, exc_value)))

            wallTime = time.time() - startTime
            sendUpdate(task, 'TASK_FINISHED', wallTime)
Esempio n. 4
0
        def run_task(task):
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_RUNNING'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)

            # 保留以作测试用
            print(decode_data(task.data), file=sys.stderr)
            cnt = 0
            N = 2000000
            for i in range(N):
                x = random()
                y = random()
                if (x * x + y * y) < 1:
                    cnt += 1
            vPi = 4.0 * cnt / N
            print(vPi)
            driver.sendFrameworkMessage(encode_data(str(vPi)))

            time.sleep(30)

            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_FINISHED'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
Esempio n. 5
0
 def frameworkMessage(self, driver, executorId, slaveId, message):
     self.sumPi = self.sumPi + float(decode_data(message))
     self.temp = self.temp + 1
     if self.temp >= self.count:
         self.Pi = self.sumPi / self.count
         print(self.Pi)
         driver.stop()
Esempio n. 6
0
    def launchTask(self, driver, task):
        task_id = task.task_id
        reply_status(driver, task_id, 'TASK_RUNNING')
        logger.debug('launch task %s', task.task_id.value)

        def worker(procname, q, task_id_value, task_data):
            task_id_str = "task %s" % (task_id_value, )
            threading.current_thread().name = task_id_str
            setproctitle(procname)
            set_oom_score(100)
            env.start()
            q.put((task_id_value, run_task(task_data)))

        try:
            name = '[Task-%s]%s' % (task.task_id.value, Script)
            proc = multiprocessing.Process(target=worker,
                                           args=(
                                               name,
                                               self.result_queue,
                                               task.task_id.value,
                                               decode_data(task.data),
                                           ))
            proc.name = name
            proc.daemon = True
            proc.start()
            self.tasks[task.task_id.value] = (task, proc)

        except Exception:
            import traceback
            msg = traceback.format_exc()
            reply_status(driver, task_id, 'TASK_LOST', msg)
Esempio n. 7
0
 def frameworkMessage(self, driver, executorId, slaveId, message):
     ans = decode_data(message)
     print('get an ans %s' % ans)
     ans = ans.split(' ')
     self.left = self.left + int(ans[0])
     self.right = self.right + int(ans[1])
     self.Task_finished = self.Task_finished + 1
Esempio n. 8
0
        def run_task(task):
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_RUNNING'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)

            tmp = decode_data(task.data).split('!')
            fun = tmp[0]
            left = float(tmp[1])
            right = float(tmp[2])
            step = float(tmp[3])
            res_tot = 0
            x = left
            while x < right - 1e-16:
                exec(fun) in globals(), locals()
                res_tot = res_tot + res
                x = x + step
            driver.sendFrameworkMessage(encode_data(repr(step * res_tot)))

            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_FINISHED'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
Esempio n. 9
0
    def launchTask(self, driver, task):
        task_id = task.task_id
        reply_status(driver, task_id, 'TASK_RUNNING')
        logger.debug('launch task %s', task.task_id.value)

        def worker(name, q, task_id_value, task_data, init_args):
            setproctitle(name)
            init_env(init_args)
            q.put((task_id_value, run_task(task_data)))

        try:
            name = '[Task-%s]%s' % (task.task_id.value, Script)
            proc = multiprocessing.Process(target=worker,
                                           args=(name,
                                                 self.result_queue,
                                                 task.task_id.value,
                                                 decode_data(task.data),
                                                 self.init_args))
            proc.name = name
            proc.daemon = True
            proc.start()
            self.tasks[task.task_id.value] = (task, proc)

        except Exception:
            import traceback
            msg = traceback.format_exc()
            reply_status(driver, task_id, 'TASK_LOST', msg)
Esempio n. 10
0
    def launchTask(self, driver, task):
        task_id = task.task_id
        reply_status(driver, task_id, 'TASK_RUNNING')
        logger.debug('launch task %s', task.task_id.value)

        def worker(name, q, task_id_value, task_data, init_args):
            setproctitle(name)
            init_env(init_args)
            q.put((task_id_value, run_task(task_data)))

        try:
            name = '[Task-%s]%s' % (task.task_id.value, Script)
            proc = multiprocessing.Process(target=worker,
                                           args=(name, self.result_queue,
                                                 task.task_id.value,
                                                 decode_data(task.data),
                                                 self.init_args))
            proc.name = name
            proc.daemon = True
            proc.start()
            self.tasks[task.task_id.value] = (task, proc)

        except Exception:
            import traceback
            msg = traceback.format_exc()
            reply_status(driver, task_id, 'TASK_LOST', msg)
Esempio n. 11
0
    def registered(self, driver, executorInfo, frameworkInfo, agent_info):
        try:
            global Script
            (
                Script, cwd, python_path, osenv, self.parallel,
                out_logger, err_logger, logLevel, args
            ) = marshal.loads(decode_data(executorInfo.data))

            self.init_args = args
            sys.path = python_path
            os.environ.update(osenv)
            setproctitle('[Executor]' + Script)

            prefix = '[%s] ' % socket.gethostname()

            fmt = '%(asctime)-15s [%(levelname)s] [%(name)-9s] %(message)s'
            logging.basicConfig(format=fmt, level=logLevel)

            r1 = self.stdout_redirect = Redirect(1, out_logger, prefix)
            sys.stdout = r1.pipe_wfile

            r2 = self.stderr_redirect = Redirect(2, err_logger, prefix)
            sys.stderr = r2.pipe_wfile

            if os.path.exists(cwd):
                try:
                    os.chdir(cwd)
                except Exception as e:
                    logger.warning('change cwd to %s failed: %s', cwd, e)
            else:
                logger.warning('cwd (%s) not exists', cwd)

            self.workdir = args['WORKDIR']
            main_workdir = self.workdir[0]

            root = os.path.dirname(main_workdir)
            if not os.path.exists(root):
                os.mkdir(root)
                os.chmod(root, 0o777)  # because umask

            mkdir_p(main_workdir)
            self._try_flock(main_workdir)

            args['SERVER_URI'] = startWebServer(main_workdir)
            if 'MESOS_SLAVE_PID' in os.environ:  # make unit test happy
                setup_cleaner_process(self.workdir)

            spawn(self.check_memory, driver)
            spawn(self.replier, driver)

            logger.debug('executor started at %s', agent_info.hostname)

        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            logger.error('init executor failed: %s', msg)
            raise
Esempio n. 12
0
 def frameworkMessage(self, driver, executorId, slaveId, message):
     # merge task result
     result = decode_data(message).decode().split('ThisIsAnOuterSeparator')
     for item in result:
         item = item.split('ThisIsAnInnerSeparator')
         if item[0] in self.word_count:
             self.word_count[item[0]] += int(item[1])
         else:
             self.word_count[item[0]] = int(item[1])
Esempio n. 13
0
    def registered(self, driver, executorInfo, frameworkInfo, agent_info):
        try:
            global Script
            (
                Script, cwd, python_path, osenv, self.parallel,
                out_logger, err_logger, logLevel, use_color, dpark_env
            ) = marshal.loads(decode_data(executorInfo.data))

            sys.path = python_path
            os.environ.update(osenv)
            setproctitle('[Executor]' + Script)

            prefix = formatter_message(
                '{MAGENTA}[%s]{RESET} ' % socket.gethostname().ljust(10),
                use_color
            )

            init_dpark_logger(logLevel, use_color=use_color)
            logging.root.setLevel(logLevel)

            r1 = self.stdout_redirect = Redirect(1, out_logger, prefix)
            sys.stdout = r1.pipe_wfile

            r2 = self.stderr_redirect = Redirect(2, err_logger, prefix)
            sys.stderr = r2.pipe_wfile

            spawn_rconsole(locals())

            if os.path.exists(cwd):
                try:
                    os.chdir(cwd)
                except Exception as e:
                    logger.warning('change cwd to %s failed: %s', cwd, e)
            else:
                logger.warning('cwd (%s) not exists', cwd)

            env.workdir.init(dpark_env.get(env.DPARK_ID))
            self._try_flock(env.workdir.main)
            dpark_env['SERVER_URI'] = startWebServer(env.workdir.main)
            if 'MESOS_SLAVE_PID' in os.environ:  # make unit test happy
                env.workdir.setup_cleaner_process()

            spawn(self.check_alive, driver)
            spawn(self.replier, driver)

            env.environ.update(dpark_env)
            from dpark.broadcast import start_download_manager
            start_download_manager()

            logger.debug('executor started at %s', agent_info.hostname)

        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            logger.error('init executor failed: %s', msg)
            raise
Esempio n. 14
0
    def registered(self, driver, executorInfo, frameworkInfo, agent_info):
        try:
            global Script
            (Script, cwd, python_path, osenv, self.parallel, out_logger,
             err_logger, logLevel, use_color,
             dpark_env) = marshal.loads(decode_data(executorInfo.data))

            sys.path = python_path
            os.environ.update(osenv)
            setproctitle('[Executor]' + Script)

            prefix = formatter_message(
                '{MAGENTA}[%s]{RESET} ' % socket.gethostname().ljust(10),
                use_color)

            init_dpark_logger(logLevel, use_color=use_color)
            logging.root.setLevel(logLevel)

            r1 = self.stdout_redirect = Redirect(1, out_logger, prefix)
            sys.stdout = r1.pipe_wfile

            r2 = self.stderr_redirect = Redirect(2, err_logger, prefix)
            sys.stderr = r2.pipe_wfile

            spawn_rconsole(locals())

            if os.path.exists(cwd):
                try:
                    os.chdir(cwd)
                except Exception as e:
                    logger.warning('change cwd to %s failed: %s', cwd, e)
            else:
                logger.warning('cwd (%s) not exists', cwd)

            env.workdir.init(dpark_env.get(env.DPARK_ID))
            self._try_flock(env.workdir.main)
            dpark_env['SERVER_URI'] = startWebServer(env.workdir.main)
            if 'MESOS_SLAVE_PID' in os.environ:  # make unit test happy
                env.workdir.setup_cleaner_process()

            spawn(self.check_alive, driver)
            spawn(self.replier, driver)

            env.environ.update(dpark_env)
            from dpark.broadcast import start_download_manager
            start_download_manager()

            logger.debug('executor started at %s', agent_info.hostname)

        except Exception as e:
            import traceback
            msg = traceback.format_exc()
            logger.error('init executor failed: %s', msg)
            raise
Esempio n. 15
0
        def run_task(task):
            sendback = Dict()
            sendback.state = 'TASK_RUNNING'
            sendback.task_id.value = task.task_id.value
            sendback.timestamp = time.time()
            driver.sendStatusUpdate(sendback)

            os.system(decode_data(task.data))

            sendback = Dict()
            sendback.state = 'TASK_FINISHED'
            sendback.task_id.value = task.task_id.value
            sendback.timestamp = time.time()
            driver.sendStatusUpdate(sendback)
Esempio n. 16
0
        def run_task(task):
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = "TASK_RUNNING"
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)

            print(decode_data(task.data), file=sys.stderr)
            time.sleep(30)

            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = "TASK_FINISHED"
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
Esempio n. 17
0
        def run_task(task):
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_RUNNING'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)

            result = len(decode_data(task.data).split(' '))
            driver.sendFrameworkMessage(encode_data(str(result)))

            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_FINISHED'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
Esempio n. 18
0
        def run_task(task):
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_RUNNING'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)

            print(decode_data(task.data), file=sys.stderr)
            time.sleep(30)

            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_FINISHED'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
Esempio n. 19
0
        def run_task(task):
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_RUNNING'
            update.timestamp = time.time()
            logging.debug('Task running: %s %s', update.task_id.value,
                          update.state)
            driver.sendStatusUpdate(update)

            print(decode_data(task.data), file=sys.stderr)
            time.sleep(30)

            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_FINISHED'
            update.timestamp = time.time()
            logging.debug('Task finished: %s %s', update.task_id.value,
                          update.state)
            driver.sendStatusUpdate(update)
Esempio n. 20
0
        def run_task(task):
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_RUNNING'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)

            tmp = decode_data(task.data).split(' ')
            left = int(tmp[0])
            right = int(tmp[1])
            res = 0
            for i in xrange(left, right):
                res = res + i
            driver.sendFrameworkMessage(encode_data(str(res)))

            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_FINISHED'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
Esempio n. 21
0
        def run_task(task):
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_RUNNING'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)

            data = decode_data(task.data).split(' ')
            result = 0
            for x in data:
                if x != '':
                    result += int(x)

            # send the result to the scheduler
            driver.sendFrameworkMessage(encode_data(str(result)))

            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_FINISHED'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
Esempio n. 22
0
        def run_task(task):
            #更新状态,表明任务开始
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_RUNNING'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
            #解析数据
            data = decode_data(task.data)
            data = data.split('\n')
            #初始化统计量
            left = 0
            right = 0
            ans = ''
            #计算
            for x in data:
                if x == '':
                    break
                tmp = x.split(' ')
                a = float(tmp[0])
                b = float(tmp[1])
                c = float(tmp[2])
                deta = math.sqrt(b * b - 4 * a * c)
                pt = (-deta - b) * 0.5 / a
                if pt > 0:
                    right = right + 1
                else:
                    left = left + 1
            #返回计算结果
            ans = str(left) + ' ' + str(right)
            driver.sendFrameworkMessage(encode_data(ans))

            #更新状态,表明计算结束
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_FINISHED'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
Esempio n. 23
0
def launch_task(task, stdout_name, stderr_name):
    """Launches the task using the command available in the json map from the data field.

    Parameters
    ----------
    task: map
        The task to execute.
    stdout_name: string
        The file to use to redirect stdout.
    stderr_name: string
        The file to use to redirect stderr.

    Returns
    -------
    When command is provided and a process can be started, the tuple containing the process launched, stdout file, 
    and stderr file.
    Else it logs the reason and returns None.
    """
    try:
        data_string = decode_data(task['data']).decode('utf8')
        data_json = json.loads(data_string)
        command = str(data_json['command']).strip()
        logging.info('Command: {}'.format(command))
        if not command:
            logging.warning('No command provided!')
            return None

        stdout = open(stdout_name, 'a+')
        stderr = open(stderr_name, 'a+')
        process = subprocess.Popen(command,
                                   shell=True,
                                   stdout=stdout,
                                   stderr=stderr)

        return process, stdout, stderr
    except Exception:
        logging.exception('Error in launch_task')
        return None
Esempio n. 24
0
        def run_task(task):
            # config start state
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_RUNNING'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)

            # word count
            # ----------------------------------------
            # data preparation
            words = decode_data(task.data).decode().split('ThisIsASeparator')

            # count words
            word_count = {}
            for word in words:
                if word in word_count:
                    word_count[word] += 1
                else:
                    word_count[word] = 1

            # prepare result
            result = []
            for word, cnt in word_count.items():
                result.append(word + 'ThisIsAnInnerSeparator' + str(cnt))
            result = 'ThisIsAnOuterSeparator'.join(result)

            # send result to scheduler
            driver.sendFrameworkMessage(encode_data(bytes(result, 'utf-8')))
            # ----------------------------------------

            # config end state
            update = Dict()
            update.task_id.value = task.task_id.value
            update.state = 'TASK_FINISHED'
            update.timestamp = time.time()
            driver.sendStatusUpdate(update)
Esempio n. 25
0
def launch_task(task, environment):
    """Launches the task using the command available in the json map from the data field.

    Parameters
    ----------
    task: dictionary
        The task to execute.
    environment: dictionary
        The task environment.

    Returns
    -------
    When command is provided and a process can be started, the process launched.
    Else it logs the reason and returns None.
    """
    try:
        data_string = pm.decode_data(task['data']).decode('utf8')
        data_json = json.loads(data_string)
        command = str(data_json['command']).strip()
        logging.info('Command: {}'.format(command))
        return cs.launch_process(command, environment)
    except Exception:
        logging.exception('Error in launch_task')
        return None
Esempio n. 26
0
    def statusUpdate(self, driver, status):
        tid = status.task_id.value
        state = status.state
        logger.debug('status update: %s %s', tid, state)

        jid = self.taskIdToJobId.get(tid)
        _, task_id, tried = map(int, tid.split(':'))
        if state == 'TASK_RUNNING':
            if jid in self.activeJobs:
                job = self.activeJobs[jid]
                job.statusUpdate(task_id, tried, state)
            else:
                logger.debug('kill task %s as its job has gone', tid)
                self.driver.killTask(Dict(value=tid))

            return

        self.taskIdToJobId.pop(tid, None)
        if jid in self.jobTasks:
            self.jobTasks[jid].remove(tid)
        if tid in self.taskIdToAgentId:
            agent_id = self.taskIdToAgentId[tid]
            if agent_id in self.agentTasks:
                self.agentTasks[agent_id] -= 1
            del self.taskIdToAgentId[tid]

        if jid not in self.activeJobs:
            logger.debug('ignore task %s as its job has gone', tid)
            return

        job = self.activeJobs[jid]
        data = status.get('data')
        if state in ('TASK_FINISHED', 'TASK_FAILED') and data:
            try:
                reason, result, accUpdate = cPickle.loads(
                    decode_data(data))
                if result:
                    flag, data = result
                    if flag >= 2:
                        try:
                            data = urllib.urlopen(data).read()
                        except IOError:
                            # try again
                            data = urllib.urlopen(data).read()
                        flag -= 2
                    data = decompress(data)
                    if flag == 0:
                        result = marshal.loads(data)
                    else:
                        result = cPickle.loads(data)
            except Exception as e:
                logger.warning(
                    'error when cPickle.loads(): %s, data:%s', e, len(data))
                state = 'TASK_FAILED'
                return job.statusUpdate(
                    task_id, tried, 'TASK_FAILED', 'load failed: %s' % e)
            else:
                return job.statusUpdate(task_id, tried, state,
                                        reason, result, accUpdate)

        # killed, lost, load failed
        job.statusUpdate(task_id, tried, state, data)
Esempio n. 27
0
File: utils.py Progetto: yueri/Cook
def parse_message(encoded_message):
    return json.loads(decode_data(encoded_message).decode('utf8'))
Esempio n. 28
0
    def statusUpdate(self, driver, status):
        def plot_progresses():
            if self.color:
                total = len(self.active_tasksets)
                logger.info('\x1b[2K\x1b[J\x1b[1A')
                for i, taskset_id in enumerate(self.active_tasksets):
                    if i == total - 1:
                        ending = '\x1b[%sA' % total
                    else:
                        ending = ''

                    tasksets = self.active_tasksets[taskset_id]
                    tasksets.progress(ending)

        mesos_task_id = status.task_id.value
        state = status.state
        reason = status.get('message')  # set by mesos
        data = status.get('data')

        logger.debug('status update: %s %s', mesos_task_id, state)

        ttid = TTID(mesos_task_id)

        taskset = self.active_tasksets.get(ttid.taskset_id)

        if taskset is None:
            if state == 'TASK_RUNNING':
                logger.debug('kill task %s as its taskset has gone',
                             mesos_task_id)
                self.driver.killTask(Dict(value=mesos_task_id))
            else:
                logger.debug('ignore task %s as its taskset has gone',
                             mesos_task_id)
            return

        if state == 'TASK_RUNNING':
            taskset.statusUpdate(ttid.task_id, ttid.task_try, state)
            if taskset.tasksFinished == 0:
                plot_progresses()
        else:
            if mesos_task_id not in taskset.ttids:
                logger.debug(
                    'ignore task %s as it has finished or failed, new msg: %s',
                    mesos_task_id, (state, reason))
            else:
                taskset.ttids.remove(mesos_task_id)
                if mesos_task_id in self.ttid_to_agent_id:
                    agent_id = self.ttid_to_agent_id[mesos_task_id]
                    if agent_id in self.agent_id_to_ttids:
                        self.agent_id_to_ttids[agent_id] -= 1
                    del self.ttid_to_agent_id[mesos_task_id]

                if state in ('TASK_FINISHED', 'TASK_FAILED') and data:
                    try:
                        reason, result, accUpdate, task_stats = cPickle.loads(
                            decode_data(data))
                        if result:
                            flag, data = result
                            if flag >= 2:
                                try:
                                    data = urllib.request.urlopen(data).read()
                                except IOError:
                                    # try again
                                    data = urllib.request.urlopen(data).read()
                                flag -= 2
                            data = decompress(data)
                            if flag == 0:
                                result = marshal.loads(data)
                            else:
                                result = cPickle.loads(data)
                        taskset.statusUpdate(ttid.task_id, ttid.task_try,
                                             state, reason, result, accUpdate,
                                             task_stats)
                        if state == 'TASK_FINISHED':
                            plot_progresses()
                    except Exception as e:
                        logger.warning(
                            'error when cPickle.loads(): %s, data:%s', e,
                            len(data))
                        state = 'TASK_FAILED'
                        taskset.statusUpdate(ttid.task_id, ttid.task_try,
                                             state, 'load failed: %s' % e)
                else:
                    # killed, lost
                    taskset.statusUpdate(ttid.task_id, ttid.task_try, state,
                                         reason or data)
Esempio n. 29
0
def launch_task(self, driver, task):
    reply_status(driver, task.task_id, 'TASK_RUNNING')

    host = socket.gethostname()
    cwd, command, _env, shell, addr1, addr2, addr3 = pickle.loads(
        decode_data(task.data)
    )

    prefix = "[%s@%s] " % (str(task.task_id.value), host)
    prefix = prefix.encode('utf-8')
    outr, outw = os.pipe()
    errr, errw = os.pipe()
    t1 = Thread(target=forword, args=[outr, addr1, prefix])
    t1.daemon = True
    t1.start()
    t2 = Thread(target=forword, args=[errr, addr2, prefix])
    t2.daemon = True
    t2.start()
    wout = os.fdopen(outw, 'wb', 0)
    werr = os.fdopen(errw, 'wb', 0)

    if addr3:
        tid = int(task.task_id.value.split('-')[0])
        subscriber = ctx.socket(zmq.SUB)
        subscriber.connect(addr3)
        subscriber.setsockopt(zmq.SUBSCRIBE, b'')
        poller = zmq.Poller()
        poller.register(subscriber, zmq.POLLIN)
        socks = dict(poller.poll(min(tid / 100.0 + 1, 5) * 60 * 1000))
        if socks and socks.get(subscriber) == zmq.POLLIN:
            hosts = pickle.loads(subscriber.recv(zmq.NOBLOCK))
            line = hosts.get(host)
            if not six.PY2:
                line = line.decode('utf-8')

            if line:
                command = line.split(' ')
            else:
                return reply_status(driver, task.task_id, 'TASK_FAILED')
        else:
            return reply_status(driver, task.task_id, 'TASK_FAILED')

    mem = 100
    for r in task.resources:
        if r.name == 'mem':
            mem = r.scalar.value
            break

    try:
        env = dict(os.environ)
        env.update(_env)
        if not os.path.exists(cwd):
            print('CWD %s is not exists, use /tmp instead' % cwd, file=werr)
            cwd = '/tmp'
        p = subprocess.Popen(command,
                             stdout=wout, stderr=werr,
                             cwd=cwd, env=env, shell=shell)
        tid = task.task_id.value
        self.ps[tid] = p
        code = None
        last_time = 0
        while True:
            time.sleep(0.1)
            code = p.poll()
            if code is not None:
                break

            now = time.time()
            if now < last_time + 2:
                continue

            last_time = now
            try:
                import psutil
                process = psutil.Process(p.pid)

                rss = sum((proc.memory_info().rss
                           for proc in process.get_children(recursive=True)),
                          process.memory_info().rss)
                rss = (rss >> 20)

                if rss > mem * 1.5:
                    print("task %s used too much memory: %dMB > %dMB * 1.5, kill it. " \
                          "use -m argument to request more memory." % (
                              tid, rss, mem), file=werr)
                    p.kill()

                elif rss > mem:
                    print("task %s used too much memory: %dMB > %dMB, " \
                          "use -m to request for more memory" % (
                              tid, rss, mem), file=werr)

            except Exception:
                pass

        if code == 0:
            status = 'TASK_FINISHED'
        else:
            print(' '.join(command) + ' exit with %s' % code, file=werr)
            status = 'TASK_FAILED'
    except Exception:
        status = 'TASK_FAILED'
        import traceback
        print('exception while open ' + ' '.join(command), file=werr)
        for line in traceback.format_exc():
            werr.write(line)

    reply_status(driver, task.task_id, status)

    wout.close()
    werr.close()
    t1.join()
    t2.join()

    self.ps.pop(tid, None)
    self.ts.pop(tid, None)
Esempio n. 30
0
 def frameworkMessage(self, driver, executorId, slaveId, message):
     self.result += int(decode_data(message))
Esempio n. 31
0
 def frameworkMessage(self, driver, executorId, slaveId, message):
     self.sum_res = self.sum_res + float(decode_data(message))
     self.finished = self.finished + 1
     if self.finished >= self.counts:
         print(self.sum_res)
         driver.stop()
Esempio n. 32
0
def assert_message(testcase, expected_message, actual_encoded_message):
    actual_message = json.loads(
        decode_data(actual_encoded_message).decode('utf8'))
    testcase.assertEquals(expected_message, actual_message)
Esempio n. 33
0
    def statusUpdate(self, driver, status):
        tid = status.task_id.value
        state = status.state
        logger.debug('status update: %s %s', tid, state)

        jid = self.taskIdToJobId.get(tid)
        _, task_id, tried = map(int, tid.split(':'))
        if state == 'TASK_RUNNING':
            if jid in self.activeJobs:
                job = self.activeJobs[jid]
                job.statusUpdate(task_id, tried, state)
            else:
                logger.debug('kill task %s as its job has gone', tid)
                self.driver.killTask(Dict(value=tid))

            return

        self.taskIdToJobId.pop(tid, None)
        if jid in self.jobTasks:
            self.jobTasks[jid].remove(tid)
        if tid in self.taskIdToAgentId:
            agent_id = self.taskIdToAgentId[tid]
            if agent_id in self.agentTasks:
                self.agentTasks[agent_id] -= 1
            del self.taskIdToAgentId[tid]

        if jid not in self.activeJobs:
            logger.debug('ignore task %s as its job has gone', tid)
            return

        job = self.activeJobs[jid]
        data = status.get('data')
        if state in ('TASK_FINISHED', 'TASK_FAILED') and data:
            try:
                reason, result, accUpdate = cPickle.loads(decode_data(data))
                if result:
                    flag, data = result
                    if flag >= 2:
                        try:
                            data = urllib.urlopen(data).read()
                        except IOError:
                            # try again
                            data = urllib.urlopen(data).read()
                        flag -= 2
                    data = decompress(data)
                    if flag == 0:
                        result = marshal.loads(data)
                    else:
                        result = cPickle.loads(data)
            except Exception as e:
                logger.warning('error when cPickle.loads(): %s, data:%s', e,
                               len(data))
                state = 'TASK_FAILED'
                return job.statusUpdate(task_id, tried, 'TASK_FAILED',
                                        'load failed: %s' % e)
            else:
                return job.statusUpdate(task_id, tried, state, reason, result,
                                        accUpdate)

        # killed, lost, load failed
        job.statusUpdate(task_id, tried, state, data)
Esempio n. 34
0
    def statusUpdate(self, driver, status):

        def plot_progresses():
            if self.color:
                total = len(self.activeJobs)
                logger.info('\x1b[2K\x1b[J\x1b[1A')
                for i, jid in enumerate(self.activeJobs):
                    if i == total - 1:
                        ending = '\x1b[%sA' % total
                    else:
                        ending = ''

                    jobs = self.activeJobs[jid]
                    jobs.progress(ending)

        tid = status.task_id.value
        state = status.state
        logger.debug('status update: %s %s', tid, state)

        jid = self.taskIdToJobId.get(tid)
        _, task_id, tried = list(map(int, tid.split(':')))
        if state == 'TASK_RUNNING':
            if jid in self.activeJobs:
                job = self.activeJobs[jid]
                job.statusUpdate(task_id, tried, state)
                if job.tasksFinished == 0:
                    plot_progresses()
            else:
                logger.debug('kill task %s as its job has gone', tid)
                self.driver.killTask(Dict(value=tid))

            return

        self.taskIdToJobId.pop(tid, None)
        if jid in self.jobTasks:
            self.jobTasks[jid].remove(tid)
        if tid in self.taskIdToAgentId:
            agent_id = self.taskIdToAgentId[tid]
            if agent_id in self.agentTasks:
                self.agentTasks[agent_id] -= 1
            del self.taskIdToAgentId[tid]

        if jid not in self.activeJobs:
            logger.debug('ignore task %s as its job has gone', tid)
            return

        job = self.activeJobs[jid]
        reason = status.get('message')
        data = status.get('data')
        if state in ('TASK_FINISHED', 'TASK_FAILED') and data:
            try:
                reason, result, accUpdate, task_stats = six.moves.cPickle.loads(
                    decode_data(data))
                if result:
                    flag, data = result
                    if flag >= 2:
                        try:
                            data = urllib.request.urlopen(data).read()
                        except IOError:
                            # try again
                            data = urllib.request.urlopen(data).read()
                        flag -= 2
                    data = decompress(data)
                    if flag == 0:
                        result = marshal.loads(data)
                    else:
                        result = six.moves.cPickle.loads(data)
            except Exception as e:
                logger.warning(
                    'error when cPickle.loads(): %s, data:%s', e, len(data))
                state = 'TASK_FAILED'
                job.statusUpdate(task_id, tried, state, 'load failed: %s' % e)
                return
            else:
                job.statusUpdate(task_id, tried, state, reason, result, accUpdate, task_stats)
                if state == 'TASK_FINISHED':
                    plot_progresses()
                return

        # killed, lost, load failed
        job.statusUpdate(task_id, tried, state, reason or data)