Ejemplo n.º 1
0
    def start(self, ctx, alloc_guard):
        self._sbx_task_priority = ctx.sandbox_task_priority
        self._executor_resource_id = ctx.sandbox_executor_resource_id
        self._rpc_listen_addr = ctx.sandbox_rpc_listen_addr
        self._sbx_task_kill_timeout = ctx.sandbox_task_kill_timeout
        self._sbx_task_owner = ctx.sandbox_task_owner
        self._sbx_python_resource_id = ctx.sandbox_python_resource_id

        self._default_sandbox_client = ctx.default_sandbox_client
        self._create_sandbox_client = ctx.create_sandbox_client

        self._rpc_invoker = ActionQueue(
            thread_count=ctx.sandbox_rpc_invoker_thread_pool_size,
            thread_name_prefix='RpcIO')

        self._sbx_invoker = ctx.sandbox_action_queue

        self._tasks_status_awaiter = self.TasksAwaiter(self._default_sandbox_client, self)

        self._vivify_packets(alloc_guard)

        self._rpc_server = self._create_rpc_server(ctx)
        self._tasks_status_awaiter.start()
        self._rpc_server.start()
Ejemplo n.º 2
0
class RemotePacketsDispatcher(object):
    _SANDBOX_TASK_CREATION_RETRY_INTERVAL = 10.0
    _RPC_RESEND_INTERVAL = 20.0
    _FINAL_UPDATE_FETCH_TIMEOUT = 30.0

    # It's better to use TOps than inheritence
    class TasksAwaiter(SandboxTaskStateAwaiter):
        def __init__(self, sandbox, dispatcher):
            SandboxTaskStateAwaiter.__init__(self, sandbox)
            self.__dispatcher = dispatcher

        def _notify(self, task_id, status, status_group, can_has_res):
            self.__dispatcher._on_task_status_change(task_id, status, status_group, can_has_res)

    def __init__(self, rhs=None):
        self._by_task_id = rhs._by_task_id if rhs else {}

    def __getstate__(self):
        return {
            '_by_task_id': self._by_task_id.copy(),
        }

    def __setstate__(self, sdict):
        self._by_task_id = sdict['_by_task_id']

    def _create_rpc_server(self, ctx):
        srv = AsyncXMLRPCServer2(
            ctx.sandbox_rpc_server_thread_pool_size,
            self._rpc_listen_addr,
            allow_none=True)

        #srv.register_function(self._on_rpc_ping, 'ping') # FIXME Don't remember what for
        srv.register_function(self._on_rpc_update_graph, 'update_graph')

        return srv

    def start(self, ctx, alloc_guard):
        self._sbx_task_priority = ctx.sandbox_task_priority
        self._executor_resource_id = ctx.sandbox_executor_resource_id
        self._rpc_listen_addr = ctx.sandbox_rpc_listen_addr
        self._sbx_task_kill_timeout = ctx.sandbox_task_kill_timeout
        self._sbx_task_owner = ctx.sandbox_task_owner
        self._sbx_python_resource_id = ctx.sandbox_python_resource_id

        self._default_sandbox_client = ctx.default_sandbox_client
        self._create_sandbox_client = ctx.create_sandbox_client

        self._rpc_invoker = ActionQueue(
            thread_count=ctx.sandbox_rpc_invoker_thread_pool_size,
            thread_name_prefix='RpcIO')

        self._sbx_invoker = ctx.sandbox_action_queue

        self._tasks_status_awaiter = self.TasksAwaiter(self._default_sandbox_client, self)

        self._vivify_packets(alloc_guard)

        self._rpc_server = self._create_rpc_server(ctx)
        self._tasks_status_awaiter.start()
        self._rpc_server.start()

    def _vivify_packets(self, alloc_guard):
        logging.debug('RemotePacketsDispatcher packet count: %d' % len(self._by_task_id))

        for pck in self._by_task_id.itervalues():
            logging.debug('VIVIFY %s' % pck.id)
            if pck._sandbox is None:
                pck._sandbox = self._default_sandbox_client
            pck._run_guard = alloc_guard()
            self._await_task_status(pck)
            self._reschedule_packet(pck)

    def _reschedule_packet(self, pck):
        # TODO Check
        by_state = {
            RemotePacketState.CREATING: self._start_create_sandbox_task,
            RemotePacketState.STARTING: self._start_start_sandbox_task,
            RemotePacketState.FETCHING_RESOURCE_LIST: self._start_fetch_resource_list,
            RemotePacketState.FETCHING_FINAL_UPDATE: self._start_fetch_final_update,
        }

        action = by_state.get(pck._state)

        if not action \
                and pck._target_stop_mode != pck._sent_stop_mode \
                and pck._state == RemotePacketState.STARTED:
            action = self._start_packet_stop

        if action:
            logging.debug('action %s' % action)
            action(pck)

    def stop(self):
        self._rpc_server.shutdown()
        self._rpc_invoker.stop()
        self._tasks_status_awaiter.stop()

    # XXX TODO
    # Instance must also ping server (if server doesn't ping instance)
    # so REM-server will register instance's remote_addr after
    # loading old backup (after server's fail)

    def register_packet(self, pck):
        pck._sandbox = self._create_sandbox_client(pck._oauth_token) if pck._oauth_token \
            else self._default_sandbox_client
        self._start_create_sandbox_task(pck)

        # FIXME We can't identify packet by pck_id in async/delayed calls
        #       because packet may be recreated by PacketBase
        #       (or we must cancel invokers (not only delayed_executor))

    def _start_create_sandbox_task(self, pck):
        self._sbx_invoker.invoke(lambda : self._do_create_sandbox_task(pck))

    def _start_start_sandbox_task(self, pck):
        self._sbx_invoker.invoke(lambda : self._do_start_sandbox_task(pck))

    #def _start_delete_task(self, task_id):
        #self._sbx_invoker.invoke(lambda : pck._sandbox.delete_task(task_id)) # no retries

    # TODO Consider following options:
    #   max_restarts=0
    #   kill_timeout=14 * 86400
    #   fail_on_any_error=False

    def _sbx_create_task(self, pck):
        # TODO remove flow-fork after update on veles02:7104
        files_setup = pck._custom_resources
        resource_ids = []
        if isinstance(files_setup, PacketResources):
            files_setup, resource_ids = files_setup.files_setup, files_setup.resource_ids

        return pck._sandbox.create_task(
            'RUN_REM_JOBPACKET',
            {
                'rem_server_addr': ('%s:%d' % self._rpc_listen_addr),
                'pck_id': pck.id,
                'executor_resource': self._executor_resource_id,
                'snapshot_data': wrap_string(pck._start_snapshot_data, 79) \
                    if pck._start_snapshot_data \
                    else None,
                'snapshot_resource_id': pck._start_snapshot_resource_id,
                # '=' to prevent '[object Object]' rendering of parameter on Sandbox task page
                'custom_resources': '=' + json.dumps(files_setup, indent=3),
                'vaults_setup': json.dumps(pck._vaults_setup, indent=3),
                'custom_resources_list': map(str, resource_ids),
                'python_resource': self._sbx_python_resource_id,
                'resume_params': json.dumps({
                    'reset_tries': pck._reset_tries_at_start,
                    'use_dummy_jobs': USE_DUMMY_JOBS,
                }),
            }
        )

    def _sbx_update_task(self, pck, task):
        real_pck = pck._ops._ops.pck
        prev_task_id = pck._ops._prev_task_id

        jobs = {
            job.id: {
                'command': job.shell,
                'parents': job.parents,
                'pipe_parents': job.inputs,
                'max_try_count': job.max_try_count,
                #'max_working_time': job.max_working_time,
                #retry_delay = retry_delay
                #pipe_fail = pipe_fail
            }
                for job in real_pck.jobs.itervalues()
        }

        description = '''pck_id: {pck_id}
pck_name: {pck_name}
rem_server: {rem_host}:{rem_port}
prev_history: {history}
prev_task: {prev_task}

{job_graph}
'''.format(

            rem_host=self._rpc_listen_addr[0],
            rem_port=self._rpc_listen_addr[1],
            prev_task=prev_task_id,

            pck_id=pck.id,
            pck_name=real_pck.name,
            #pck_name_timestamp_descr=' (1464601024 == 2016-05-30T12:37:20)', # TODO
            history=real_pck.history[:-13], # TODO

            job_graph=json.dumps(jobs, indent=3),
        )

        task.update(
            max_restarts=0,
            kill_timeout=self._sbx_task_kill_timeout,
            owner=self._sbx_task_owner,
            priority=self._sbx_task_priority,
            notifications=[],
            description=description,
            host=pck._host,
            #fail_on_any_error=True, # FIXME What is this?
        )

    def _mark_as_finished(self, pck, reason=None):
        prev_state = pck._state
        pck._set_state(RemotePacketState.FINISHED, reason)
        pck._run_guard = None # j.i.c

        # TODO NotImplementedError
        #self._tasks_status_awaiter.cancel_wait(pck._sandbox_task_id)

        if pck._sandbox_task_id:
            self._by_task_id.pop(pck._sandbox_task_id)

        if prev_state != RemotePacketState.TASK_FIN_WAIT:
            pck._ops._on_packet_terminated() # TODO Ugly

    def _mark_task_fin_wait(self, pck, reason=None):
        pck._set_state(RemotePacketState.TASK_FIN_WAIT, reason)
        pck._ops._on_packet_terminated() # TODO Ugly

    def _do_create_sandbox_task(self, pck):
        def reschedule_if_need():
            with pck._lock:
                if pck._target_stop_mode:
                    return

                self._schedule(
                    pck,
                    self._start_create_sandbox_task,
                    timeout=self._SANDBOX_TASK_CREATION_RETRY_INTERVAL)

        with pck._lock:
            if pck._target_stop_mode:
                return

        def handle_unknown_error(e):
            with pck._lock:
                if not pck._target_stop_mode:
                    pck.set_error(str(e), False)
                self._mark_as_finished(pck, 'Unknown error while creating: %s' % e)

        # FIXME Invert logic: retry everything except permanent

        try:
            task = self._sbx_create_task(pck)
        except (sandbox.NetworkError, sandbox.ServerInternalError) as e:
            reschedule_if_need()
            return
        except Exception as e:
            logging.exception('Failed to create sandbox task for %s' % pck.id)
            handle_unknown_error(e)
            return

        logging.debug('sbx:%d for %s created' % (task.id, pck.id))

        with pck._lock:
            if pck._target_stop_mode:
                return

        try:
            self._sbx_update_task(pck, task)

        except (sandbox.NetworkError, sandbox.ServerInternalError) as e:
            reschedule_if_need()
            #self._start_delete_task(task.id)
            return
        except Exception as e:
            logging.exception('Failed to update (after start) task %s for %s' % (task.id, pck.id))
            handle_unknown_error(e)
            #self._start_delete_task(task.id)
            return

        with pck._lock:
            if pck._target_stop_mode:
                return

            # FIXME fork locking (mallformed pck state)
            pck._set_task_id(task.id)
            pck._set_state(RemotePacketState.STARTING)

            self._by_task_id[task.id] = pck

        self._await_task_status(pck)
        self._do_start_sandbox_task(pck)

    def _await_task_status(self, pck):
        self._tasks_status_awaiter.await(pck._sandbox_task_id)

    def _do_start_sandbox_task(self, pck):
        try:
           pck._sandbox.start_task(pck._sandbox_task_id)

        # Possible events before lock will be acquired in this func:
        # - not final GRAPH_UPDATE
        # - final GRAPH_UPDATE
        # - STOP_GRACEFULLY/STOP/CANCEL

        # XXX Уже неверно.
        # No task status changes may appear here, because awaiting is not racy
        # - final GRAPH_UPDATE + task terminated
        # - task terminated wo GRAPH_UPDATE's

        except Exception as e:
            logging.exception('Failed to start task %s' % pck)

            with pck._lock:
                if pck._state != RemotePacketState.STARTING:
                    return

                # TODO Don't forget to take into account ._target_stop_mode in _on_task_status_change

                # Here we don't know if task is really started
                is_error_permanent = \
                    not isinstance(e, (sandbox.NetworkError, sandbox.ServerInternalError))

                pck.set_error(str(e), is_error_permanent)
                pck._set_state(RemotePacketState.CHECKING_START_ERROR)

            return

        with pck._lock:
            if pck._state != RemotePacketState.STARTING:
                return

            pck._set_state(RemotePacketState.STARTED, '._sandbox.start() -> ok')

            assert not pck._peer_addr
            #if pck._target_stop_mode:
            #    <waiting for peer addr>
            #
            #    We can't STOP* for now, because we have no _peer_addr
            #    We theoretically can do CANCEL using Sandbox API, but
            #    I prefer do all STOP*/CANCEL in the same way.

            # XXX FIXME If firewall has no holes for us,
            # we can't do STOP*, but theoretically can do CANCEL

    # The hard way
    # must be called under lock
    #def _schedule(self, pck, impl, timeout=None):
        #id = None
        #wrap = lambda : self._execute_scheduled(pck, id, impl)
        #id = id(wrap)
        #cancel = delayed_executor.schedule(wrap, timeout=timeout)

        #pck._sched = (id, cancel)

    #def _execute_scheduled(self, pck, id, impl):
        #with pck._lock:
            #if not pck._sched or pck._sched[0] != id:
                #return

            #pck._sched = None

            #impl()

    # must be called under lock
    def _schedule(self, pck, impl, timeout=None):
        logging.debug("++ ._schedule(%s, %s, timeout=%s)" % (pck, impl, timeout))
        wrap = lambda id: self._execute_scheduled(pck, id, impl)
        pck._sched = delayed_executor.schedule(wrap, timeout=timeout)

    def _execute_scheduled(self, pck, id, impl):
        with pck._lock:
            if not pck._sched or pck._sched.id != id:
                return

            pck._sched = None

        # ._sched is None means that impl is running
        # When impl stop it will set ._sched again or modify other fields of pck
        impl(pck)

    @traced_rpc_method()
    def _on_rpc_update_graph(self, task_id, peer_addr, state, is_final):
        pck = self._by_task_id.get(task_id)

        logging.debug('_on_rpc_update_graph: task_id=%s, pck_id=%s; status=%s; is_final=%s' % (
            task_id,
            pck and pck.id,
            GraphState.str(state['state']),
            is_final
        ))

        if not pck:
            raise WrongTaskIdError('No packet for sbx:%s' % task_id)

        with pck._lock:
            # FIXME
            # pck: connect
            # pck: write
            # rem: enter _on_rpc_update_graph
            # pck: CRASHED
            # pck: Sandbox' task FAILURE
            # rem: _on_task_status_change (enter + exit)
            # rem: _on_rpc_update_graph with self.lock <-- OOPS

            assert pck._state not in [
                RemotePacketState.CREATING,
                RemotePacketState.TASK_FIN_WAIT,
                RemotePacketState.FINISHED,
                RemotePacketState.FETCHING_RESOURCE_LIST,
                RemotePacketState.FETCHING_FINAL_UPDATE
            ], "_on_rpc_update_graph in %s state" % pck._state

            if pck._state in [RemotePacketState.STARTING,
                              RemotePacketState.CHECKING_START_ERROR,
                              RemotePacketState.STARTED]:
                if pck._state != RemotePacketState.STARTED:
                    pck._set_state(RemotePacketState.STARTED, '_on_rpc_update_graph')
                    pck._drop_sched_if_need()
                    assert not pck._peer_addr
                #else:
                    #assert not pck._sched # stop may be scheduled for example

                if not pck._peer_addr:
                    pck._peer_addr = peer_addr
                    logging.debug('SET pck._peer_addr = %s for %s' % (peer_addr, pck))

                    if pck._target_stop_mode:
                        if is_final:
                            pck._sent_stop_mode = pck._target_stop_mode # FIXME
                        else:
                            self._start_packet_stop(pck)

            if pck._target_stop_mode != StopMode.CANCEL:
                pck._update_graph(state, is_final)

            if is_final:
                if pck._target_stop_mode == StopMode.CANCEL \
                    or state['state'] == GraphState.SUCCESSFULL:
                    self._mark_task_fin_wait(pck, '_on_rpc_update_graph(SUCCESSFULL)')
                else:
                    pass # XXX WAITING for TaskStateGroups.TERMINATED

    def _start_fetch_resource_list(self, pck):
        self._sbx_invoker.invoke(lambda : self._fetch_resource_list(pck))

    # FIXME From which task state resources are really ready?
    def _fetch_resource_list(self, pck):
        try:
            ans = pck._sandbox.list_task_resources(pck._sandbox_task_id)
        except:
            logging.exception('Failed to fetch task resources for %s' % pck)

            with pck._lock:
                if pck._target_stop_mode != StopMode.CANCEL:
                    self._schedule(pck, self._start_fetch_resource_list, self._SANDBOX_TASK_CREATION_RETRY_INTERVAL)

            return

        # TODO We don't have to _fetch_resource_list() in any TERMINATED task
        # state (e.g. ERROR, EXCEPTION)

        #import json
        #logging.debug('task #%s resources list answer: %s' % (pck._sandbox_task_id, json.dumps(ans, indent=3)))

        res_by_type = {
            resource['type']: resource
                for resource in ans['items']
        }

        #logging.debug('task #%s res_by_type: %s' % (pck._sandbox_task_id, json.dumps(res_by_type, indent=3)))

        with pck._lock:
            resource = res_by_type.get('REM_JOBPACKET_EXECUTION_SNAPSHOT')
            if not resource:
                logging.error("No REM_JOBPACKET_EXECUTION_SNAPSHOT resource in %s" % pck)
                err = "No REM_JOBPACKET_EXECUTION_SNAPSHOT resource"
                pck.set_error(err, False)
                self._mark_as_finished(pck, err)
                return

            pck._result_snapshot_resource_id = resource['id']

            if pck._final_state is None:
                pck._final_update_url = res_by_type['REM_JOBPACKET_GRAPH_UPDATE']['http']['proxy']
                pck._set_state(RemotePacketState.FETCHING_FINAL_UPDATE)
            else:
                self._mark_as_finished(pck, '_fetch_resource_list')
                return

        self._fetch_final_update(pck) # not under lock

    def _start_fetch_final_update(self, pck):
        self._sbx_invoker.invoke(lambda : self._fetch_final_update(pck))

    def _fetch_final_update(self, pck):
        def log_fail(error):
            logging.error('Failed to fetch %s for %s: %s' % (pck._final_update_url, pck, error))

        def reschedule_if_need():
            with pck._lock:
                if pck._target_stop_mode == StopMode.CANCEL:
                    return

                self._schedule(pck,
                            self._start_fetch_final_update,
                            self._SANDBOX_TASK_CREATION_RETRY_INTERVAL)

        try:
            resp = requests.get(pck._final_update_url, timeout=self._FINAL_UPDATE_FETCH_TIMEOUT)
        except Exception as e:
            # FIXME permanent errors?
            log_fail(e)
            reschedule_if_need()
            return

        if resp.status_code != 200:
            http_status_group = resp.status_code / 100

            if http_status_group == 5:
                log_fail(e)
                reschedule_if_need()
                return

            elif http_status_group == 3:
                raise NotImplementedError("Redirects handling not implemented")

            else:
                log_fail('http status code == %d' % resp.status_code)

                with pck._lock:
                    if pck._target_stop_mode != StopMode.CANCEL:
                        pck.set_error("Can't fetch final update: status == %s" % resp.status_code, False) # TODO
                    self._mark_as_finished(pck)

                return

        try:
            update = pickle.loads(resp.content)
        except Exception as e:
            log_fail('malformed dump: %s' % e)

            with pck._lock:
                if pck._target_stop_mode != StopMode.CANCEL:
                    pck.set_error('Malformed last update resource data: %s' % e, False)
                self._mark_as_finished(pck)

            return


        with pck._lock:
            if pck._target_stop_mode != StopMode.CANCEL:
                pck._update_graph(update, is_final=True)

            self._mark_as_finished(pck, '_fetch_final_update')

    def _on_task_status_change(self, task_id, _, status_group, can_has_res):
        #with self.lock:
        if True:
            pck = self._by_task_id.get(task_id)

        if not pck:
            return

        with pck._lock:
            #if pck._status_await_job_id != job_id:
                #return
            #pck._status_await_job_id = None

            state = pck._state

            assert state not in [
                RemotePacketState.FINISHED,
                RemotePacketState.FETCHING_RESOURCE_LIST,
                RemotePacketState.FETCHING_FINAL_UPDATE
            ]

            if state in [
                RemotePacketState.CREATING, # we subscribe just after creation and before start
            ]:
                return

            # TODO Check the code

            if status_group == TaskStateGroups.DRAFT:

                if state == RemotePacketState.STARTING:
                    pass

                elif state == RemotePacketState.CHECKING_START_ERROR:
                    if pck._is_error_permanent:
                        self._mark_as_finished(pck, 'is_error_permanent=True, DRAFT')
                    else:
                        pck._set_state(RemotePacketState.STARTING)
                        self._start_start_sandbox_task(pck)

                elif state in [RemotePacketState.STARTED, RemotePacketState.TASK_FIN_WAIT]:
                    # FIXME Race here between graph updates and _on_task_status_change
                    logging.warning('%s._on_task_status_change(%s, %s)' % (pck, status_group, state))
                    #raise AssertionError()

            elif status_group == TaskStateGroups.ACTIVE:

                if state in [RemotePacketState.STARTING, RemotePacketState.CHECKING_START_ERROR]:
                    pck._set_state(RemotePacketState.STARTED, 'TaskStateGroups.ACTIVE')

            elif status_group == TaskStateGroups.TERMINATED:

                if state in [RemotePacketState.STARTING,
                             RemotePacketState.CHECKING_START_ERROR,
                             RemotePacketState.STARTED]:

                    if can_has_res:
                        pck._set_state(RemotePacketState.FETCHING_RESOURCE_LIST)
                        self._start_fetch_resource_list(pck)
                    else:
                        pck.set_error("Unknown task error", False) # TODO
                        self._mark_as_finished(pck, 'Task TERMINATED and EXCEPTION/FAILURE')

                    # FIXME Does Sandbox delete task's meta info or it's always DELETED

                    # TODO Fetch and interpret ctx['__last_rem_error']

                    #TaskStatus.DELETING:
                    #TaskStatus.DELETED: # has context
                    #TaskStatus.FAILURE:
                    #TaskStatus.EXCEPTION:
                    #TaskStatus.NO_RES:
                    #TaskStatus.TIMEOUT:

                elif state == RemotePacketState.TASK_FIN_WAIT:
                    self._mark_as_finished(pck, 'Task TERMINATED on TASK_FIN_WAIT')


    def restart_packet(self, pck):
        return # TODO

    def resume_packet(self, pck):
        return # TODO

    def list_all_user_processes(self, pck):
        return self._create_packet_rpc_proxy(pck).list_all_user_processes(pck._sandbox_task_id)

    def stop_packet(self, pck, kill_jobs):
        self._stop_packet(pck, StopMode.STOP if kill_jobs else StopMode.STOP_GRACEFULLY)

    def cancel_packet(self, pck):
        self._stop_packet(pck, StopMode.CANCEL)

    def _stop_packet(self, pck, stop_mode):
        # TODO Check
        with pck._lock:
            if pck._target_stop_mode >= stop_mode:
                return

            def really_cancel():
                pck._drop_sched_if_need()
                self._mark_as_finished(pck, '_stop_packet')

            logging.debug("+ set _target_stop_mode to %s for %s" % (pck._target_stop_mode, pck))
            pck._target_stop_mode = stop_mode

            state = pck._state

            if state == RemotePacketState.FINISHED \
                or state == RemotePacketState.TASK_FIN_WAIT \
                    and pck._final_state == GraphState.SUCCESSFULL:
                raise AlreadyTerminated()

            elif state == RemotePacketState.CREATING \
                or state == RemotePacketState.STARTING and pck._sched:

                really_cancel()

            elif stop_mode == StopMode.CANCEL \
                and state in [
                    RemotePacketState.FETCHING_RESOURCE_LIST,
                    RemotePacketState.FETCHING_FINAL_UPDATE,
                    RemotePacketState.TASK_FIN_WAIT]:

                really_cancel()

            # FIXME
            elif state == RemotePacketState.STARTED:
                if pck._peer_addr:
                    self._start_packet_stop(pck)

            #elif state in [RemotePacketState.STARTING, RemotePacketState.CHECKING_START_ERROR]:
                #pass # later

            #else:
                #raise Unreachable()

    def _create_packet_rpc_proxy(self, pck):
        return XMLRPCServerProxy(
            uri='http://%s' % join_host_port(*pck._peer_addr),
            timeout=15.0
        )

    def _do_stop_packet(self, pck):
        task_id = pck._sandbox_task_id
        stop_mode = pck._target_stop_mode

        def reschedule_if_need():
            if pck._state != RemotePacketState.STARTED:
                return
            if pck._target_stop_mode > stop_mode:
                return

            if pck._sched:
                # TODO Assert that we have another _do_stop_packet in pck._sched
                return

            self._schedule(
                pck,
                self._start_packet_stop,
                timeout=self._RPC_RESEND_INTERVAL)

        with pck._lock:
            if not pck._peer_addr: # FIXME _do_stop_packet must not be called on this condition
                reschedule_if_need()
                return

        proxy = self._create_packet_rpc_proxy(pck)

        logging.debug('_do_stop_packet(%s, stop_mode=%s' % (pck, stop_mode))

        try:
            if stop_mode == StopMode.CANCEL:
                proxy.cancel(task_id)
            else:
                kill_jobs = stop_mode == StopMode.STOP
                proxy.stop(task_id, kill_jobs)

        except Exception as e:
            logging.warning("Failed to send stop to packet %s: %s" % (pck, e))

            if is_xmlrpc_exception(e, WrongTaskIdError) \
                    or isinstance(e, socket.error) and e.errno == errno.ECONNREFUSED:
                return # FIXME Is enough? # STOP/SUSPEND->EXECUTING

            with pck._lock:
                reschedule_if_need()

        else:
            with pck._lock:
                #assert pck._state == RemotePacketState.STARTED # FIXME XXX
                if pck._state != RemotePacketState.STARTED:
                    return

                if pck._sent_stop_mode < stop_mode:
                    pck._sent_stop_mode = stop_mode

    def _start_packet_stop(self, pck):
        logging.debug('_start_packet_stop(%s, %s)' % (pck, pck._target_stop_mode))
        self._rpc_invoker.invoke(lambda : self._do_stop_packet(pck))