Beispiel #1
0
    def stopTask(self, request, context):
        task_id = request.id

        self._logger.info(
            'mxnet_service has received a new request to stop the task with id:%s'
            % task_id)
        self._record_user_action(task_id, 'stop_task')

        executor_process = self._task_dict.get(task_id)
        if executor_process is None:
            self._logger.warn('mxnet_service can not find a task with id: %s' %
                              task_id)
            return mxserver_pb2.TaskState(task_id=task_id,
                                          state_code=TASK_STATE_CODES[1],
                                          state_desc=TASK_STATES[2])
        else:
            try:
                executor_process.terminate()
                self._logger.warn(
                    'mxnet_service has terminated the task with id: %s' %
                    task_id)
                # After terminate, the key-value should be deleted
                del self._task_dict[task_id]
                return mxserver_pb2.TaskState(task_id=task_id,
                                              state_code=TASK_STATE_CODES[0],
                                              state_desc=TASK_STATES[3])
            except BaseException as e:
                self._logger.warn(
                    'mxnet_service can not terminate the task with id: %s! Because %s'
                    % (task_id, exception_msg(e)))
                return mxserver_pb2.TaskState(task_id=task_id,
                                              state_code=TASK_STATE_CODES[1],
                                              state_desc=TASK_STATES[4])
Beispiel #2
0
def query_gpu():
    mxserver_flask_logger.info(
        'The mxserver_flask_server receives a request to query local GPU infos'
    )
    try:
        result = gpu_monitor.query_gpu()
        response = make_response(result)
        response.headers['Content-Type'] = 'application/json'
        return response
    except BaseException as e:
        mxserver_flask_logger.error(
            'The mxserver_flask_server fails to query local GPU infos! Error message: %s'
            % exception_msg(e))
        return jsonify([])
Beispiel #3
0
    return task_state


def __task_state_2_json(task_state):
    state_id = task_state.id
    state_code = task_state.state_code
    state_desc = task_state.state_desc
    return '{"task_id": "%s", "state_code": "%s", "state_desc": "%s"}' % (
        state_id, state_code, state_desc)


if __name__ == '__main__':
    try:
        if ZkRegister.use_zk():
            mxserver_flask_logger.info(
                'The mxserver flask server is trying to register to ZooKeeper')
        zk_register = ZkRegister()
        zk_register.register_flask_to_zk()
        if ZkRegister.use_zk():
            mxserver_flask_logger.info(
                'The mxserver flask server has registered to ZooKeeper')
    except BaseException as e:
        mxserver_flask_logger.error(
            'The mxserver flask server can not register to ZooKeeper! System exists! '
            'Error message: \n%s' % exception_msg(e))
        sys.exit('Failed to register to ZooKeeper')

    mxserver_flask_logger.info('The mxserver flask server has been started')
    app.run(host=mxserver_flask_config['host'],
            port=mxserver_flask_config['port'])
# -*- coding: utf-8 -*-

# @Author: Terence Wu
# @Time: 26/02/18 上午 11:37
from requests import post
from test_resources import STOP_TEST_URL, STOP_REQUEST_JSON
from util.logger_generator import get_logger
from util.exception_handler import exception_msg


if __name__ == '__main__':
    logger = get_logger('test_stop_request')
    logger.info('Begin to test API for deep learning training')
    logger.info('Begin to send request to url: %s' % STOP_TEST_URL)
    try:
        response = post(url=STOP_TEST_URL, json=STOP_REQUEST_JSON)
        logger.info('Receive a response')
        logger.info('Response\'s status code: %s' % response.status_code)
        logger.info('Response\'s content: %s' % response.content)
    except BaseException as e:
        logger.error('Fail! Error message: %s\n' % exception_msg(e))
    def run(self):
        self._task_progress_recorder.insert_one({
            'task_id': self._process_id,
            'task_progresses': []
        })

        for_training, exec_type, executor_params_dict = parse_task_desc(
            self._task_desc)
        executor_params_dict['task_id'] = self._process_id
        data_config = get_data_config(self._task_desc)

        self._update_task_state('TASK_BEGIN_PREPARE_DATA')
        try:
            data_manager = DataManager(for_training=for_training,
                                       target=exec_type,
                                       data_config=data_config)
            data_iters = data_manager.prepare_data()
            self._update_task_state('TASK_PREPARE_DATA_DONE')
        except BaseException as e:
            self._update_task_state('TASK_BEGIN_PREPARE_DATA_FAILED')
            excep_msg = exception_msg(e)
            _logger.error(
                'Task_%s\'s DataIter instances creation failed! Because %s' %
                (self._process_id, excep_msg))
            return
        if for_training:
            executor_params_dict['train_iter'] = data_iters[0]
            if len(data_iters) == 1:
                executor_params_dict['val_iter'] = None
            else:
                executor_params_dict['val_iter'] = data_iters[1]
        else:
            executor_params_dict['data_batch_list'] = data_iters

        try:
            executor = Executor.create_executor(for_training=for_training,
                                                exec_type=exec_type,
                                                **executor_params_dict)
            self._update_task_state('TASK_EXECUTOR_CREATION_DONE')
            _logger.error('Task_%s\'s Executor instances creation done!' %
                          self._process_id)
        except BaseException as e:
            self._update_task_state('TASK_EXECUTOR_CREATION_FAILED')
            _logger.error(
                'Task_%s\'s Executor instances creation failed! Because %s' %
                (self._process_id, exception_msg(e)))
            return

        try:
            self._update_task_state('TASK_BEGIN_RUNNING')
            _logger.info('Task_%s running is starting now' % self._process_id)
            executor.execute()
            self._update_task_state('TASK_DONE_SUCCESSFULLY')
            _logger.info('Task_%s running is done successfully' %
                         self._process_id)
        except BaseException as e:
            self._update_task_state('TASK_TERMINATED_BY_INTERNAL_ERROR')
            excep_msg = exception_msg(e)
            _logger.error(
                'Task_%s has been terminated by server internal error! Because %s'
                % (self._process_id, excep_msg))
Beispiel #6
0
# Add rcnn package to sys.path
sys.path.append(mxserver_mxnet_config['rcnn-path'])
print sys.path

if __name__ == '__main__':
    main_logger = get_logger('mxserver_worker_logger')
    try:
        if ZkRegister.use_zk():
            main_logger.info('The mxserver worker is trying to register to ZooKeeper')
        zk_register = ZkRegister()
        zk_register.register_worker_to_zk()
        if ZkRegister.use_zk():
            main_logger.info('The mxserver worker has registered to ZooKeeper')
    except BaseException as e:
        main_logger.error('The mxserver worker can not register to ZooKeeper! System exists! Error message: \n%s'
                          % exception_msg(e))
        sys.exit('Failed to register to ZooKeeper')
    task_queue = Queue(int(mxserver_task_queue_config['queue-max-size']))
    try:
        executor_process_manager = ExecutorProcessManager(task_queue=task_queue)
        executor_process_manager.start()

        server = grpc.server(futures.ThreadPoolExecutor(max_workers=int(mxserver_rpc_config['max-thread-num'])))
        mxserver_pb2_grpc.add_MXNetServiceServicer_to_server(MXNetService(task_queue), server)

        uri = mxserver_rpc_config['host'] + ':' + str(mxserver_rpc_config['port'])
        server.add_insecure_port(uri)

        server.start()
        main_logger.info('The mxserver worker has been started at: %s, waiting for request.' % uri)
        try: