コード例 #1
0
    def RequestDataBlock(self, request, context):
        if request.worker_rank not in self._running_workers:
            return tm_pb.DataBlockResponse(status=common_pb.Status(
                code=common_pb.StatusCode.STATUS_INVALID_REQUEST,
                error_message="unregistered worker"))

        if request.worker_rank in self._completed_workers:
            return tm_pb.DataBlockResponse(status=common_pb.Status(
                code=common_pb.StatusCode.STATUS_INVALID_REQUEST,
                error_message="worker has completed"))

        return self._request_data_block(request)
コード例 #2
0
ファイル: follower_tm.py プロジェクト: piiswrong/fedlearner
 def _data_block_response(self, request):
     response = tm_pb.DataBlockResponse()
     data_block = self._alloc_data_block(block_id=request.block_id)
     if data_block:
         logging.info("%s allocated worker_%d with block id %s",
                      self.__class__.__name__, request.worker_rank,
                      data_block.block_id)
         response.status.code = common_pb.STATUS_SUCCESS
         response.status.error_message = 'success'
         response.data_block_info.data_path = \
             str(data_block.data_block_fpath)
         response.data_block_info.meta_path = ''
         response.data_block_info.block_id = str(data_block.block_id)
     elif self._online_training:
         logging.info("%s allocated worker_%d with empty data block. "\
                       "wait for new data block since online traning",
                       self.__class__.__name__, request.worker_rank)
         response.status.code = common_pb.STATUS_NO_MORE_DATA
         response.status.error_message = 'please wait for datablock ready'
     else:
         logging.info("%s allocated worker_%d with empty data block. "\
                       "exit running since since batch traning",
                       self.__class__.__name__, request.worker_rank)
         response.status.code = common_pb.STATUS_DATA_FINISHED
         response.status.error_message = 'datablock finished'
     return response
コード例 #3
0
 def RequestDataBlock(self, request, context):
     response = tm_pb.DataBlockResponse()
     try:
         response = self._receiver_fn(request)
     except Exception:  # pylint: disable=broad-except
         response.status.code = common_pb.STATUS_UNKNOWN_ERROR
         response.status.error_message = sys.exc_info()
     return response
コード例 #4
0
 def _request_data_block(self, request):
     data_block = self._data_visitor.get_datablock_by_id(request.block_id)
     if data_block:
         fl_logging.info("allocated worker_%d with block: %s",
                         request.worker_rank, data_block.id)
         response = tm_pb.DataBlockResponse(
             status=common_pb.Status(
                 code=common_pb.StatusCode.STATUS_SUCCESS),
             block_id=data_block.id,
             data_path=data_block.data_path,
         )
     else:
         fl_logging.error("invalid data block id: %s", request.block_id)
         response = tm_pb.DataBlockResponse(status=common_pb.Status(
             code=common_pb.StatusCode.STATUS_INVALID_DATA_BLOCK,
             error_message="invalid data block"))
     return response
コード例 #5
0
 def RequestDataBlock(self, request, context):
     response = tm_pb.DataBlockResponse()
     try:
         response = self._receiver_fn(request)
     except Exception:  # pylint: disable=broad-except
         response.status.code = common_pb.STATUS_UNKNOWN_ERROR
         err_str = ''.join(traceback.format_exception(*sys.exc_info()))
         response.status.error_message = err_str
     return response
コード例 #6
0
    def _request_data_block(self, request):
        try:
            data_block = next(self._data_visitor)
        except StopIteration:
            data_block = None

        response = tm_pb.DataBlockResponse()
        if data_block:
            fl_logging.info("allocated worker_%d with block: %s",
                            request.worker_rank, data_block.id)
            response = tm_pb.DataBlockResponse(
                status=common_pb.Status(
                    code=common_pb.StatusCode.STATUS_SUCCESS),
                block_id=data_block.id,
                data_path=data_block.data_path,
            )
        else:
            response = tm_pb.DataBlockResponse(status=common_pb.Status(
                code=common_pb.StatusCode.STATUS_DATA_FINISHED,
                error_message="data block finished"))

        return response
コード例 #7
0
ファイル: leader_tm.py プロジェクト: flyfoxCI/fedlearner
    def _data_block_response(self, request):
        response = tm_pb.DataBlockResponse()
        def status_check_fn(status):
            response = tm_pb.DataBlockResponse()
            if status in (tm_pb.MasterStatus.FINISHED, \
                    tm_pb.MasterStatus.ERROR):
                response.status.code = common_pb.STATUS_DATA_FINISHED
                response.status.error_message = 'datablock finished'
                return response
            if status != tm_pb.MasterStatus.RUNNING:
                response.status.code = \
                       common_pb.STATUS_WAIT_FOR_SYNCING_CHECKPOINT
                response.status.error_message = \
                        "must sync data checkpoint before alloc"
                return response
            #only if status is RUNNING
            return True

        ready = self._check_status(status_check_fn)
        if ready is not True:
            return ready
        data_block = self._alloc_data_block(block_id=request.block_id)
        if data_block:
            logging.debug("%s allocated worker_%d with block id %s",
                          self.__class__.__name__,
                          request.worker_rank,
                          data_block.block_id)
            response.status.code = common_pb.STATUS_SUCCESS
            response.status.error_message = 'success'
            response.data_block_info.data_path = \
                str(data_block.data_block_fpath)
            response.data_block_info.meta_path = ''
            response.data_block_info.block_id = str(data_block.block_id)
        elif self._online_training:
            logging.debug("%s allocated worker_%d with empty data block. "\
                          "wait for new data block since online traning",
                          self.__class__.__name__, request.worker_rank)
            response.status.code = common_pb.STATUS_NO_MORE_DATA
            response.status.error_message = 'please wait for datablock ready'
        else:
            logging.debug("%s allocated worker_%d with empty data block. "\
                          "exit running since since batch traning",
                          self.__class__.__name__, request.worker_rank)
            response.status.code = common_pb.STATUS_DATA_FINISHED
            response.status.error_message = 'datablock finished'
        if response.status.code == common_pb.STATUS_DATA_FINISHED:
            self._transfer_status(tm_pb.MasterStatus.RUNNING,
                                  tm_pb.MasterStatus.FINISHED)
        return response
コード例 #8
0
ファイル: leader_tm.py プロジェクト: flyfoxCI/fedlearner
 def status_check_fn(status):
     response = tm_pb.DataBlockResponse()
     if status in (tm_pb.MasterStatus.FINISHED, \
             tm_pb.MasterStatus.ERROR):
         response.status.code = common_pb.STATUS_DATA_FINISHED
         response.status.error_message = 'datablock finished'
         return response
     if status != tm_pb.MasterStatus.RUNNING:
         response.status.code = \
                common_pb.STATUS_WAIT_FOR_SYNCING_CHECKPOINT
         response.status.error_message = \
                 "must sync data checkpoint before alloc"
         return response
     #only if status is RUNNING
     return True
コード例 #9
0
ファイル: trainer_master.py プロジェクト: eddyJ/fedlearner
 def _data_block_response(self, request):
     logging.debug(
         "In Base TrainerMaster::_data_block_response  block_id = %s",
         request.block_id)
     data_block = self._alloc_data_block(block_id=request.block_id)
     response = tm_pb.DataBlockResponse()
     if data_block:
         response.status.code = common_pb.STATUS_SUCCESS
         response.status.error_message = 'success'
         response.data_block_info.data_path = data_block.data_path
         response.data_block_info.meta_path = data_block.meta_path
         response.data_block_info.block_id = data_block.block_id
     else:
         response.status.code = common_pb.STATUS_NO_MORE_DATA
         response.status.error_message = 'no more datablock to alloc'
     return response
コード例 #10
0
 def _data_block_response(self, request):
     logging.debug("In Base TrainerMaster::_data_block_response "\
                   "block_id = %s", request.block_id)
     data_block = self._alloc_data_block(block_id=request.block_id)
     response = tm_pb.DataBlockResponse()
     if data_block:
         response.status.code = common_pb.STATUS_SUCCESS
         response.status.error_message = 'success'
         response.data_block_info.data_path = \
             str(data_block.data_block_fpath)
         response.data_block_info.meta_path = ''
         response.data_block_info.block_id = str(data_block.block_id)
     elif self._online_training:
         response.status.code = common_pb.STATUS_NO_MORE_DATA
         response.status.error_message = 'please wait for datablock ready'
     else:
         response.status.code = common_pb.STATUS_DATA_FINISHED
         response.status.error_message = 'datablock finished'
     return response
コード例 #11
0
    def request_data_block(self, block_id=None):
        if self._status != tm_pb.MasterStatus.RUNNING:
            response = tm_pb.DataBlockResponse()
            response.status.code = \
                   common_pb.STATUS_WAIT_FOR_SYNCING_CHECKPOINT
            response.status.error_message = \
                    "must sync data checkpoint before alloc"
            return response
        if self._role == 'leader':
            assert block_id is None, "Must not set block_id for leader"
            while self._block_queue:
                ret = self._block_queue.pop(0)
                logging.debug('Fetch data block %s, ckpt is %s', ret,
                              ",".join(self._allocated_data_blockids))
                self._allocated_data_blockids.add(ret.block_id)
                logging.info('Fetch data block %s done', ret)
                return ret
            return None

        assert block_id, "Must set block_id for follower"
        if block_id not in self._block_map:
            return None
        return self._block_map[block_id]