def RequestDataBlock(self, request, context): if request.worker_rank not in self._running_workers: return tm_pb.DataBlockResponse(status=common_pb.Status( code=common_pb.StatusCode.STATUS_INVALID_REQUEST, error_message="unregistered worker")) if request.worker_rank in self._completed_workers: return tm_pb.DataBlockResponse(status=common_pb.Status( code=common_pb.StatusCode.STATUS_INVALID_REQUEST, error_message="worker has completed")) return self._request_data_block(request)
def _data_block_response(self, request): response = tm_pb.DataBlockResponse() data_block = self._alloc_data_block(block_id=request.block_id) if data_block: logging.info("%s allocated worker_%d with block id %s", self.__class__.__name__, request.worker_rank, data_block.block_id) response.status.code = common_pb.STATUS_SUCCESS response.status.error_message = 'success' response.data_block_info.data_path = \ str(data_block.data_block_fpath) response.data_block_info.meta_path = '' response.data_block_info.block_id = str(data_block.block_id) elif self._online_training: logging.info("%s allocated worker_%d with empty data block. "\ "wait for new data block since online traning", self.__class__.__name__, request.worker_rank) response.status.code = common_pb.STATUS_NO_MORE_DATA response.status.error_message = 'please wait for datablock ready' else: logging.info("%s allocated worker_%d with empty data block. "\ "exit running since since batch traning", self.__class__.__name__, request.worker_rank) response.status.code = common_pb.STATUS_DATA_FINISHED response.status.error_message = 'datablock finished' return response
def RequestDataBlock(self, request, context): response = tm_pb.DataBlockResponse() try: response = self._receiver_fn(request) except Exception: # pylint: disable=broad-except response.status.code = common_pb.STATUS_UNKNOWN_ERROR response.status.error_message = sys.exc_info() return response
def _request_data_block(self, request): data_block = self._data_visitor.get_datablock_by_id(request.block_id) if data_block: fl_logging.info("allocated worker_%d with block: %s", request.worker_rank, data_block.id) response = tm_pb.DataBlockResponse( status=common_pb.Status( code=common_pb.StatusCode.STATUS_SUCCESS), block_id=data_block.id, data_path=data_block.data_path, ) else: fl_logging.error("invalid data block id: %s", request.block_id) response = tm_pb.DataBlockResponse(status=common_pb.Status( code=common_pb.StatusCode.STATUS_INVALID_DATA_BLOCK, error_message="invalid data block")) return response
def RequestDataBlock(self, request, context): response = tm_pb.DataBlockResponse() try: response = self._receiver_fn(request) except Exception: # pylint: disable=broad-except response.status.code = common_pb.STATUS_UNKNOWN_ERROR err_str = ''.join(traceback.format_exception(*sys.exc_info())) response.status.error_message = err_str return response
def _request_data_block(self, request): try: data_block = next(self._data_visitor) except StopIteration: data_block = None response = tm_pb.DataBlockResponse() if data_block: fl_logging.info("allocated worker_%d with block: %s", request.worker_rank, data_block.id) response = tm_pb.DataBlockResponse( status=common_pb.Status( code=common_pb.StatusCode.STATUS_SUCCESS), block_id=data_block.id, data_path=data_block.data_path, ) else: response = tm_pb.DataBlockResponse(status=common_pb.Status( code=common_pb.StatusCode.STATUS_DATA_FINISHED, error_message="data block finished")) return response
def _data_block_response(self, request): response = tm_pb.DataBlockResponse() def status_check_fn(status): response = tm_pb.DataBlockResponse() if status in (tm_pb.MasterStatus.FINISHED, \ tm_pb.MasterStatus.ERROR): response.status.code = common_pb.STATUS_DATA_FINISHED response.status.error_message = 'datablock finished' return response if status != tm_pb.MasterStatus.RUNNING: response.status.code = \ common_pb.STATUS_WAIT_FOR_SYNCING_CHECKPOINT response.status.error_message = \ "must sync data checkpoint before alloc" return response #only if status is RUNNING return True ready = self._check_status(status_check_fn) if ready is not True: return ready data_block = self._alloc_data_block(block_id=request.block_id) if data_block: logging.debug("%s allocated worker_%d with block id %s", self.__class__.__name__, request.worker_rank, data_block.block_id) response.status.code = common_pb.STATUS_SUCCESS response.status.error_message = 'success' response.data_block_info.data_path = \ str(data_block.data_block_fpath) response.data_block_info.meta_path = '' response.data_block_info.block_id = str(data_block.block_id) elif self._online_training: logging.debug("%s allocated worker_%d with empty data block. "\ "wait for new data block since online traning", self.__class__.__name__, request.worker_rank) response.status.code = common_pb.STATUS_NO_MORE_DATA response.status.error_message = 'please wait for datablock ready' else: logging.debug("%s allocated worker_%d with empty data block. "\ "exit running since since batch traning", self.__class__.__name__, request.worker_rank) response.status.code = common_pb.STATUS_DATA_FINISHED response.status.error_message = 'datablock finished' if response.status.code == common_pb.STATUS_DATA_FINISHED: self._transfer_status(tm_pb.MasterStatus.RUNNING, tm_pb.MasterStatus.FINISHED) return response
def status_check_fn(status): response = tm_pb.DataBlockResponse() if status in (tm_pb.MasterStatus.FINISHED, \ tm_pb.MasterStatus.ERROR): response.status.code = common_pb.STATUS_DATA_FINISHED response.status.error_message = 'datablock finished' return response if status != tm_pb.MasterStatus.RUNNING: response.status.code = \ common_pb.STATUS_WAIT_FOR_SYNCING_CHECKPOINT response.status.error_message = \ "must sync data checkpoint before alloc" return response #only if status is RUNNING return True
def _data_block_response(self, request): logging.debug( "In Base TrainerMaster::_data_block_response block_id = %s", request.block_id) data_block = self._alloc_data_block(block_id=request.block_id) response = tm_pb.DataBlockResponse() if data_block: response.status.code = common_pb.STATUS_SUCCESS response.status.error_message = 'success' response.data_block_info.data_path = data_block.data_path response.data_block_info.meta_path = data_block.meta_path response.data_block_info.block_id = data_block.block_id else: response.status.code = common_pb.STATUS_NO_MORE_DATA response.status.error_message = 'no more datablock to alloc' return response
def _data_block_response(self, request): logging.debug("In Base TrainerMaster::_data_block_response "\ "block_id = %s", request.block_id) data_block = self._alloc_data_block(block_id=request.block_id) response = tm_pb.DataBlockResponse() if data_block: response.status.code = common_pb.STATUS_SUCCESS response.status.error_message = 'success' response.data_block_info.data_path = \ str(data_block.data_block_fpath) response.data_block_info.meta_path = '' response.data_block_info.block_id = str(data_block.block_id) elif self._online_training: response.status.code = common_pb.STATUS_NO_MORE_DATA response.status.error_message = 'please wait for datablock ready' else: response.status.code = common_pb.STATUS_DATA_FINISHED response.status.error_message = 'datablock finished' return response
def request_data_block(self, block_id=None): if self._status != tm_pb.MasterStatus.RUNNING: response = tm_pb.DataBlockResponse() response.status.code = \ common_pb.STATUS_WAIT_FOR_SYNCING_CHECKPOINT response.status.error_message = \ "must sync data checkpoint before alloc" return response if self._role == 'leader': assert block_id is None, "Must not set block_id for leader" while self._block_queue: ret = self._block_queue.pop(0) logging.debug('Fetch data block %s, ckpt is %s', ret, ",".join(self._allocated_data_blockids)) self._allocated_data_blockids.add(ret.block_id) logging.info('Fetch data block %s done', ret) return ret return None assert block_id, "Must set block_id for follower" if block_id not in self._block_map: return None return self._block_map[block_id]