async def _handle_join(self, conn: IncomingConnection, msg: JoinMessageModel) -> None: logger.debug(f"handle join {msg}") assert self._state is not None if msg.token != self._token: logger.info(f"handle join > bad token from {conn}") await conn.send_msg( MessageModel( type=MessageTypeEnum.ERROR, data=ErrorMessageModel(what="bad token", code=status.HTTP_401_UNAUTHORIZED), )) return if not msg.address or not msg.hostname: logger.info(f"handle join > missing address or host from {conn}") await conn.send_msg( MessageModel( type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="missing address or hostname", code=status.HTTP_400_BAD_REQUEST, ), )) return orch = Orchestrator(self.gstate.ceph_mgr) pubkey: str = orch.get_public_key() cephconf_path: Path = Path("/etc/ceph/ceph.conf") keyring_path: Path = Path("/etc/ceph/ceph.client.admin.keyring") assert cephconf_path.exists() assert keyring_path.exists() cephconf: str = cephconf_path.read_text("utf-8") keyring: str = keyring_path.read_text("utf-8") assert len(cephconf) > 0 assert len(keyring) > 0 logger.debug(f"handle join > pubkey: {pubkey}") welcome = WelcomeMessageModel(pubkey=pubkey, cephconf=cephconf, keyring=keyring) try: logger.debug(f"handle join > send welcome: {welcome}") await conn.send_msg( MessageModel(type=MessageTypeEnum.WELCOME, data=welcome.dict())) except Exception as e: logger.error(f"handle join > error: {str(e)}") return logger.debug(f"handle join > welcome sent: {welcome}") self._joining[conn.address] = JoiningNodeModel(address=msg.address, hostname=msg.hostname)
async def _handle_ready_to_add(self, conn: IncomingConnection, msg: ReadyToAddMessageModel) -> None: logger.debug(f"handle ready to add from {conn}") address: str = conn.address if address not in self._joining: logger.info(f"handle ready to add > unknown node {conn}") await conn.send_msg( MessageModel( type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="node not joining", code=status.HTTP_428_PRECONDITION_REQUIRED, ), )) return node: JoiningNodeModel = self._joining[address] logger.info("handle ready to add > " f"hostname: {node.hostname}, address: {node.address}") orch = Orchestrator(self.gstate.ceph_mgr) if not orch.host_add(node.hostname, node.address): logger.error("handle ready > failed adding host to orch") # reset default crush ruleset, and adjust pools to use a multi-node # ruleset, spreading replicas across hosts rather than osds. mon = self.gstate.ceph_mon if not mon.set_replicated_ruleset(): logger.error( "handle ready to add > unable to set replicated ruleset") await self._set_pool_default_size()
async def _handle_join(self, conn: IncomingConnection, msg: JoinMessageModel) -> None: logger.debug(f"handle join {msg}") assert self._state is not None if msg.token != self._token: logger.info(f"handle join > bad token from {conn}") await conn.send_msg( MessageModel(type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="bad token", code=status.HTTP_401_UNAUTHORIZED))) return if not msg.address or not msg.hostname: logger.info(f"handle join > missing address or host from {conn}") await conn.send_msg( MessageModel(type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="missing address or hostname", code=status.HTTP_400_BAD_REQUEST))) return orch = Orchestrator() pubkey: str = orch.get_public_key() logger.debug(f"handle join > pubkey: {pubkey}") welcome = WelcomeMessageModel(pubkey=pubkey) try: logger.debug(f"handle join > send welcome: {welcome}") await conn.send_msg( MessageModel(type=MessageTypeEnum.WELCOME, data=welcome.dict())) except Exception as e: logger.error(f"handle join > error: {str(e)}") return logger.debug(f"handle join > welcome sent: {welcome}") self._joining[conn.address] = \ JoiningNodeModel(address=msg.address, hostname=msg.hostname)
async def _handle_ready_to_add(self, conn: IncomingConnection, msg: ReadyToAddMessageModel) -> None: logger.debug(f"handle ready to add from {conn}") address: str = conn.address if address not in self._joining: logger.info(f"handle ready to add > unknown node {conn}") await conn.send_msg( MessageModel(type=MessageTypeEnum.ERROR, data=ErrorMessageModel( what="node not joining", code=status.HTTP_428_PRECONDITION_REQUIRED))) return node: JoiningNodeModel = self._joining[address] logger.info("handle ready to add > " f"hostname: {node.hostname}, address: {node.address}") orch = Orchestrator() if not orch.host_add(node.hostname, node.address): logger.error("handle ready > failed adding host to orch")
async def on_receive(self, websocket: WebSocket, data: Any) -> None: logger.debug(f"incoming -- recv from {websocket.client}: {data}") connmgr: ConnMgr = get_conn_mgr() assert connmgr.is_started() msg: MessageModel = MessageModel.parse_raw(data) await connmgr.on_incoming_receive(self, msg)
async def receive(self) -> MessageModel: assert self._ws raw = await self._ws.recv() return MessageModel.parse_raw(raw)
async def send(self, msg: MessageModel) -> None: assert self._ws await self._ws.send(msg.json())
async def send_msg(self, data: MessageModel) -> None: logger.debug(f"incoming -- send to {self._ws} data {data}") assert self._ws await self._ws.send_text(data.json())
async def join(self, leader_address: str, token: str) -> bool: logger.debug(f"join > with leader {leader_address}, token: {token}") if self._init_stage == NodeInitStage.NONE: raise NodeNotStartedError() elif self._init_stage > NodeInitStage.PRESTART: raise NodeCantJoinError() assert self._state assert self._state.hostname assert self._state.address if self._state.stage == NodeStageEnum.BOOTSTRAPPING: raise NodeBootstrappingError() elif self._state.stage == NodeStageEnum.BOOTSTRAPPED: raise NodeHasBeenDeployedError() elif self._state.stage == NodeStageEnum.JOINING: raise NodeAlreadyJoiningError() elif self._state.stage == NodeStageEnum.READY: raise NodeHasJoinedError() assert self._state.stage == NodeStageEnum.NONE assert self._state.role == NodeRoleEnum.NONE uri: str = f"ws://{leader_address}/api/nodes/ws" conn = await self._connmgr.connect(uri) logger.debug(f"join > conn: {conn}") joinmsg = JoinMessageModel(uuid=self._state.uuid, hostname=self._state.hostname, address=self._state.address, token=token) msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict()) await conn.send(msg) self._state.stage = NodeStageEnum.JOINING self._save_state() reply: MessageModel = await conn.receive() logger.debug(f"join > recv: {reply}") if reply.type == MessageTypeEnum.ERROR: errmsg = ErrorMessageModel.parse_obj(reply.data) logger.error(f"join > error: {errmsg.what}") await conn.close() self._state.stage = NodeStageEnum.NONE self._save_state() return False assert reply.type == MessageTypeEnum.WELCOME welcome = WelcomeMessageModel.parse_obj(reply.data) assert welcome.pubkey authorized_keys: Path = Path("/root/.ssh/authorized_keys") if not authorized_keys.parent.exists(): authorized_keys.parent.mkdir(0o700) with authorized_keys.open("a") as fd: fd.writelines([welcome.pubkey]) logger.debug(f"join > wrote pubkey to {authorized_keys}") readymsg = ReadyToAddMessageModel() await conn.send( MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg)) await conn.close() self._state.stage = NodeStageEnum.READY self._state.role = NodeRoleEnum.FOLLOWER self._save_state() self._token = token self._save_token(should_exist=False) self._node_start() return True
async def join( self, leader_address: str, token: str, uuid: UUID, hostname: str, address: str, disks: DeploymentDisksConfig, ) -> bool: logger.debug(f"join > with leader {leader_address}, token: {token}") assert self._state assert hostname assert address if self._state.bootstrapping: raise NodeBootstrappingError() elif self._state.deployed: raise NodeHasBeenDeployedError() elif self._state.joining: raise NodeAlreadyJoiningError() elif self._state.ready: raise NodeHasJoinedError() assert self._state.nostage uri: str = f"ws://{leader_address}/api/nodes/ws" conn = await self._connmgr.connect(uri) logger.debug(f"join > conn: {conn}") joinmsg = JoinMessageModel( uuid=uuid, hostname=hostname, address=address, token=token ) msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict()) await conn.send(msg) reply: MessageModel = await conn.receive() logger.debug(f"join > recv: {reply}") if reply.type == MessageTypeEnum.ERROR: errmsg = ErrorMessageModel.parse_obj(reply.data) logger.error(f"join > error: {errmsg.what}") await conn.close() self._state.mark_error( code=DeploymentErrorEnum.CANT_JOIN, msg=errmsg.what ) return False assert reply.type == MessageTypeEnum.WELCOME welcome = WelcomeMessageModel.parse_obj(reply.data) assert welcome.pubkey assert welcome.cephconf assert welcome.keyring # create system disk after we are certain we are joining. # ensure all state writes happen only after the disk has been created. systemdisk = SystemDisk(self._gstate) try: await systemdisk.create(disks.system) await systemdisk.enable() except GravelError as e: raise NodeCantJoinError(e.message) self._state.mark_join() await self._set_hostname(hostname) authorized_keys: Path = Path("/root/.ssh/authorized_keys") if not authorized_keys.parent.exists(): authorized_keys.parent.mkdir(0o700) with authorized_keys.open("a") as fd: fd.writelines([welcome.pubkey]) logger.debug(f"join > wrote pubkey to {authorized_keys}") cephconf_path: Path = Path("/etc/ceph/ceph.conf") keyring_path: Path = Path("/etc/ceph/ceph.client.admin.keyring") if not cephconf_path.parent.exists(): cephconf_path.parent.mkdir(0o755) cephconf_path.write_text(welcome.cephconf) keyring_path.write_text(welcome.keyring) keyring_path.chmod(0o600) cephconf_path.chmod(0o644) # We've got ceph.conf and the admin keyring now, kick the kvstore # to get a connection. await self._gstate.store.ensure_connection() # get NTP address ntp_addr = await self._gstate.store.get("/nodes/ntp_addr") assert ntp_addr await self._set_ntp_addr(ntp_addr) readymsg = ReadyToAddMessageModel() await conn.send( MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg) ) await conn.close() logger.debug("join > wait for host to be added") orch = Orchestrator(self._gstate.ceph_mgr) try: await asyncio.wait_for(orch.wait_host_added(hostname), 30.0) except TimeoutError: logger.error("join > timeout waiting for host to be added") raise NodeCantJoinError("host was not added to the cluster") logger.debug("join > host added, continue") try: await self._assimilate_devices(hostname, disks.storage) except DeploymentError as e: raise NodeCantJoinError(e.message) self._state.mark_ready() return True
async def join( self, leader_address: str, token: str, uuid: UUID, hostname: str, address: str, disks: DeploymentDisksConfig, ) -> bool: logger.debug(f"join > with leader {leader_address}, token: {token}") assert self._state assert hostname assert address if self._state.bootstrapping: raise NodeBootstrappingError() elif self._state.deployed: raise NodeHasBeenDeployedError() elif self._state.joining: raise NodeAlreadyJoiningError() elif self._state.ready: raise NodeHasJoinedError() assert self._state.nostage uri: str = f"ws://{leader_address}/api/nodes/ws" conn = await self._connmgr.connect(uri) logger.debug(f"join > conn: {conn}") joinmsg = JoinMessageModel(uuid=uuid, hostname=hostname, address=address, token=token) msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict()) await conn.send(msg) reply: MessageModel = await conn.receive() logger.debug(f"join > recv: {reply}") if reply.type == MessageTypeEnum.ERROR: errmsg = ErrorMessageModel.parse_obj(reply.data) logger.error(f"join > error: {errmsg.what}") await conn.close() self._state.mark_error(code=DeploymentErrorEnum.CANT_JOIN, msg=errmsg.what) return False assert reply.type == MessageTypeEnum.WELCOME welcome = WelcomeMessageModel.parse_obj(reply.data) assert welcome.pubkey assert welcome.cephconf assert welcome.keyring self._state.mark_join() await self._prepare_node( disks.system, hostname, ntpaddr=None, pubkey=welcome.pubkey, keyring=welcome.keyring, cephconf=welcome.cephconf, containerconf=None, is_join=True, progress_cb=None, ) readymsg = ReadyToAddMessageModel() await conn.send( MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg)) await conn.close() logger.debug("join > wait for host to be added") orch = Orchestrator(self._gstate.ceph_mgr) try: await asyncio.wait_for(orch.wait_host_added(hostname), 30.0) except TimeoutError: logger.error("join > timeout waiting for host to be added") raise NodeCantJoinError("Host was not added to the cluster.") logger.debug("join > host added, continue") try: await self._assimilate_devices(hostname, disks.storage) except DeploymentError as e: raise NodeCantJoinError(e.message) self._state.mark_ready() return True