Exemple #1
0
    async def _handle_join(self, conn: IncomingConnection,
                           msg: JoinMessageModel) -> None:
        logger.debug(f"handle join {msg}")
        assert self._state is not None

        if msg.token != self._token:
            logger.info(f"handle join > bad token from {conn}")
            await conn.send_msg(
                MessageModel(
                    type=MessageTypeEnum.ERROR,
                    data=ErrorMessageModel(what="bad token",
                                           code=status.HTTP_401_UNAUTHORIZED),
                ))
            return

        if not msg.address or not msg.hostname:
            logger.info(f"handle join > missing address or host from {conn}")
            await conn.send_msg(
                MessageModel(
                    type=MessageTypeEnum.ERROR,
                    data=ErrorMessageModel(
                        what="missing address or hostname",
                        code=status.HTTP_400_BAD_REQUEST,
                    ),
                ))
            return

        orch = Orchestrator(self.gstate.ceph_mgr)
        pubkey: str = orch.get_public_key()

        cephconf_path: Path = Path("/etc/ceph/ceph.conf")
        keyring_path: Path = Path("/etc/ceph/ceph.client.admin.keyring")
        assert cephconf_path.exists()
        assert keyring_path.exists()

        cephconf: str = cephconf_path.read_text("utf-8")
        keyring: str = keyring_path.read_text("utf-8")
        assert len(cephconf) > 0
        assert len(keyring) > 0

        logger.debug(f"handle join > pubkey: {pubkey}")

        welcome = WelcomeMessageModel(pubkey=pubkey,
                                      cephconf=cephconf,
                                      keyring=keyring)
        try:
            logger.debug(f"handle join > send welcome: {welcome}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.WELCOME,
                             data=welcome.dict()))
        except Exception as e:
            logger.error(f"handle join > error: {str(e)}")
            return

        logger.debug(f"handle join > welcome sent: {welcome}")
        self._joining[conn.address] = JoiningNodeModel(address=msg.address,
                                                       hostname=msg.hostname)
Exemple #2
0
    async def _handle_ready_to_add(self, conn: IncomingConnection,
                                   msg: ReadyToAddMessageModel) -> None:
        logger.debug(f"handle ready to add from {conn}")
        address: str = conn.address

        if address not in self._joining:
            logger.info(f"handle ready to add > unknown node {conn}")
            await conn.send_msg(
                MessageModel(
                    type=MessageTypeEnum.ERROR,
                    data=ErrorMessageModel(
                        what="node not joining",
                        code=status.HTTP_428_PRECONDITION_REQUIRED,
                    ),
                ))
            return

        node: JoiningNodeModel = self._joining[address]
        logger.info("handle ready to add > "
                    f"hostname: {node.hostname}, address: {node.address}")
        orch = Orchestrator(self.gstate.ceph_mgr)
        if not orch.host_add(node.hostname, node.address):
            logger.error("handle ready > failed adding host to orch")

        # reset default crush ruleset, and adjust pools to use a multi-node
        # ruleset, spreading replicas across hosts rather than osds.
        mon = self.gstate.ceph_mon
        if not mon.set_replicated_ruleset():
            logger.error(
                "handle ready to add > unable to set replicated ruleset")

        await self._set_pool_default_size()
Exemple #3
0
    async def _handle_join(self, conn: IncomingConnection,
                           msg: JoinMessageModel) -> None:
        logger.debug(f"handle join {msg}")
        assert self._state is not None

        if msg.token != self._token:
            logger.info(f"handle join > bad token from {conn}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.ERROR,
                             data=ErrorMessageModel(
                                 what="bad token",
                                 code=status.HTTP_401_UNAUTHORIZED)))
            return

        if not msg.address or not msg.hostname:
            logger.info(f"handle join > missing address or host from {conn}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.ERROR,
                             data=ErrorMessageModel(
                                 what="missing address or hostname",
                                 code=status.HTTP_400_BAD_REQUEST)))
            return

        orch = Orchestrator()
        pubkey: str = orch.get_public_key()

        logger.debug(f"handle join > pubkey: {pubkey}")

        welcome = WelcomeMessageModel(pubkey=pubkey)
        try:
            logger.debug(f"handle join > send welcome: {welcome}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.WELCOME,
                             data=welcome.dict()))
        except Exception as e:
            logger.error(f"handle join > error: {str(e)}")
            return

        logger.debug(f"handle join > welcome sent: {welcome}")
        self._joining[conn.address] = \
            JoiningNodeModel(address=msg.address, hostname=msg.hostname)
Exemple #4
0
    async def _handle_ready_to_add(self, conn: IncomingConnection,
                                   msg: ReadyToAddMessageModel) -> None:
        logger.debug(f"handle ready to add from {conn}")
        address: str = conn.address

        if address not in self._joining:
            logger.info(f"handle ready to add > unknown node {conn}")
            await conn.send_msg(
                MessageModel(type=MessageTypeEnum.ERROR,
                             data=ErrorMessageModel(
                                 what="node not joining",
                                 code=status.HTTP_428_PRECONDITION_REQUIRED)))
            return

        node: JoiningNodeModel = self._joining[address]
        logger.info("handle ready to add > "
                    f"hostname: {node.hostname}, address: {node.address}")
        orch = Orchestrator()
        if not orch.host_add(node.hostname, node.address):
            logger.error("handle ready > failed adding host to orch")
Exemple #5
0
 async def on_receive(self, websocket: WebSocket, data: Any) -> None:
     logger.debug(f"incoming -- recv from {websocket.client}: {data}")
     connmgr: ConnMgr = get_conn_mgr()
     assert connmgr.is_started()
     msg: MessageModel = MessageModel.parse_raw(data)
     await connmgr.on_incoming_receive(self, msg)
Exemple #6
0
 async def receive(self) -> MessageModel:
     assert self._ws
     raw = await self._ws.recv()
     return MessageModel.parse_raw(raw)
Exemple #7
0
 async def send(self, msg: MessageModel) -> None:
     assert self._ws
     await self._ws.send(msg.json())
Exemple #8
0
 async def send_msg(self, data: MessageModel) -> None:
     logger.debug(f"incoming -- send to {self._ws} data {data}")
     assert self._ws
     await self._ws.send_text(data.json())
Exemple #9
0
    async def join(self, leader_address: str, token: str) -> bool:
        logger.debug(f"join > with leader {leader_address}, token: {token}")

        if self._init_stage == NodeInitStage.NONE:
            raise NodeNotStartedError()
        elif self._init_stage > NodeInitStage.PRESTART:
            raise NodeCantJoinError()

        assert self._state
        assert self._state.hostname
        assert self._state.address

        if self._state.stage == NodeStageEnum.BOOTSTRAPPING:
            raise NodeBootstrappingError()
        elif self._state.stage == NodeStageEnum.BOOTSTRAPPED:
            raise NodeHasBeenDeployedError()
        elif self._state.stage == NodeStageEnum.JOINING:
            raise NodeAlreadyJoiningError()
        elif self._state.stage == NodeStageEnum.READY:
            raise NodeHasJoinedError()
        assert self._state.stage == NodeStageEnum.NONE
        assert self._state.role == NodeRoleEnum.NONE

        uri: str = f"ws://{leader_address}/api/nodes/ws"
        conn = await self._connmgr.connect(uri)
        logger.debug(f"join > conn: {conn}")

        joinmsg = JoinMessageModel(uuid=self._state.uuid,
                                   hostname=self._state.hostname,
                                   address=self._state.address,
                                   token=token)
        msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict())
        await conn.send(msg)

        self._state.stage = NodeStageEnum.JOINING
        self._save_state()

        reply: MessageModel = await conn.receive()
        logger.debug(f"join > recv: {reply}")
        if reply.type == MessageTypeEnum.ERROR:
            errmsg = ErrorMessageModel.parse_obj(reply.data)
            logger.error(f"join > error: {errmsg.what}")
            await conn.close()
            self._state.stage = NodeStageEnum.NONE
            self._save_state()
            return False

        assert reply.type == MessageTypeEnum.WELCOME
        welcome = WelcomeMessageModel.parse_obj(reply.data)
        assert welcome.pubkey

        authorized_keys: Path = Path("/root/.ssh/authorized_keys")
        if not authorized_keys.parent.exists():
            authorized_keys.parent.mkdir(0o700)
        with authorized_keys.open("a") as fd:
            fd.writelines([welcome.pubkey])
            logger.debug(f"join > wrote pubkey to {authorized_keys}")

        readymsg = ReadyToAddMessageModel()
        await conn.send(
            MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg))
        await conn.close()

        self._state.stage = NodeStageEnum.READY
        self._state.role = NodeRoleEnum.FOLLOWER
        self._save_state()

        self._token = token
        self._save_token(should_exist=False)

        self._node_start()
        return True
Exemple #10
0
    async def join(
        self,
        leader_address: str,
        token: str,
        uuid: UUID,
        hostname: str,
        address: str,
        disks: DeploymentDisksConfig,
    ) -> bool:
        logger.debug(f"join > with leader {leader_address}, token: {token}")

        assert self._state
        assert hostname
        assert address

        if self._state.bootstrapping:
            raise NodeBootstrappingError()
        elif self._state.deployed:
            raise NodeHasBeenDeployedError()
        elif self._state.joining:
            raise NodeAlreadyJoiningError()
        elif self._state.ready:
            raise NodeHasJoinedError()
        assert self._state.nostage

        uri: str = f"ws://{leader_address}/api/nodes/ws"
        conn = await self._connmgr.connect(uri)
        logger.debug(f"join > conn: {conn}")

        joinmsg = JoinMessageModel(
            uuid=uuid, hostname=hostname, address=address, token=token
        )
        msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict())
        await conn.send(msg)

        reply: MessageModel = await conn.receive()
        logger.debug(f"join > recv: {reply}")
        if reply.type == MessageTypeEnum.ERROR:
            errmsg = ErrorMessageModel.parse_obj(reply.data)
            logger.error(f"join > error: {errmsg.what}")
            await conn.close()
            self._state.mark_error(
                code=DeploymentErrorEnum.CANT_JOIN, msg=errmsg.what
            )
            return False

        assert reply.type == MessageTypeEnum.WELCOME
        welcome = WelcomeMessageModel.parse_obj(reply.data)
        assert welcome.pubkey
        assert welcome.cephconf
        assert welcome.keyring

        # create system disk after we are certain we are joining.
        # ensure all state writes happen only after the disk has been created.
        systemdisk = SystemDisk(self._gstate)
        try:
            await systemdisk.create(disks.system)
            await systemdisk.enable()
        except GravelError as e:
            raise NodeCantJoinError(e.message)

        self._state.mark_join()
        await self._set_hostname(hostname)

        authorized_keys: Path = Path("/root/.ssh/authorized_keys")
        if not authorized_keys.parent.exists():
            authorized_keys.parent.mkdir(0o700)
        with authorized_keys.open("a") as fd:
            fd.writelines([welcome.pubkey])
            logger.debug(f"join > wrote pubkey to {authorized_keys}")

        cephconf_path: Path = Path("/etc/ceph/ceph.conf")
        keyring_path: Path = Path("/etc/ceph/ceph.client.admin.keyring")
        if not cephconf_path.parent.exists():
            cephconf_path.parent.mkdir(0o755)
        cephconf_path.write_text(welcome.cephconf)
        keyring_path.write_text(welcome.keyring)
        keyring_path.chmod(0o600)
        cephconf_path.chmod(0o644)

        # We've got ceph.conf and the admin keyring now, kick the kvstore
        # to get a connection.
        await self._gstate.store.ensure_connection()

        # get NTP address
        ntp_addr = await self._gstate.store.get("/nodes/ntp_addr")
        assert ntp_addr
        await self._set_ntp_addr(ntp_addr)

        readymsg = ReadyToAddMessageModel()
        await conn.send(
            MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg)
        )
        await conn.close()

        logger.debug("join > wait for host to be added")
        orch = Orchestrator(self._gstate.ceph_mgr)
        try:
            await asyncio.wait_for(orch.wait_host_added(hostname), 30.0)
        except TimeoutError:
            logger.error("join > timeout waiting for host to be added")
            raise NodeCantJoinError("host was not added to the cluster")
        logger.debug("join > host added, continue")

        try:
            await self._assimilate_devices(hostname, disks.storage)
        except DeploymentError as e:
            raise NodeCantJoinError(e.message)

        self._state.mark_ready()
        return True
Exemple #11
0
    async def join(
        self,
        leader_address: str,
        token: str,
        uuid: UUID,
        hostname: str,
        address: str,
        disks: DeploymentDisksConfig,
    ) -> bool:
        logger.debug(f"join > with leader {leader_address}, token: {token}")

        assert self._state
        assert hostname
        assert address

        if self._state.bootstrapping:
            raise NodeBootstrappingError()
        elif self._state.deployed:
            raise NodeHasBeenDeployedError()
        elif self._state.joining:
            raise NodeAlreadyJoiningError()
        elif self._state.ready:
            raise NodeHasJoinedError()
        assert self._state.nostage

        uri: str = f"ws://{leader_address}/api/nodes/ws"
        conn = await self._connmgr.connect(uri)
        logger.debug(f"join > conn: {conn}")

        joinmsg = JoinMessageModel(uuid=uuid,
                                   hostname=hostname,
                                   address=address,
                                   token=token)
        msg = MessageModel(type=MessageTypeEnum.JOIN, data=joinmsg.dict())
        await conn.send(msg)

        reply: MessageModel = await conn.receive()
        logger.debug(f"join > recv: {reply}")
        if reply.type == MessageTypeEnum.ERROR:
            errmsg = ErrorMessageModel.parse_obj(reply.data)
            logger.error(f"join > error: {errmsg.what}")
            await conn.close()
            self._state.mark_error(code=DeploymentErrorEnum.CANT_JOIN,
                                   msg=errmsg.what)
            return False

        assert reply.type == MessageTypeEnum.WELCOME
        welcome = WelcomeMessageModel.parse_obj(reply.data)
        assert welcome.pubkey
        assert welcome.cephconf
        assert welcome.keyring

        self._state.mark_join()
        await self._prepare_node(
            disks.system,
            hostname,
            ntpaddr=None,
            pubkey=welcome.pubkey,
            keyring=welcome.keyring,
            cephconf=welcome.cephconf,
            containerconf=None,
            is_join=True,
            progress_cb=None,
        )

        readymsg = ReadyToAddMessageModel()
        await conn.send(
            MessageModel(type=MessageTypeEnum.READY_TO_ADD, data=readymsg))
        await conn.close()

        logger.debug("join > wait for host to be added")
        orch = Orchestrator(self._gstate.ceph_mgr)
        try:
            await asyncio.wait_for(orch.wait_host_added(hostname), 30.0)
        except TimeoutError:
            logger.error("join > timeout waiting for host to be added")
            raise NodeCantJoinError("Host was not added to the cluster.")
        logger.debug("join > host added, continue")

        try:
            await self._assimilate_devices(hostname, disks.storage)
        except DeploymentError as e:
            raise NodeCantJoinError(e.message)

        self._state.mark_ready()
        return True