예제 #1
0
파일: node.py 프로젝트: sdss/fliswarm
    async def report_status(
        self,
        command: Command,
        volumes: bool = True,
        containers: bool = True,
    ):
        """Reports the status of the node to an actor.

        Parameters
        ----------
        command
            The command that is requesting the status.
        volumes
            Whether to report the volumes connected to the node Docker engine.
        containers
            Whether to report the containers running. Only reports running
            containers whose ancestor matches the ``config['image']``.

        Notes
        -----
        Outputs the ``node`` keyword, with format
        ``node={node_name, addr, daemon_addr, node_alive, docker_alive}``.
        If ``containers=True``, outputs the ``container`` keyword with
        format ``container={node_name, container_short_id}``. If
        ``volumes=True``, reports the ``volume`` keyword with format
        ``volume={node_name, volume, ping, docker_client}``
        """

        status = [self.name, self.addr, self.daemon_addr, False, False]

        config = command.actor.config

        if not self.client:
            command.warning(f"Node {self.addr} has no client.")
            return

        if not (await self.ping(timeout=config["ping_timeout"])):
            command.warning(text=f"Node {self.addr} is not pinging back.")
            command.info(node=status)
            if self.client:
                self.client.close()
            return

        status[3] = True  # The NUC is responding.

        if not (await self.client_alive()):
            command.warning(text=f"Docker client on node {self.addr} is not connected.")
            command.info(node=status)
            if self.client:
                self.client.close()
            return

        status[4] = True
        command.info(node=status)

        if containers:

            image = config["image"].split(":")[0]
            if config["registry"]:
                image = config["registry"] + "/" + image

            container_list: List[Any] = await self._run(
                self.client.containers.list,
                all=True,
                filters={"ancestor": image, "status": "running"},
            )

            if len(container_list) == 0:
                command.warning(text=f"No containers running on {self.addr}.")
                command.debug(container=[self.name, "NA"])
            elif len(container_list) > 1:
                command.warning(
                    text=f"Multiple containers with image {image} "
                    f"running on node {self.addr}."
                )
                command.debug(container=[self.name, "NA"])
            else:
                command.debug(container=[self.name, container_list[0].short_id])

        if volumes:
            for vname in config["volumes"]:
                volume: Any = await self.get_volume(vname)
                if volume is False:
                    command.warning(text=f"Volume {vname} not present in {self.name}.")
                    command.debug(volume=[self.name, vname, False, "NA"])
                    continue
                command.debug(
                    volume=[self.name, vname, True, volume.attrs["Options"]["device"]]
                )
예제 #2
0
async def reconnect(
    command: Command,
    nodes: Dict[str, Node],
    names: str,
    category: str,
    force: bool,
):
    """Recreates volumes and restarts the Docker containers."""

    assert command.actor
    config = command.actor.config

    async def reconnect_node(node):
        """Reconnect sync. Will be run in an executor."""

        actor = command.actor
        assert actor

        try:
            await node.connect()
            if not (await node.connected()):
                raise ConnectionError()
        except ConnectionError:
            command.warning(text=f"Node {node.name} is not pinging back or "
                            "the Docker daemon is not running. Try "
                            "rebooting the computer.")
            return

        # Stop container first, because we cannot remove volumes that are
        # attached to running containers.
        await node.stop_container(
            config["container_name"] + f"-{node.name}",
            config["image"],
            force=True,
            command=command,
        )

        for vname in config["volumes"]:
            vconfig = config["volumes"][vname]
            await node.create_volume(
                vname,
                driver=vconfig["driver"],
                opts=vconfig["opts"],
                force=force,
                command=command,
            )

        return await node.run_container(
            actor.get_container_name(node),
            config["image"],
            volumes=list(config["volumes"]),
            privileged=True,
            registry=config["registry"],
            ports=[config["nodes"][actor.observatory][node.name]["port"]],
            envs={
                "ACTOR_NAME": node.name,
                "OBSERVATORY": actor.observatory
            },
            force=True,
            command=command,
        )

    c_nodes = select_nodes(nodes, category, names)

    # Drop the device before doing anything with the containers, or we'll
    # get weird hangups.
    for node in c_nodes:
        node_name = node.name
        device = command.actor.flicameras[node_name]
        if device.is_connected():
            await device.stop()

    await asyncio.gather(*[reconnect_node(node) for node in c_nodes])

    command.info(text="Waiting 5 seconds before reconnecting the devices ...")
    await asyncio.sleep(5)

    for node in c_nodes:

        container_name = config["container_name"] + f"-{node.name}"
        if not (await node.is_container_running(container_name)):
            continue

        device = command.actor.flicameras[node.name]
        await device.restart()

        if device.is_connected():
            port = device.port
            await node.report_status(command)
            command.debug(
                text=f"{node.name}: reconnected to device on port {port}.")
        else:
            command.warning(text=f"{node.name}: failed to connect to device.")

    command.finish()