def _inspect_containers(
    config: Dict[str, Any],
    lock: synchronize.Lock,
    replications_queue: Queue,  # type: ignore
    replication_statuses: Dict[str, str],
    containers: List[Any],
) -> Tuple[List[Dict[str, str]], List[str]]:
    statuses: List[str] = []
    desired_containers: List[Dict[str, str]] = []

    for container in containers:
        name, image = ((container["name"],
                        container["image"]) if isinstance(container, dict) else
                       (container.name, container.image))
        desired_image = _get_desired_image(config, image)
        logger.debug("Container Image: %s -> %s", image, desired_image)
        desired_containers.append({"name": name, "image": desired_image})

        if image != desired_image:
            status = _get_replication_status(lock, replications_queue,
                                             replication_statuses, image,
                                             desired_image)
            logger.info("Image: %s -> %s, Replication Status: %s", image,
                        desired_image, status)
            statuses.append(status)

    return desired_containers, statuses
def _replicate_image(config: Dict[str, Any], src: str, dest: str) -> str:
    logger.info("Replicating Image: %s -> %s", src, dest)

    buildspec = yaml.safe_dump(
        _generate_buildspec(config["repo_host"], config["repo_prefix"], src,
                            dest))
    logger.debug("BuildSpec:\n%s", buildspec)

    client = boto3.client("codebuild")
    build_id = client.start_build(
        projectName=config["codebuild_project"],
        sourceTypeOverride="NO_SOURCE",
        buildspecOverride=buildspec,
        timeoutInMinutesOverride=config["codebuild_timeout"],
        privilegedModeOverride=True,
    )["build"]["id"]

    logger.info("Started CodeBuild Id: %s", build_id)

    while True:
        build = client.batch_get_builds(ids=[build_id])["builds"][0]
        status: str = build["buildStatus"]
        phase: str = build["currentPhase"]

        logger.debug("CodeBuild Id: %s, Phase: %s,  Status: %s", build_id,
                     phase, status)

        if status == "IN_PROGRESS":
            time.sleep(10)
            continue
        else:
            return status
def _inspect_standalone_pods(
    config: Dict[str, Any],
    lock: synchronize.Lock,
    replications_queue: Queue,  # type: ignore
    replication_statuses: Dict[str, str],
) -> None:
    pods = CoreV1Api().list_pod_for_all_namespaces()
    for pod in pods.items:
        if pod.metadata.owner_references:
            owner_reference = pod.metadata.owner_references[0]
            if owner_reference.kind != "Job":
                logger.debug("Skipping Owned Pod: %s/%s",
                             pod.metadata.namespace, pod.metadata.name)
                continue
            else:
                logger.info("Found Pod: %s/%s for Job: %s",
                            pod.metadata.namespace, pod.metadata.name,
                            owner_reference.name)

        spec = pod.spec
        containers, statuses = _inspect_containers(config, lock,
                                                   replications_queue,
                                                   replication_statuses,
                                                   spec.containers)
        init_containers, init_statuses = (_inspect_containers(
            config, lock, replications_queue, replication_statuses,
            spec.init_containers) if spec.init_containers else ([], []))

        all_statuses = statuses + init_statuses
        all_containers = containers + init_containers
        if len(all_statuses) > 0 and all(
            [status == "Complete" for status in all_statuses]):
            with lock:
                for container in all_containers:
                    del replication_statuses[container["image"]]

            body = {
                "spec": {
                    "containers": containers,
                    "initContainers": init_containers,
                }
            }

            CoreV1Api().patch_namespaced_pod(
                name=pod.metadata.name,
                namespace=pod.metadata.namespace,
                body=body,
            )
            logger.info("Patched Pod: %s, Namespace: %s", pod.metadata.name,
                        pod.metadata.namespace)
def _inspect_daemon_sets(
    config: Dict[str, Any],
    lock: synchronize.Lock,
    replications_queue: Queue,  # type: ignore
    replication_statuses: Dict[str, str],
) -> None:
    daemon_sets = AppsV1Api().list_daemon_set_for_all_namespaces()
    for daemon_set in daemon_sets.items:
        spec = daemon_set.spec.template.spec
        containers, statuses = _inspect_containers(config, lock,
                                                   replications_queue,
                                                   replication_statuses,
                                                   spec.containers)
        init_containers, init_statuses = (_inspect_containers(
            config, lock, replications_queue, replication_statuses,
            spec.init_containers) if spec.init_containers else ([], []))

        all_statuses = statuses + init_statuses
        all_containers = containers + init_containers
        if len(all_statuses) > 0 and all(
            [status == "Complete" for status in all_statuses]):
            with lock:
                for container in all_containers:
                    del replication_statuses[container["image"]]

            body = {
                "spec": {
                    "template": {
                        "spec": {
                            "containers": containers,
                            "initContainers": init_containers,
                        }
                    }
                }
            }

            AppsV1Api().patch_namespaced_daemon_set(
                name=daemon_set.metadata.name,
                namespace=daemon_set.metadata.namespace,
                body=body,
            )
            logger.info("Patched Daemon Set: %s, Namespace: %s",
                        daemon_set.metadata.name,
                        daemon_set.metadata.namespace)
def monitor(
    config: Dict[str, Any],
    lock: synchronize.Lock,
    replications_queue: Queue,  # type: ignore
    replication_statuses: Dict[str, str],
) -> int:
    try:
        while True:
            load_config(config["in_cluster_deployment"])

            logger.info("Monitoring Deployments")
            _inspect_deployments(config, lock, replications_queue,
                                 replication_statuses)

            logger.info("Monitoring Daemon Sets")
            _inspect_daemon_sets(config, lock, replications_queue,
                                 replication_statuses)

            # logger.info("Monitoring Jobs")
            # _inspect_jobs(config, lock, replications_queue, replication_statuses)

            logger.info("Monitoring Standalone Pods")
            _inspect_standalone_pods(config, lock, replications_queue,
                                     replication_statuses)

            time.sleep(20)
    except Exception as e:
        logger.exception(e)
        return -1
def _get_replication_status(
    lock: synchronize.Lock,
    replications_queue: Queue,  # type: ignore
    replication_statuses: Dict[str, str],
    image: str,
    desired_image: str,
) -> str:
    with lock:
        status = replication_statuses.get(desired_image, "Unknown")

        if status == "Unknown":
            if image_replicated(desired_image):
                logger.info(
                    "Skipping previously completed Replication Task: %s -> %s",
                    image, desired_image)
                status = "Complete"
                replication_statuses[desired_image] = status
            else:
                logger.info("Queueing Replication Task: %s -> %s", image,
                            desired_image)
                status = "Pending:1"
                replication_statuses[desired_image] = status
                replications_queue.put({"src": image, "dest": desired_image})
        elif status.startswith("Failed"):
            attempt = int(status.split(":")[1])
            if attempt < 3:
                attempt = attempt + 1
                logger.info(
                    "Queueing Failed Replication Task Attemp %s: %s -> %s",
                    attempt, image, desired_image)
                replication_statuses[desired_image] = f"Pending:{attempt}"
                replications_queue.put({"src": image, "dest": desired_image})
            else:
                logger.error("Too many failed replication attempts: %s -> %s",
                             image, desired_image)

        return status
def replicate(
    config: Dict[str, Any],
    lock: synchronize.Lock,
    replications_queue: Queue,  # type: ignore
    replication_statuses: Dict[str, str],
    replicator_id: int,
) -> int:
    logger.info("Started Replicator Id: %s", replicator_id)
    replication_task: Optional[Dict[str, str]] = None

    while True:
        try:
            load_config(config["in_cluster_deployment"])

            queue_size = replications_queue.qsize()
            logger.info(f"Queue Size: {queue_size}")

            replication_task = cast(
                Dict[str, str], replications_queue.get(block=True,
                                                       timeout=None))
            src, dest = replication_task["src"], replication_task["dest"]

            with lock:
                logger.info("Got Replication Task: %s -> %s", src,
                            replication_task["dest"])

                status = replication_statuses[dest]
                if status == "Complete":
                    logger.info("Skipping Completed Task: %s -> %s", src, dest)
                    continue
                elif status.startswith("Failed"):
                    logger.info("Skipping Failed Task: %s -> %s", src, dest)
                    continue
                elif status.startswith("Replicating"):
                    logger.info("Skipping Replicating Task: %s -> %s", src,
                                dest)
                    continue
                else:
                    attempt = int(status.split(":")[1])
                    replication_statuses[dest] = f"Replicating:{attempt}"

            result = _replicate_image(config, src, dest)

            with lock:
                if result == "SUCCEEDED":
                    logger.info("Replication Complete: %s -> %s", src, dest)
                    replication_statuses[dest] = "Complete"
                else:
                    logger.error(
                        "Image Replication Attempt %s Failed: %s -> %s",
                        attempt,
                        src,
                        dest,
                    )
                    replication_statuses[dest] = f"Failed:{attempt}"

        except Exception as e:
            with lock:
                status = replication_statuses[dest]
                attempt = int(status.split(":")[1])
                logger.error(
                    "Image Replication Attempt %s Failed: %s -> %s",
                    attempt,
                    src,
                    dest,
                )
                logger.exception(e)
                replication_statuses[dest] = f"Failed:{attempt}"
        finally:
            replication_task = None
            time.sleep(5)
Beispiel #8
0
def main(
    repo_host: Optional[str],
    repo_prefix: Optional[str],
    codebuild_project: Optional[str],
    codebuild_timeout: Optional[int],
    replicate_external_repos: Optional[bool],
    in_cluster_deployment: Optional[bool],
    replicator_processes: Optional[int],
    debug: bool,
) -> int:
    if debug:
        logger.setLevel(logging.DEBUG)

    repo_host = repo_host if repo_host else os.environ.get("IMAGE_REPLICATOR_REPO_HOST", "")
    repo_prefix = repo_prefix if repo_prefix else os.environ.get("IMAGE_REPLICATOR_REPO_PREFIX", "")
    codebuild_project = (
        codebuild_project if codebuild_project else os.environ.get("IMAGE_REPLICATOR_CODEBUILD_PROJECT", "")
    )
    codebuild_timeout = (
        codebuild_timeout if codebuild_timeout else int(os.environ.get("IMAGE_REPLICATOR_CODEBUILD_TIMEOUT", "30"))
    )
    replicate_external_repos = (
        replicate_external_repos
        if replicate_external_repos is not None
        else os.environ.get("IMAGE_REPLICATOR_REPLICATE_EXTERNAL_REPOS", "False").lower() in ["true", "yes", "1"]
    )
    in_cluster_deployment = (
        in_cluster_deployment
        if in_cluster_deployment is not None
        else os.environ.get("IMAGE_REPLICATOR_IN_CLUSTER_DEPLOYMENT", "True").lower() in ["true", "yes", "1"]
    )
    replicator_processes = (
        replicator_processes
        if replicator_processes
        else int(os.environ.get("IMAGE_REPLICATOR_REPLICATOR_PROCESSES", "3"))
    )

    logger.info("repo_host: %s", repo_host)
    logger.info("repo_prefix: %s", repo_prefix)
    logger.info("codebuild_project: %s", codebuild_project)
    logger.info("codebuild_timeout: %s", codebuild_timeout)
    logger.info("replicate_external_repos: %s", replicate_external_repos)
    logger.info("in_cluster_deployment: %s", in_cluster_deployment)
    logger.info("replicator_processes: %s", replicator_processes)

    if not repo_host or not repo_prefix or not codebuild_project or not codebuild_timeout or not replicator_processes:
        exception = click.ClickException(
            "All of repo_host, repo_prefix, codebuild_project, codebuild_timeout, "
            "and replicator_processes are required."
        )
        logger.error(exception)
        raise exception

    with Manager() as manager:
        sync_manager = cast(SyncManager, manager)
        lock = sync_manager.Lock()
        replications_queue = sync_manager.Queue()
        replication_statuses: Dict[str, str] = sync_manager.dict()
        config = {
            "repo_host": repo_host,
            "repo_prefix": repo_prefix,
            "codebuild_project": codebuild_project,
            "codebuild_timeout": codebuild_timeout,
            "replicate_external_repos": replicate_external_repos,
            "in_cluster_deployment": in_cluster_deployment,
            "replicator_processes": replicator_processes,
        }

        logger.info("Starting Monitoring Process")
        monitor = Process(
            target=image_monitor.monitor,
            kwargs={
                "config": config,
                "lock": lock,
                "replications_queue": replications_queue,
                "replication_statuses": replication_statuses,
            },
        )
        monitor.start()

        replicators = []
        for i in range(replicator_processes):
            logger.info("Starting Replication Process: %s", i)
            replicator = Process(
                target=image_replicator.replicate,
                kwargs={
                    "config": config,
                    "lock": lock,
                    "replications_queue": replications_queue,
                    "replication_statuses": replication_statuses,
                    "replicator_id": i,
                },
            )
            replicators.append(replicator)
            replicator.start()

        monitor.join()
        for replicator in replicators:
            replicator.terminate()

    return 0