Exemple #1
0
def start_scraper(docker_client, task, dns, host_workdir):
    config = task["config"]
    offliner = config["task_name"]
    container_name = get_container_name(
        f"{CONTAINER_SCRAPER_IDENT}_{offliner}", task["_id"]
    )

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, container_name)
    except docker.errors.NotFound:
        pass

    # scraper is systematically pulled before starting
    tag = f'{config["image"]["name"]}:{config["image"]["tag"]}'
    logger.debug(f"Pulling image {tag}")
    docker_image = pull_image(docker_client, tag)

    # where to mount volume inside scraper
    mount_point = config["mount_point"]

    # mounts will be attached to host's fs, not this one
    mounts = [Mount(str(mount_point), str(host_workdir), type="bind")]

    command = config["str_command"]
    cpu_shares = config["resources"]["cpu"] * DEFAULT_CPU_SHARE
    mem_limit = config["resources"]["memory"]
    disk_limit = config["resources"]["disk"]
    shm_size = config["resources"].get("shm")
    cap_add = config["resources"].get("cap_add", [])
    cap_drop = config["resources"].get("cap_drop", [])

    return run_container(
        docker_client,
        image=docker_image,
        command=command,
        # disk is already reserved on zimtask
        cpu_shares=cpu_shares,
        mem_limit=mem_limit,
        dns=dns,
        detach=True,
        labels={
            "zimfarm": "",
            "zimscraper": "yes",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
            "human.cpu": str(config["resources"]["cpu"]),
            "human.memory": format_size(mem_limit),
            "human.disk": format_size(disk_limit),
        },
        mem_swappiness=0,
        shm_size=shm_size,
        cap_add=cap_add,
        cap_drop=cap_drop,
        mounts=mounts,
        name=container_name,
        remove=False,  # scaper container will be removed once log&zim handled
    )
Exemple #2
0
def start_task_worker(docker_client, task, webapi_uri, username, workdir, worker_name):
    container_name = task_container_name(task["_id"])

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, container_name)
    except docker.errors.NotFound:
        pass

    image, tag = TASK_WORKER_IMAGE.rsplit(":", 1)
    if tag == "local":
        docker_image = get_image(docker_client, TASK_WORKER_IMAGE)
    else:
        logger.debug(f"pulling image {image}:{tag}")
        docker_image = pull_image(docker_client, image, tag=tag)

    # mounts will be attached to host's fs, not this one
    host_mounts = query_host_mounts(docker_client, workdir)
    host_task_workdir = str(host_mounts.get(workdir))
    host_docker_socket = str(host_mounts.get(DOCKER_SOCKET))
    host_private_key = str(host_mounts.get(PRIVATE_KEY))
    mounts = [
        Mount(str(workdir), host_task_workdir, type="bind"),
        Mount(str(DOCKER_SOCKET), host_docker_socket, type="bind", read_only=True),
        Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True),
    ]
    command = ["task-worker", "--task-id", task["_id"]]

    logger.debug(f"running {command}")
    return run_container(
        docker_client,
        image=docker_image,
        command=command,
        detach=True,
        environment={
            "USERNAME": username,
            "WORKDIR": str(workdir),
            "WEB_API_URI": webapi_uri,
            "UPLOAD_URI": UPLOAD_URI,
            "WORKER_NAME": worker_name,
            "ZIMFARM_DISK": os.getenv("ZIMFARM_DISK"),
            "ZIMFARM_CPUS": os.getenv("ZIMFARM_CPUS"),
            "ZIMFARM_MEMORY": os.getenv("ZIMFARM_MEMORY"),
            "DEBUG": os.getenv("DEBUG"),
            "USE_PUBLIC_DNS": "1" if USE_PUBLIC_DNS else "",
        },
        labels={
            "zimfarm": "",
            "zimtask": "yes",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
        },
        mem_swappiness=0,
        mounts=mounts,
        name=container_name,
        remove=False,  # zimtask containers are pruned periodically
    )
Exemple #3
0
def start_scraper(docker_client, task, dns, host_workdir):
    config = task["config"]
    offliner = config["task_name"]
    container_name = scraper_container_name(task["_id"], offliner)

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, container_name)
    except docker.errors.NotFound:
        pass

    logger.debug(
        f'pulling image {config["image"]["name"]}:{config["image"]["tag"]}')
    docker_image = pull_image(docker_client,
                              config["image"]["name"],
                              tag=config["image"]["tag"])

    # where to mount volume inside scraper
    mount_point = config["mount_point"]

    # mounts will be attached to host's fs, not this one
    mounts = [Mount(str(mount_point), str(host_workdir), type="bind")]

    command = config["str_command"]
    cpu_shares = config["resources"]["cpu"] * DEFAULT_CPU_SHARE
    mem_limit = config["resources"]["memory"]
    disk_limit = config["resources"]["disk"]

    return run_container(
        docker_client,
        image=docker_image,
        command=command,
        cpu_shares=cpu_shares,
        mem_limit=mem_limit,
        dns=dns,
        detach=True,
        labels={
            "zimfarm": "",
            "zimscraper": "yes",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
            RESOURCES_DISK_LABEL: str(disk_limit),
            "human.cpu": str(config["resources"]["cpu"]),
            "human.memory": format_size(mem_limit),
            "human.disk": format_size(disk_limit),
        },
        mem_swappiness=0,
        mounts=mounts,
        name=container_name,
        remove=False,  # scaper container will be removed once log&zim handled
    )
Exemple #4
0
def start_dnscache(docker_client, task):
    name = get_container_name("dnscache", task["_id"])
    environment = {"USE_PUBLIC_DNS": "yes" if USE_PUBLIC_DNS else "no"}
    image = get_or_pull_image(docker_client, DNSCACHE_IMAGE)
    return run_container(
        docker_client,
        image=image,
        detach=True,
        name=name,
        environment=environment,
        remove=False,
        labels={
            "zimfarm": "",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
        },
    )
Exemple #5
0
def start_dnscache(docker_client, task):
    name = dnscache_container_name(task["_id"])
    environment = {"USE_PUBLIC_DNS": "yes" if USE_PUBLIC_DNS else "no"}
    image = pull_image(docker_client, "openzim/dnscache", tag="latest")
    return run_container(
        docker_client,
        image=image,
        detach=True,
        name=name,
        environment=environment,
        remove=True,
        labels={
            "zimfarm": "",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
        },
    )
Exemple #6
0
def start_checker(docker_client, task, host_workdir, filename):
    name = get_container_name("checker", task["_id"])
    image = get_or_pull_image(docker_client, CHECKER_IMAGE)

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, name)
        prune_containers(docker_client, {"label": [f"filename={filename}"]})
    except docker.errors.NotFound:
        pass

    # in container paths
    workdir = pathlib.Path("/data")
    filepath = workdir.joinpath(filename)
    mounts = [Mount(str(workdir), str(host_workdir), type="bind", read_only=True)]

    command = [
        "zimcheck",
        task["upload"]["zim"]["zimcheck"] or "-A",
        str(filepath),
    ]

    return run_container(
        docker_client,
        image=image,
        command=command,
        detach=True,
        name=name,
        mounts=mounts,
        labels={
            "zimfarm": "",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
            "filename": filename,
        },
        remove=False,
    )
Exemple #7
0
def start_uploader(
    docker_client,
    task,
    kind,
    username,
    host_workdir,
    upload_dir,
    filename,
    move,
    delete,
    compress,
    resume,
    watch=False,
):
    container_name = upload_container_name(task["_id"], filename, False)

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, container_name)
        prune_containers(docker_client, {"label": [f"filename={filename}"]})
    except docker.errors.NotFound:
        pass

    docker_image = get_or_pull_image(docker_client, UPLOADER_IMAGE)

    # in container paths
    workdir = pathlib.Path("/data")
    filepath = workdir.joinpath(filename)

    host_mounts = query_host_mounts(docker_client)
    host_private_key = str(host_mounts[PRIVATE_KEY])
    mounts = [
        Mount(str(workdir), str(host_workdir), type="bind", read_only=not delete),
        Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True),
    ]

    # append the upload_dir and filename to upload_uri
    upload_uri = urllib.parse.urlparse(task["upload"][kind]["upload_uri"])
    parts = list(upload_uri)
    # make sure we have a valid upload path
    parts[2] += "/" if not parts[2].endswith("/") else ""
    # ensure upload_dir is not absolute
    parts[2] += os.path.join(re.sub(r"^/", "", upload_dir, 1), filepath.name)
    upload_uri = urllib.parse.urlunparse(parts)

    command = [
        "uploader",
        "--file",
        str(filepath),
        "--upload-uri",
        upload_uri,
        "--username",
        username,
    ]
    if compress:
        command.append("--compress")
    if resume:
        command.append("--resume")
    if move:
        command.append("--move")
    if delete:
        command.append("--delete")
    if watch:
        command += ["--watch", str(watch)]
    if task["upload"][kind]["expiration"]:
        command += ["--delete-after", str(task["upload"][kind]["expiration"])]

    return run_container(
        docker_client,
        image=docker_image,
        command=command,
        detach=True,
        environment={"RSA_KEY": str(PRIVATE_KEY)},
        labels={
            "zimfarm": "",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
            "filename": filename,
        },
        mem_swappiness=0,
        mounts=mounts,
        name=container_name,
        remove=False,
    )
Exemple #8
0
def start_task_worker(docker_client, task, webapi_uri, username, workdir, worker_name):
    container_name = get_container_name(CONTAINER_TASK_IDENT, task["_id"])

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, container_name)
    except docker.errors.NotFound:
        pass

    logger.debug(f"getting image {TASK_WORKER_IMAGE}")
    # task worker is always pulled to ensure we can update our code
    if ":" not in TASK_WORKER_IMAGE:
        # consider missing :tag info as a local image for tests
        docker_image = get_image(docker_client, TASK_WORKER_IMAGE)
    else:
        docker_image = pull_image(docker_client, TASK_WORKER_IMAGE)

    # mounts will be attached to host's fs, not this one
    host_mounts = query_host_mounts(docker_client, workdir)
    host_task_workdir = str(host_mounts.get(workdir))
    host_docker_socket = str(host_mounts.get(DOCKER_SOCKET))
    host_private_key = str(host_mounts.get(PRIVATE_KEY))
    mounts = [
        Mount(str(workdir), host_task_workdir, type="bind"),
        Mount(str(DOCKER_SOCKET), host_docker_socket, type="bind", read_only=True),
        Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True),
    ]
    command = ["task-worker", "--task-id", task["_id"]]

    logger.debug(f"running {command}")
    return run_container(
        docker_client,
        image=docker_image,
        command=command,
        detach=True,
        environment={
            "USERNAME": username,
            "WORKDIR": str(workdir),
            "WEB_API_URI": webapi_uri,
            "WORKER_NAME": worker_name,
            "ZIMFARM_DISK": os.getenv("ZIMFARM_DISK"),
            "ZIMFARM_CPUS": os.getenv("ZIMFARM_CPUS"),
            "ZIMFARM_MEMORY": os.getenv("ZIMFARM_MEMORY"),
            "DEBUG": os.getenv("DEBUG"),
            "USE_PUBLIC_DNS": "1" if USE_PUBLIC_DNS else "",
            "UPLOADER_IMAGE": UPLOADER_IMAGE,
            "CHECKER_IMAGE": CHECKER_IMAGE,
            "DNSCACHE_IMAGE": DNSCACHE_IMAGE,
            "DOCKER_SOCKET": DOCKER_SOCKET,
        },
        labels={
            "zimfarm": "",
            "zimtask": "yes",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
            # disk usage is accounted for on this container
            RESOURCES_DISK_LABEL: str(task["config"]["resources"]["disk"]),
            # display-only human-readable values
            "human.cpu": str(task["config"]["resources"]["cpu"]),
            "human.memory": format_size(task["config"]["resources"]["memory"]),
            "human.disk": format_size(task["config"]["resources"]["disk"]),
        },
        mem_swappiness=0,
        mounts=mounts,
        name=container_name,
        remove=False,  # zimtask containers are pruned periodically
    )
Exemple #9
0
def start_uploader(
    docker_client,
    task,
    username,
    host_workdir,
    upload_dir,
    filename,
    move,
    delete,
    compress,
    resume,
    watch,
):
    container_name = upload_container_name(task["_id"], filename, False)

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, container_name)
        prune_containers(docker_client, {"label": [f"filename={filename}"]})
    except docker.errors.NotFound:
        pass

    docker_image = pull_image(docker_client, "openzim/uploader", tag="latest")

    # in container paths
    workdir = pathlib.Path("/data")
    filepath = workdir.joinpath(filename)

    host_mounts = query_host_mounts(docker_client)
    host_private_key = str(host_mounts[PRIVATE_KEY])
    mounts = [
        Mount(str(workdir), str(host_workdir), type="bind", read_only=not delete),
        Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True),
    ]

    command = [
        "uploader",
        "--file",
        str(filepath),
        "--upload-uri",
        f"{UPLOAD_URI}/{upload_dir}/{filepath.name}",
        "--username",
        username,
    ]
    if compress:
        command.append("--compress")
    if resume:
        command.append("--resume")
    if move:
        command.append("--move")
    if delete:
        command.append("--delete")
    if watch:
        command += ["--watch", str(watch)]

    return run_container(
        docker_client,
        image=docker_image,
        command=command,
        detach=True,
        environment={"RSA_KEY": str(PRIVATE_KEY)},
        labels={
            "zimfarm": "",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
            "filename": filename,
        },
        mem_swappiness=0,
        mounts=mounts,
        name=container_name,
        remove=False,
    )