def start_scraper(docker_client, task, dns, host_workdir): config = task["config"] offliner = config["task_name"] container_name = get_container_name( f"{CONTAINER_SCRAPER_IDENT}_{offliner}", task["_id"] ) # remove container should it exists (should not) try: remove_container(docker_client, container_name) except docker.errors.NotFound: pass # scraper is systematically pulled before starting tag = f'{config["image"]["name"]}:{config["image"]["tag"]}' logger.debug(f"Pulling image {tag}") docker_image = pull_image(docker_client, tag) # where to mount volume inside scraper mount_point = config["mount_point"] # mounts will be attached to host's fs, not this one mounts = [Mount(str(mount_point), str(host_workdir), type="bind")] command = config["str_command"] cpu_shares = config["resources"]["cpu"] * DEFAULT_CPU_SHARE mem_limit = config["resources"]["memory"] disk_limit = config["resources"]["disk"] shm_size = config["resources"].get("shm") cap_add = config["resources"].get("cap_add", []) cap_drop = config["resources"].get("cap_drop", []) return run_container( docker_client, image=docker_image, command=command, # disk is already reserved on zimtask cpu_shares=cpu_shares, mem_limit=mem_limit, dns=dns, detach=True, labels={ "zimfarm": "", "zimscraper": "yes", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], "human.cpu": str(config["resources"]["cpu"]), "human.memory": format_size(mem_limit), "human.disk": format_size(disk_limit), }, mem_swappiness=0, shm_size=shm_size, cap_add=cap_add, cap_drop=cap_drop, mounts=mounts, name=container_name, remove=False, # scaper container will be removed once log&zim handled )
def start_task_worker(docker_client, task, webapi_uri, username, workdir, worker_name): container_name = task_container_name(task["_id"]) # remove container should it exists (should not) try: remove_container(docker_client, container_name) except docker.errors.NotFound: pass image, tag = TASK_WORKER_IMAGE.rsplit(":", 1) if tag == "local": docker_image = get_image(docker_client, TASK_WORKER_IMAGE) else: logger.debug(f"pulling image {image}:{tag}") docker_image = pull_image(docker_client, image, tag=tag) # mounts will be attached to host's fs, not this one host_mounts = query_host_mounts(docker_client, workdir) host_task_workdir = str(host_mounts.get(workdir)) host_docker_socket = str(host_mounts.get(DOCKER_SOCKET)) host_private_key = str(host_mounts.get(PRIVATE_KEY)) mounts = [ Mount(str(workdir), host_task_workdir, type="bind"), Mount(str(DOCKER_SOCKET), host_docker_socket, type="bind", read_only=True), Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True), ] command = ["task-worker", "--task-id", task["_id"]] logger.debug(f"running {command}") return run_container( docker_client, image=docker_image, command=command, detach=True, environment={ "USERNAME": username, "WORKDIR": str(workdir), "WEB_API_URI": webapi_uri, "UPLOAD_URI": UPLOAD_URI, "WORKER_NAME": worker_name, "ZIMFARM_DISK": os.getenv("ZIMFARM_DISK"), "ZIMFARM_CPUS": os.getenv("ZIMFARM_CPUS"), "ZIMFARM_MEMORY": os.getenv("ZIMFARM_MEMORY"), "DEBUG": os.getenv("DEBUG"), "USE_PUBLIC_DNS": "1" if USE_PUBLIC_DNS else "", }, labels={ "zimfarm": "", "zimtask": "yes", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], }, mem_swappiness=0, mounts=mounts, name=container_name, remove=False, # zimtask containers are pruned periodically )
def start_scraper(docker_client, task, dns, host_workdir): config = task["config"] offliner = config["task_name"] container_name = scraper_container_name(task["_id"], offliner) # remove container should it exists (should not) try: remove_container(docker_client, container_name) except docker.errors.NotFound: pass logger.debug( f'pulling image {config["image"]["name"]}:{config["image"]["tag"]}') docker_image = pull_image(docker_client, config["image"]["name"], tag=config["image"]["tag"]) # where to mount volume inside scraper mount_point = config["mount_point"] # mounts will be attached to host's fs, not this one mounts = [Mount(str(mount_point), str(host_workdir), type="bind")] command = config["str_command"] cpu_shares = config["resources"]["cpu"] * DEFAULT_CPU_SHARE mem_limit = config["resources"]["memory"] disk_limit = config["resources"]["disk"] return run_container( docker_client, image=docker_image, command=command, cpu_shares=cpu_shares, mem_limit=mem_limit, dns=dns, detach=True, labels={ "zimfarm": "", "zimscraper": "yes", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], RESOURCES_DISK_LABEL: str(disk_limit), "human.cpu": str(config["resources"]["cpu"]), "human.memory": format_size(mem_limit), "human.disk": format_size(disk_limit), }, mem_swappiness=0, mounts=mounts, name=container_name, remove=False, # scaper container will be removed once log&zim handled )
def start_dnscache(docker_client, task): name = get_container_name("dnscache", task["_id"]) environment = {"USE_PUBLIC_DNS": "yes" if USE_PUBLIC_DNS else "no"} image = get_or_pull_image(docker_client, DNSCACHE_IMAGE) return run_container( docker_client, image=image, detach=True, name=name, environment=environment, remove=False, labels={ "zimfarm": "", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], }, )
def start_dnscache(docker_client, task): name = dnscache_container_name(task["_id"]) environment = {"USE_PUBLIC_DNS": "yes" if USE_PUBLIC_DNS else "no"} image = pull_image(docker_client, "openzim/dnscache", tag="latest") return run_container( docker_client, image=image, detach=True, name=name, environment=environment, remove=True, labels={ "zimfarm": "", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], }, )
def start_checker(docker_client, task, host_workdir, filename): name = get_container_name("checker", task["_id"]) image = get_or_pull_image(docker_client, CHECKER_IMAGE) # remove container should it exists (should not) try: remove_container(docker_client, name) prune_containers(docker_client, {"label": [f"filename={filename}"]}) except docker.errors.NotFound: pass # in container paths workdir = pathlib.Path("/data") filepath = workdir.joinpath(filename) mounts = [Mount(str(workdir), str(host_workdir), type="bind", read_only=True)] command = [ "zimcheck", task["upload"]["zim"]["zimcheck"] or "-A", str(filepath), ] return run_container( docker_client, image=image, command=command, detach=True, name=name, mounts=mounts, labels={ "zimfarm": "", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], "filename": filename, }, remove=False, )
def start_uploader( docker_client, task, kind, username, host_workdir, upload_dir, filename, move, delete, compress, resume, watch=False, ): container_name = upload_container_name(task["_id"], filename, False) # remove container should it exists (should not) try: remove_container(docker_client, container_name) prune_containers(docker_client, {"label": [f"filename={filename}"]}) except docker.errors.NotFound: pass docker_image = get_or_pull_image(docker_client, UPLOADER_IMAGE) # in container paths workdir = pathlib.Path("/data") filepath = workdir.joinpath(filename) host_mounts = query_host_mounts(docker_client) host_private_key = str(host_mounts[PRIVATE_KEY]) mounts = [ Mount(str(workdir), str(host_workdir), type="bind", read_only=not delete), Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True), ] # append the upload_dir and filename to upload_uri upload_uri = urllib.parse.urlparse(task["upload"][kind]["upload_uri"]) parts = list(upload_uri) # make sure we have a valid upload path parts[2] += "/" if not parts[2].endswith("/") else "" # ensure upload_dir is not absolute parts[2] += os.path.join(re.sub(r"^/", "", upload_dir, 1), filepath.name) upload_uri = urllib.parse.urlunparse(parts) command = [ "uploader", "--file", str(filepath), "--upload-uri", upload_uri, "--username", username, ] if compress: command.append("--compress") if resume: command.append("--resume") if move: command.append("--move") if delete: command.append("--delete") if watch: command += ["--watch", str(watch)] if task["upload"][kind]["expiration"]: command += ["--delete-after", str(task["upload"][kind]["expiration"])] return run_container( docker_client, image=docker_image, command=command, detach=True, environment={"RSA_KEY": str(PRIVATE_KEY)}, labels={ "zimfarm": "", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], "filename": filename, }, mem_swappiness=0, mounts=mounts, name=container_name, remove=False, )
def start_task_worker(docker_client, task, webapi_uri, username, workdir, worker_name): container_name = get_container_name(CONTAINER_TASK_IDENT, task["_id"]) # remove container should it exists (should not) try: remove_container(docker_client, container_name) except docker.errors.NotFound: pass logger.debug(f"getting image {TASK_WORKER_IMAGE}") # task worker is always pulled to ensure we can update our code if ":" not in TASK_WORKER_IMAGE: # consider missing :tag info as a local image for tests docker_image = get_image(docker_client, TASK_WORKER_IMAGE) else: docker_image = pull_image(docker_client, TASK_WORKER_IMAGE) # mounts will be attached to host's fs, not this one host_mounts = query_host_mounts(docker_client, workdir) host_task_workdir = str(host_mounts.get(workdir)) host_docker_socket = str(host_mounts.get(DOCKER_SOCKET)) host_private_key = str(host_mounts.get(PRIVATE_KEY)) mounts = [ Mount(str(workdir), host_task_workdir, type="bind"), Mount(str(DOCKER_SOCKET), host_docker_socket, type="bind", read_only=True), Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True), ] command = ["task-worker", "--task-id", task["_id"]] logger.debug(f"running {command}") return run_container( docker_client, image=docker_image, command=command, detach=True, environment={ "USERNAME": username, "WORKDIR": str(workdir), "WEB_API_URI": webapi_uri, "WORKER_NAME": worker_name, "ZIMFARM_DISK": os.getenv("ZIMFARM_DISK"), "ZIMFARM_CPUS": os.getenv("ZIMFARM_CPUS"), "ZIMFARM_MEMORY": os.getenv("ZIMFARM_MEMORY"), "DEBUG": os.getenv("DEBUG"), "USE_PUBLIC_DNS": "1" if USE_PUBLIC_DNS else "", "UPLOADER_IMAGE": UPLOADER_IMAGE, "CHECKER_IMAGE": CHECKER_IMAGE, "DNSCACHE_IMAGE": DNSCACHE_IMAGE, "DOCKER_SOCKET": DOCKER_SOCKET, }, labels={ "zimfarm": "", "zimtask": "yes", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], # disk usage is accounted for on this container RESOURCES_DISK_LABEL: str(task["config"]["resources"]["disk"]), # display-only human-readable values "human.cpu": str(task["config"]["resources"]["cpu"]), "human.memory": format_size(task["config"]["resources"]["memory"]), "human.disk": format_size(task["config"]["resources"]["disk"]), }, mem_swappiness=0, mounts=mounts, name=container_name, remove=False, # zimtask containers are pruned periodically )
def start_uploader( docker_client, task, username, host_workdir, upload_dir, filename, move, delete, compress, resume, watch, ): container_name = upload_container_name(task["_id"], filename, False) # remove container should it exists (should not) try: remove_container(docker_client, container_name) prune_containers(docker_client, {"label": [f"filename={filename}"]}) except docker.errors.NotFound: pass docker_image = pull_image(docker_client, "openzim/uploader", tag="latest") # in container paths workdir = pathlib.Path("/data") filepath = workdir.joinpath(filename) host_mounts = query_host_mounts(docker_client) host_private_key = str(host_mounts[PRIVATE_KEY]) mounts = [ Mount(str(workdir), str(host_workdir), type="bind", read_only=not delete), Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True), ] command = [ "uploader", "--file", str(filepath), "--upload-uri", f"{UPLOAD_URI}/{upload_dir}/{filepath.name}", "--username", username, ] if compress: command.append("--compress") if resume: command.append("--resume") if move: command.append("--move") if delete: command.append("--delete") if watch: command += ["--watch", str(watch)] return run_container( docker_client, image=docker_image, command=command, detach=True, environment={"RSA_KEY": str(PRIVATE_KEY)}, labels={ "zimfarm": "", "task_id": task["_id"], "tid": short_id(task["_id"]), "schedule_name": task["schedule_name"], "filename": filename, }, mem_swappiness=0, mounts=mounts, name=container_name, remove=False, )