コード例 #1
0
def start_scraper(docker_client, task, dns, host_workdir):
    config = task["config"]
    offliner = config["task_name"]
    container_name = get_container_name(
        f"{CONTAINER_SCRAPER_IDENT}_{offliner}", task["_id"]
    )

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, container_name)
    except docker.errors.NotFound:
        pass

    # scraper is systematically pulled before starting
    tag = f'{config["image"]["name"]}:{config["image"]["tag"]}'
    logger.debug(f"Pulling image {tag}")
    docker_image = pull_image(docker_client, tag)

    # where to mount volume inside scraper
    mount_point = config["mount_point"]

    # mounts will be attached to host's fs, not this one
    mounts = [Mount(str(mount_point), str(host_workdir), type="bind")]

    command = config["str_command"]
    cpu_shares = config["resources"]["cpu"] * DEFAULT_CPU_SHARE
    mem_limit = config["resources"]["memory"]
    disk_limit = config["resources"]["disk"]
    shm_size = config["resources"].get("shm")
    cap_add = config["resources"].get("cap_add", [])
    cap_drop = config["resources"].get("cap_drop", [])

    return run_container(
        docker_client,
        image=docker_image,
        command=command,
        # disk is already reserved on zimtask
        cpu_shares=cpu_shares,
        mem_limit=mem_limit,
        dns=dns,
        detach=True,
        labels={
            "zimfarm": "",
            "zimscraper": "yes",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
            "human.cpu": str(config["resources"]["cpu"]),
            "human.memory": format_size(mem_limit),
            "human.disk": format_size(disk_limit),
        },
        mem_swappiness=0,
        shm_size=shm_size,
        cap_add=cap_add,
        cap_drop=cap_drop,
        mounts=mounts,
        name=container_name,
        remove=False,  # scaper container will be removed once log&zim handled
    )
コード例 #2
0
ファイル: docker.py プロジェクト: deborahlow97/zimfarm
def start_scraper(docker_client, task, dns, host_workdir):
    config = task["config"]
    offliner = config["task_name"]
    container_name = scraper_container_name(task["_id"], offliner)

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, container_name)
    except docker.errors.NotFound:
        pass

    logger.debug(
        f'pulling image {config["image"]["name"]}:{config["image"]["tag"]}')
    docker_image = pull_image(docker_client,
                              config["image"]["name"],
                              tag=config["image"]["tag"])

    # where to mount volume inside scraper
    mount_point = config["mount_point"]

    # mounts will be attached to host's fs, not this one
    mounts = [Mount(str(mount_point), str(host_workdir), type="bind")]

    command = config["str_command"]
    cpu_shares = config["resources"]["cpu"] * DEFAULT_CPU_SHARE
    mem_limit = config["resources"]["memory"]
    disk_limit = config["resources"]["disk"]

    return run_container(
        docker_client,
        image=docker_image,
        command=command,
        cpu_shares=cpu_shares,
        mem_limit=mem_limit,
        dns=dns,
        detach=True,
        labels={
            "zimfarm": "",
            "zimscraper": "yes",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
            RESOURCES_DISK_LABEL: str(disk_limit),
            "human.cpu": str(config["resources"]["cpu"]),
            "human.memory": format_size(mem_limit),
            "human.disk": format_size(disk_limit),
        },
        mem_swappiness=0,
        mounts=mounts,
        name=container_name,
        remove=False,  # scaper container will be removed once log&zim handled
    )
コード例 #3
0
    def __init__(self, **kwargs):

        # print config
        self.print_config(**kwargs)

        # check workdir
        self.check_workdir()

        # check SSH private key
        self.check_private_key()

        # ensure we have valid credentials
        self.check_auth()

        # ensure we have access to docker API
        self.check_docker()

        cont_stats = query_container_stats(self.workdir)
        logger.info(
            "Container resources:"
            "\n\tRAM  (total): {mem_total}"
            "\n\tRAM  (avail): {mem_avail}"
            "\n\tCPUs: {cpu_total}"
            "\n\tDisk: {disk_avail}".format(
                mem_total=format_size(cont_stats["memory"]["total"]),
                mem_avail=format_size(cont_stats["memory"]["available"]),
                cpu_total=cont_stats["cpu"]["total"],
                disk_avail=format_size(cont_stats["disk"]["available"]),
            ))

        self.task = None
        self.should_stop = False
        self.task_workdir = None
        self.progress_file = None
        self.host_task_workdir = None  # path on host for task_dir

        self.dnscache = None  # dnscache container
        self.dns = None  # list of DNS IPs or None

        self.zim_files = {}  # ZIM files registry
        self.zim_retries = {}  # ZIM files with upload errors (registry)
        self.uploader = None  # zim-files uploader container
        self.checker = None  # zim-files uploader container

        self.scraper = None  # scraper container
        self.log_uploader = None  # scraper log uploader container
        self.host_logsdir = None  # path on host where logs are stored
        self.scraper_succeeded = None  # whether scraper succeeded

        # register stop/^C
        self.register_signals()
コード例 #4
0
ファイル: worker.py プロジェクト: shuntaroy12/zimfarm
    def __init__(self, **kwargs):
        # include our class config values in the config print
        kwargs.update({k: getattr(self, k) for k in self.config_keys})
        kwargs.update({"OFFLINERS": SUPPORTED_OFFLINERS})
        kwargs.update({"PLATFORMS_TASKS": PLATFORMS_TASKS})
        self.print_config(**kwargs)

        # set data holders
        self.tasks = {}
        self.last_poll = datetime.datetime(2020, 1, 1)
        self.should_stop = False

        # check workdir
        self.check_workdir()

        # check SSH private key
        self.check_private_key()

        # ensure we have valid credentials
        self.check_auth()

        # ensure we have access to docker API
        self.check_docker()

        # display resources
        host_stats = query_host_stats(self.docker, self.workdir)
        logger.info(
            "Host hardware resources:"
            "\n\tCPU : {cpu_total} (total) ;  {cpu_avail} (avail)"
            "\n\tRAM : {mem_total} (total) ;  {mem_avail} (avail)"
            "\n\tDisk: {disk_total} (configured) ; {disk_avail} (avail)".
            format(
                mem_total=format_size(host_stats["memory"]["total"]),
                mem_avail=format_size(host_stats["memory"]["available"]),
                cpu_total=host_stats["cpu"]["total"],
                cpu_avail=host_stats["cpu"]["available"],
                disk_avail=format_size(host_stats["disk"]["available"]),
                disk_total=format_size(host_stats["disk"]["total"]),
            ))

        if host_stats["disk"]["available"] < host_stats["disk"]["total"]:
            self.should_stop = True
            logger.critical("Configured disk space is not available. Exiting.")
            return

        self.check_in()

        # register stop/^C
        self.register_signals()

        self.sync_tasks_and_containers()
コード例 #5
0
ファイル: app.py プロジェクト: timlardner/Arrow
    def cache_data_frame(self, df, key, force_eviction=False):
        object_key = self.get_cache_key(key)
        object_id = plasma.ObjectID(object_key)
        if self.client.contains(object_id):
            string = 'DataWriter: Object exists in cache'
            if force_eviction:
                print('{} - evicting'.format(string))
                self.client.release(object_id)
            else:
                raise Exception(string)

        record_batch = pa.RecordBatch.from_pandas(df)

        # Work out how large our data frame is
        mock_sink = pa.MockOutputStream()
        stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema)
        stream_writer.write_batch(record_batch)
        stream_writer.close()
        data_size = mock_sink.size()
        print('DataWriter: Data size is {}'.format(format_size(data_size)))

        # Actually write the data frame to the cache
        buf = self.client.create(object_id, data_size)
        stream = pa.FixedSizeBufferWriter(buf)
        stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema)
        stream_writer.write_batch(record_batch)
        stream_writer.close()

        # Make item available to other processes
        self.client.seal(object_id)
コード例 #6
0
ファイル: worker.py プロジェクト: satyamtg/zimfarm
 def mark_file_created(self, filename, filesize):
     human_fsize = format_size(filesize)
     logger.info(f"ZIM file created: {filename}, {human_fsize}")
     self.patch_task(
         {
             "event": "created_file",
             "payload": {"file": {"name": filename, "size": filesize}},
         }
     )
コード例 #7
0
 def cleanup_workdir(self):
     logger.info(f"Removing task workdir {self.workdir}")
     zim_files = [(f.name, format_size(f.stat().st_size))
                  for f in self.task_workdir.glob("*.zim")]
     if zim_files:
         logger.warning(f"ZIM files exists. removing anyway: {zim_files}")
     try:
         shutil.rmtree(self.task_workdir)
     except Exception as exc:
         logger.error(f"Failed to remove workdir: {exc}")
コード例 #8
0
def start_task_worker(docker_client, task, webapi_uri, username, workdir, worker_name):
    container_name = get_container_name(CONTAINER_TASK_IDENT, task["_id"])

    # remove container should it exists (should not)
    try:
        remove_container(docker_client, container_name)
    except docker.errors.NotFound:
        pass

    logger.debug(f"getting image {TASK_WORKER_IMAGE}")
    # task worker is always pulled to ensure we can update our code
    if ":" not in TASK_WORKER_IMAGE:
        # consider missing :tag info as a local image for tests
        docker_image = get_image(docker_client, TASK_WORKER_IMAGE)
    else:
        docker_image = pull_image(docker_client, TASK_WORKER_IMAGE)

    # mounts will be attached to host's fs, not this one
    host_mounts = query_host_mounts(docker_client, workdir)
    host_task_workdir = str(host_mounts.get(workdir))
    host_docker_socket = str(host_mounts.get(DOCKER_SOCKET))
    host_private_key = str(host_mounts.get(PRIVATE_KEY))
    mounts = [
        Mount(str(workdir), host_task_workdir, type="bind"),
        Mount(str(DOCKER_SOCKET), host_docker_socket, type="bind", read_only=True),
        Mount(str(PRIVATE_KEY), host_private_key, type="bind", read_only=True),
    ]
    command = ["task-worker", "--task-id", task["_id"]]

    logger.debug(f"running {command}")
    return run_container(
        docker_client,
        image=docker_image,
        command=command,
        detach=True,
        environment={
            "USERNAME": username,
            "WORKDIR": str(workdir),
            "WEB_API_URI": webapi_uri,
            "WORKER_NAME": worker_name,
            "ZIMFARM_DISK": os.getenv("ZIMFARM_DISK"),
            "ZIMFARM_CPUS": os.getenv("ZIMFARM_CPUS"),
            "ZIMFARM_MEMORY": os.getenv("ZIMFARM_MEMORY"),
            "DEBUG": os.getenv("DEBUG"),
            "USE_PUBLIC_DNS": "1" if USE_PUBLIC_DNS else "",
            "UPLOADER_IMAGE": UPLOADER_IMAGE,
            "CHECKER_IMAGE": CHECKER_IMAGE,
            "DNSCACHE_IMAGE": DNSCACHE_IMAGE,
            "DOCKER_SOCKET": DOCKER_SOCKET,
        },
        labels={
            "zimfarm": "",
            "zimtask": "yes",
            "task_id": task["_id"],
            "tid": short_id(task["_id"]),
            "schedule_name": task["schedule_name"],
            # disk usage is accounted for on this container
            RESOURCES_DISK_LABEL: str(task["config"]["resources"]["disk"]),
            # display-only human-readable values
            "human.cpu": str(task["config"]["resources"]["cpu"]),
            "human.memory": format_size(task["config"]["resources"]["memory"]),
            "human.disk": format_size(task["config"]["resources"]["disk"]),
        },
        mem_swappiness=0,
        mounts=mounts,
        name=container_name,
        remove=False,  # zimtask containers are pruned periodically
    )