Example #1
0
def _create_mount(mount_name: str, path: str) -> Tuple[types.Mount, str]:
  path = os.path.abspath(path)
  source_path = os.path.dirname(path)
  target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, mount_name)
  logging.info('Mounting %s -> %s', source_path, target_path)
  mount = types.Mount(target_path, source_path, type='bind', read_only=True)
  return mount, os.path.join(target_path, os.path.basename(path))
Example #2
0
    def MakeDockerRunParams(self,
                            host_port: int,
                            remote_model_base_path: Optional[Text] = None,
                            host_model_base_path: Optional[Text] = None):
        """Make parameters for docker `client.containers.run`.

    Args:
      host_port: Available port in the host to bind with container port.
      remote_model_base_path: (Optional) Model base path in the remote
          destination. (e.g. `gs://your_bucket/model_base_path`.) Use this
          argument if you have model in the remote place.
      host_model_base_path: (Optional) Model base path in the host machine.
          (i.e. local path during the execution.) This would create a volume
          mount from `host_model_base_path` to the container model base path
          (i.e. `/model`).

    Returns:
      A dictionary of docker run parameters.
    """
        result = dict(self._base_docker_run_args,
                      image=self._image,
                      ports={'{}/tcp'.format(self.container_port): host_port},
                      environment=self.MakeEnvVars(
                          model_base_path=remote_model_base_path))

        if host_model_base_path is not None:
            # TODO(b/149534564): Replace os.path to pathlib.PurePosixPath after py3.
            result.update(mounts=[
                docker_types.Mount(type='bind',
                                   target=self._DEFAULT_MODEL_BASE_PATH,
                                   source=host_model_base_path,
                                   read_only=True)
            ])

        return result
Example #3
0
def gen_shell(answer, path):
    try:
        result = run_container(
            'python', f'sh -c "echo -n {answer} | python /opt/gen.py"',
            [types.Mount(
                type='bind',
                source=path,
                target='/opt',
            )])
        return result
    except Exception as e:
        return b''
Example #4
0
    def MakeDockerRunParams(self,
                            host_port: int,
                            model_base_path: Optional[Text] = None,
                            host_model_path: Optional[Text] = None):
        """Make parameters for docker `client.containers.run`.

    Args:
      host_port: Available port in the host to bind with container port.
      model_base_path: (Optional) Model base path for the tensorflow serving.
          If the model is exported to the remote destination, you should specify
          its location (e.g. `gs://your_bucket/model_base_path`) and gfile will
          recognize it. If your model is in the local host machine, do not alter
          `model_base_path` (i.e. use default value `/model`) and use
          `host_model_path` argument to configure a volume mount from a host
          machine to the container.
      host_model_path: (Optional) host path for exported model. Use this only if
          you have an exported SavedModel in the local host machine. Using this
          option will create a volume mount from `host_model_path` to the
          `{model_base_path}/{model_name}`.

    Returns:
      A dictionary of docker run parameters.
    """
        result = dict(
            self._base_docker_run_args,
            image=self._image,
            ports={'{}/tcp'.format(self.container_port): host_port},
            environment=self.MakeEnvVars(model_base_path=model_base_path))

        if host_model_path is not None:
            # TODO(b/149534564): Replace os.path to pathlib.PurePosixPath after py3.
            result.update(mounts=[
                docker_types.Mount(
                    type='bind',
                    target=os.path.join(
                        model_base_path or self._DEFAULT_MODEL_BASE_PATH,
                        self._model_name),
                    source=host_model_path,
                    read_only=True)
            ])

        return result
Example #5
0
    def MakeDockerRunParams(self, host_port: int, model_path: Text,
                            needs_mount: bool) -> Dict[Text, Any]:
        """Make parameters for docker `client.containers.run`.

    Args:
      host_port: Available port in the host to bind with container port.
      model_path: A path to the model.
      needs_mount: If True, model_path will be mounted to the container.

    Returns:
      A dictionary of docker run parameters.
    """
        result = dict(self._BASE_DOCKER_RUN_PARAMS,
                      image=self._image,
                      ports={'{}/tcp'.format(self.container_port): host_port})

        if needs_mount:
            # model_path should be a local directory. In order to make TF Serving see
            # the host model path, we need to mount model path volume to the
            # container.
            assert os.path.isdir(model_path), '{} does not exist'.format(
                model_path)
            container_model_path = tf_serving_flavor.make_model_path(
                model_base_path=self._DEFAULT_MODEL_BASE_PATH,
                model_name=self._model_name,
                version=1)
            result.update(environment=self.MakeEnvVars(),
                          mounts=[
                              docker_types.Mount(type='bind',
                                                 target=container_model_path,
                                                 source=model_path,
                                                 read_only=True)
                          ])
        else:
            # model_path is presumably a remote URI. TF Serving is able to pickup
            # model in remote directly using gfile, so all we need to do is setting
            # environment variables correctly.
            result.update(environment=self.MakeEnvVars(model_path=model_path))

        return result
Example #6
0
    def generate_docker_py_service_description(self, name, docker_networks):
        mounts = []
        for mount_config in self.mounts:
            mounts.append(
                types.Mount(target=mount_config['target'],
                            source=mount_config['source'],
                            type=mount_config['type'],
                            read_only=mount_config['readonly']))

        configs = []
        for config_config in self.configs:
            configs.append(
                types.ConfigReference(config_id=config_config['config_id'],
                                      config_name=config_config['config_name'],
                                      filename=config_config.get('filename'),
                                      uid=config_config.get('uid'),
                                      gid=config_config.get('gid'),
                                      mode=config_config.get('mode')))
        secrets = []
        for secret_config in self.secrets:
            secrets.append(
                types.SecretReference(secret_id=secret_config['secret_id'],
                                      secret_name=secret_config['secret_name'],
                                      filename=secret_config.get('filename'),
                                      uid=secret_config.get('uid'),
                                      gid=secret_config.get('gid'),
                                      mode=secret_config.get('mode')))

        cspec = types.ContainerSpec(image=self.image,
                                    user=self.user,
                                    dns_config=types.DNSConfig(
                                        nameservers=self.dns,
                                        search=self.dns_search,
                                        options=self.dns_options),
                                    args=self.args,
                                    env=self.env,
                                    tty=self.tty,
                                    hostname=self.hostname,
                                    labels=self.container_labels,
                                    mounts=mounts,
                                    secrets=secrets,
                                    configs=configs)

        log_driver = types.DriverConfig(name=self.log_driver,
                                        options=self.log_driver_options)

        placement = types.Placement(constraints=self.constraints)

        restart_policy = types.RestartPolicy(
            condition=self.restart_policy,
            delay=self.restart_policy_delay,
            max_attempts=self.restart_policy_attempts,
            window=self.restart_policy_window)

        resources = types.Resources(
            cpu_limit=int(self.limit_cpu * 1000000000.0),
            mem_limit=self.limit_memory,
            cpu_reservation=int(self.reserve_cpu * 1000000000.0),
            mem_reservation=self.reserve_memory)

        update_policy = types.UpdateConfig(
            parallelism=self.update_parallelism,
            delay=self.update_delay,
            failure_action=self.update_failure_action,
            monitor=self.update_monitor,
            max_failure_ratio=self.update_max_failure_ratio,
            order=self.update_order)

        task_template = types.TaskTemplate(container_spec=cspec,
                                           log_driver=log_driver,
                                           restart_policy=restart_policy,
                                           placement=placement,
                                           resources=resources,
                                           force_update=self.force_update)

        if self.mode == 'global':
            self.replicas = None

        mode = types.ServiceMode(self.mode, replicas=self.replicas)

        networks = []
        for network_name in self.networks:
            network_id = None
            try:
                network_id = list(
                    filter(lambda n: n['name'] == network_name,
                           docker_networks))[0]['id']
            except:
                pass
            if network_id:
                networks.append({'Target': network_id})
            else:
                raise Exception("no docker networks named: %s" % network_name)

        ports = {}
        for port in self.publish:
            ports[int(port['published_port'])] = (int(port['target_port']),
                                                  port['protocol'],
                                                  port['mode'])
        endpoint_spec = types.EndpointSpec(mode=self.endpoint_mode,
                                           ports=ports)
        return update_policy, task_template, networks, endpoint_spec, mode, self.labels
Example #7
0
    def run(self, time_add, time_to_run, peer_number, runs=1):
        """
        Run the benchmark

        :param peer_number: How many starting peers
        :param time_add: How much time before starting the benchmark
        :param time_to_run: How much time to run the benchmark for
        :param runs: RUn the benchmark how many time
        """
        time_add *= 1000
        time_to_run *= 1000
        if self.local:
            service_image = self.app_config['service']['name']
            if self.use_tracker:
                tracker_image = self.app_config['tracker']['name']
            with subprocess.Popen(['../gradlew', '-p', '..', 'docker'],
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  universal_newlines=True) as p:
                for line in p.stdout:
                    print(line, end='')
        else:
            service_image = (self.app_config['repository']['name'] +
                             self.app_config['service']['name'])
            self.logger.info(self.client.images.pull(service_image))

            if self.use_tracker:
                tracker_image = (self.app_config['repository']['name'] +
                                 self.app_config['tracker']['name'])
                self.logger.info(self.client.images.pull(tracker_image))

            try:
                self.client.swarm.init()
                if not self.local:
                    self.logger.info('Joining Swarm on every hosts:')
                    token = self.client.swarm.attrs['JoinTokens']['Worker']
                    subprocess.call([
                        'parallel-ssh', '-t', '0', '-h', 'config/hosts',
                        'docker', 'swarm', 'join', '--token', token,
                        '{:s}:2377'.format(self.cluster_config['manager_ip'])
                    ])
                ipam_pool = utils.create_ipam_pool(
                    subnet=self.app_config['service']['network']['subnet'])
                ipam_config = utils.create_ipam_config(
                    pool_configs=[ipam_pool])
                self.client.networks.create(
                    self.app_config['service']['network']['name'],
                    driver='overlay',
                    ipam=ipam_config)
            except errors.APIError:
                self.logger.info('Host is already part of a swarm')
                if not self.client.networks.list(
                        names=[self.app_config['service']['network']['name']]):
                    self.logger.error('Network  doesn\'t exist!')
                    exit(1)

        for run_nb, _ in enumerate(range(runs), 1):
            if self.use_tracker:
                self._create_service(
                    self.app_config['tracker']['name'],
                    tracker_image,
                    placement=['node.role == manager'],
                    mem_limit=self.app_config['service']['mem_limit'])
                self._wait_on_service(self.app_config['tracker']['name'], 1)
            time_to_start = int((time.time() * 1000) + time_add)
            self.logger.debug(
                datetime.utcfromtimestamp(time_to_start / 1000).isoformat())

            environment_vars = {
                'PEER_NUMBER': peer_number,
                'TIME': time_to_start,
                'TIME_TO_RUN': time_to_run
            }
            if 'parameters' in self.app_config['service']:
                environment_vars.update(
                    self.app_config['service']['parameters'])
            environment_vars = [
                '{:s}={}'.format(k, v) for k, v in environment_vars.items()
            ]
            self.logger.debug(environment_vars)

            service_replicas = 0 if self.churn else peer_number
            log_storage = (self.cluster_config['local_data'] if self.local else
                           self.cluster_config['cluster_data'])

            if 'mem_limit' in self.app_config['service']:
                self._create_service(
                    self.app_config['service']['name'],
                    service_image,
                    env=environment_vars,
                    mounts=[
                        types.Mount(target='/data',
                                    source=log_storage,
                                    type='bind')
                    ],
                    replicas=service_replicas,
                    mem_limit=self.app_config['service']['mem_limit'])
            else:
                self._create_service(self.app_config['service']['name'],
                                     service_image,
                                     env=environment_vars,
                                     mounts=[
                                         types.Mount(target='/data',
                                                     source=log_storage,
                                                     type='bind')
                                     ],
                                     replicas=service_replicas)

            self.logger.info(
                'Running Benchmark -> Experiment: {:d}/{:d}'.format(
                    run_nb, runs))
            if self.churn:
                thread = threading.Thread(
                    target=self._run_churn,
                    args=[time_to_start + self.churn.delay],
                    daemon=True)
                thread.start()
                self._wait_on_service(self.app_config['service']['name'],
                                      0,
                                      inverse=True)
                self.logger.info('Running with churn')
                if self.churn.synthetic:
                    # Wait for some peers to at least start
                    time.sleep(120)
                    total = [
                        sum(x)
                        for x in zip(*self.churn.churn_params['synthetic'])
                    ]
                    # Wait until only stopped containers are still alive
                    self._wait_on_service(self.app_config['service']['name'],
                                          containers_nb=total[0],
                                          total_nb=total[1])
                else:
                    # TODO not the most elegant solution
                    thread.join()  # Wait for churn to finish
                    time.sleep(300)  # Wait 5 more minutes

            else:
                self._wait_on_service(self.app_config['service']['name'],
                                      0,
                                      inverse=True)
                self.logger.info('Running without churn')
                self._wait_on_service(self.app_config['service']['name'], 0)
            self.stop()

            self.logger.info('Services removed')
            time.sleep(30)

            if not self.local:
                subprocess.call(
                    'parallel-ssh -t 0 -h config/hosts'
                    ' "mkdir -p {path}/test-{nb}/capture &&'
                    ' mv {path}/*.txt {path}/test-{nb}/ &&'
                    ' mv {path}/capture/*.csv {path}/test-{nb}/capture/"'.
                    format(path=self.cluster_config['cluster_data'],
                           nb=run_nb),
                    shell=True)

            subprocess.call('mkdir -p {path}/test-{nb}/capture'.format(
                path=log_storage, nb=run_nb),
                            shell=True)
            subprocess.call('mv {path}/*.txt {path}/test-{nb}/'.format(
                path=log_storage, nb=run_nb),
                            shell=True)
            subprocess.call(
                'mv {path}/capture/*.csv {path}/test-{nb}/capture/'.format(
                    path=log_storage, nb=run_nb),
                shell=True)

        self.logger.info('Benchmark done!')
Example #8
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  mounts = []
  command_args = []

  # Mount each fasta path as a unique target directory.
  target_fasta_paths = []
  for i, fasta_path in enumerate(FLAGS.fasta_paths):
    mount, target_path = _create_mount(f'fasta_path_{i}', fasta_path)
    mounts.append(mount)
    target_fasta_paths.append(target_path)
  command_args.append(f'--fasta_paths={",".join(target_fasta_paths)}')

  for name, path in [('uniref90_database_path', uniref90_database_path),
                     ('mgnify_database_path', mgnify_database_path),
                     ('uniclust30_database_path', uniclust30_database_path),
                     ('bfd_database_path', bfd_database_path),
                     ('pdb70_database_path', pdb70_database_path),
                     ('data_dir', data_dir),
                     ('template_mmcif_dir', template_mmcif_dir),
                     ('obsolete_pdbs_path', obsolete_pdbs_path)]:
    if path:
      mount, target_path = _create_mount(name, path)
      mounts.append(mount)
      command_args.append(f'--{name}={target_path}')

  output_target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, 'output')
  mounts.append(types.Mount(output_target_path, output_dir, type='bind'))

  command_args.extend([
      f'--output_dir={output_target_path}',
      f'--model_names={",".join(model_names)}',
      f'--max_template_date={FLAGS.max_template_date}',
      f'--preset={FLAGS.preset}',
      f'--benchmark={FLAGS.benchmark}',
      '--logtostderr',
  ])

  client = docker.from_env()
  container = client.containers.run(
      image=docker_image_name,
      command=command_args,
      runtime='nvidia' if FLAGS.use_gpu else None,
      remove=True,
      detach=True,
      mounts=mounts,
      environment={
          'NVIDIA_VISIBLE_DEVICES': FLAGS.gpu_devices,
          # The following flags allow us to make predictions on proteins that
          # would typically be too long to fit into GPU memory.
          'TF_FORCE_UNIFIED_MEMORY': '1',
          'XLA_PYTHON_CLIENT_MEM_FRACTION': '4.0',
      })

  # Add signal handler to ensure CTRL+C also stops the running container.
  signal.signal(signal.SIGINT,
                lambda unused_sig, unused_frame: container.kill())

  for line in container.logs(stream=True):
    logging.info(line.strip().decode('utf-8'))
Example #9
0
    async def run_container(
        self,
        name: str,
        image: str,
        volumes: List[Any] = [],
        privileged: bool = False,
        registry: Optional[Any] = None,
        envs: Dict[str, Any] = {},
        ports: Union[List[int], Dict[str, Tuple[str, int]]] = [],
        force: bool = False,
        command: Optional[Union[Command, FakeCommand]] = None,
    ):
        """Runs a container in the node, in detached mode.

        Parameters
        ----------
        name
            The name to assign to the container.
        image
            The image to run.
        volumes
            Names of the volumes to mount. The mount point in the container
            will match the original device. The volumes must already exist
            in the node Docker engine.
        privileged
            Whether to run the container in privileged mode.
        registry
            The registry from which to pull the image, if it doesn't exist
            locally.
        envs
            A dictionary of environment variable to value to pass to the
            container.
        ports
            Ports to bind inside the container. The format must be
            ``{'2222/tcp': 3333}`` which will expose port 2222 inside the
            container as port 3333 on the node. Also accepted is a list of
            integers; each integer port will be exposed in the container
            and bound to the same port in the node.
        force
            If `True`, removes any running containers of the same name,
            or any container with the same image as ancestor.
        command
            A command to which output messages.

        Returns
        -------
        :
            The container object.
        """

        assert self.client, "Client is not connected."

        # This is the command we aim to run.
        # docker --context gfa1 run
        #        --rm -d --network host
        #        --mount source=data,target=/data
        #        --mount source=home,target=/home/sdss
        #        --env OBSERVATORY=APO --env ACTOR_NAME=gfa
        #        --privileged
        #        sdss-hub:5000/flicamera:latest

        command = command or FakeCommand()

        if (await self.is_container_running(name)) and not force:
            command.debug(text=f"{self.name}: container already running.")
            return

        await self.stop_container(name, image, force=force, command=command)

        if registry:
            image = registry + "/" + image

        if isinstance(ports, (list, tuple)):
            ports = {f"{port}/tcp": ("0.0.0.0", port) for port in ports}

        mounts = []
        for vname in volumes:
            volume = await self._run(self.client.volumes.get, vname)
            target = volume.attrs["Options"]["device"].strip(":")
            mounts.append(types.Mount(target, vname))

        command.debug(text=f"{self.name}: pulling latest image.")
        await self._run(self.client.images.pull, image)

        command.info(text=f"{self.name}: running {name} from {image}.")
        container = await self._run(
            self.client.containers.run,
            image,
            name=name,
            tty=False,
            detach=True,
            remove=True,
            environment=envs,
            # ports=ports,
            privileged=privileged,
            mounts=mounts,
            stdin_open=False,
            stdout=False,
            network="host",
        )

        return container