def _create_mount(mount_name: str, path: str) -> Tuple[types.Mount, str]: path = os.path.abspath(path) source_path = os.path.dirname(path) target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, mount_name) logging.info('Mounting %s -> %s', source_path, target_path) mount = types.Mount(target_path, source_path, type='bind', read_only=True) return mount, os.path.join(target_path, os.path.basename(path))
def MakeDockerRunParams(self, host_port: int, remote_model_base_path: Optional[Text] = None, host_model_base_path: Optional[Text] = None): """Make parameters for docker `client.containers.run`. Args: host_port: Available port in the host to bind with container port. remote_model_base_path: (Optional) Model base path in the remote destination. (e.g. `gs://your_bucket/model_base_path`.) Use this argument if you have model in the remote place. host_model_base_path: (Optional) Model base path in the host machine. (i.e. local path during the execution.) This would create a volume mount from `host_model_base_path` to the container model base path (i.e. `/model`). Returns: A dictionary of docker run parameters. """ result = dict(self._base_docker_run_args, image=self._image, ports={'{}/tcp'.format(self.container_port): host_port}, environment=self.MakeEnvVars( model_base_path=remote_model_base_path)) if host_model_base_path is not None: # TODO(b/149534564): Replace os.path to pathlib.PurePosixPath after py3. result.update(mounts=[ docker_types.Mount(type='bind', target=self._DEFAULT_MODEL_BASE_PATH, source=host_model_base_path, read_only=True) ]) return result
def gen_shell(answer, path): try: result = run_container( 'python', f'sh -c "echo -n {answer} | python /opt/gen.py"', [types.Mount( type='bind', source=path, target='/opt', )]) return result except Exception as e: return b''
def MakeDockerRunParams(self, host_port: int, model_base_path: Optional[Text] = None, host_model_path: Optional[Text] = None): """Make parameters for docker `client.containers.run`. Args: host_port: Available port in the host to bind with container port. model_base_path: (Optional) Model base path for the tensorflow serving. If the model is exported to the remote destination, you should specify its location (e.g. `gs://your_bucket/model_base_path`) and gfile will recognize it. If your model is in the local host machine, do not alter `model_base_path` (i.e. use default value `/model`) and use `host_model_path` argument to configure a volume mount from a host machine to the container. host_model_path: (Optional) host path for exported model. Use this only if you have an exported SavedModel in the local host machine. Using this option will create a volume mount from `host_model_path` to the `{model_base_path}/{model_name}`. Returns: A dictionary of docker run parameters. """ result = dict( self._base_docker_run_args, image=self._image, ports={'{}/tcp'.format(self.container_port): host_port}, environment=self.MakeEnvVars(model_base_path=model_base_path)) if host_model_path is not None: # TODO(b/149534564): Replace os.path to pathlib.PurePosixPath after py3. result.update(mounts=[ docker_types.Mount( type='bind', target=os.path.join( model_base_path or self._DEFAULT_MODEL_BASE_PATH, self._model_name), source=host_model_path, read_only=True) ]) return result
def MakeDockerRunParams(self, host_port: int, model_path: Text, needs_mount: bool) -> Dict[Text, Any]: """Make parameters for docker `client.containers.run`. Args: host_port: Available port in the host to bind with container port. model_path: A path to the model. needs_mount: If True, model_path will be mounted to the container. Returns: A dictionary of docker run parameters. """ result = dict(self._BASE_DOCKER_RUN_PARAMS, image=self._image, ports={'{}/tcp'.format(self.container_port): host_port}) if needs_mount: # model_path should be a local directory. In order to make TF Serving see # the host model path, we need to mount model path volume to the # container. assert os.path.isdir(model_path), '{} does not exist'.format( model_path) container_model_path = tf_serving_flavor.make_model_path( model_base_path=self._DEFAULT_MODEL_BASE_PATH, model_name=self._model_name, version=1) result.update(environment=self.MakeEnvVars(), mounts=[ docker_types.Mount(type='bind', target=container_model_path, source=model_path, read_only=True) ]) else: # model_path is presumably a remote URI. TF Serving is able to pickup # model in remote directly using gfile, so all we need to do is setting # environment variables correctly. result.update(environment=self.MakeEnvVars(model_path=model_path)) return result
def generate_docker_py_service_description(self, name, docker_networks): mounts = [] for mount_config in self.mounts: mounts.append( types.Mount(target=mount_config['target'], source=mount_config['source'], type=mount_config['type'], read_only=mount_config['readonly'])) configs = [] for config_config in self.configs: configs.append( types.ConfigReference(config_id=config_config['config_id'], config_name=config_config['config_name'], filename=config_config.get('filename'), uid=config_config.get('uid'), gid=config_config.get('gid'), mode=config_config.get('mode'))) secrets = [] for secret_config in self.secrets: secrets.append( types.SecretReference(secret_id=secret_config['secret_id'], secret_name=secret_config['secret_name'], filename=secret_config.get('filename'), uid=secret_config.get('uid'), gid=secret_config.get('gid'), mode=secret_config.get('mode'))) cspec = types.ContainerSpec(image=self.image, user=self.user, dns_config=types.DNSConfig( nameservers=self.dns, search=self.dns_search, options=self.dns_options), args=self.args, env=self.env, tty=self.tty, hostname=self.hostname, labels=self.container_labels, mounts=mounts, secrets=secrets, configs=configs) log_driver = types.DriverConfig(name=self.log_driver, options=self.log_driver_options) placement = types.Placement(constraints=self.constraints) restart_policy = types.RestartPolicy( condition=self.restart_policy, delay=self.restart_policy_delay, max_attempts=self.restart_policy_attempts, window=self.restart_policy_window) resources = types.Resources( cpu_limit=int(self.limit_cpu * 1000000000.0), mem_limit=self.limit_memory, cpu_reservation=int(self.reserve_cpu * 1000000000.0), mem_reservation=self.reserve_memory) update_policy = types.UpdateConfig( parallelism=self.update_parallelism, delay=self.update_delay, failure_action=self.update_failure_action, monitor=self.update_monitor, max_failure_ratio=self.update_max_failure_ratio, order=self.update_order) task_template = types.TaskTemplate(container_spec=cspec, log_driver=log_driver, restart_policy=restart_policy, placement=placement, resources=resources, force_update=self.force_update) if self.mode == 'global': self.replicas = None mode = types.ServiceMode(self.mode, replicas=self.replicas) networks = [] for network_name in self.networks: network_id = None try: network_id = list( filter(lambda n: n['name'] == network_name, docker_networks))[0]['id'] except: pass if network_id: networks.append({'Target': network_id}) else: raise Exception("no docker networks named: %s" % network_name) ports = {} for port in self.publish: ports[int(port['published_port'])] = (int(port['target_port']), port['protocol'], port['mode']) endpoint_spec = types.EndpointSpec(mode=self.endpoint_mode, ports=ports) return update_policy, task_template, networks, endpoint_spec, mode, self.labels
def run(self, time_add, time_to_run, peer_number, runs=1): """ Run the benchmark :param peer_number: How many starting peers :param time_add: How much time before starting the benchmark :param time_to_run: How much time to run the benchmark for :param runs: RUn the benchmark how many time """ time_add *= 1000 time_to_run *= 1000 if self.local: service_image = self.app_config['service']['name'] if self.use_tracker: tracker_image = self.app_config['tracker']['name'] with subprocess.Popen(['../gradlew', '-p', '..', 'docker'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) as p: for line in p.stdout: print(line, end='') else: service_image = (self.app_config['repository']['name'] + self.app_config['service']['name']) self.logger.info(self.client.images.pull(service_image)) if self.use_tracker: tracker_image = (self.app_config['repository']['name'] + self.app_config['tracker']['name']) self.logger.info(self.client.images.pull(tracker_image)) try: self.client.swarm.init() if not self.local: self.logger.info('Joining Swarm on every hosts:') token = self.client.swarm.attrs['JoinTokens']['Worker'] subprocess.call([ 'parallel-ssh', '-t', '0', '-h', 'config/hosts', 'docker', 'swarm', 'join', '--token', token, '{:s}:2377'.format(self.cluster_config['manager_ip']) ]) ipam_pool = utils.create_ipam_pool( subnet=self.app_config['service']['network']['subnet']) ipam_config = utils.create_ipam_config( pool_configs=[ipam_pool]) self.client.networks.create( self.app_config['service']['network']['name'], driver='overlay', ipam=ipam_config) except errors.APIError: self.logger.info('Host is already part of a swarm') if not self.client.networks.list( names=[self.app_config['service']['network']['name']]): self.logger.error('Network doesn\'t exist!') exit(1) for run_nb, _ in enumerate(range(runs), 1): if self.use_tracker: self._create_service( self.app_config['tracker']['name'], tracker_image, placement=['node.role == manager'], mem_limit=self.app_config['service']['mem_limit']) self._wait_on_service(self.app_config['tracker']['name'], 1) time_to_start = int((time.time() * 1000) + time_add) self.logger.debug( datetime.utcfromtimestamp(time_to_start / 1000).isoformat()) environment_vars = { 'PEER_NUMBER': peer_number, 'TIME': time_to_start, 'TIME_TO_RUN': time_to_run } if 'parameters' in self.app_config['service']: environment_vars.update( self.app_config['service']['parameters']) environment_vars = [ '{:s}={}'.format(k, v) for k, v in environment_vars.items() ] self.logger.debug(environment_vars) service_replicas = 0 if self.churn else peer_number log_storage = (self.cluster_config['local_data'] if self.local else self.cluster_config['cluster_data']) if 'mem_limit' in self.app_config['service']: self._create_service( self.app_config['service']['name'], service_image, env=environment_vars, mounts=[ types.Mount(target='/data', source=log_storage, type='bind') ], replicas=service_replicas, mem_limit=self.app_config['service']['mem_limit']) else: self._create_service(self.app_config['service']['name'], service_image, env=environment_vars, mounts=[ types.Mount(target='/data', source=log_storage, type='bind') ], replicas=service_replicas) self.logger.info( 'Running Benchmark -> Experiment: {:d}/{:d}'.format( run_nb, runs)) if self.churn: thread = threading.Thread( target=self._run_churn, args=[time_to_start + self.churn.delay], daemon=True) thread.start() self._wait_on_service(self.app_config['service']['name'], 0, inverse=True) self.logger.info('Running with churn') if self.churn.synthetic: # Wait for some peers to at least start time.sleep(120) total = [ sum(x) for x in zip(*self.churn.churn_params['synthetic']) ] # Wait until only stopped containers are still alive self._wait_on_service(self.app_config['service']['name'], containers_nb=total[0], total_nb=total[1]) else: # TODO not the most elegant solution thread.join() # Wait for churn to finish time.sleep(300) # Wait 5 more minutes else: self._wait_on_service(self.app_config['service']['name'], 0, inverse=True) self.logger.info('Running without churn') self._wait_on_service(self.app_config['service']['name'], 0) self.stop() self.logger.info('Services removed') time.sleep(30) if not self.local: subprocess.call( 'parallel-ssh -t 0 -h config/hosts' ' "mkdir -p {path}/test-{nb}/capture &&' ' mv {path}/*.txt {path}/test-{nb}/ &&' ' mv {path}/capture/*.csv {path}/test-{nb}/capture/"'. format(path=self.cluster_config['cluster_data'], nb=run_nb), shell=True) subprocess.call('mkdir -p {path}/test-{nb}/capture'.format( path=log_storage, nb=run_nb), shell=True) subprocess.call('mv {path}/*.txt {path}/test-{nb}/'.format( path=log_storage, nb=run_nb), shell=True) subprocess.call( 'mv {path}/capture/*.csv {path}/test-{nb}/capture/'.format( path=log_storage, nb=run_nb), shell=True) self.logger.info('Benchmark done!')
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') mounts = [] command_args = [] # Mount each fasta path as a unique target directory. target_fasta_paths = [] for i, fasta_path in enumerate(FLAGS.fasta_paths): mount, target_path = _create_mount(f'fasta_path_{i}', fasta_path) mounts.append(mount) target_fasta_paths.append(target_path) command_args.append(f'--fasta_paths={",".join(target_fasta_paths)}') for name, path in [('uniref90_database_path', uniref90_database_path), ('mgnify_database_path', mgnify_database_path), ('uniclust30_database_path', uniclust30_database_path), ('bfd_database_path', bfd_database_path), ('pdb70_database_path', pdb70_database_path), ('data_dir', data_dir), ('template_mmcif_dir', template_mmcif_dir), ('obsolete_pdbs_path', obsolete_pdbs_path)]: if path: mount, target_path = _create_mount(name, path) mounts.append(mount) command_args.append(f'--{name}={target_path}') output_target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, 'output') mounts.append(types.Mount(output_target_path, output_dir, type='bind')) command_args.extend([ f'--output_dir={output_target_path}', f'--model_names={",".join(model_names)}', f'--max_template_date={FLAGS.max_template_date}', f'--preset={FLAGS.preset}', f'--benchmark={FLAGS.benchmark}', '--logtostderr', ]) client = docker.from_env() container = client.containers.run( image=docker_image_name, command=command_args, runtime='nvidia' if FLAGS.use_gpu else None, remove=True, detach=True, mounts=mounts, environment={ 'NVIDIA_VISIBLE_DEVICES': FLAGS.gpu_devices, # The following flags allow us to make predictions on proteins that # would typically be too long to fit into GPU memory. 'TF_FORCE_UNIFIED_MEMORY': '1', 'XLA_PYTHON_CLIENT_MEM_FRACTION': '4.0', }) # Add signal handler to ensure CTRL+C also stops the running container. signal.signal(signal.SIGINT, lambda unused_sig, unused_frame: container.kill()) for line in container.logs(stream=True): logging.info(line.strip().decode('utf-8'))
async def run_container( self, name: str, image: str, volumes: List[Any] = [], privileged: bool = False, registry: Optional[Any] = None, envs: Dict[str, Any] = {}, ports: Union[List[int], Dict[str, Tuple[str, int]]] = [], force: bool = False, command: Optional[Union[Command, FakeCommand]] = None, ): """Runs a container in the node, in detached mode. Parameters ---------- name The name to assign to the container. image The image to run. volumes Names of the volumes to mount. The mount point in the container will match the original device. The volumes must already exist in the node Docker engine. privileged Whether to run the container in privileged mode. registry The registry from which to pull the image, if it doesn't exist locally. envs A dictionary of environment variable to value to pass to the container. ports Ports to bind inside the container. The format must be ``{'2222/tcp': 3333}`` which will expose port 2222 inside the container as port 3333 on the node. Also accepted is a list of integers; each integer port will be exposed in the container and bound to the same port in the node. force If `True`, removes any running containers of the same name, or any container with the same image as ancestor. command A command to which output messages. Returns ------- : The container object. """ assert self.client, "Client is not connected." # This is the command we aim to run. # docker --context gfa1 run # --rm -d --network host # --mount source=data,target=/data # --mount source=home,target=/home/sdss # --env OBSERVATORY=APO --env ACTOR_NAME=gfa # --privileged # sdss-hub:5000/flicamera:latest command = command or FakeCommand() if (await self.is_container_running(name)) and not force: command.debug(text=f"{self.name}: container already running.") return await self.stop_container(name, image, force=force, command=command) if registry: image = registry + "/" + image if isinstance(ports, (list, tuple)): ports = {f"{port}/tcp": ("0.0.0.0", port) for port in ports} mounts = [] for vname in volumes: volume = await self._run(self.client.volumes.get, vname) target = volume.attrs["Options"]["device"].strip(":") mounts.append(types.Mount(target, vname)) command.debug(text=f"{self.name}: pulling latest image.") await self._run(self.client.images.pull, image) command.info(text=f"{self.name}: running {name} from {image}.") container = await self._run( self.client.containers.run, image, name=name, tty=False, detach=True, remove=True, environment=envs, # ports=ports, privileged=privileged, mounts=mounts, stdin_open=False, stdout=False, network="host", ) return container