def launch(self, task_id, options, resource, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch): ec2_client = self._session.client("ec2") response = _run_instance(ec2_client, resource) if response is None: raise RuntimeError("empty response from boto3.run_instances") if not response["Instances"]: raise RuntimeError("no instances were created") instance_id = response["Instances"][0]["InstanceId"] ec2 = self._session.resource("ec2") instance = ec2.Instance(instance_id) instance.wait_until_running() logger.info("Instance %s is running.", instance.id) key_path = os.path.join(self._config["privateKeysDirectory"], "%s.pem" % instance.key_pair.name) client = paramiko.SSHClient() try: common.ssh_connect_with_retry( client, instance.public_dns_name, self._config["amiUsername"], key_path, delay=self._config["sshConnectionDelay"], retry=self._config["maxSshConnectionRetry"]) common.fuse_s3_bucket(client, self._config["corpus"]) gpu_id = 1 if common.has_gpu_support(client) else 0 task = common.launch_task(task_id, client, gpu_id, self._config["logDir"], self._config["docker"], docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), self._config.get('callback_url'), self._config.get('callback_interval')) except Exception as e: if self._config.get("terminateOnError", True): instance.terminate() client.close() raise e finally: client.close() task["instance_id"] = instance.id return task
def launch( self, task_id, options, xpulist, resource, storages, # pylint: disable=unused-argument docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, auth_token, support_statistics): options['server'] = resource params = _get_params(self._templates, options) ec2_client = self._session.client("ec2") try: response = _run_instance( ec2_client, params["name"], task_id=task_id, instance_init_limit=self._config["variables"].get( "instanceInitLimit")) except ClientError as error: raise EnvironmentError('Create instance failed: %s' % error) from error if response is None: raise RuntimeError("empty response from boto3.run_instances") if not response["Instances"]: raise RuntimeError("no instances were created") instance_id = response["Instances"][0]["InstanceId"] ec2 = self._session.resource("ec2") instance = ec2.Instance(instance_id) instance.wait_until_running() logger.info("EC2 - Instance %s is running.", instance.id) client = paramiko.SSHClient() try: client = common.ssh_connect_with_retry( instance.public_dns_name, 22, params['login'], pkey=self._config.get('pkey'), key_filename=self._config.get('key_filename') or self._config.get('privateKey'), delay=self._config["variables"]["sshConnectionDelay"], retry=self._config["variables"]["maxSshConnectionRetry"]) # mounting corpus and temporary model directories corpus_dir = self._config["corpus"] if not isinstance(corpus_dir, list): corpus_dir = [corpus_dir] for corpus_description in corpus_dir: common.fuse_s3_bucket(client, corpus_description) if self._config["variables"].get("temporary_model_storage"): common.fuse_s3_bucket( client, self._config["variables"]["temporary_model_storage"]) callback_url = self._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://" + auth_token + ":x@") task = common.launch_task(task_id, client, (xpulist[0], None), params, docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), callback_url, self._config.get('callback_interval'), support_statistics=support_statistics) except Exception as e: if self._config["variables"].get("terminateOnError", True): instance.terminate() logger.info("Terminated instance (on launch error): %s.", instance_id) client.close() raise e finally: client.close() global num_of_inits if num_of_inits.get(task_id): num_of_inits.pop(task_id) task["instance_id"] = instance.id return task