def launch(self, task_id, options, xpulist, resource, storages, docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, auth_token, support_statistics): options['server'] = resource params = _get_params(self._config, options) client = self._get_client(params=params) try: callback_url = self._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://" + auth_token + ":x@") task = common.launch_task( task_id, client, xpulist, params, docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, storages, callback_url, self._config.get('callback_interval'), requirements=self._config.get("requirements"), support_statistics=support_statistics) finally: client.close() params['model'] = task['model'] params['pgid'] = task['pgid'] return params
def launch(self, task_id, options, gpulist, resource, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch): options['server'] = resource params = _get_params(self._config, options) client = common.ssh_connect_with_retry(params['server'], params['login'], self._config['privateKey'], login_cmd=params['login_cmd']) try: task = common.launch_task( task_id, client, gpulist, params['log_dir'], self._config['docker'], docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), self._config.get('callback_url'), self._config.get('callback_interval'), requirements=self._config.get("requirements")) finally: client.close() params['model'] = task['model'] params['pgid'] = task['pgid'] return params
def launch(self, task_id, options, resource, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch): ec2_client = self._session.client("ec2") response = _run_instance(ec2_client, resource) if response is None: raise RuntimeError("empty response from boto3.run_instances") if not response["Instances"]: raise RuntimeError("no instances were created") instance_id = response["Instances"][0]["InstanceId"] ec2 = self._session.resource("ec2") instance = ec2.Instance(instance_id) instance.wait_until_running() logger.info("Instance %s is running.", instance.id) key_path = os.path.join(self._config["privateKeysDirectory"], "%s.pem" % instance.key_pair.name) client = paramiko.SSHClient() try: common.ssh_connect_with_retry( client, instance.public_dns_name, self._config["amiUsername"], key_path, delay=self._config["sshConnectionDelay"], retry=self._config["maxSshConnectionRetry"]) common.fuse_s3_bucket(client, self._config["corpus"]) gpu_id = 1 if common.has_gpu_support(client) else 0 task = common.launch_task(task_id, client, gpu_id, self._config["logDir"], self._config["docker"], docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), self._config.get('callback_url'), self._config.get('callback_interval')) except Exception as e: if self._config.get("terminateOnError", True): instance.terminate() client.close() raise e finally: client.close() task["instance_id"] = instance.id return task
def launch(self, task_id, options, xpulist, resource, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, auth_token, support_statistics): options['server'] = resource params = _get_params(self._config, options) client = common.ssh_connect_with_retry( params['host'], params['port'], params['login'], pkey=self._config.get('pkey'), key_filename=self._config.get('key_filename') or self._config.get('privateKey'), login_cmd=params['login_cmd']) try: callback_url = self._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://" + auth_token + ":x@") task = common.launch_task( task_id, client, xpulist, params['log_dir'], self._config['docker'], docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), callback_url, self._config.get('callback_interval'), requirements=self._config.get("requirements"), support_statistics=support_statistics) finally: client.close() params['model'] = task['model'] params['pgid'] = task['pgid'] return params
def launch( self, task_id, options, xpulist, resource, storages, # pylint: disable=unused-argument docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, auth_token, support_statistics): options['server'] = resource params = _get_params(self._templates, options) ec2_client = self._session.client("ec2") try: response = _run_instance( ec2_client, params["name"], task_id=task_id, instance_init_limit=self._config["variables"].get( "instanceInitLimit")) except ClientError as error: raise EnvironmentError('Create instance failed: %s' % error) from error if response is None: raise RuntimeError("empty response from boto3.run_instances") if not response["Instances"]: raise RuntimeError("no instances were created") instance_id = response["Instances"][0]["InstanceId"] ec2 = self._session.resource("ec2") instance = ec2.Instance(instance_id) instance.wait_until_running() logger.info("EC2 - Instance %s is running.", instance.id) client = paramiko.SSHClient() try: client = common.ssh_connect_with_retry( instance.public_dns_name, 22, params['login'], pkey=self._config.get('pkey'), key_filename=self._config.get('key_filename') or self._config.get('privateKey'), delay=self._config["variables"]["sshConnectionDelay"], retry=self._config["variables"]["maxSshConnectionRetry"]) # mounting corpus and temporary model directories corpus_dir = self._config["corpus"] if not isinstance(corpus_dir, list): corpus_dir = [corpus_dir] for corpus_description in corpus_dir: common.fuse_s3_bucket(client, corpus_description) if self._config["variables"].get("temporary_model_storage"): common.fuse_s3_bucket( client, self._config["variables"]["temporary_model_storage"]) callback_url = self._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://" + auth_token + ":x@") task = common.launch_task(task_id, client, (xpulist[0], None), params, docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), callback_url, self._config.get('callback_interval'), support_statistics=support_statistics) except Exception as e: if self._config["variables"].get("terminateOnError", True): instance.terminate() logger.info("Terminated instance (on launch error): %s.", instance_id) client.close() raise e finally: client.close() global num_of_inits if num_of_inits.get(task_id): num_of_inits.pop(task_id) task["instance_id"] = instance.id return task
def launch(self, task_id, options, xpulist, resource, storages, # pylint: disable=unused-argument docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, auth_token, support_statistics): options['server'] = resource params = _get_params(self._templates, options) params['service'] = 'nova' if not params.get('port'): params['port'] = 22 nova_client = self._nova_client instance = _run_instance(nova_client, params, self._config, task_id=task_id) if not instance: raise RuntimeError("no instances were created") logger.info("OVH - Instance %s is running.", instance.id) params['host'] = [addr for addr in instance.addresses['Ext-Net'] if addr.get('version') == 4][0]['addr'] ssh_client = paramiko.SSHClient() try: ssh_client = get_client(params, self._config) callback_url = self._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://"+auth_token+":x@") task = common.launch_task( task_id, ssh_client, (xpulist[0], None), params, docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), callback_url, self._config.get('callback_interval'), support_statistics=support_statistics) except Exception as e: if self._config["variables"].get("terminateOnError", True): params['instance_id'] = instance.id self.terminate(params) logger.info("Terminated instance (on launch error): %s.", instance.id) ssh_client.close() raise e finally: ssh_client.close() task['instance_id'] = instance.id task['host'] = params['host'] task['port'] = params['port'] task['login'] = params['login'] task['log_dir'] = params['log_dir'] task['dynamic'] = params.get('dynamic') return task